linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] btrfs: fix check_shared for fiemap ioctl
@ 2016-06-01  5:48 Lu Fengqi
  2016-06-01 21:15 ` Mark Fasheh
  2016-06-03 14:02 ` Josef Bacik
  0 siblings, 2 replies; 12+ messages in thread
From: Lu Fengqi @ 2016-06-01  5:48 UTC (permalink / raw)
  To: linux-btrfs; +Cc: dsterba, Lu Fengqi

Only in the case of different root_id or different object_id, check_shared
identified extent as the shared. However, If a extent was referred by
different offset of same file, it should also be identified as shared.
In addition, check_shared's loop scale is at least  n^3, so if a extent
has too many references,  even causes soft hang up.

First, add all delayed_ref to the ref_tree and calculate the unqiue_refs,
if the unique_refs is greater than one, return BACKREF_FOUND_SHARED.
Then individually add the  on-disk reference(inline/keyed) to the ref_tree
and calculate the unique_refs of the ref_tree to check if the unique_refs
is greater than one.Because once there are two references to return
SHARED, so the time complexity is close to the constant.

Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
---
 fs/btrfs/backref.c   | 349 +++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/extent_io.c |  18 ++-
 2 files changed, 357 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 80e8472..325a144 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/vmalloc.h>
+#include <linux/rbtree.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
@@ -34,6 +35,253 @@ struct extent_inode_elem {
 	struct extent_inode_elem *next;
 };
 
+/*
+ * ref_root is used as the root of the ref tree that hold a collection
+ * of unique references.
+ */
+struct ref_root {
+	struct rb_root rb_root;
+
+	/*
+	 * the unique_refs represents the number of ref_nodes with a positive
+	 * count stored in the tree. Even if a ref_node(the count is greater
+	 * than one) is added, the unique_refs will only increase one.
+	 */
+	unsigned int unique_refs;
+};
+
+/* ref_node is used to store a unique reference to the ref tree. */
+struct ref_node {
+	struct rb_node rb_node;
+
+	/* for NORMAL_REF, otherwise all these fields should be set to 0 */
+	u64 root_id;
+	u64 object_id;
+	u64 offset;
+
+	/* for SHARED_REF, otherwise parent field should be set to 0 */
+	u64 parent;
+
+	/* ref to the ref_mod of btrfs_delayed_ref_node(delayed-ref.h) */
+	int ref_mod;
+};
+
+/* dynamically allocate and initialize a ref_root */
+static struct ref_root *ref_root_alloc(void)
+{
+	struct ref_root *ref_tree;
+
+	ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
+	if (!ref_tree)
+		return NULL;
+
+	ref_tree->rb_root = RB_ROOT;
+	ref_tree->unique_refs = 0;
+
+	return ref_tree;
+}
+
+/* free all nodes in the ref tree, and reinit ref_root */
+static void ref_root_fini(struct ref_root *ref_tree)
+{
+	struct ref_node *node;
+	struct rb_node *next;
+
+	while ((next = rb_first(&ref_tree->rb_root)) != NULL) {
+		node = rb_entry(next, struct ref_node, rb_node);
+		rb_erase(next, &ref_tree->rb_root);
+		kfree(node);
+	}
+
+	ref_tree->rb_root = RB_ROOT;
+	ref_tree->unique_refs = 0;
+}
+
+/* free dynamically allocated ref_root */
+static void ref_root_free(struct ref_root *ref_tree)
+{
+	if (!ref_tree)
+		return;
+
+	ref_root_fini(ref_tree);
+	kfree(ref_tree);
+}
+
+/*
+ * search ref_node with (root_id, object_id, offset, parent) in the tree
+ *
+ * if found, the pointer of the ref_node will be returned;
+ * if not found, NULL will be returned and pos will point to the rb_node for
+ * insert, pos_parent will point to pos'parent for insert;
+*/
+static struct ref_node *__ref_tree_search(struct ref_root *ref_tree,
+					  struct rb_node ***pos,
+					  struct rb_node **pos_parent,
+					  u64 root_id, u64 object_id,
+					  u64 offset, u64 parent)
+{
+	struct ref_node *cur = NULL;
+
+	*pos = &ref_tree->rb_root.rb_node;
+
+	while (**pos) {
+		*pos_parent = **pos;
+		cur = rb_entry(*pos_parent, struct ref_node, rb_node);
+
+		if (cur->root_id < root_id) {
+			*pos = &(**pos)->rb_right;
+			continue;
+		} else if (cur->root_id > root_id) {
+			*pos = &(**pos)->rb_left;
+			continue;
+		}
+
+		if (cur->object_id < object_id) {
+			*pos = &(**pos)->rb_right;
+			continue;
+		} else if (cur->object_id > object_id) {
+			*pos = &(**pos)->rb_left;
+			continue;
+		}
+
+		if (cur->offset < offset) {
+			*pos = &(**pos)->rb_right;
+			continue;
+		} else if (cur->offset > offset) {
+			*pos = &(**pos)->rb_left;
+			continue;
+		}
+
+		if (cur->parent < parent) {
+			*pos = &(**pos)->rb_right;
+			continue;
+		} else if (cur->parent > parent) {
+			*pos = &(**pos)->rb_left;
+			continue;
+		}
+
+		return cur;
+	}
+
+	return NULL;
+}
+
+/*
+ * insert a ref_node to the ref tree
+ * @pos used for specifiy the position to insert
+ * @pos_parent for specifiy pos's parent
+ *
+ * success, return 0;
+ * ref_node already exists, return -EEXIST;
+*/
+static int ref_tree_insert(struct ref_root *ref_tree, struct rb_node **pos,
+			   struct rb_node *pos_parent, struct ref_node *ins)
+{
+	struct rb_node **p = NULL;
+	struct rb_node *parent = NULL;
+	struct ref_node *cur = NULL;
+
+	if (!pos) {
+		cur = __ref_tree_search(ref_tree, &p, &parent, ins->root_id,
+					ins->object_id, ins->offset,
+					ins->parent);
+		if (cur)
+			return -EEXIST;
+	} else {
+		p = pos;
+		parent = pos_parent;
+	}
+
+	rb_link_node(&ins->rb_node, parent, p);
+	rb_insert_color(&ins->rb_node, &ref_tree->rb_root);
+
+	return 0;
+}
+
+/* erase and free ref_node, caller should update ref_root->unique_refs */
+static void ref_tree_remove(struct ref_root *ref_tree, struct ref_node *node)
+{
+	rb_erase(&node->rb_node, &ref_tree->rb_root);
+	kfree(node);
+}
+
+/*
+ * update ref_root->unique_refs
+ *
+ * call __ref_tree_search
+ *	1. if ref_node doesn't exist, ref_tree_insert this node, and update
+ *	ref_root->unique_refs:
+ *		if ref_node->ref_mod > 0, ref_root->unique_refs++;
+ *		if ref_node->ref_mod < 0, do noting;
+ *
+ *	2. if ref_node is found, then get origin ref_node->ref_mod, and update
+ *	ref_node->ref_mod.
+ *		if ref_node->ref_mod is equal to 0,then call ref_tree_remove
+ *
+ *		according to origin_mod and new_mod, update ref_root->items
+ *		+----------------+--------------+-------------+
+ *		|		 |new_count <= 0|new_count > 0|
+ *		+----------------+--------------+-------------+
+ *		|origin_count < 0|       0      |      1      |
+ *		+----------------+--------------+-------------+
+ *		|origin_count > 0|      -1      |      0      |
+ *		+----------------+--------------+-------------+
+ *
+ * In case of allocation failure, -ENOMEM is returned and the ref_tree stays
+ * unaltered.
+ * Success, return 0
+ */
+static int ref_tree_add(struct ref_root *ref_tree, u64 root_id, u64 object_id,
+			u64 offset, u64 parent, int count)
+{
+	struct ref_node *node = NULL;
+	struct rb_node **pos = NULL;
+	struct rb_node *pos_parent = NULL;
+	int origin_count;
+	int ret;
+
+	if (!count)
+		return 0;
+
+	node = __ref_tree_search(ref_tree, &pos, &pos_parent, root_id,
+				 object_id, offset, parent);
+	if (node == NULL) {
+		node = kmalloc(sizeof(*node), GFP_KERNEL);
+		if (!node)
+			return -ENOMEM;
+
+		node->root_id = root_id;
+		node->object_id = object_id;
+		node->offset = offset;
+		node->parent = parent;
+		node->ref_mod = count;
+
+		ret = ref_tree_insert(ref_tree, pos, pos_parent, node);
+		ASSERT(!ret);
+		if (ret) {
+			kfree(node);
+			return ret;
+		}
+
+		ref_tree->unique_refs += node->ref_mod > 0 ? 1 : 0;
+
+		return 0;
+	}
+
+	origin_count = node->ref_mod;
+	node->ref_mod += count;
+
+	if (!node->ref_mod)
+		ref_tree_remove(ref_tree, node);
+
+	if (node->ref_mod > 0)
+		ref_tree->unique_refs += origin_count > 0 ? 0 : 1;
+	else if (node->ref_mod <= 0)
+		ref_tree->unique_refs += origin_count > 0 ? -1 : 0;
+
+	return 0;
+}
+
 static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
 				struct btrfs_file_extent_item *fi,
 				u64 extent_item_pos,
@@ -699,6 +947,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			     struct btrfs_path *path, u64 bytenr,
 			     int *info_level, struct list_head *prefs,
+			     struct ref_root *ref_tree,
 			     u64 *total_refs, u64 inum)
 {
 	int ret = 0;
@@ -766,6 +1015,13 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
 					       bytenr, count, GFP_NOFS);
+			if (ref_tree) {
+				if (!ret)
+					ret = ref_tree_add(ref_tree, 0, 0, 0,
+							   bytenr, count);
+				if (!ret && ref_tree->unique_refs > 1)
+					ret = BACKREF_FOUND_SHARED;
+			}
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
@@ -793,6 +1049,15 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			root = btrfs_extent_data_ref_root(leaf, dref);
 			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
 					       bytenr, count, GFP_NOFS);
+			if (ref_tree) {
+				if (!ret)
+					ret = ref_tree_add(ref_tree, root,
+							   key.objectid,
+							   key.offset, 0,
+							   count);
+				if (!ret && ref_tree->unique_refs > 1)
+					ret = BACKREF_FOUND_SHARED;
+			}
 			break;
 		}
 		default:
@@ -811,7 +1076,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
  */
 static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 			    struct btrfs_path *path, u64 bytenr,
-			    int info_level, struct list_head *prefs, u64 inum)
+			    int info_level, struct list_head *prefs,
+			    struct ref_root *ref_tree, u64 inum)
 {
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
@@ -854,6 +1120,13 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
 						bytenr, count, GFP_NOFS);
+			if (ref_tree) {
+				if (!ret)
+					ret = ref_tree_add(ref_tree, 0, 0, 0,
+							   bytenr, count);
+				if (!ret && ref_tree->unique_refs > 1)
+					ret = BACKREF_FOUND_SHARED;
+			}
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
@@ -882,6 +1155,15 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 			root = btrfs_extent_data_ref_root(leaf, dref);
 			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
 					       bytenr, count, GFP_NOFS);
+			if (ref_tree) {
+				if (!ret)
+					ret = ref_tree_add(ref_tree, root,
+							   key.objectid,
+							   key.offset, 0,
+							   count);
+				if (!ret && ref_tree->unique_refs > 1)
+					ret = BACKREF_FOUND_SHARED;
+			}
 			break;
 		}
 		default:
@@ -908,13 +1190,16 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  * commit root.
  * The special case is for qgroup to search roots in commit_transaction().
  *
+ * If check_shared is set to 1, any extent has more than one ref item, will
+ * be returned BACKREF_FOUND_SHARED immediately.
+ *
  * FIXME some caching might speed things up
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
 			     u64 time_seq, struct ulist *refs,
 			     struct ulist *roots, const u64 *extent_item_pos,
-			     u64 root_objectid, u64 inum)
+			     u64 root_objectid, u64 inum, int check_shared)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -926,6 +1211,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	struct list_head prefs;
 	struct __prelim_ref *ref;
 	struct extent_inode_elem *eie = NULL;
+	struct ref_root *ref_tree = NULL;
 	u64 total_refs = 0;
 
 	INIT_LIST_HEAD(&prefs);
@@ -957,6 +1243,18 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 again:
 	head = NULL;
 
+	if (check_shared) {
+		if (!ref_tree) {
+			ref_tree = ref_root_alloc();
+			if (!ref_tree) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		} else {
+			ref_root_fini(ref_tree);
+		}
+	}
+
 	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -1001,6 +1299,36 @@ again:
 		} else {
 			spin_unlock(&delayed_refs->lock);
 		}
+
+		if (check_shared && !list_empty(&prefs_delayed)) {
+			/*
+			 * Add all delay_ref to the ref_tree and check if there
+			 * are multiple ref item added.
+			 */
+			list_for_each_entry(ref, &prefs_delayed, list) {
+				if (ref->key_for_search.type) {
+					ret = ref_tree_add(ref_tree,
+						ref->root_id,
+						ref->key_for_search.objectid,
+						ref->key_for_search.offset,
+						0, ref->count);
+					if (ret)
+						goto out;
+				} else {
+					ret = ref_tree_add(ref_tree, 0, 0, 0,
+						     ref->parent, ref->count);
+					if (ret)
+						goto out;
+				}
+
+			}
+
+			if (ref_tree->unique_refs > 1) {
+				ret = BACKREF_FOUND_SHARED;
+				goto out;
+			}
+
+		}
 	}
 
 	if (path->slots[0]) {
@@ -1016,11 +1344,13 @@ again:
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = __add_inline_refs(fs_info, path, bytenr,
 						&info_level, &prefs,
-						&total_refs, inum);
+						ref_tree, &total_refs,
+						inum);
 			if (ret)
 				goto out;
 			ret = __add_keyed_refs(fs_info, path, bytenr,
-					       info_level, &prefs, inum);
+					       info_level, &prefs,
+					       ref_tree, inum);
 			if (ret)
 				goto out;
 		}
@@ -1105,6 +1435,7 @@ again:
 
 out:
 	btrfs_free_path(path);
+	ref_root_free(ref_tree);
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
 		list_del(&ref->list);
@@ -1158,8 +1489,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 	if (!*leafs)
 		return -ENOMEM;
 
-	ret = find_parent_nodes(trans, fs_info, bytenr,
-				time_seq, *leafs, NULL, extent_item_pos, 0, 0);
+	ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
+				*leafs, NULL, extent_item_pos, 0, 0, 0);
 	if (ret < 0 && ret != -ENOENT) {
 		free_leaf_list(*leafs);
 		return ret;
@@ -1201,8 +1532,8 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
-		ret = find_parent_nodes(trans, fs_info, bytenr,
-					time_seq, tmp, *roots, NULL, 0, 0);
+		ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
+					tmp, *roots, NULL, 0, 0, 0);
 		if (ret < 0 && ret != -ENOENT) {
 			ulist_free(tmp);
 			ulist_free(*roots);
@@ -1272,7 +1603,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
-					roots, NULL, root_objectid, inum);
+					roots, NULL, root_objectid, inum, 1);
 		if (ret == BACKREF_FOUND_SHARED) {
 			/* this is the only condition under which we return 1 */
 			ret = 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 76a0c85..a242e37 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,6 +20,7 @@
 #include "locking.h"
 #include "rcu-string.h"
 #include "backref.h"
+#include "transaction.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -4471,21 +4472,36 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
 		} else if (fieinfo->fi_extents_max) {
+			struct btrfs_trans_handle *trans;
+
 			u64 bytenr = em->block_start -
 				(em->start - em->orig_start);
 
 			disko = em->block_start + offset_in_extent;
 
 			/*
+			 * We need a trans handle to get delayed refs
+			 */
+			trans = btrfs_join_transaction(root);
+			/*
+			 * It's OK if we can't start a trans
+			 * we can still check from commit_root
+			 */
+			if (IS_ERR(trans))
+				trans = NULL;
+
+			/*
 			 * As btrfs supports shared space, this information
 			 * can be exported to userspace tools via
 			 * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
 			 * then we're just getting a count and we can skip the
 			 * lookup stuff.
 			 */
-			ret = btrfs_check_shared(NULL, root->fs_info,
+			ret = btrfs_check_shared(trans, root->fs_info,
 						 root->objectid,
 						 btrfs_ino(inode), bytenr);
+			if (trans)
+				btrfs_end_transaction(trans, root);
 			if (ret < 0)
 				goto out_free;
 			if (ret)
-- 
2.5.5




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-01  5:48 [PATCH v2] btrfs: fix check_shared for fiemap ioctl Lu Fengqi
@ 2016-06-01 21:15 ` Mark Fasheh
  2016-06-01 21:24   ` Mark Fasheh
                     ` (2 more replies)
  2016-06-03 14:02 ` Josef Bacik
  1 sibling, 3 replies; 12+ messages in thread
From: Mark Fasheh @ 2016-06-01 21:15 UTC (permalink / raw)
  To: Lu Fengqi; +Cc: linux-btrfs, dsterba

Thanks for trying to fix this problem, comments below.

On Wed, Jun 01, 2016 at 01:48:05PM +0800, Lu Fengqi wrote:
> Only in the case of different root_id or different object_id, check_shared
> identified extent as the shared. However, If a extent was referred by
> different offset of same file, it should also be identified as shared.
> In addition, check_shared's loop scale is at least  n^3, so if a extent
> has too many references,  even causes soft hang up.
> 
> First, add all delayed_ref to the ref_tree and calculate the unqiue_refs,
> if the unique_refs is greater than one, return BACKREF_FOUND_SHARED.
> Then individually add the  on-disk reference(inline/keyed) to the ref_tree
> and calculate the unique_refs of the ref_tree to check if the unique_refs
> is greater than one.Because once there are two references to return
> SHARED, so the time complexity is close to the constant.

Constant time in the best case, but still n^3 in the worst case right? I'm
not complaining btw, I just want to be sure we're not over promising  :)


> @@ -34,6 +35,253 @@ struct extent_inode_elem {
>  	struct extent_inode_elem *next;
>  };
>  
> +/*
> + * ref_root is used as the root of the ref tree that hold a collection
> + * of unique references.
> + */
> +struct ref_root {
> +	struct rb_root rb_root;
> +
> +	/*
> +	 * the unique_refs represents the number of ref_nodes with a positive
> +	 * count stored in the tree. Even if a ref_node(the count is greater
> +	 * than one) is added, the unique_refs will only increase one.
> +	 */
> +	unsigned int unique_refs;
> +};
> +
> +/* ref_node is used to store a unique reference to the ref tree. */
> +struct ref_node {
> +	struct rb_node rb_node;
> +
> +	/* for NORMAL_REF, otherwise all these fields should be set to 0 */
> +	u64 root_id;
> +	u64 object_id;
> +	u64 offset;
> +
> +	/* for SHARED_REF, otherwise parent field should be set to 0 */
> +	u64 parent;
> +
> +	/* ref to the ref_mod of btrfs_delayed_ref_node(delayed-ref.h) */
> +	int ref_mod;
> +};

Why are we mirroring so much of the backref structures here? It seems like
we're just throwing layers on top of layers. Can't we modify the backref
structures and code to handle whatever small amount of unique accounting you
must do?



> +/* dynamically allocate and initialize a ref_root */
> +static struct ref_root *ref_root_alloc(void)
> +{
> +	struct ref_root *ref_tree;
> +
> +	ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);

I'm pretty sure we want GFP_NOFS here.


> +
> +/*
> + * search ref_node with (root_id, object_id, offset, parent) in the tree
> + *
> + * if found, the pointer of the ref_node will be returned;
> + * if not found, NULL will be returned and pos will point to the rb_node for
> + * insert, pos_parent will point to pos'parent for insert;
> +*/
> +static struct ref_node *__ref_tree_search(struct ref_root *ref_tree,
> +					  struct rb_node ***pos,
> +					  struct rb_node **pos_parent,
> +					  u64 root_id, u64 object_id,
> +					  u64 offset, u64 parent)
> +{
> +	struct ref_node *cur = NULL;
> +
> +	*pos = &ref_tree->rb_root.rb_node;
> +
> +	while (**pos) {
> +		*pos_parent = **pos;
> +		cur = rb_entry(*pos_parent, struct ref_node, rb_node);
> +
> +		if (cur->root_id < root_id) {
> +			*pos = &(**pos)->rb_right;
> +			continue;
> +		} else if (cur->root_id > root_id) {
> +			*pos = &(**pos)->rb_left;
> +			continue;
> +		}
> +
> +		if (cur->object_id < object_id) {
> +			*pos = &(**pos)->rb_right;
> +			continue;
> +		} else if (cur->object_id > object_id) {
> +			*pos = &(**pos)->rb_left;
> +			continue;
> +		}
> +
> +		if (cur->offset < offset) {
> +			*pos = &(**pos)->rb_right;
> +			continue;
> +		} else if (cur->offset > offset) {
> +			*pos = &(**pos)->rb_left;
> +			continue;
> +		}
> +
> +		if (cur->parent < parent) {
> +			*pos = &(**pos)->rb_right;
> +			continue;
> +		} else if (cur->parent > parent) {
> +			*pos = &(**pos)->rb_left;
> +			continue;
> +		}

These 4 comparisons need to be in their own cmp() function - we want to
avoid open coded comparisons with an rbtree search. This is a pretty
standard way to handle it.


> +
> +		return cur;
> +	}
> +
> +	return NULL;
> +}
> +
> +/*
> + * insert a ref_node to the ref tree
> + * @pos used for specifiy the position to insert
> + * @pos_parent for specifiy pos's parent
> + *
> + * success, return 0;
> + * ref_node already exists, return -EEXIST;
> +*/
> +static int ref_tree_insert(struct ref_root *ref_tree, struct rb_node **pos,
> +			   struct rb_node *pos_parent, struct ref_node *ins)
> +{
> +	struct rb_node **p = NULL;
> +	struct rb_node *parent = NULL;
> +	struct ref_node *cur = NULL;
> +
> +	if (!pos) {
> +		cur = __ref_tree_search(ref_tree, &p, &parent, ins->root_id,
> +					ins->object_id, ins->offset,
> +					ins->parent);
> +		if (cur)
> +			return -EEXIST;
> +	} else {
> +		p = pos;
> +		parent = pos_parent;
> +	}
> +
> +	rb_link_node(&ins->rb_node, parent, p);
> +	rb_insert_color(&ins->rb_node, &ref_tree->rb_root);
> +
> +	return 0;
> +}
> +
> +/* erase and free ref_node, caller should update ref_root->unique_refs */
> +static void ref_tree_remove(struct ref_root *ref_tree, struct ref_node *node)
> +{
> +	rb_erase(&node->rb_node, &ref_tree->rb_root);
> +	kfree(node);
> +}
> +
> +/*
> + * update ref_root->unique_refs
> + *
> + * call __ref_tree_search
> + *	1. if ref_node doesn't exist, ref_tree_insert this node, and update
> + *	ref_root->unique_refs:
> + *		if ref_node->ref_mod > 0, ref_root->unique_refs++;
> + *		if ref_node->ref_mod < 0, do noting;
> + *
> + *	2. if ref_node is found, then get origin ref_node->ref_mod, and update
> + *	ref_node->ref_mod.
> + *		if ref_node->ref_mod is equal to 0,then call ref_tree_remove
> + *
> + *		according to origin_mod and new_mod, update ref_root->items
> + *		+----------------+--------------+-------------+
> + *		|		 |new_count <= 0|new_count > 0|
> + *		+----------------+--------------+-------------+
> + *		|origin_count < 0|       0      |      1      |
> + *		+----------------+--------------+-------------+
> + *		|origin_count > 0|      -1      |      0      |
> + *		+----------------+--------------+-------------+
> + *
> + * In case of allocation failure, -ENOMEM is returned and the ref_tree stays
> + * unaltered.
> + * Success, return 0
> + */
> +static int ref_tree_add(struct ref_root *ref_tree, u64 root_id, u64 object_id,
> +			u64 offset, u64 parent, int count)
> +{
> +	struct ref_node *node = NULL;
> +	struct rb_node **pos = NULL;
> +	struct rb_node *pos_parent = NULL;
> +	int origin_count;
> +	int ret;
> +
> +	if (!count)
> +		return 0;
> +
> +	node = __ref_tree_search(ref_tree, &pos, &pos_parent, root_id,
> +				 object_id, offset, parent);
> +	if (node == NULL) {
> +		node = kmalloc(sizeof(*node), GFP_KERNEL);
> +		if (!node)
> +			return -ENOMEM;
> +
> +		node->root_id = root_id;
> +		node->object_id = object_id;
> +		node->offset = offset;
> +		node->parent = parent;
> +		node->ref_mod = count;
> +
> +		ret = ref_tree_insert(ref_tree, pos, pos_parent, node);
> +		ASSERT(!ret);
> +		if (ret) {
> +			kfree(node);
> +			return ret;
> +		}

If you put the open coded comparisons into their own function, then it
should be trivial to call that here and we can have a 'standard' looking
rbtree insert instead of this custom version. See the rbtree.h header for an
example.

We really need to avoid open coded, 'custom' versions of standard linux
kernel programming conventions. rbtree's are coded in a specific and
standard way across the entire kernel that everyone can quickly understand
and verify. There is no reason for an exception here.

Also, __ref_tree_search() and ref_tree_insert() are both re-doing the same
search so your custom rbtree() code is also slower than what we'd get if
you used the usual methods.


Please CC me on any revisions to this patch as I am very interested to see
btrfs fiemap performance get back in line with other filesystems.

Thanks,
	--Mark

--
Mark Fasheh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-01 21:15 ` Mark Fasheh
@ 2016-06-01 21:24   ` Mark Fasheh
  2016-06-02  5:46   ` Lu Fengqi
  2016-06-02 17:07   ` David Sterba
  2 siblings, 0 replies; 12+ messages in thread
From: Mark Fasheh @ 2016-06-01 21:24 UTC (permalink / raw)
  To: Lu Fengqi; +Cc: linux-btrfs, dsterba

On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
> > +static int ref_tree_add(struct ref_root *ref_tree, u64 root_id, u64 object_id,
> > +			u64 offset, u64 parent, int count)
> > +{
> > +	struct ref_node *node = NULL;
> > +	struct rb_node **pos = NULL;
> > +	struct rb_node *pos_parent = NULL;
> > +	int origin_count;
> > +	int ret;
> > +
> > +	if (!count)
> > +		return 0;
> > +
> > +	node = __ref_tree_search(ref_tree, &pos, &pos_parent, root_id,
> > +				 object_id, offset, parent);
> > +	if (node == NULL) {
> > +		node = kmalloc(sizeof(*node), GFP_KERNEL);
> > +		if (!node)
> > +			return -ENOMEM;
> > +
> > +		node->root_id = root_id;
> > +		node->object_id = object_id;
> > +		node->offset = offset;
> > +		node->parent = parent;
> > +		node->ref_mod = count;
> > +
> > +		ret = ref_tree_insert(ref_tree, pos, pos_parent, node);
> > +		ASSERT(!ret);
> > +		if (ret) {
> > +			kfree(node);
> > +			return ret;
> > +		}
> 
> If you put the open coded comparisons into their own function, then it
> should be trivial to call that here and we can have a 'standard' looking
> rbtree insert instead of this custom version. See the rbtree.h header for an
> example.

oops, I meant Documentation/rbtree.txt instead of include/linux/rbtree.h.
	--Mark

--
Mark Fasheh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-01 21:15 ` Mark Fasheh
  2016-06-01 21:24   ` Mark Fasheh
@ 2016-06-02  5:46   ` Lu Fengqi
  2016-06-02 20:30     ` Mark Fasheh
  2016-06-02 17:07   ` David Sterba
  2 siblings, 1 reply; 12+ messages in thread
From: Lu Fengqi @ 2016-06-02  5:46 UTC (permalink / raw)
  To: Mark Fasheh, dsterba; +Cc: linux-btrfs@vger.kernel.org


At 06/02/2016 05:15 AM, Mark Fasheh wrote:
> Thanks for trying to fix this problem, comments below.
>
> On Wed, Jun 01, 2016 at 01:48:05PM +0800, Lu Fengqi wrote:
>> Only in the case of different root_id or different object_id, check_shared
>> identified extent as the shared. However, If a extent was referred by
>> different offset of same file, it should also be identified as shared.
>> In addition, check_shared's loop scale is at least  n^3, so if a extent
>> has too many references,  even causes soft hang up.
>>
>> First, add all delayed_ref to the ref_tree and calculate the unqiue_refs,
>> if the unique_refs is greater than one, return BACKREF_FOUND_SHARED.
>> Then individually add the  on-disk reference(inline/keyed) to the ref_tree
>> and calculate the unique_refs of the ref_tree to check if the unique_refs
>> is greater than one.Because once there are two references to return
>> SHARED, so the time complexity is close to the constant.
> Constant time in the best case, but still n^3 in the worst case right? I'm
> not complaining btw, I just want to be sure we're not over promising  :)
Only in case of a large number of delayed_ref, the worst case time 
complexity will be n^2*logn. Otherwise, it will be constant even if 
there are many on-disk references.
>> @@ -34,6 +35,253 @@ struct extent_inode_elem {
>>   	struct extent_inode_elem *next;
>>   };
>>
>> +/*
>> + * ref_root is used as the root of the ref tree that hold a collection
>> + * of unique references.
>> + */
>> +struct ref_root {
>> +	struct rb_root rb_root;
>> +
>> +	/*
>> +	 * the unique_refs represents the number of ref_nodes with a positive
>> +	 * count stored in the tree. Even if a ref_node(the count is greater
>> +	 * than one) is added, the unique_refs will only increase one.
>> +	 */
>> +	unsigned int unique_refs;
>> +};
>> +
>> +/* ref_node is used to store a unique reference to the ref tree. */
>> +struct ref_node {
>> +	struct rb_node rb_node;
>> +
>> +	/* for NORMAL_REF, otherwise all these fields should be set to 0 */
>> +	u64 root_id;
>> +	u64 object_id;
>> +	u64 offset;
>> +
>> +	/* for SHARED_REF, otherwise parent field should be set to 0 */
>> +	u64 parent;
>> +
>> +	/* ref to the ref_mod of btrfs_delayed_ref_node(delayed-ref.h) */
>> +	int ref_mod;
>> +};
> Why are we mirroring so much of the backref structures here? It seems like
> we're just throwing layers on top of layers. Can't we modify the backref
> structures and code to handle whatever small amount of unique accounting you
> must do?
The original structure(struct __prelim_ref) store reference in list, and 
I have to perform many search operations that not suitable for list. 
However, if I modify the original structure, it would require a lot of 
rework. So I just want to fix fiemap with this patch. If necessary, we 
can use this structure to replace the original structure later.
>
>> +/* dynamically allocate and initialize a ref_root */
>> +static struct ref_root *ref_root_alloc(void)
>> +{
>> +	struct ref_root *ref_tree;
>> +
>> +	ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> I'm pretty sure we want GFP_NOFS here.
Yes, perhaps you're right.
> Because there's no need to narrow the allocation constraints. GFP_NOFS
> is necessary when the caller is on a critical path that must not recurse
> back to the filesystem through the allocation (ie. if the allocator
> decides to free some memory and tries tro write dirty data). FIEMAP is
> called from an ioctl.
But David seems to have a different point of view with you, so I would 
like to ask for his advice again.
>> +
>> +/*
>> + * search ref_node with (root_id, object_id, offset, parent) in the tree
>> + *
>> + * if found, the pointer of the ref_node will be returned;
>> + * if not found, NULL will be returned and pos will point to the rb_node for
>> + * insert, pos_parent will point to pos'parent for insert;
>> +*/
>> +static struct ref_node *__ref_tree_search(struct ref_root *ref_tree,
>> +					  struct rb_node ***pos,
>> +					  struct rb_node **pos_parent,
>> +					  u64 root_id, u64 object_id,
>> +					  u64 offset, u64 parent)
>> +{
>> +	struct ref_node *cur = NULL;
>> +
>> +	*pos = &ref_tree->rb_root.rb_node;
>> +
>> +	while (**pos) {
>> +		*pos_parent = **pos;
>> +		cur = rb_entry(*pos_parent, struct ref_node, rb_node);
>> +
>> +		if (cur->root_id < root_id) {
>> +			*pos = &(**pos)->rb_right;
>> +			continue;
>> +		} else if (cur->root_id > root_id) {
>> +			*pos = &(**pos)->rb_left;
>> +			continue;
>> +		}
>> +
>> +		if (cur->object_id < object_id) {
>> +			*pos = &(**pos)->rb_right;
>> +			continue;
>> +		} else if (cur->object_id > object_id) {
>> +			*pos = &(**pos)->rb_left;
>> +			continue;
>> +		}
>> +
>> +		if (cur->offset < offset) {
>> +			*pos = &(**pos)->rb_right;
>> +			continue;
>> +		} else if (cur->offset > offset) {
>> +			*pos = &(**pos)->rb_left;
>> +			continue;
>> +		}
>> +
>> +		if (cur->parent < parent) {
>> +			*pos = &(**pos)->rb_right;
>> +			continue;
>> +		} else if (cur->parent > parent) {
>> +			*pos = &(**pos)->rb_left;
>> +			continue;
>> +		}
> These 4 comparisons need to be in their own cmp() function - we want to
> avoid open coded comparisons with an rbtree search. This is a pretty
> standard way to handle it.
>
Yes, you are right. I'll update rb_tree related function.
>> +
>> +		return cur;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +/*
>> + * insert a ref_node to the ref tree
>> + * @pos used for specifiy the position to insert
>> + * @pos_parent for specifiy pos's parent
>> + *
>> + * success, return 0;
>> + * ref_node already exists, return -EEXIST;
>> +*/
>> +static int ref_tree_insert(struct ref_root *ref_tree, struct rb_node **pos,
>> +			   struct rb_node *pos_parent, struct ref_node *ins)
>> +{
>> +	struct rb_node **p = NULL;
>> +	struct rb_node *parent = NULL;
>> +	struct ref_node *cur = NULL;
>> +
>> +	if (!pos) {
>> +		cur = __ref_tree_search(ref_tree, &p, &parent, ins->root_id,
>> +					ins->object_id, ins->offset,
>> +					ins->parent);
>> +		if (cur)
>> +			return -EEXIST;
>> +	} else {
>> +		p = pos;
>> +		parent = pos_parent;
>> +	}
>> +
>> +	rb_link_node(&ins->rb_node, parent, p);
>> +	rb_insert_color(&ins->rb_node, &ref_tree->rb_root);
>> +
>> +	return 0;
>> +}
>> +
>> +/* erase and free ref_node, caller should update ref_root->unique_refs */
>> +static void ref_tree_remove(struct ref_root *ref_tree, struct ref_node *node)
>> +{
>> +	rb_erase(&node->rb_node, &ref_tree->rb_root);
>> +	kfree(node);
>> +}
>> +
>> +/*
>> + * update ref_root->unique_refs
>> + *
>> + * call __ref_tree_search
>> + *	1. if ref_node doesn't exist, ref_tree_insert this node, and update
>> + *	ref_root->unique_refs:
>> + *		if ref_node->ref_mod > 0, ref_root->unique_refs++;
>> + *		if ref_node->ref_mod < 0, do noting;
>> + *
>> + *	2. if ref_node is found, then get origin ref_node->ref_mod, and update
>> + *	ref_node->ref_mod.
>> + *		if ref_node->ref_mod is equal to 0,then call ref_tree_remove
>> + *
>> + *		according to origin_mod and new_mod, update ref_root->items
>> + *		+----------------+--------------+-------------+
>> + *		|		 |new_count <= 0|new_count > 0|
>> + *		+----------------+--------------+-------------+
>> + *		|origin_count < 0|       0      |      1      |
>> + *		+----------------+--------------+-------------+
>> + *		|origin_count > 0|      -1      |      0      |
>> + *		+----------------+--------------+-------------+
>> + *
>> + * In case of allocation failure, -ENOMEM is returned and the ref_tree stays
>> + * unaltered.
>> + * Success, return 0
>> + */
>> +static int ref_tree_add(struct ref_root *ref_tree, u64 root_id, u64 object_id,
>> +			u64 offset, u64 parent, int count)
>> +{
>> +	struct ref_node *node = NULL;
>> +	struct rb_node **pos = NULL;
>> +	struct rb_node *pos_parent = NULL;
>> +	int origin_count;
>> +	int ret;
>> +
>> +	if (!count)
>> +		return 0;
>> +
>> +	node = __ref_tree_search(ref_tree, &pos, &pos_parent, root_id,
>> +				 object_id, offset, parent);
>> +	if (node == NULL) {
>> +		node = kmalloc(sizeof(*node), GFP_KERNEL);
>> +		if (!node)
>> +			return -ENOMEM;
>> +
>> +		node->root_id = root_id;
>> +		node->object_id = object_id;
>> +		node->offset = offset;
>> +		node->parent = parent;
>> +		node->ref_mod = count;
>> +
>> +		ret = ref_tree_insert(ref_tree, pos, pos_parent, node);
>> +		ASSERT(!ret);
>> +		if (ret) {
>> +			kfree(node);
>> +			return ret;
>> +		}
> If you put the open coded comparisons into their own function, then it
> should be trivial to call that here and we can have a 'standard' looking
> rbtree insert instead of this custom version. See the rbtree.h header for an
> example.
>
> We really need to avoid open coded, 'custom' versions of standard linux
> kernel programming conventions. rbtree's are coded in a specific and
> standard way across the entire kernel that everyone can quickly understand
> and verify. There is no reason for an exception here.
>
> Also, __ref_tree_search() and ref_tree_insert() are both re-doing the same
> search so your custom rbtree() code is also slower than what we'd get if
> you used the usual methods.
ref_tree_insert() will use these parameters(pos and pos_parent) without 
same search.
Of course, you are right, we need to avoid open coded, I'll use standard 
methods.
>
> Please CC me on any revisions to this patch as I am very interested to see
> btrfs fiemap performance get back in line with other filesystems.
>
> Thanks,
> 	--Mark
>
> --
> Mark Fasheh
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message tomajordomo@vger.kernel.org
> More majordomo info athttp://vger.kernel.org/majordomo-info.html
>
>
Thanks,
Lu





^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-01 21:15 ` Mark Fasheh
  2016-06-01 21:24   ` Mark Fasheh
  2016-06-02  5:46   ` Lu Fengqi
@ 2016-06-02 17:07   ` David Sterba
  2016-06-02 19:08     ` Mark Fasheh
  2 siblings, 1 reply; 12+ messages in thread
From: David Sterba @ 2016-06-02 17:07 UTC (permalink / raw)
  To: Mark Fasheh; +Cc: Lu Fengqi, linux-btrfs, dsterba

On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
> > +/* dynamically allocate and initialize a ref_root */
> > +static struct ref_root *ref_root_alloc(void)
> > +{
> > +	struct ref_root *ref_tree;
> > +
> > +	ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> 
> I'm pretty sure we want GFP_NOFS here.

Then please explain to me why/where the reasoning below is wrong:

> > Because there's no need to narrow the allocation constraints. GFP_NOFS
> > is necessary when the caller is on a critical path that must not recurse
> > back to the filesystem through the allocation (ie. if the allocator
> > decides to free some memory and tries tro write dirty data). FIEMAP is
> > called from an ioctl.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-02 17:07   ` David Sterba
@ 2016-06-02 19:08     ` Mark Fasheh
  2016-06-02 20:56       ` Jeff Mahoney
  0 siblings, 1 reply; 12+ messages in thread
From: Mark Fasheh @ 2016-06-02 19:08 UTC (permalink / raw)
  To: dsterba, Lu Fengqi, linux-btrfs

On Thu, Jun 02, 2016 at 07:07:32PM +0200, David Sterba wrote:
> On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
> > > +/* dynamically allocate and initialize a ref_root */
> > > +static struct ref_root *ref_root_alloc(void)
> > > +{
> > > +	struct ref_root *ref_tree;
> > > +
> > > +	ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> > 
> > I'm pretty sure we want GFP_NOFS here.
> 
> Then please explain to me why/where the reasoning below is wrong:

The general reasoning of when to use GFP_NOFS below is fine, I don't
disagree with that at all. If there is no way a recursion back into btrfs
can't happen at that allocation site then we can use GFP_KERNEL.

That said, have you closely audited this path? Does the allocation happen
completely outside any locks that might be shared with the writeout path?
What happens if we have to do writeout of the inode being fiemapped in order
to allocate space? If the answer to all my questions is "there is no way
this can deadlock" then by all means, we should use GFP_KERNEL. Otherwise
GFP_NOFS is a sensible guard against possible future deadlocks.
	--Mark

--
Mark Fasheh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-02  5:46   ` Lu Fengqi
@ 2016-06-02 20:30     ` Mark Fasheh
  0 siblings, 0 replies; 12+ messages in thread
From: Mark Fasheh @ 2016-06-02 20:30 UTC (permalink / raw)
  To: Lu Fengqi; +Cc: dsterba, linux-btrfs@vger.kernel.org, Josef Bacik

On Thu, Jun 02, 2016 at 01:46:27PM +0800, Lu Fengqi wrote:
> 
> At 06/02/2016 05:15 AM, Mark Fasheh wrote:
> >Thanks for trying to fix this problem, comments below.
> >
> >On Wed, Jun 01, 2016 at 01:48:05PM +0800, Lu Fengqi wrote:
> >>Only in the case of different root_id or different object_id, check_shared
> >>identified extent as the shared. However, If a extent was referred by
> >>different offset of same file, it should also be identified as shared.
> >>In addition, check_shared's loop scale is at least  n^3, so if a extent
> >>has too many references,  even causes soft hang up.
> >>
> >>First, add all delayed_ref to the ref_tree and calculate the unqiue_refs,
> >>if the unique_refs is greater than one, return BACKREF_FOUND_SHARED.
> >>Then individually add the  on-disk reference(inline/keyed) to the ref_tree
> >>and calculate the unique_refs of the ref_tree to check if the unique_refs
> >>is greater than one.Because once there are two references to return
> >>SHARED, so the time complexity is close to the constant.
> >Constant time in the best case, but still n^3 in the worst case right? I'm
> >not complaining btw, I just want to be sure we're not over promising  :)
> Only in case of a large number of delayed_ref, the worst case time
> complexity will be n^2*logn. Otherwise, it will be constant even if
> there are many on-disk references.

Ahh ok so it's driven more by the # of delayed refs. That makes sense,
thanks.


> >>@@ -34,6 +35,253 @@ struct extent_inode_elem {
> >>  	struct extent_inode_elem *next;
> >>  };
> >>
> >>+/*
> >>+ * ref_root is used as the root of the ref tree that hold a collection
> >>+ * of unique references.
> >>+ */
> >>+struct ref_root {
> >>+	struct rb_root rb_root;
> >>+
> >>+	/*
> >>+	 * the unique_refs represents the number of ref_nodes with a positive
> >>+	 * count stored in the tree. Even if a ref_node(the count is greater
> >>+	 * than one) is added, the unique_refs will only increase one.
> >>+	 */
> >>+	unsigned int unique_refs;
> >>+};
> >>+
> >>+/* ref_node is used to store a unique reference to the ref tree. */
> >>+struct ref_node {
> >>+	struct rb_node rb_node;
> >>+
> >>+	/* for NORMAL_REF, otherwise all these fields should be set to 0 */
> >>+	u64 root_id;
> >>+	u64 object_id;
> >>+	u64 offset;
> >>+
> >>+	/* for SHARED_REF, otherwise parent field should be set to 0 */
> >>+	u64 parent;
> >>+
> >>+	/* ref to the ref_mod of btrfs_delayed_ref_node(delayed-ref.h) */
> >>+	int ref_mod;
> >>+};
> >Why are we mirroring so much of the backref structures here? It seems like
> >we're just throwing layers on top of layers. Can't we modify the backref
> >structures and code to handle whatever small amount of unique accounting you
> >must do?
> The original structure(struct __prelim_ref) store reference in list,
> and I have to perform many search operations that not suitable for
> list. However, if I modify the original structure, it would require
> a lot of rework. So I just want to fix fiemap with this patch. If
> necessary, we can use this structure to replace the original
> structure later.

Well there's room for an rb_node on that structure so we can solve the 'it
only uses a list' problem trivially. I definitely understand your reluctance
to modify the backref code, but to me that just sounds like we need someone
who is familiar with that code to review your work and provide advice when
needed.

Otherwise, I believe my point holds. If there's some technical reason why
this is a bad idea, that's a different story. So far though this just seems
like a situation where we need some extra review from the primary
developers. I cc'd Josef in the hopes he could shed some light for us.


> >>+/* dynamically allocate and initialize a ref_root */
> >>+static struct ref_root *ref_root_alloc(void)
> >>+{
> >>+	struct ref_root *ref_tree;
> >>+
> >>+	ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> >I'm pretty sure we want GFP_NOFS here.
> Yes, perhaps you're right.
> >Because there's no need to narrow the allocation constraints. GFP_NOFS
> >is necessary when the caller is on a critical path that must not recurse
> >back to the filesystem through the allocation (ie. if the allocator
> >decides to free some memory and tries tro write dirty data). FIEMAP is
> >called from an ioctl.
> But David seems to have a different point of view with you, so I
> would like to ask for his advice again.

Sounds good, hopefully David and I can figure it out  :)

Thanks again Lu,
	--Mark

--
Mark Fasheh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-02 19:08     ` Mark Fasheh
@ 2016-06-02 20:56       ` Jeff Mahoney
  2016-06-02 21:17         ` Mark Fasheh
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Mahoney @ 2016-06-02 20:56 UTC (permalink / raw)
  To: Mark Fasheh, dsterba, Lu Fengqi, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 1473 bytes --]

On 6/2/16 3:08 PM, Mark Fasheh wrote:
> On Thu, Jun 02, 2016 at 07:07:32PM +0200, David Sterba wrote:
>> On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
>>>> +/* dynamically allocate and initialize a ref_root */
>>>> +static struct ref_root *ref_root_alloc(void)
>>>> +{
>>>> +	struct ref_root *ref_tree;
>>>> +
>>>> +	ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
>>>
>>> I'm pretty sure we want GFP_NOFS here.
>>
>> Then please explain to me why/where the reasoning below is wrong:
> 
> The general reasoning of when to use GFP_NOFS below is fine, I don't
> disagree with that at all. If there is no way a recursion back into btrfs
> can't happen at that allocation site then we can use GFP_KERNEL.
> 
> That said, have you closely audited this path? Does the allocation happen
> completely outside any locks that might be shared with the writeout path?
> What happens if we have to do writeout of the inode being fiemapped in order
> to allocate space? If the answer to all my questions is "there is no way
> this can deadlock" then by all means, we should use GFP_KERNEL. Otherwise
> GFP_NOFS is a sensible guard against possible future deadlocks.

This is exactly the situation we discussed at LSF/MM this year.  The MM
folks are pushing back because the fs folks tend to use GFP_NOFS as a
talisman.  The audit needs to happen, otherwise that last sentence is
another talisman.

-Jeff

-- 
Jeff Mahoney
SUSE Labs


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 881 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-02 20:56       ` Jeff Mahoney
@ 2016-06-02 21:17         ` Mark Fasheh
  2016-06-02 21:40           ` Mark Fasheh
  0 siblings, 1 reply; 12+ messages in thread
From: Mark Fasheh @ 2016-06-02 21:17 UTC (permalink / raw)
  To: Jeff Mahoney; +Cc: dsterba, Lu Fengqi, linux-btrfs

On Thu, Jun 02, 2016 at 04:56:06PM -0400, Jeff Mahoney wrote:
> On 6/2/16 3:08 PM, Mark Fasheh wrote:
> > On Thu, Jun 02, 2016 at 07:07:32PM +0200, David Sterba wrote:
> >> On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
> >>>> +/* dynamically allocate and initialize a ref_root */
> >>>> +static struct ref_root *ref_root_alloc(void)
> >>>> +{
> >>>> +	struct ref_root *ref_tree;
> >>>> +
> >>>> +	ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> >>>
> >>> I'm pretty sure we want GFP_NOFS here.
> >>
> >> Then please explain to me why/where the reasoning below is wrong:
> > 
> > The general reasoning of when to use GFP_NOFS below is fine, I don't
> > disagree with that at all. If there is no way a recursion back into btrfs
> > can't happen at that allocation site then we can use GFP_KERNEL.
> > 
> > That said, have you closely audited this path? Does the allocation happen
> > completely outside any locks that might be shared with the writeout path?
> > What happens if we have to do writeout of the inode being fiemapped in order
> > to allocate space? If the answer to all my questions is "there is no way
> > this can deadlock" then by all means, we should use GFP_KERNEL. Otherwise
> > GFP_NOFS is a sensible guard against possible future deadlocks.
> 
> This is exactly the situation we discussed at LSF/MM this year.  The MM
> folks are pushing back because the fs folks tend to use GFP_NOFS as a
> talisman.  The audit needs to happen, otherwise that last sentence is
> another talisman.

There's nothing here I disagree with. I'm not seeing a strong technical
justification, which is what I want (being called from an ioctl means
nothing in this case).
	--Mark

--
Mark Fasheh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-02 21:17         ` Mark Fasheh
@ 2016-06-02 21:40           ` Mark Fasheh
  0 siblings, 0 replies; 12+ messages in thread
From: Mark Fasheh @ 2016-06-02 21:40 UTC (permalink / raw)
  To: Jeff Mahoney; +Cc: dsterba, Lu Fengqi, linux-btrfs

On Thu, Jun 02, 2016 at 02:17:40PM -0700, Mark Fasheh wrote:
> On Thu, Jun 02, 2016 at 04:56:06PM -0400, Jeff Mahoney wrote:
> > On 6/2/16 3:08 PM, Mark Fasheh wrote:
> > > On Thu, Jun 02, 2016 at 07:07:32PM +0200, David Sterba wrote:
> > >> On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
> > >>>> +/* dynamically allocate and initialize a ref_root */
> > >>>> +static struct ref_root *ref_root_alloc(void)
> > >>>> +{
> > >>>> +	struct ref_root *ref_tree;
> > >>>> +
> > >>>> +	ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> > >>>
> > >>> I'm pretty sure we want GFP_NOFS here.
> > >>
> > >> Then please explain to me why/where the reasoning below is wrong:
> > > 
> > > The general reasoning of when to use GFP_NOFS below is fine, I don't
> > > disagree with that at all. If there is no way a recursion back into btrfs
> > > can't happen at that allocation site then we can use GFP_KERNEL.
> > > 
> > > That said, have you closely audited this path? Does the allocation happen
> > > completely outside any locks that might be shared with the writeout path?
> > > What happens if we have to do writeout of the inode being fiemapped in order
> > > to allocate space? If the answer to all my questions is "there is no way
> > > this can deadlock" then by all means, we should use GFP_KERNEL. Otherwise
> > > GFP_NOFS is a sensible guard against possible future deadlocks.
> > 
> > This is exactly the situation we discussed at LSF/MM this year.  The MM
> > folks are pushing back because the fs folks tend to use GFP_NOFS as a
> > talisman.  The audit needs to happen, otherwise that last sentence is
> > another talisman.
> 
> There's nothing here I disagree with. I'm not seeing a strong technical
> justification, which is what I want (being called from an ioctl means
> nothing in this case).

A small amount of searching shows me that extent_fiemap() does
lock_extent_bits() and writepage_delalloc() also calls lock_extent_bits()
(via find_lock_delalloc_range()).

I'm no expert on the extent locking but that seems pretty deadlocky to me.
	--Mark

--
Mark Fasheh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-01  5:48 [PATCH v2] btrfs: fix check_shared for fiemap ioctl Lu Fengqi
  2016-06-01 21:15 ` Mark Fasheh
@ 2016-06-03 14:02 ` Josef Bacik
  2016-06-06  2:16   ` Lu Fengqi
  1 sibling, 1 reply; 12+ messages in thread
From: Josef Bacik @ 2016-06-03 14:02 UTC (permalink / raw)
  To: Lu Fengqi, linux-btrfs; +Cc: dsterba

On 06/01/2016 01:48 AM, Lu Fengqi wrote:
> Only in the case of different root_id or different object_id, check_shared
> identified extent as the shared. However, If a extent was referred by
> different offset of same file, it should also be identified as shared.
> In addition, check_shared's loop scale is at least  n^3, so if a extent
> has too many references,  even causes soft hang up.
>
> First, add all delayed_ref to the ref_tree and calculate the unqiue_refs,
> if the unique_refs is greater than one, return BACKREF_FOUND_SHARED.
> Then individually add the  on-disk reference(inline/keyed) to the ref_tree
> and calculate the unique_refs of the ref_tree to check if the unique_refs
> is greater than one.Because once there are two references to return
> SHARED, so the time complexity is close to the constant.
>
> Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
> Signed-off-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>

This is a lot of work for just wanting to know if something is shared. 
Instead lets adjust this slightly.  Instead of passing down a 
root_objectid/inum and noticing this and returned shared, add a new way 
to iterate refs.  Currently we go gather all the refs and then do the 
iterate dance, which is what takes so long.  So instead add another 
helper that calls the provided function every time it has a match, and 
then we can pass in whatever context we want, and we return when 
something matches.  This way we don't have all this extra accounting, 
and we're no longer passing root_objectid/inum around and testing for 
some magic scenario.  Thanks,

Josef


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl
  2016-06-03 14:02 ` Josef Bacik
@ 2016-06-06  2:16   ` Lu Fengqi
  0 siblings, 0 replies; 12+ messages in thread
From: Lu Fengqi @ 2016-06-06  2:16 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, dsterba

At 06/03/2016 10:02 PM, Josef Bacik wrote:
> On 06/01/2016 01:48 AM, Lu Fengqi wrote:
>> Only in the case of different root_id or different object_id,
>> check_shared
>> identified extent as the shared. However, If a extent was referred by
>> different offset of same file, it should also be identified as shared.
>> In addition, check_shared's loop scale is at least  n^3, so if a extent
>> has too many references,  even causes soft hang up.
>>
>> First, add all delayed_ref to the ref_tree and calculate the unqiue_refs,
>> if the unique_refs is greater than one, return BACKREF_FOUND_SHARED.
>> Then individually add the  on-disk reference(inline/keyed) to the
>> ref_tree
>> and calculate the unique_refs of the ref_tree to check if the unique_refs
>> is greater than one.Because once there are two references to return
>> SHARED, so the time complexity is close to the constant.
>>
>> Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
>> Signed-off-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
>
> This is a lot of work for just wanting to know if something is shared.
> Instead lets adjust this slightly.  Instead of passing down a
> root_objectid/inum and noticing this and returned shared, add a new way
> to iterate refs.  Currently we go gather all the refs and then do the
> iterate dance, which is what takes so long.  So instead add another
> helper that calls the provided function every time it has a match, and
> then we can pass in whatever context we want, and we return when
> something matches.  This way we don't have all this extra accounting,
> and we're no longer passing root_objectid/inum around and testing for
> some magic scenario.  Thanks,
>
> Josef
>
>
>

With this patch, we can quickly find extent that has more than one 
reference(delayed, inline and keyed) and return SHARED immediately. 
However, for indirect refs, we have to continue to resolve indirect refs 
to their parent bytenr, and check if this parent bytenr is shared. So, 
the original function is necessary. The original refs list reduces the 
efficiency of search, so maybe we can use rb_tree to replace it for 
optimize the original function in the furture. And, we just want to 
solve the problem of check_shared now.

-- 
Thanks,
Lu



^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-06-06  2:16 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-06-01  5:48 [PATCH v2] btrfs: fix check_shared for fiemap ioctl Lu Fengqi
2016-06-01 21:15 ` Mark Fasheh
2016-06-01 21:24   ` Mark Fasheh
2016-06-02  5:46   ` Lu Fengqi
2016-06-02 20:30     ` Mark Fasheh
2016-06-02 17:07   ` David Sterba
2016-06-02 19:08     ` Mark Fasheh
2016-06-02 20:56       ` Jeff Mahoney
2016-06-02 21:17         ` Mark Fasheh
2016-06-02 21:40           ` Mark Fasheh
2016-06-03 14:02 ` Josef Bacik
2016-06-06  2:16   ` Lu Fengqi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).