linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] btrfs: implement ref_tracker for delayed_nodes
@ 2025-07-29  7:08 Leo Martins
  2025-08-04 13:57 ` David Sterba
  0 siblings, 1 reply; 6+ messages in thread
From: Leo Martins @ 2025-07-29  7:08 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

This patch adds ref_tracker infrastructure for btrfs_delayed_node.

A ref_tracker object is allocated every time the reference count is
increased and freed when the reference count is decreased. The
ref_tracker object contains stack_trace information about where the
ref count is increased and decreased. The ref_tracker_dir embedded in
btrfs_delayed_node keeps track of all current references, and some
previous references. This information allows for detection of reference
leaks or double frees.

Here is a common example of taking a reference to a delayed_node and
freeing it with ref_tracker.

```C
struct ref_tracker *tracker;
struct btrfs_delayed_node *node;

node = btrfs_get_delayed_node(inode, tracker);
// use delayed_node...
btrfs_release_delayed_node(node, tracker);
```

There are two special cases where the delayed_node reference is long
term, meaning that the thread that takes the reference and the thread
that frees the reference are different. The inode_cache which is the
btrfs_delayed_node reference stored in the associated btrfs_inode, and
the node_list which is the btrfs_delayed_node reference stored in the
btrfs_delayed_root node_list/prepare_list. In these cases the
ref_tracker is stored in the delayed_node itself.

This patch introduces some new wrappers (btrfs_delayed_node_ref_tracker_*)
to ensure that when BTRFS_DEBUG is disabled everything gets compiled out.

Signed-off-by: Leo Martins <loemra.dev@gmail.com>
---
v1: https://lore.kernel.org/linux-btrfs/fa19acf9dc38e93546183fc083c365cdb237e89b.1752098515.git.loemra.dev@gmail.com/

v1->v2:
- remove typedefs, now functions always take struct ref_tracker **
- put delayed_node::count back to original position to not change
  delayed_node struct size
- cleanup ref_tracker_dir if btrfs_get_or_create_delayed_node
  fails to create a delayed_node
- remove CONFIG_BTRFS_DELAYED_NODE_REF_TRACKER and use
  CONFIG_BTRFS_DEBUG
---
 fs/btrfs/delayed-inode.c | 229 ++++++++++++++++++++++++++++-----------
 fs/btrfs/delayed-inode.h |  61 ++++++++++-
 2 files changed, 226 insertions(+), 64 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0f8d8e275143..0b40308c96a6 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -57,6 +57,8 @@ static inline void btrfs_init_delayed_node(
 	delayed_node->root = root;
 	delayed_node->inode_id = inode_id;
 	refcount_set(&delayed_node->refs, 0);
+	btrfs_delayed_node_ref_tracker_dir_init(delayed_node, 16,
+						"delayed_node");
 	delayed_node->ins_root = RB_ROOT_CACHED;
 	delayed_node->del_root = RB_ROOT_CACHED;
 	mutex_init(&delayed_node->mutex);
@@ -64,16 +66,19 @@ static inline void btrfs_init_delayed_node(
 	INIT_LIST_HEAD(&delayed_node->p_list);
 }
 
-static struct btrfs_delayed_node *btrfs_get_delayed_node(
-		struct btrfs_inode *btrfs_inode)
+static struct btrfs_delayed_node *
+btrfs_get_delayed_node(struct btrfs_inode *btrfs_inode,
+		       struct ref_tracker **tracker)
 {
 	struct btrfs_root *root = btrfs_inode->root;
 	u64 ino = btrfs_ino(btrfs_inode);
 	struct btrfs_delayed_node *node;
+	struct ref_tracker **inode_cache_tracker = NULL;
 
 	node = READ_ONCE(btrfs_inode->delayed_node);
 	if (node) {
 		refcount_inc(&node->refs);
+		btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS);
 		return node;
 	}
 
@@ -83,6 +88,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 	if (node) {
 		if (btrfs_inode->delayed_node) {
 			refcount_inc(&node->refs);	/* can be accessed */
+			btrfs_delayed_node_ref_tracker_alloc(node, tracker,
+							     GFP_ATOMIC);
 			BUG_ON(btrfs_inode->delayed_node != node);
 			xa_unlock(&root->delayed_nodes);
 			return node;
@@ -106,6 +113,13 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 		 */
 		if (refcount_inc_not_zero(&node->refs)) {
 			refcount_inc(&node->refs);
+#ifdef CONFIG_BTRFS_DEBUG
+			inode_cache_tracker = &node->inode_cache_tracker;
+#endif
+			btrfs_delayed_node_ref_tracker_alloc(node, tracker,
+							     GFP_ATOMIC);
+			btrfs_delayed_node_ref_tracker_alloc(
+				node, inode_cache_tracker, GFP_ATOMIC);
 			btrfs_inode->delayed_node = node;
 		} else {
 			node = NULL;
@@ -125,17 +139,19 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
  *
  * Return the delayed node, or error pointer on failure.
  */
-static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
-		struct btrfs_inode *btrfs_inode)
+static struct btrfs_delayed_node *
+btrfs_get_or_create_delayed_node(struct btrfs_inode *btrfs_inode,
+				 struct ref_tracker **tracker)
 {
 	struct btrfs_delayed_node *node;
+	struct ref_tracker **inode_cache_tracker = NULL;
 	struct btrfs_root *root = btrfs_inode->root;
 	u64 ino = btrfs_ino(btrfs_inode);
 	int ret;
 	void *ptr;
 
 again:
-	node = btrfs_get_delayed_node(btrfs_inode);
+	node = btrfs_get_delayed_node(btrfs_inode, tracker);
 	if (node)
 		return node;
 
@@ -144,12 +160,10 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
 		return ERR_PTR(-ENOMEM);
 	btrfs_init_delayed_node(node, root, ino);
 
-	/* Cached in the inode and can be accessed. */
-	refcount_set(&node->refs, 2);
-
 	/* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
 	ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
 	if (ret == -ENOMEM) {
+		btrfs_delayed_node_ref_tracker_dir_exit(node);
 		kmem_cache_free(delayed_node_cache, node);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -158,6 +172,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
 	if (ptr) {
 		/* Somebody inserted it, go back and read it. */
 		xa_unlock(&root->delayed_nodes);
+		btrfs_delayed_node_ref_tracker_dir_exit(node);
 		kmem_cache_free(delayed_node_cache, node);
 		node = NULL;
 		goto again;
@@ -166,6 +181,15 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
 	ASSERT(xa_err(ptr) != -EINVAL);
 	ASSERT(xa_err(ptr) != -ENOMEM);
 	ASSERT(ptr == NULL);
+
+	/* Cached in the inode and can be accessed. */
+	refcount_set(&node->refs, 2);
+#ifdef CONFIG_BTRFS_DEBUG
+	inode_cache_tracker = &node->inode_cache_tracker;
+#endif
+	btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+	btrfs_delayed_node_ref_tracker_alloc(node, inode_cache_tracker, GFP_ATOMIC);
+
 	btrfs_inode->delayed_node = node;
 	xa_unlock(&root->delayed_nodes);
 
@@ -181,6 +205,8 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
 				     struct btrfs_delayed_node *node,
 				     int mod)
 {
+	struct ref_tracker **node_list_tracker = NULL;
+
 	spin_lock(&root->lock);
 	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
 		if (!list_empty(&node->p_list))
@@ -191,6 +217,10 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
 		list_add_tail(&node->n_list, &root->node_list);
 		list_add_tail(&node->p_list, &root->prepare_list);
 		refcount_inc(&node->refs);	/* inserted into list */
+#ifdef CONFIG_BTRFS_DEBUG
+		node_list_tracker = &node->node_list_tracker;
+#endif
+		btrfs_delayed_node_ref_tracker_alloc(node, node_list_tracker, GFP_ATOMIC);
 		root->nodes++;
 		set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
 	}
@@ -201,9 +231,15 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
 static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
 				       struct btrfs_delayed_node *node)
 {
+	struct ref_tracker **node_list_tracker = NULL;
+
 	spin_lock(&root->lock);
 	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
 		root->nodes--;
+#ifdef CONFIG_BTRFS_DEBUG
+		node_list_tracker = &node->node_list_tracker;
+#endif
+		btrfs_delayed_node_ref_tracker_free(node, node_list_tracker);
 		refcount_dec(&node->refs);	/* not in the list */
 		list_del_init(&node->n_list);
 		if (!list_empty(&node->p_list))
@@ -213,23 +249,27 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
 	spin_unlock(&root->lock);
 }
 
-static struct btrfs_delayed_node *btrfs_first_delayed_node(
-			struct btrfs_delayed_root *delayed_root)
+static struct btrfs_delayed_node *
+btrfs_first_delayed_node(struct btrfs_delayed_root *delayed_root,
+			 struct ref_tracker **tracker)
 {
 	struct btrfs_delayed_node *node;
 
 	spin_lock(&delayed_root->lock);
 	node = list_first_entry_or_null(&delayed_root->node_list,
 					struct btrfs_delayed_node, n_list);
-	if (node)
+	if (node) {
 		refcount_inc(&node->refs);
+		btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+	}
 	spin_unlock(&delayed_root->lock);
 
 	return node;
 }
 
-static struct btrfs_delayed_node *btrfs_next_delayed_node(
-						struct btrfs_delayed_node *node)
+static struct btrfs_delayed_node *
+btrfs_next_delayed_node(struct btrfs_delayed_node *node,
+			struct ref_tracker **tracker)
 {
 	struct btrfs_delayed_root *delayed_root;
 	struct list_head *p;
@@ -249,15 +289,16 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
 
 	next = list_entry(p, struct btrfs_delayed_node, n_list);
 	refcount_inc(&next->refs);
+	btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC);
 out:
 	spin_unlock(&delayed_root->lock);
 
 	return next;
 }
 
-static void __btrfs_release_delayed_node(
-				struct btrfs_delayed_node *delayed_node,
-				int mod)
+static void
+__btrfs_release_delayed_node(struct btrfs_delayed_node *delayed_node, int mod,
+			     struct ref_tracker **tracker)
 {
 	struct btrfs_delayed_root *delayed_root;
 
@@ -273,6 +314,7 @@ static void __btrfs_release_delayed_node(
 		btrfs_dequeue_delayed_node(delayed_root, delayed_node);
 	mutex_unlock(&delayed_node->mutex);
 
+	btrfs_delayed_node_ref_tracker_free(delayed_node, tracker);
 	if (refcount_dec_and_test(&delayed_node->refs)) {
 		struct btrfs_root *root = delayed_node->root;
 
@@ -282,17 +324,20 @@ static void __btrfs_release_delayed_node(
 		 * back up.  We can delete it now.
 		 */
 		ASSERT(refcount_read(&delayed_node->refs) == 0);
+		btrfs_delayed_node_ref_tracker_dir_exit(delayed_node);
 		kmem_cache_free(delayed_node_cache, delayed_node);
 	}
 }
 
-static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node,
+					      struct ref_tracker **tracker)
 {
-	__btrfs_release_delayed_node(node, 0);
+	__btrfs_release_delayed_node(node, 0, tracker);
 }
 
-static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
-					struct btrfs_delayed_root *delayed_root)
+static struct btrfs_delayed_node *
+btrfs_first_prepared_delayed_node(struct btrfs_delayed_root *delayed_root,
+				  struct ref_tracker **tracker)
 {
 	struct btrfs_delayed_node *node;
 
@@ -302,16 +347,18 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
 	if (node) {
 		list_del_init(&node->p_list);
 		refcount_inc(&node->refs);
+		btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
 	}
 	spin_unlock(&delayed_root->lock);
 
 	return node;
 }
 
-static inline void btrfs_release_prepared_delayed_node(
-					struct btrfs_delayed_node *node)
+static inline void
+btrfs_release_prepared_delayed_node(struct btrfs_delayed_node *node,
+				    struct ref_tracker **tracker)
 {
-	__btrfs_release_delayed_node(node, 1);
+	__btrfs_release_delayed_node(node, 1, tracker);
 }
 
 static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
@@ -1126,6 +1173,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_root *delayed_root;
 	struct btrfs_delayed_node *curr_node, *prev_node;
+	struct ref_tracker *curr_delayed_node_tracker,
+		*prev_delayed_node_tracker;
 	struct btrfs_path *path;
 	struct btrfs_block_rsv *block_rsv;
 	int ret = 0;
@@ -1143,7 +1192,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 
 	delayed_root = fs_info->delayed_root;
 
-	curr_node = btrfs_first_delayed_node(delayed_root);
+	curr_node = btrfs_first_delayed_node(delayed_root,
+					     &curr_delayed_node_tracker);
 	while (curr_node && (!count || nr--)) {
 		ret = __btrfs_commit_inode_delayed_items(trans, path,
 							 curr_node);
@@ -1153,7 +1203,9 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 		}
 
 		prev_node = curr_node;
-		curr_node = btrfs_next_delayed_node(curr_node);
+		prev_delayed_node_tracker = curr_delayed_node_tracker;
+		curr_node = btrfs_next_delayed_node(curr_node,
+						    &curr_delayed_node_tracker);
 		/*
 		 * See the comment below about releasing path before releasing
 		 * node. If the commit of delayed items was successful the path
@@ -1161,7 +1213,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 		 * point to locked extent buffers (a leaf at the very least).
 		 */
 		ASSERT(path->nodes[0] == NULL);
-		btrfs_release_delayed_node(prev_node);
+		btrfs_release_delayed_node(prev_node,
+					   &prev_delayed_node_tracker);
 	}
 
 	/*
@@ -1174,7 +1227,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 	btrfs_free_path(path);
 
 	if (curr_node)
-		btrfs_release_delayed_node(curr_node);
+		btrfs_release_delayed_node(curr_node,
+					   &curr_delayed_node_tracker);
 	trans->block_rsv = block_rsv;
 
 	return ret;
@@ -1193,7 +1247,9 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
 int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 				     struct btrfs_inode *inode)
 {
-	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct ref_tracker *delayed_node_tracker;
+	struct btrfs_delayed_node *delayed_node =
+		btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_block_rsv *block_rsv;
 	int ret;
@@ -1204,14 +1260,14 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 	mutex_lock(&delayed_node->mutex);
 	if (!delayed_node->count) {
 		mutex_unlock(&delayed_node->mutex);
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return 0;
 	}
 	mutex_unlock(&delayed_node->mutex);
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return -ENOMEM;
 	}
 
@@ -1220,7 +1276,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 
 	ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
 
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	trans->block_rsv = block_rsv;
 
 	return ret;
@@ -1230,7 +1286,9 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct ref_tracker *delayed_node_tracker;
+	struct btrfs_delayed_node *delayed_node =
+		btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	struct btrfs_path *path;
 	struct btrfs_block_rsv *block_rsv;
 	int ret;
@@ -1241,7 +1299,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
 	mutex_lock(&delayed_node->mutex);
 	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		mutex_unlock(&delayed_node->mutex);
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return 0;
 	}
 	mutex_unlock(&delayed_node->mutex);
@@ -1275,7 +1333,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
 	btrfs_end_transaction(trans);
 	btrfs_btree_balance_dirty(fs_info);
 out:
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 
 	return ret;
 }
@@ -1283,13 +1341,20 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
 void btrfs_remove_delayed_node(struct btrfs_inode *inode)
 {
 	struct btrfs_delayed_node *delayed_node;
+	struct ref_tracker **tracker;
 
 	delayed_node = READ_ONCE(inode->delayed_node);
 	if (!delayed_node)
 		return;
 
 	inode->delayed_node = NULL;
-	btrfs_release_delayed_node(delayed_node);
+
+#ifdef CONFIG_BTRFS_DEBUG
+	tracker = &delayed_node->inode_cache_tracker;
+#else
+	tracker = NULL;
+#endif
+	btrfs_release_delayed_node(delayed_node, tracker);
 }
 
 struct btrfs_async_delayed_work {
@@ -1305,6 +1370,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
 	struct btrfs_delayed_node *delayed_node = NULL;
+	struct ref_tracker *delayed_node_tracker;
 	struct btrfs_root *root;
 	struct btrfs_block_rsv *block_rsv;
 	int total_done = 0;
@@ -1321,7 +1387,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 		    BTRFS_DELAYED_BACKGROUND / 2)
 			break;
 
-		delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+		delayed_node = btrfs_first_prepared_delayed_node(
+			delayed_root, &delayed_node_tracker);
 		if (!delayed_node)
 			break;
 
@@ -1330,7 +1397,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
 			btrfs_release_path(path);
-			btrfs_release_prepared_delayed_node(delayed_node);
+			btrfs_release_prepared_delayed_node(
+				delayed_node, &delayed_node_tracker);
 			total_done++;
 			continue;
 		}
@@ -1345,7 +1413,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 		btrfs_btree_balance_dirty_nodelay(root->fs_info);
 
 		btrfs_release_path(path);
-		btrfs_release_prepared_delayed_node(delayed_node);
+		btrfs_release_prepared_delayed_node(delayed_node,
+						    &delayed_node_tracker);
 		total_done++;
 
 	} while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
@@ -1377,10 +1446,15 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 
 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root);
+	struct ref_tracker *delayed_node_tracker;
+	struct btrfs_delayed_node *node = btrfs_first_delayed_node(
+		fs_info->delayed_root, &delayed_node_tracker);
 
-	if (WARN_ON(node))
+	if (WARN_ON(node)) {
+		btrfs_delayed_node_ref_tracker_free(node,
+						    &delayed_node_tracker);
 		refcount_dec(&node->refs);
+	}
 }
 
 static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
@@ -1454,13 +1528,15 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
 	struct btrfs_delayed_node *delayed_node;
+	struct ref_tracker *delayed_node_tracker;
 	struct btrfs_delayed_item *delayed_item;
 	struct btrfs_dir_item *dir_item;
 	bool reserve_leaf_space;
 	u32 data_len;
 	int ret;
 
-	delayed_node = btrfs_get_or_create_delayed_node(dir);
+	delayed_node =
+		btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
 	if (IS_ERR(delayed_node))
 		return PTR_ERR(delayed_node);
 
@@ -1536,7 +1612,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	mutex_unlock(&delayed_node->mutex);
 
 release_node:
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return ret;
 }
 
@@ -1591,10 +1667,11 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 				   struct btrfs_inode *dir, u64 index)
 {
 	struct btrfs_delayed_node *node;
+	struct ref_tracker *delayed_node_tracker;
 	struct btrfs_delayed_item *item;
 	int ret;
 
-	node = btrfs_get_or_create_delayed_node(dir);
+	node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
 	if (IS_ERR(node))
 		return PTR_ERR(node);
 
@@ -1635,13 +1712,15 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 	}
 	mutex_unlock(&node->mutex);
 end:
-	btrfs_release_delayed_node(node);
+	btrfs_release_delayed_node(node, &delayed_node_tracker);
 	return ret;
 }
 
 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
 {
-	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct ref_tracker *delayed_node_tracker;
+	struct btrfs_delayed_node *delayed_node =
+		btrfs_get_delayed_node(inode, &delayed_node_tracker);
 
 	if (!delayed_node)
 		return -ENOENT;
@@ -1652,12 +1731,12 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
 	 * is updated now. So we needn't lock the delayed node.
 	 */
 	if (!delayed_node->index_cnt) {
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return -EINVAL;
 	}
 
 	inode->index_cnt = delayed_node->index_cnt;
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return 0;
 }
 
@@ -1668,8 +1747,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
 {
 	struct btrfs_delayed_node *delayed_node;
 	struct btrfs_delayed_item *item;
+	struct ref_tracker *delayed_node_tracker;
 
-	delayed_node = btrfs_get_delayed_node(inode);
+	delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!delayed_node)
 		return false;
 
@@ -1704,6 +1784,8 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
 	 * insert/delete delayed items in this period. So we also needn't
 	 * requeue or dequeue this delayed node.
 	 */
+	btrfs_delayed_node_ref_tracker_free(delayed_node,
+					    &delayed_node_tracker);
 	refcount_dec(&delayed_node->refs);
 
 	return true;
@@ -1845,17 +1927,18 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_delayed_node *delayed_node;
+	struct ref_tracker *delayed_node_tracker;
 	struct btrfs_inode_item *inode_item;
 	struct inode *vfs_inode = &inode->vfs_inode;
 
-	delayed_node = btrfs_get_delayed_node(inode);
+	delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!delayed_node)
 		return -ENOENT;
 
 	mutex_lock(&delayed_node->mutex);
 	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		mutex_unlock(&delayed_node->mutex);
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return -ENOENT;
 	}
 
@@ -1895,7 +1978,7 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
 		inode->index_cnt = (u64)-1;
 
 	mutex_unlock(&delayed_node->mutex);
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return 0;
 }
 
@@ -1904,9 +1987,11 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_delayed_node *delayed_node;
+	struct ref_tracker *delayed_node_tracker;
 	int ret = 0;
 
-	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	delayed_node =
+		btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
 	if (IS_ERR(delayed_node))
 		return PTR_ERR(delayed_node);
 
@@ -1926,7 +2011,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 	atomic_inc(&root->fs_info->delayed_root->items);
 release_node:
 	mutex_unlock(&delayed_node->mutex);
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return ret;
 }
 
@@ -1934,6 +2019,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_delayed_node *delayed_node;
+	struct ref_tracker *delayed_node_tracker;
 
 	/*
 	 * we don't do delayed inode updates during log recovery because it
@@ -1943,7 +2029,8 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
 		return -EAGAIN;
 
-	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	delayed_node =
+		btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
 	if (IS_ERR(delayed_node))
 		return PTR_ERR(delayed_node);
 
@@ -1970,7 +2057,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
 	atomic_inc(&fs_info->delayed_root->items);
 release_node:
 	mutex_unlock(&delayed_node->mutex);
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return 0;
 }
 
@@ -2014,19 +2101,21 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
 {
 	struct btrfs_delayed_node *delayed_node;
+	struct ref_tracker *delayed_node_tracker;
 
-	delayed_node = btrfs_get_delayed_node(inode);
+	delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!delayed_node)
 		return;
 
 	__btrfs_kill_delayed_node(delayed_node);
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 }
 
 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 {
 	unsigned long index = 0;
 	struct btrfs_delayed_node *delayed_nodes[8];
+	struct ref_tracker *delayed_node_trackers[8];
 
 	while (1) {
 		struct btrfs_delayed_node *node;
@@ -2045,6 +2134,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 			 * about to be removed from the tree in the loop below
 			 */
 			if (refcount_inc_not_zero(&node->refs)) {
+				btrfs_delayed_node_ref_tracker_alloc(
+					node, &delayed_node_trackers[count],
+					GFP_ATOMIC);
 				delayed_nodes[count] = node;
 				count++;
 			}
@@ -2056,7 +2148,8 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 
 		for (int i = 0; i < count; i++) {
 			__btrfs_kill_delayed_node(delayed_nodes[i]);
-			btrfs_release_delayed_node(delayed_nodes[i]);
+			btrfs_release_delayed_node(delayed_nodes[i],
+						   &delayed_node_trackers[i]);
 		}
 	}
 }
@@ -2064,14 +2157,20 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_delayed_node *curr_node, *prev_node;
+	struct ref_tracker *curr_delayed_node_tracker,
+		*prev_delayed_node_tracker;
 
-	curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
+	curr_node = btrfs_first_delayed_node(fs_info->delayed_root,
+					     &curr_delayed_node_tracker);
 	while (curr_node) {
 		__btrfs_kill_delayed_node(curr_node);
 
 		prev_node = curr_node;
-		curr_node = btrfs_next_delayed_node(curr_node);
-		btrfs_release_delayed_node(prev_node);
+		prev_delayed_node_tracker = curr_delayed_node_tracker;
+		curr_node = btrfs_next_delayed_node(curr_node,
+						    &curr_delayed_node_tracker);
+		btrfs_release_delayed_node(prev_node,
+					   &prev_delayed_node_tracker);
 	}
 }
 
@@ -2081,8 +2180,9 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
 {
 	struct btrfs_delayed_node *node;
 	struct btrfs_delayed_item *item;
+	struct ref_tracker *delayed_node_tracker;
 
-	node = btrfs_get_delayed_node(inode);
+	node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!node)
 		return;
 
@@ -2140,6 +2240,7 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
 	 * delete delayed items.
 	 */
 	ASSERT(refcount_read(&node->refs) > 1);
+	btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
 	refcount_dec(&node->refs);
 }
 
@@ -2150,8 +2251,9 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
 	struct btrfs_delayed_node *node;
 	struct btrfs_delayed_item *item;
 	struct btrfs_delayed_item *next;
+	struct ref_tracker *delayed_node_tracker;
 
-	node = btrfs_get_delayed_node(inode);
+	node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!node)
 		return;
 
@@ -2183,5 +2285,6 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
 	 * delete delayed items.
 	 */
 	ASSERT(refcount_read(&node->refs) > 1);
+	btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
 	refcount_dec(&node->refs);
 }
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index e6e763ad2d42..4377f60e3457 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -7,6 +7,7 @@
 #ifndef BTRFS_DELAYED_INODE_H
 #define BTRFS_DELAYED_INODE_H
 
+#include <linux/ref_tracker.h>
 #include <linux/types.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
@@ -63,8 +64,19 @@ struct btrfs_delayed_node {
 	struct rb_root_cached del_root;
 	struct mutex mutex;
 	struct btrfs_inode_item inode_item;
-	refcount_t refs;
+
 	int count;
+	refcount_t refs;
+
+#ifdef CONFIG_BTRFS_DEBUG
+	/* Used to track all references to this delayed node. */
+	struct ref_tracker_dir ref_dir;
+	/* Used to track delayed node reference stored in node list. */
+	struct ref_tracker *node_list_tracker;
+	/* Used to track delayed node reference stored in inode cache. */
+	struct ref_tracker *inode_cache_tracker;
+#endif
+
 	u64 index_cnt;
 	unsigned long flags;
 	/*
@@ -106,6 +118,53 @@ struct btrfs_delayed_item {
 	char data[] __counted_by(data_len);
 };
 
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node, 
+						       unsigned int quarantine_count,
+						       const char *name)
+{
+	ref_tracker_dir_init(&node->ref_dir, quarantine_count, name);
+}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node)
+{
+	ref_tracker_dir_exit(&node->ref_dir);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+						    struct ref_tracker **tracker,
+						    gfp_t gfp)
+{
+	return ref_tracker_alloc(&node->ref_dir, tracker, gfp);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+						   struct ref_tracker **tracker)
+{
+	return ref_tracker_free(&node->ref_dir, tracker);
+}
+#else
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node, 
+						       unsigned int quarantine_count,
+						       const char *name) {}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) {}
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+						    struct ref_tracker **tracker,
+						    gfp_t gfp)
+{
+	return 0;
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+						   struct ref_tracker **tracker)
+{
+	return 0;
+}
+#endif
+
+
 void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root);
 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 				   const char *name, int name_len,
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: implement ref_tracker for delayed_nodes
  2025-07-29  7:08 [PATCH v2] btrfs: implement ref_tracker for delayed_nodes Leo Martins
@ 2025-08-04 13:57 ` David Sterba
  2025-08-04 19:00   ` Leo Martins
  0 siblings, 1 reply; 6+ messages in thread
From: David Sterba @ 2025-08-04 13:57 UTC (permalink / raw)
  To: Leo Martins; +Cc: linux-btrfs, kernel-team

On Tue, Jul 29, 2025 at 12:08:04AM -0700, Leo Martins wrote:
> This patch adds ref_tracker infrastructure for btrfs_delayed_node.
> 
> A ref_tracker object is allocated every time the reference count is
> increased and freed when the reference count is decreased. The
> ref_tracker object contains stack_trace information about where the
> ref count is increased and decreased. The ref_tracker_dir embedded in
> btrfs_delayed_node keeps track of all current references, and some
> previous references. This information allows for detection of reference
> leaks or double frees.
> 
> Here is a common example of taking a reference to a delayed_node and
> freeing it with ref_tracker.
> 
> ```C
> struct ref_tracker *tracker;
> struct btrfs_delayed_node *node;
> 
> node = btrfs_get_delayed_node(inode, tracker);
> // use delayed_node...
> btrfs_release_delayed_node(node, tracker);
> ```
> 
> There are two special cases where the delayed_node reference is long
> term, meaning that the thread that takes the reference and the thread
> that frees the reference are different. The inode_cache which is the
> btrfs_delayed_node reference stored in the associated btrfs_inode, and
> the node_list which is the btrfs_delayed_node reference stored in the
> btrfs_delayed_root node_list/prepare_list. In these cases the
> ref_tracker is stored in the delayed_node itself.
> 
> This patch introduces some new wrappers (btrfs_delayed_node_ref_tracker_*)
> to ensure that when BTRFS_DEBUG is disabled everything gets compiled out.
> 
> Signed-off-by: Leo Martins <loemra.dev@gmail.com>

I'm still missing why we need to add this whole infrastructure, there
weren't any recent bugs in delayed inode tracking. I understand what the
ref tracker does but it's still increasing structure size and adds a
perf hit that is acceptable for debugging kernels but still.

We have the ref-verify code and mount option, from what I've heard it
was used sporadically when touching related code. We can let the ref
tracking work like that too, not turned on by default ie. requiring a
mount option.

In case you'd want to enhance more structures with ref tracker the mount
option can enable them selectively.

> @@ -106,6 +113,13 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
>  		 */
>  		if (refcount_inc_not_zero(&node->refs)) {
>  			refcount_inc(&node->refs);
> +#ifdef CONFIG_BTRFS_DEBUG
> +			inode_cache_tracker = &node->inode_cache_tracker;
> +#endif

We try to avoid #ifdefs if it's a repeating pattern, this should be
abstracted away in a helper.

> +			btrfs_delayed_node_ref_tracker_alloc(node, tracker,
> +							     GFP_ATOMIC);
> +			btrfs_delayed_node_ref_tracker_alloc(
> +				node, inode_cache_tracker, GFP_ATOMIC);
>  			btrfs_inode->delayed_node = node;
>  		} else {
>  			node = NULL;
> @@ -125,17 +139,19 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
>   *
>   * Return the delayed node, or error pointer on failure.
>   */
> -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
> -		struct btrfs_inode *btrfs_inode)
> +static struct btrfs_delayed_node *
> +btrfs_get_or_create_delayed_node(struct btrfs_inode *btrfs_inode,
> +				 struct ref_tracker **tracker)

And another pattern, please don't change the style of the function
definition line, the return value is on the same line as name, if
parameters don't fit under reasonable column limit like 90 chars then
it's on the next line. Most of the code has been fixed so you can just
extend it.

>  {
>  	struct btrfs_delayed_node *node;
> +	struct ref_tracker **inode_cache_tracker = NULL;
>  	struct btrfs_root *root = btrfs_inode->root;
>  	u64 ino = btrfs_ino(btrfs_inode);
>  	int ret;
>  	void *ptr;
>  

> --- a/fs/btrfs/delayed-inode.h
> +++ b/fs/btrfs/delayed-inode.h
> @@ -7,6 +7,7 @@
>  #ifndef BTRFS_DELAYED_INODE_H
>  #define BTRFS_DELAYED_INODE_H
>  
> +#include <linux/ref_tracker.h>
>  #include <linux/types.h>
>  #include <linux/rbtree.h>
>  #include <linux/spinlock.h>
> @@ -63,8 +64,19 @@ struct btrfs_delayed_node {
>  	struct rb_root_cached del_root;
>  	struct mutex mutex;
>  	struct btrfs_inode_item inode_item;
> -	refcount_t refs;
> +
>  	int count;
> +	refcount_t refs;
> +
> +#ifdef CONFIG_BTRFS_DEBUG
> +	/* Used to track all references to this delayed node. */
> +	struct ref_tracker_dir ref_dir;
> +	/* Used to track delayed node reference stored in node list. */
> +	struct ref_tracker *node_list_tracker;
> +	/* Used to track delayed node reference stored in inode cache. */
> +	struct ref_tracker *inode_cache_tracker;
> +#endif

Optional and debugging structure members are best placed near the end.
In this case it's related to the refs so this could be mentioned in
comments as 'refs' should not be moved.

> +
>  	u64 index_cnt;
>  	unsigned long flags;
>  	/*

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: implement ref_tracker for delayed_nodes
  2025-08-04 13:57 ` David Sterba
@ 2025-08-04 19:00   ` Leo Martins
  2025-08-05 15:14     ` David Sterba
  0 siblings, 1 reply; 6+ messages in thread
From: Leo Martins @ 2025-08-04 19:00 UTC (permalink / raw)
  To: David Sterba; +Cc: linux-btrfs, kernel-team

On Mon, 4 Aug 2025 15:57:50 +0200 David Sterba <dsterba@suse.cz> wrote:

> On Tue, Jul 29, 2025 at 12:08:04AM -0700, Leo Martins wrote:
> > This patch adds ref_tracker infrastructure for btrfs_delayed_node.
> > 
> > A ref_tracker object is allocated every time the reference count is
> > increased and freed when the reference count is decreased. The
> > ref_tracker object contains stack_trace information about where the
> > ref count is increased and decreased. The ref_tracker_dir embedded in
> > btrfs_delayed_node keeps track of all current references, and some
> > previous references. This information allows for detection of reference
> > leaks or double frees.
> > 
> > Here is a common example of taking a reference to a delayed_node and
> > freeing it with ref_tracker.
> > 
> > ```C
> > struct ref_tracker *tracker;
> > struct btrfs_delayed_node *node;
> > 
> > node = btrfs_get_delayed_node(inode, tracker);
> > // use delayed_node...
> > btrfs_release_delayed_node(node, tracker);
> > ```
> > 
> > There are two special cases where the delayed_node reference is long
> > term, meaning that the thread that takes the reference and the thread
> > that frees the reference are different. The inode_cache which is the
> > btrfs_delayed_node reference stored in the associated btrfs_inode, and
> > the node_list which is the btrfs_delayed_node reference stored in the
> > btrfs_delayed_root node_list/prepare_list. In these cases the
> > ref_tracker is stored in the delayed_node itself.
> > 
> > This patch introduces some new wrappers (btrfs_delayed_node_ref_tracker_*)
> > to ensure that when BTRFS_DEBUG is disabled everything gets compiled out.
> > 
> > Signed-off-by: Leo Martins <loemra.dev@gmail.com>
> 
> I'm still missing why we need to add this whole infrastructure, there
> weren't any recent bugs in delayed inode tracking. I understand what the
> ref tracker does but it's still increasing structure size and adds a
> perf hit that is acceptable for debugging kernels but still.

Our leading btrfs crash is a soft lockup related to delayed_nodes.

[21017.354271] [     C96] Call Trace:
[21017.354273] [     C96]  <IRQ>
[21017.354278] [     C96]  ? rcu_dump_cpu_stacks+0xa1/0xe0
[21017.354282] [     C96]  ? print_cpu_stall+0x113/0x200
[21017.354284] [     C96]  ? rcu_sched_clock_irq+0x633/0x730
[21017.354287] [     C96]  ? update_process_times+0x66/0xa0
[21017.354288] [     C96]  ? dma_map_page_attrs+0x250/0x250
[21017.354289] [     C96]  ? tick_nohz_handler+0x84/0x1d0
[21017.354291] [     C96]  ? hrtimer_interrupt+0x1a1/0x5b0
[21017.354293] [     C96]  ? __sysvec_apic_timer_interrupt+0x44/0xe0
[21017.354295] [     C96]  ? sysvec_apic_timer_interrupt+0x6b/0x80
[21017.354296] [     C96]  </IRQ>
[21017.354297] [     C96]  <TASK>
[21017.354297] [     C96]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
[21017.354302] [     C96]  ? xa_find+0xa/0xb0
[21017.354305] [     C96]  btrfs_kill_all_delayed_nodes+0x6b/0x150
[21017.354307] [     C96]  ? __switch_to+0x133/0x530
[21017.354308] [     C96]  ? schedule+0xff4/0x1380
[21017.354310] [     C96]  btrfs_clean_one_deleted_snapshot+0x70/0xe0
[21017.354313] [     C96]  cleaner_kthread+0x83/0x120
[21017.354315] [     C96]  ? exportfs_encode_inode_fh+0x60/0x60
[21017.354317] [     C96]  kthread+0xae/0xe0
[21017.354319] [     C96]  ? __efi_queue_work+0x130/0x130
[21017.354320] [     C96]  ret_from_fork+0x2f/0x40
[21017.354322] [     C96]  ? __efi_queue_work+0x130/0x130
[21017.354323] [     C96]  ret_from_fork_asm+0x11/0x20
[21017.354326] [     C96]  </TASK>
[21042.628340] [     C96] watchdog: BUG: soft lockup - CPU#96 stuck for 45s! [btrfs-cleaner:1438]

btrfs_kill_all_delayed_nodes is getting stuck in an infinite loop because
the root->delayed_nodes xarray has a delayed_node that never gets erased.
Inspecting the crash dump reveals that the delayed_node has a reference count
of one, indicating there is a reference leak.

I believe this bug was introduced or at least magnified somewhere between
6.9 and 6.11.

Let me know if you want anymore details about the crash.

> 
> We have the ref-verify code and mount option, from what I've heard it
> was used sporadically when touching related code. We can let the ref
> tracking work like that too, not turned on by default ie. requiring a
> mount option.

The only issue with a mount option is that we would not be able to conditionally
compile ref_tracker_dir and ref_tracker pointers in struct btrfs_delayed_node,
permanently increasing the size of the struct from 304 bytes to 400 bytes.

Alternatively, there could be a delayed_node ref_tracker helper struct that we
could store a pointer to.

> 
> In case you'd want to enhance more structures with ref tracker the mount
> option can enable them selectively.
> 
> > @@ -106,6 +113,13 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
> >  		 */
> >  		if (refcount_inc_not_zero(&node->refs)) {
> >  			refcount_inc(&node->refs);
> > +#ifdef CONFIG_BTRFS_DEBUG
> > +			inode_cache_tracker = &node->inode_cache_tracker;
> > +#endif
> 
> We try to avoid #ifdefs if it's a repeating pattern, this should be
> abstracted away in a helper.

Sounds good.

For the record, there would be no need for it if we used the typedef
approach :-). There is a good example of how the original
author of ref_tracker implemented it in include/net/net_trackers.h.

> 
> > +			btrfs_delayed_node_ref_tracker_alloc(node, tracker,
> > +							     GFP_ATOMIC);
> > +			btrfs_delayed_node_ref_tracker_alloc(
> > +				node, inode_cache_tracker, GFP_ATOMIC);
> >  			btrfs_inode->delayed_node = node;
> >  		} else {
> >  			node = NULL;
> > @@ -125,17 +139,19 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
> >   *
> >   * Return the delayed node, or error pointer on failure.
> >   */
> > -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
> > -		struct btrfs_inode *btrfs_inode)
> > +static struct btrfs_delayed_node *
> > +btrfs_get_or_create_delayed_node(struct btrfs_inode *btrfs_inode,
> > +				 struct ref_tracker **tracker)
> 
> And another pattern, please don't change the style of the function
> definition line, the return value is on the same line as name, if
> parameters don't fit under reasonable column limit like 90 chars then
> it's on the next line. Most of the code has been fixed so you can just
> extend it.

Got it, I was just using the default kernel .clang-format.
Is there a btrfs specific .clang-format I could use?

> 
> >  {
> >  	struct btrfs_delayed_node *node;
> > +	struct ref_tracker **inode_cache_tracker = NULL;
> >  	struct btrfs_root *root = btrfs_inode->root;
> >  	u64 ino = btrfs_ino(btrfs_inode);
> >  	int ret;
> >  	void *ptr;
> >  
> 
> > --- a/fs/btrfs/delayed-inode.h
> > +++ b/fs/btrfs/delayed-inode.h
> > @@ -7,6 +7,7 @@
> >  #ifndef BTRFS_DELAYED_INODE_H
> >  #define BTRFS_DELAYED_INODE_H
> >  
> > +#include <linux/ref_tracker.h>
> >  #include <linux/types.h>
> >  #include <linux/rbtree.h>
> >  #include <linux/spinlock.h>
> > @@ -63,8 +64,19 @@ struct btrfs_delayed_node {
> >  	struct rb_root_cached del_root;
> >  	struct mutex mutex;
> >  	struct btrfs_inode_item inode_item;
> > -	refcount_t refs;
> > +
> >  	int count;
> > +	refcount_t refs;
> > +
> > +#ifdef CONFIG_BTRFS_DEBUG
> > +	/* Used to track all references to this delayed node. */
> > +	struct ref_tracker_dir ref_dir;
> > +	/* Used to track delayed node reference stored in node list. */
> > +	struct ref_tracker *node_list_tracker;
> > +	/* Used to track delayed node reference stored in inode cache. */
> > +	struct ref_tracker *inode_cache_tracker;
> > +#endif
> 
> Optional and debugging structure members are best placed near the end.
> In this case it's related to the refs so this could be mentioned in
> comments as 'refs' should not be moved.

Ok.

> 
> > +
> >  	u64 index_cnt;
> >  	unsigned long flags;
> >  	/*

Sent using hkml (https://github.com/sjp38/hackermail)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: implement ref_tracker for delayed_nodes
  2025-08-04 19:00   ` Leo Martins
@ 2025-08-05 15:14     ` David Sterba
  2025-08-05 18:07       ` Boris Burkov
  2025-08-05 19:48       ` Leo Martins
  0 siblings, 2 replies; 6+ messages in thread
From: David Sterba @ 2025-08-05 15:14 UTC (permalink / raw)
  To: Leo Martins; +Cc: David Sterba, linux-btrfs, kernel-team

On Mon, Aug 04, 2025 at 12:00:12PM -0700, Leo Martins wrote:
> On Mon, 4 Aug 2025 15:57:50 +0200 David Sterba <dsterba@suse.cz> wrote:
> 
> > On Tue, Jul 29, 2025 at 12:08:04AM -0700, Leo Martins wrote:
> > > This patch adds ref_tracker infrastructure for btrfs_delayed_node.
> > > 
> > > A ref_tracker object is allocated every time the reference count is
> > > increased and freed when the reference count is decreased. The
> > > ref_tracker object contains stack_trace information about where the
> > > ref count is increased and decreased. The ref_tracker_dir embedded in
> > > btrfs_delayed_node keeps track of all current references, and some
> > > previous references. This information allows for detection of reference
> > > leaks or double frees.
> > > 
> > > Here is a common example of taking a reference to a delayed_node and
> > > freeing it with ref_tracker.
> > > 
> > > ```C
> > > struct ref_tracker *tracker;
> > > struct btrfs_delayed_node *node;
> > > 
> > > node = btrfs_get_delayed_node(inode, tracker);
> > > // use delayed_node...
> > > btrfs_release_delayed_node(node, tracker);
> > > ```
> > > 
> > > There are two special cases where the delayed_node reference is long
> > > term, meaning that the thread that takes the reference and the thread
> > > that frees the reference are different. The inode_cache which is the
> > > btrfs_delayed_node reference stored in the associated btrfs_inode, and
> > > the node_list which is the btrfs_delayed_node reference stored in the
> > > btrfs_delayed_root node_list/prepare_list. In these cases the
> > > ref_tracker is stored in the delayed_node itself.
> > > 
> > > This patch introduces some new wrappers (btrfs_delayed_node_ref_tracker_*)
> > > to ensure that when BTRFS_DEBUG is disabled everything gets compiled out.
> > > 
> > > Signed-off-by: Leo Martins <loemra.dev@gmail.com>
> > 
> > I'm still missing why we need to add this whole infrastructure, there
> > weren't any recent bugs in delayed inode tracking. I understand what the
> > ref tracker does but it's still increasing structure size and adds a
> > perf hit that is acceptable for debugging kernels but still.
> 
> Our leading btrfs crash is a soft lockup related to delayed_nodes.

Adding infrastructure for real problems makes sense, this is a good
justification and should be mentioned in the patch. On which version do
you observe it?

> [21017.354271] [     C96] Call Trace:
> [21017.354273] [     C96]  <IRQ>
> [21017.354278] [     C96]  ? rcu_dump_cpu_stacks+0xa1/0xe0
> [21017.354282] [     C96]  ? print_cpu_stall+0x113/0x200
> [21017.354284] [     C96]  ? rcu_sched_clock_irq+0x633/0x730
> [21017.354287] [     C96]  ? update_process_times+0x66/0xa0
> [21017.354288] [     C96]  ? dma_map_page_attrs+0x250/0x250
> [21017.354289] [     C96]  ? tick_nohz_handler+0x84/0x1d0
> [21017.354291] [     C96]  ? hrtimer_interrupt+0x1a1/0x5b0
> [21017.354293] [     C96]  ? __sysvec_apic_timer_interrupt+0x44/0xe0
> [21017.354295] [     C96]  ? sysvec_apic_timer_interrupt+0x6b/0x80
> [21017.354296] [     C96]  </IRQ>
> [21017.354297] [     C96]  <TASK>
> [21017.354297] [     C96]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
> [21017.354302] [     C96]  ? xa_find+0xa/0xb0
> [21017.354305] [     C96]  btrfs_kill_all_delayed_nodes+0x6b/0x150
> [21017.354307] [     C96]  ? __switch_to+0x133/0x530
> [21017.354308] [     C96]  ? schedule+0xff4/0x1380
> [21017.354310] [     C96]  btrfs_clean_one_deleted_snapshot+0x70/0xe0
> [21017.354313] [     C96]  cleaner_kthread+0x83/0x120
> [21017.354315] [     C96]  ? exportfs_encode_inode_fh+0x60/0x60
> [21017.354317] [     C96]  kthread+0xae/0xe0
> [21017.354319] [     C96]  ? __efi_queue_work+0x130/0x130
> [21017.354320] [     C96]  ret_from_fork+0x2f/0x40
> [21017.354322] [     C96]  ? __efi_queue_work+0x130/0x130
> [21017.354323] [     C96]  ret_from_fork_asm+0x11/0x20
> [21017.354326] [     C96]  </TASK>
> [21042.628340] [     C96] watchdog: BUG: soft lockup - CPU#96 stuck for 45s! [btrfs-cleaner:1438]
> 
> btrfs_kill_all_delayed_nodes is getting stuck in an infinite loop because
> the root->delayed_nodes xarray has a delayed_node that never gets erased.
> Inspecting the crash dump reveals that the delayed_node has a reference count
> of one, indicating there is a reference leak.
> 
> I believe this bug was introduced or at least magnified somewhere between
> 6.9 and 6.11.

That's likely to be related to the xarray conversion in 6.8.

> Let me know if you want anymore details about the crash.
> 
> > 
> > We have the ref-verify code and mount option, from what I've heard it
> > was used sporadically when touching related code. We can let the ref
> > tracking work like that too, not turned on by default ie. requiring a
> > mount option.
> 
> The only issue with a mount option is that we would not be able to conditionally
> compile ref_tracker_dir and ref_tracker pointers in struct btrfs_delayed_node,
> permanently increasing the size of the struct from 304 bytes to 400 bytes.

That's not a small difference though for delayed inodes it may be still
within the acceptable range. Turnin on debugging features for other
subsystems also bloats structures, lockdep makes from 4B spinlock
something like 70B, also indirectly hidden in other embedded structures.

> Alternatively, there could be a delayed_node ref_tracker helper struct that we
> could store a pointer to.

This might make more sense for the first implementation.

> > In case you'd want to enhance more structures with ref tracker the mount
> > option can enable them selectively.
> > 
> > > @@ -106,6 +113,13 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
> > >  		 */
> > >  		if (refcount_inc_not_zero(&node->refs)) {
> > >  			refcount_inc(&node->refs);
> > > +#ifdef CONFIG_BTRFS_DEBUG
> > > +			inode_cache_tracker = &node->inode_cache_tracker;
> > > +#endif
> > 
> > We try to avoid #ifdefs if it's a repeating pattern, this should be
> > abstracted away in a helper.
> 
> Sounds good.
> 
> For the record, there would be no need for it if we used the typedef
> approach :-). There is a good example of how the original
> author of ref_tracker implemented it in include/net/net_trackers.h.

We also try to stick to consistent coding style and patterns even if
there's something possibly better. Dealing with various preferences in
a big code base becomes quite distracting over time.

Could the typedef be replaced by a struct that's abstracting the
conditional type? Something like

struct delayed_inode_tracker {
#ifdef ...
	struct ref_tracker tracker;
#else
	struct {} tracker;
#endif
};

Visually it's basically the same what I see in the net_tracker.h and we
won't have to argue about the typedefs.

> > > +			btrfs_delayed_node_ref_tracker_alloc(node, tracker,
> > > +							     GFP_ATOMIC);
> > > +			btrfs_delayed_node_ref_tracker_alloc(
> > > +				node, inode_cache_tracker, GFP_ATOMIC);
> > >  			btrfs_inode->delayed_node = node;
> > >  		} else {
> > >  			node = NULL;
> > > @@ -125,17 +139,19 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
> > >   *
> > >   * Return the delayed node, or error pointer on failure.
> > >   */
> > > -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
> > > -		struct btrfs_inode *btrfs_inode)
> > > +static struct btrfs_delayed_node *
> > > +btrfs_get_or_create_delayed_node(struct btrfs_inode *btrfs_inode,
> > > +				 struct ref_tracker **tracker)
> > 
> > And another pattern, please don't change the style of the function
> > definition line, the return value is on the same line as name, if
> > parameters don't fit under reasonable column limit like 90 chars then
> > it's on the next line. Most of the code has been fixed so you can just
> > extend it.
> 
> Got it, I was just using the default kernel .clang-format.
> Is there a btrfs specific .clang-format I could use?

Though there's "default kernel" .clang-format, many subsystems have
their own style.  I think Boris mentioned an adjusted .clang-format for
btrfs, but we have hand tuned code I'm not sure can be expressed in the
config. The best we can do is to let if format things that have
basically no exceptions and not touch the rest.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: implement ref_tracker for delayed_nodes
  2025-08-05 15:14     ` David Sterba
@ 2025-08-05 18:07       ` Boris Burkov
  2025-08-05 19:48       ` Leo Martins
  1 sibling, 0 replies; 6+ messages in thread
From: Boris Burkov @ 2025-08-05 18:07 UTC (permalink / raw)
  To: David Sterba; +Cc: Leo Martins, linux-btrfs, kernel-team

On Tue, Aug 05, 2025 at 05:14:47PM +0200, David Sterba wrote:
> On Mon, Aug 04, 2025 at 12:00:12PM -0700, Leo Martins wrote:
> > On Mon, 4 Aug 2025 15:57:50 +0200 David Sterba <dsterba@suse.cz> wrote:
> > 
> > > On Tue, Jul 29, 2025 at 12:08:04AM -0700, Leo Martins wrote:
> > > > This patch adds ref_tracker infrastructure for btrfs_delayed_node.
> > > > 
> > > > A ref_tracker object is allocated every time the reference count is
> > > > increased and freed when the reference count is decreased. The
> > > > ref_tracker object contains stack_trace information about where the
> > > > ref count is increased and decreased. The ref_tracker_dir embedded in
> > > > btrfs_delayed_node keeps track of all current references, and some
> > > > previous references. This information allows for detection of reference
> > > > leaks or double frees.
> > > > 
> > > > Here is a common example of taking a reference to a delayed_node and
> > > > freeing it with ref_tracker.
> > > > 
> > > > ```C
> > > > struct ref_tracker *tracker;
> > > > struct btrfs_delayed_node *node;
> > > > 
> > > > node = btrfs_get_delayed_node(inode, tracker);
> > > > // use delayed_node...
> > > > btrfs_release_delayed_node(node, tracker);
> > > > ```
> > > > 
> > > > There are two special cases where the delayed_node reference is long
> > > > term, meaning that the thread that takes the reference and the thread
> > > > that frees the reference are different. The inode_cache which is the
> > > > btrfs_delayed_node reference stored in the associated btrfs_inode, and
> > > > the node_list which is the btrfs_delayed_node reference stored in the
> > > > btrfs_delayed_root node_list/prepare_list. In these cases the
> > > > ref_tracker is stored in the delayed_node itself.
> > > > 
> > > > This patch introduces some new wrappers (btrfs_delayed_node_ref_tracker_*)
> > > > to ensure that when BTRFS_DEBUG is disabled everything gets compiled out.
> > > > 
> > > > Signed-off-by: Leo Martins <loemra.dev@gmail.com>
> > > 
> > > I'm still missing why we need to add this whole infrastructure, there
> > > weren't any recent bugs in delayed inode tracking. I understand what the
> > > ref tracker does but it's still increasing structure size and adds a
> > > perf hit that is acceptable for debugging kernels but still.
> > 
> > Our leading btrfs crash is a soft lockup related to delayed_nodes.
> 
> Adding infrastructure for real problems makes sense, this is a good
> justification and should be mentioned in the patch. On which version do
> you observe it?
> 
> > [21017.354271] [     C96] Call Trace:
> > [21017.354273] [     C96]  <IRQ>
> > [21017.354278] [     C96]  ? rcu_dump_cpu_stacks+0xa1/0xe0
> > [21017.354282] [     C96]  ? print_cpu_stall+0x113/0x200
> > [21017.354284] [     C96]  ? rcu_sched_clock_irq+0x633/0x730
> > [21017.354287] [     C96]  ? update_process_times+0x66/0xa0
> > [21017.354288] [     C96]  ? dma_map_page_attrs+0x250/0x250
> > [21017.354289] [     C96]  ? tick_nohz_handler+0x84/0x1d0
> > [21017.354291] [     C96]  ? hrtimer_interrupt+0x1a1/0x5b0
> > [21017.354293] [     C96]  ? __sysvec_apic_timer_interrupt+0x44/0xe0
> > [21017.354295] [     C96]  ? sysvec_apic_timer_interrupt+0x6b/0x80
> > [21017.354296] [     C96]  </IRQ>
> > [21017.354297] [     C96]  <TASK>
> > [21017.354297] [     C96]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
> > [21017.354302] [     C96]  ? xa_find+0xa/0xb0
> > [21017.354305] [     C96]  btrfs_kill_all_delayed_nodes+0x6b/0x150
> > [21017.354307] [     C96]  ? __switch_to+0x133/0x530
> > [21017.354308] [     C96]  ? schedule+0xff4/0x1380
> > [21017.354310] [     C96]  btrfs_clean_one_deleted_snapshot+0x70/0xe0
> > [21017.354313] [     C96]  cleaner_kthread+0x83/0x120
> > [21017.354315] [     C96]  ? exportfs_encode_inode_fh+0x60/0x60
> > [21017.354317] [     C96]  kthread+0xae/0xe0
> > [21017.354319] [     C96]  ? __efi_queue_work+0x130/0x130
> > [21017.354320] [     C96]  ret_from_fork+0x2f/0x40
> > [21017.354322] [     C96]  ? __efi_queue_work+0x130/0x130
> > [21017.354323] [     C96]  ret_from_fork_asm+0x11/0x20
> > [21017.354326] [     C96]  </TASK>
> > [21042.628340] [     C96] watchdog: BUG: soft lockup - CPU#96 stuck for 45s! [btrfs-cleaner:1438]
> > 
> > btrfs_kill_all_delayed_nodes is getting stuck in an infinite loop because
> > the root->delayed_nodes xarray has a delayed_node that never gets erased.
> > Inspecting the crash dump reveals that the delayed_node has a reference count
> > of one, indicating there is a reference leak.
> > 
> > I believe this bug was introduced or at least magnified somewhere between
> > 6.9 and 6.11.
> 
> That's likely to be related to the xarray conversion in 6.8.
> 

We suspected as much, but cannot see anything wrong with it despite lots
of staring at it. Could be something super subtle with a missing
READ_ONCE or the fact that the locks are no longer the same for the two
users.

> > Let me know if you want anymore details about the crash.
> > 
> > > 
> > > We have the ref-verify code and mount option, from what I've heard it
> > > was used sporadically when touching related code. We can let the ref
> > > tracking work like that too, not turned on by default ie. requiring a
> > > mount option.
> > 
> > The only issue with a mount option is that we would not be able to conditionally
> > compile ref_tracker_dir and ref_tracker pointers in struct btrfs_delayed_node,
> > permanently increasing the size of the struct from 304 bytes to 400 bytes.
> 
> That's not a small difference though for delayed inodes it may be still
> within the acceptable range. Turnin on debugging features for other
> subsystems also bloats structures, lockdep makes from 4B spinlock
> something like 70B, also indirectly hidden in other embedded structures.
> 
> > Alternatively, there could be a delayed_node ref_tracker helper struct that we
> > could store a pointer to.
> 
> This might make more sense for the first implementation.
> 
> > > In case you'd want to enhance more structures with ref tracker the mount
> > > option can enable them selectively.
> > > 
> > > > @@ -106,6 +113,13 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
> > > >  		 */
> > > >  		if (refcount_inc_not_zero(&node->refs)) {
> > > >  			refcount_inc(&node->refs);
> > > > +#ifdef CONFIG_BTRFS_DEBUG
> > > > +			inode_cache_tracker = &node->inode_cache_tracker;
> > > > +#endif
> > > 
> > > We try to avoid #ifdefs if it's a repeating pattern, this should be
> > > abstracted away in a helper.
> > 
> > Sounds good.
> > 
> > For the record, there would be no need for it if we used the typedef
> > approach :-). There is a good example of how the original
> > author of ref_tracker implemented it in include/net/net_trackers.h.
> 
> We also try to stick to consistent coding style and patterns even if
> there's something possibly better. Dealing with various preferences in
> a big code base becomes quite distracting over time.
> 
> Could the typedef be replaced by a struct that's abstracting the
> conditional type? Something like
> 
> struct delayed_inode_tracker {
> #ifdef ...
> 	struct ref_tracker tracker;
> #else
> 	struct {} tracker;
> #endif
> };
> 
> Visually it's basically the same what I see in the net_tracker.h and we
> won't have to argue about the typedefs.
> 

Oh that's clever, I like it :)

> > > > +			btrfs_delayed_node_ref_tracker_alloc(node, tracker,
> > > > +							     GFP_ATOMIC);
> > > > +			btrfs_delayed_node_ref_tracker_alloc(
> > > > +				node, inode_cache_tracker, GFP_ATOMIC);
> > > >  			btrfs_inode->delayed_node = node;
> > > >  		} else {
> > > >  			node = NULL;
> > > > @@ -125,17 +139,19 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
> > > >   *
> > > >   * Return the delayed node, or error pointer on failure.
> > > >   */
> > > > -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
> > > > -		struct btrfs_inode *btrfs_inode)
> > > > +static struct btrfs_delayed_node *
> > > > +btrfs_get_or_create_delayed_node(struct btrfs_inode *btrfs_inode,
> > > > +				 struct ref_tracker **tracker)
> > > 
> > > And another pattern, please don't change the style of the function
> > > definition line, the return value is on the same line as name, if
> > > parameters don't fit under reasonable column limit like 90 chars then
> > > it's on the next line. Most of the code has been fixed so you can just
> > > extend it.
> > 
> > Got it, I was just using the default kernel .clang-format.
> > Is there a btrfs specific .clang-format I could use?
> 
> Though there's "default kernel" .clang-format, many subsystems have
> their own style.  I think Boris mentioned an adjusted .clang-format for
> btrfs, but we have hand tuned code I'm not sure can be expressed in the
> config. The best we can do is to let if format things that have
> basically no exceptions and not touch the rest.

FWIW, I've never attempted to write clang-format rules for btrfs, though
I would love if we had them. Josef said that he tried a few years back
and was unsuccessful. Could be worth another attempt, maybe the rule
writing has become more sophisticated :)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] btrfs: implement ref_tracker for delayed_nodes
  2025-08-05 15:14     ` David Sterba
  2025-08-05 18:07       ` Boris Burkov
@ 2025-08-05 19:48       ` Leo Martins
  1 sibling, 0 replies; 6+ messages in thread
From: Leo Martins @ 2025-08-05 19:48 UTC (permalink / raw)
  To: David Sterba; +Cc: linux-btrfs, kernel-team

On Tue, 5 Aug 2025 17:14:47 +0200 David Sterba <dsterba@suse.cz> wrote:

> On Mon, Aug 04, 2025 at 12:00:12PM -0700, Leo Martins wrote:
> > On Mon, 4 Aug 2025 15:57:50 +0200 David Sterba <dsterba@suse.cz> wrote:
> > 
> > > On Tue, Jul 29, 2025 at 12:08:04AM -0700, Leo Martins wrote:
> > > > This patch adds ref_tracker infrastructure for btrfs_delayed_node.
> > > > 
> > > > A ref_tracker object is allocated every time the reference count is
> > > > increased and freed when the reference count is decreased. The
> > > > ref_tracker object contains stack_trace information about where the
> > > > ref count is increased and decreased. The ref_tracker_dir embedded in
> > > > btrfs_delayed_node keeps track of all current references, and some
> > > > previous references. This information allows for detection of reference
> > > > leaks or double frees.
> > > > 
> > > > Here is a common example of taking a reference to a delayed_node and
> > > > freeing it with ref_tracker.
> > > > 
> > > > ```C
> > > > struct ref_tracker *tracker;
> > > > struct btrfs_delayed_node *node;
> > > > 
> > > > node = btrfs_get_delayed_node(inode, tracker);
> > > > // use delayed_node...
> > > > btrfs_release_delayed_node(node, tracker);
> > > > ```
> > > > 
> > > > There are two special cases where the delayed_node reference is long
> > > > term, meaning that the thread that takes the reference and the thread
> > > > that frees the reference are different. The inode_cache which is the
> > > > btrfs_delayed_node reference stored in the associated btrfs_inode, and
> > > > the node_list which is the btrfs_delayed_node reference stored in the
> > > > btrfs_delayed_root node_list/prepare_list. In these cases the
> > > > ref_tracker is stored in the delayed_node itself.
> > > > 
> > > > This patch introduces some new wrappers (btrfs_delayed_node_ref_tracker_*)
> > > > to ensure that when BTRFS_DEBUG is disabled everything gets compiled out.
> > > > 
> > > > Signed-off-by: Leo Martins <loemra.dev@gmail.com>
> > > 
> > > I'm still missing why we need to add this whole infrastructure, there
> > > weren't any recent bugs in delayed inode tracking. I understand what the
> > > ref tracker does but it's still increasing structure size and adds a
> > > perf hit that is acceptable for debugging kernels but still.
> > 
> > Our leading btrfs crash is a soft lockup related to delayed_nodes.
> 
> Adding infrastructure for real problems makes sense, this is a good
> justification and should be mentioned in the patch. On which version do
> you observe it?

We have significant number of hosts running 6.9, 6.11, and 6.13. This soft
lockup happens on 6.11 and 6.13, but not 6.9. Also, it appears to be more
common on 6.11 happening at twice the rate of 6.13.

> 
> > [21017.354271] [     C96] Call Trace:
> > [21017.354273] [     C96]  <IRQ>
> > [21017.354278] [     C96]  ? rcu_dump_cpu_stacks+0xa1/0xe0
> > [21017.354282] [     C96]  ? print_cpu_stall+0x113/0x200
> > [21017.354284] [     C96]  ? rcu_sched_clock_irq+0x633/0x730
> > [21017.354287] [     C96]  ? update_process_times+0x66/0xa0
> > [21017.354288] [     C96]  ? dma_map_page_attrs+0x250/0x250
> > [21017.354289] [     C96]  ? tick_nohz_handler+0x84/0x1d0
> > [21017.354291] [     C96]  ? hrtimer_interrupt+0x1a1/0x5b0
> > [21017.354293] [     C96]  ? __sysvec_apic_timer_interrupt+0x44/0xe0
> > [21017.354295] [     C96]  ? sysvec_apic_timer_interrupt+0x6b/0x80
> > [21017.354296] [     C96]  </IRQ>
> > [21017.354297] [     C96]  <TASK>
> > [21017.354297] [     C96]  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
> > [21017.354302] [     C96]  ? xa_find+0xa/0xb0
> > [21017.354305] [     C96]  btrfs_kill_all_delayed_nodes+0x6b/0x150
> > [21017.354307] [     C96]  ? __switch_to+0x133/0x530
> > [21017.354308] [     C96]  ? schedule+0xff4/0x1380
> > [21017.354310] [     C96]  btrfs_clean_one_deleted_snapshot+0x70/0xe0
> > [21017.354313] [     C96]  cleaner_kthread+0x83/0x120
> > [21017.354315] [     C96]  ? exportfs_encode_inode_fh+0x60/0x60
> > [21017.354317] [     C96]  kthread+0xae/0xe0
> > [21017.354319] [     C96]  ? __efi_queue_work+0x130/0x130
> > [21017.354320] [     C96]  ret_from_fork+0x2f/0x40
> > [21017.354322] [     C96]  ? __efi_queue_work+0x130/0x130
> > [21017.354323] [     C96]  ret_from_fork_asm+0x11/0x20
> > [21017.354326] [     C96]  </TASK>
> > [21042.628340] [     C96] watchdog: BUG: soft lockup - CPU#96 stuck for 45s! [btrfs-cleaner:1438]
> > 
> > btrfs_kill_all_delayed_nodes is getting stuck in an infinite loop because
> > the root->delayed_nodes xarray has a delayed_node that never gets erased.
> > Inspecting the crash dump reveals that the delayed_node has a reference count
> > of one, indicating there is a reference leak.
> > 
> > I believe this bug was introduced or at least magnified somewhere between
> > 6.9 and 6.11.
> 
> That's likely to be related to the xarray conversion in 6.8.

I took a look at this, but couldn't find any obvious bugs.

> 
> > Let me know if you want anymore details about the crash.
> > 
> > > 
> > > We have the ref-verify code and mount option, from what I've heard it
> > > was used sporadically when touching related code. We can let the ref
> > > tracking work like that too, not turned on by default ie. requiring a
> > > mount option.
> > 
> > The only issue with a mount option is that we would not be able to conditionally
> > compile ref_tracker_dir and ref_tracker pointers in struct btrfs_delayed_node,
> > permanently increasing the size of the struct from 304 bytes to 400 bytes.
> 
> That's not a small difference though for delayed inodes it may be still
> within the acceptable range. Turnin on debugging features for other
> subsystems also bloats structures, lockdep makes from 4B spinlock
> something like 70B, also indirectly hidden in other embedded structures.
> 
> > Alternatively, there could be a delayed_node ref_tracker helper struct that we
> > could store a pointer to.
> 
> This might make more sense for the first implementation.

Got it, will give this a shot for v3.

> 
> > > In case you'd want to enhance more structures with ref tracker the mount
> > > option can enable them selectively.
> > > 
> > > > @@ -106,6 +113,13 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
> > > >  		 */
> > > >  		if (refcount_inc_not_zero(&node->refs)) {
> > > >  			refcount_inc(&node->refs);
> > > > +#ifdef CONFIG_BTRFS_DEBUG
> > > > +			inode_cache_tracker = &node->inode_cache_tracker;
> > > > +#endif
> > > 
> > > We try to avoid #ifdefs if it's a repeating pattern, this should be
> > > abstracted away in a helper.
> > 
> > Sounds good.
> > 
> > For the record, there would be no need for it if we used the typedef
> > approach :-). There is a good example of how the original
> > author of ref_tracker implemented it in include/net/net_trackers.h.
> 
> We also try to stick to consistent coding style and patterns even if
> there's something possibly better. Dealing with various preferences in
> a big code base becomes quite distracting over time.

Makes sense.

> 
> Could the typedef be replaced by a struct that's abstracting the
> conditional type? Something like
> 
> struct delayed_inode_tracker {
> #ifdef ...
> 	struct ref_tracker tracker;
> #else
> 	struct {} tracker;
> #endif
> };
> 
> Visually it's basically the same what I see in the net_tracker.h and we
> won't have to argue about the typedefs.

Yeah, this is great!

> 
> > > > +			btrfs_delayed_node_ref_tracker_alloc(node, tracker,
> > > > +							     GFP_ATOMIC);
> > > > +			btrfs_delayed_node_ref_tracker_alloc(
> > > > +				node, inode_cache_tracker, GFP_ATOMIC);
> > > >  			btrfs_inode->delayed_node = node;
> > > >  		} else {
> > > >  			node = NULL;
> > > > @@ -125,17 +139,19 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
> > > >   *
> > > >   * Return the delayed node, or error pointer on failure.
> > > >   */
> > > > -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
> > > > -		struct btrfs_inode *btrfs_inode)
> > > > +static struct btrfs_delayed_node *
> > > > +btrfs_get_or_create_delayed_node(struct btrfs_inode *btrfs_inode,
> > > > +				 struct ref_tracker **tracker)
> > > 
> > > And another pattern, please don't change the style of the function
> > > definition line, the return value is on the same line as name, if
> > > parameters don't fit under reasonable column limit like 90 chars then
> > > it's on the next line. Most of the code has been fixed so you can just
> > > extend it.
> > 
> > Got it, I was just using the default kernel .clang-format.
> > Is there a btrfs specific .clang-format I could use?
> 
> Though there's "default kernel" .clang-format, many subsystems have
> their own style.  I think Boris mentioned an adjusted .clang-format for
> btrfs, but we have hand tuned code I'm not sure can be expressed in the
> config. The best we can do is to let if format things that have
> basically no exceptions and not touch the rest.

Understood.

Sent using hkml (https://github.com/sjp38/hackermail)

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-08-05 19:48 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-29  7:08 [PATCH v2] btrfs: implement ref_tracker for delayed_nodes Leo Martins
2025-08-04 13:57 ` David Sterba
2025-08-04 19:00   ` Leo Martins
2025-08-05 15:14     ` David Sterba
2025-08-05 18:07       ` Boris Burkov
2025-08-05 19:48       ` Leo Martins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).