public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-19 15:31 buffer cache simplification Christoph Hellwig
@ 2026-01-19 15:31 ` Christoph Hellwig
  2026-01-20  2:39   ` Darrick J. Wong
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-19 15:31 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, linux-xfs, syzbot+0391d34e801643e2809b

The per-AG buffer hashes were added when all buffer lookups took a
per-hash look.  Since then we've made lookups entirely lockless and
removed the need for a hash-wide lock for inserts and removals as
well.  With this there is no need to sharding the hash, so reduce the
used resources by using a per-buftarg hash for all buftargs.

Long after writing this initially, syzbot found a problem in the
buffer cache teardown order, which this happens to fix as well.

Reported-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Tested-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_ag.c | 13 ++---------
 fs/xfs/libxfs/xfs_ag.h |  2 --
 fs/xfs/xfs_buf.c       | 51 +++++++++++-------------------------------
 fs/xfs/xfs_buf.h       | 10 +--------
 fs/xfs/xfs_buf_mem.c   | 11 ++-------
 5 files changed, 18 insertions(+), 69 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 586918ed1cbf..a41d782e8e8c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -110,10 +110,7 @@ xfs_perag_uninit(
 	struct xfs_group	*xg)
 {
 #ifdef __KERNEL__
-	struct xfs_perag	*pag = to_perag(xg);
-
-	cancel_delayed_work_sync(&pag->pag_blockgc_work);
-	xfs_buf_cache_destroy(&pag->pag_bcache);
+	cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
 #endif
 }
 
@@ -235,10 +232,6 @@ xfs_perag_alloc(
 	INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 #endif /* __KERNEL__ */
 
-	error = xfs_buf_cache_init(&pag->pag_bcache);
-	if (error)
-		goto out_free_perag;
-
 	/*
 	 * Pre-calculated geometry
 	 */
@@ -250,12 +243,10 @@ xfs_perag_alloc(
 
 	error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
 	if (error)
-		goto out_buf_cache_destroy;
+		goto out_free_perag;
 
 	return 0;
 
-out_buf_cache_destroy:
-	xfs_buf_cache_destroy(&pag->pag_bcache);
 out_free_perag:
 	kfree(pag);
 	return error;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 1f24cfa27321..f02323416973 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -85,8 +85,6 @@ struct xfs_perag {
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
-	struct xfs_buf_cache	pag_bcache;
-
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
 #endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 348c91335163..76eb7c5a73f1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -363,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
 	.obj_cmpfn		= _xfs_buf_obj_cmp,
 };
 
-int
-xfs_buf_cache_init(
-	struct xfs_buf_cache	*bch)
-{
-	return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
-}
-
-void
-xfs_buf_cache_destroy(
-	struct xfs_buf_cache	*bch)
-{
-	rhashtable_destroy(&bch->bc_hash);
-}
-
 static int
 xfs_buf_map_verify(
 	struct xfs_buftarg	*btp,
@@ -434,7 +420,7 @@ xfs_buf_find_lock(
 
 static inline int
 xfs_buf_lookup(
-	struct xfs_buf_cache	*bch,
+	struct xfs_buftarg	*btp,
 	struct xfs_buf_map	*map,
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
@@ -443,7 +429,7 @@ xfs_buf_lookup(
 	int			error;
 
 	rcu_read_lock();
-	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
+	bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
 	if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
 		rcu_read_unlock();
 		return -ENOENT;
@@ -468,7 +454,6 @@ xfs_buf_lookup(
 static int
 xfs_buf_find_insert(
 	struct xfs_buftarg	*btp,
-	struct xfs_buf_cache	*bch,
 	struct xfs_perag	*pag,
 	struct xfs_buf_map	*cmap,
 	struct xfs_buf_map	*map,
@@ -488,7 +473,7 @@ xfs_buf_find_insert(
 	new_bp->b_pag = pag;
 
 	rcu_read_lock();
-	bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
+	bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
 			&new_bp->b_rhash_head, xfs_buf_hash_params);
 	if (IS_ERR(bp)) {
 		rcu_read_unlock();
@@ -530,16 +515,6 @@ xfs_buftarg_get_pag(
 	return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
 }
 
-static inline struct xfs_buf_cache *
-xfs_buftarg_buf_cache(
-	struct xfs_buftarg		*btp,
-	struct xfs_perag		*pag)
-{
-	if (pag)
-		return &pag->pag_bcache;
-	return btp->bt_cache;
-}
-
 /*
  * Assembles a buffer covering the specified range. The code is optimised for
  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -553,7 +528,6 @@ xfs_buf_get_map(
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
 {
-	struct xfs_buf_cache	*bch;
 	struct xfs_perag	*pag;
 	struct xfs_buf		*bp = NULL;
 	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
@@ -570,9 +544,8 @@ xfs_buf_get_map(
 		return error;
 
 	pag = xfs_buftarg_get_pag(btp, &cmap);
-	bch = xfs_buftarg_buf_cache(btp, pag);
 
-	error = xfs_buf_lookup(bch, &cmap, flags, &bp);
+	error = xfs_buf_lookup(btp, &cmap, flags, &bp);
 	if (error && error != -ENOENT)
 		goto out_put_perag;
 
@@ -584,7 +557,7 @@ xfs_buf_get_map(
 			goto out_put_perag;
 
 		/* xfs_buf_find_insert() consumes the perag reference. */
-		error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
+		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
 				flags, &bp);
 		if (error)
 			return error;
@@ -848,11 +821,8 @@ xfs_buf_destroy(
 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
 	if (!xfs_buf_is_uncached(bp)) {
-		struct xfs_buf_cache	*bch =
-			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
-
-		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
-				xfs_buf_hash_params);
+		rhashtable_remove_fast(&bp->b_target->bt_hash,
+				&bp->b_rhash_head, xfs_buf_hash_params);
 
 		if (bp->b_pag)
 			xfs_perag_put(bp->b_pag);
@@ -1619,6 +1589,7 @@ xfs_destroy_buftarg(
 	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
 	percpu_counter_destroy(&btp->bt_readahead_count);
 	list_lru_destroy(&btp->bt_lru);
+	rhashtable_destroy(&btp->bt_hash);
 }
 
 void
@@ -1713,8 +1684,10 @@ xfs_init_buftarg(
 	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
 			     DEFAULT_RATELIMIT_BURST);
 
-	if (list_lru_init(&btp->bt_lru))
+	if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
 		return -ENOMEM;
+	if (list_lru_init(&btp->bt_lru))
+		goto out_destroy_hash;
 	if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
 		goto out_destroy_lru;
 
@@ -1732,6 +1705,8 @@ xfs_init_buftarg(
 	percpu_counter_destroy(&btp->bt_readahead_count);
 out_destroy_lru:
 	list_lru_destroy(&btp->bt_lru);
+out_destroy_hash:
+	rhashtable_destroy(&btp->bt_hash);
 	return -ENOMEM;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3a1d066e1c13..bf39d89f0f6d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -69,13 +69,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }
 
-struct xfs_buf_cache {
-	struct rhashtable	bc_hash;
-};
-
-int xfs_buf_cache_init(struct xfs_buf_cache *bch);
-void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
-
 /*
  * The xfs_buftarg contains 2 notions of "sector size" -
  *
@@ -113,8 +106,7 @@ struct xfs_buftarg {
 	unsigned int		bt_awu_min;
 	unsigned int		bt_awu_max;
 
-	/* built-in cache, if we're not using the perag one */
-	struct xfs_buf_cache	bt_cache[];
+	struct rhashtable	bt_hash;
 };
 
 struct xfs_buf_map {
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 0106da0a9f44..86dbec5ee203 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -58,7 +58,7 @@ xmbuf_alloc(
 	struct xfs_buftarg	*btp;
 	int			error;
 
-	btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
+	btp = kzalloc(sizeof(*btp), GFP_KERNEL);
 	if (!btp)
 		return -ENOMEM;
 
@@ -81,10 +81,6 @@ xmbuf_alloc(
 	/* ensure all writes are below EOF to avoid pagecache zeroing */
 	i_size_write(inode, inode->i_sb->s_maxbytes);
 
-	error = xfs_buf_cache_init(btp->bt_cache);
-	if (error)
-		goto out_file;
-
 	/* Initialize buffer target */
 	btp->bt_mount = mp;
 	btp->bt_dev = (dev_t)-1U;
@@ -95,15 +91,13 @@ xmbuf_alloc(
 
 	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
 	if (error)
-		goto out_bcache;
+		goto out_file;
 
 	trace_xmbuf_create(btp);
 
 	*btpp = btp;
 	return 0;
 
-out_bcache:
-	xfs_buf_cache_destroy(btp->bt_cache);
 out_file:
 	fput(file);
 out_free_btp:
@@ -122,7 +116,6 @@ xmbuf_free(
 	trace_xmbuf_free(btp);
 
 	xfs_destroy_buftarg(btp);
-	xfs_buf_cache_destroy(btp->bt_cache);
 	fput(btp->bt_file);
 	kfree(btp);
 }
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-19 15:31 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
@ 2026-01-20  2:39   ` Darrick J. Wong
  2026-01-20  7:06     ` Christoph Hellwig
  0 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2026-01-20  2:39 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Dave Chinner, linux-xfs,
	syzbot+0391d34e801643e2809b

On Mon, Jan 19, 2026 at 04:31:37PM +0100, Christoph Hellwig wrote:
> The per-AG buffer hashes were added when all buffer lookups took a
> per-hash look.  Since then we've made lookups entirely lockless and
> removed the need for a hash-wide lock for inserts and removals as
> well.  With this there is no need to sharding the hash, so reduce the
> used resources by using a per-buftarg hash for all buftargs.

Hey, not having all the per-ag buffer cache sounds neat!

> Long after writing this initially, syzbot found a problem in the
> buffer cache teardown order, which this happens to fix as well.

What did we get wrong, specifically?

Also: Is there a simpler fix for this bug that we can stuff into old lts
kernels?  Or is this fix independent of the b_hold and lockref changes
in the previous patches?

--D

> Reported-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
> Tested-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_ag.c | 13 ++---------
>  fs/xfs/libxfs/xfs_ag.h |  2 --
>  fs/xfs/xfs_buf.c       | 51 +++++++++++-------------------------------
>  fs/xfs/xfs_buf.h       | 10 +--------
>  fs/xfs/xfs_buf_mem.c   | 11 ++-------
>  5 files changed, 18 insertions(+), 69 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index 586918ed1cbf..a41d782e8e8c 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -110,10 +110,7 @@ xfs_perag_uninit(
>  	struct xfs_group	*xg)
>  {
>  #ifdef __KERNEL__
> -	struct xfs_perag	*pag = to_perag(xg);
> -
> -	cancel_delayed_work_sync(&pag->pag_blockgc_work);
> -	xfs_buf_cache_destroy(&pag->pag_bcache);
> +	cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
>  #endif
>  }
>  
> @@ -235,10 +232,6 @@ xfs_perag_alloc(
>  	INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
>  #endif /* __KERNEL__ */
>  
> -	error = xfs_buf_cache_init(&pag->pag_bcache);
> -	if (error)
> -		goto out_free_perag;
> -
>  	/*
>  	 * Pre-calculated geometry
>  	 */
> @@ -250,12 +243,10 @@ xfs_perag_alloc(
>  
>  	error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
>  	if (error)
> -		goto out_buf_cache_destroy;
> +		goto out_free_perag;
>  
>  	return 0;
>  
> -out_buf_cache_destroy:
> -	xfs_buf_cache_destroy(&pag->pag_bcache);
>  out_free_perag:
>  	kfree(pag);
>  	return error;
> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> index 1f24cfa27321..f02323416973 100644
> --- a/fs/xfs/libxfs/xfs_ag.h
> +++ b/fs/xfs/libxfs/xfs_ag.h
> @@ -85,8 +85,6 @@ struct xfs_perag {
>  	int		pag_ici_reclaimable;	/* reclaimable inodes */
>  	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
>  
> -	struct xfs_buf_cache	pag_bcache;
> -
>  	/* background prealloc block trimming */
>  	struct delayed_work	pag_blockgc_work;
>  #endif /* __KERNEL__ */
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 348c91335163..76eb7c5a73f1 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -363,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
>  	.obj_cmpfn		= _xfs_buf_obj_cmp,
>  };
>  
> -int
> -xfs_buf_cache_init(
> -	struct xfs_buf_cache	*bch)
> -{
> -	return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
> -}
> -
> -void
> -xfs_buf_cache_destroy(
> -	struct xfs_buf_cache	*bch)
> -{
> -	rhashtable_destroy(&bch->bc_hash);
> -}
> -
>  static int
>  xfs_buf_map_verify(
>  	struct xfs_buftarg	*btp,
> @@ -434,7 +420,7 @@ xfs_buf_find_lock(
>  
>  static inline int
>  xfs_buf_lookup(
> -	struct xfs_buf_cache	*bch,
> +	struct xfs_buftarg	*btp,
>  	struct xfs_buf_map	*map,
>  	xfs_buf_flags_t		flags,
>  	struct xfs_buf		**bpp)
> @@ -443,7 +429,7 @@ xfs_buf_lookup(
>  	int			error;
>  
>  	rcu_read_lock();
> -	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
> +	bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
>  	if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
>  		rcu_read_unlock();
>  		return -ENOENT;
> @@ -468,7 +454,6 @@ xfs_buf_lookup(
>  static int
>  xfs_buf_find_insert(
>  	struct xfs_buftarg	*btp,
> -	struct xfs_buf_cache	*bch,
>  	struct xfs_perag	*pag,
>  	struct xfs_buf_map	*cmap,
>  	struct xfs_buf_map	*map,
> @@ -488,7 +473,7 @@ xfs_buf_find_insert(
>  	new_bp->b_pag = pag;
>  
>  	rcu_read_lock();
> -	bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
> +	bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
>  			&new_bp->b_rhash_head, xfs_buf_hash_params);
>  	if (IS_ERR(bp)) {
>  		rcu_read_unlock();
> @@ -530,16 +515,6 @@ xfs_buftarg_get_pag(
>  	return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
>  }
>  
> -static inline struct xfs_buf_cache *
> -xfs_buftarg_buf_cache(
> -	struct xfs_buftarg		*btp,
> -	struct xfs_perag		*pag)
> -{
> -	if (pag)
> -		return &pag->pag_bcache;
> -	return btp->bt_cache;
> -}
> -
>  /*
>   * Assembles a buffer covering the specified range. The code is optimised for
>   * cache hits, as metadata intensive workloads will see 3 orders of magnitude
> @@ -553,7 +528,6 @@ xfs_buf_get_map(
>  	xfs_buf_flags_t		flags,
>  	struct xfs_buf		**bpp)
>  {
> -	struct xfs_buf_cache	*bch;
>  	struct xfs_perag	*pag;
>  	struct xfs_buf		*bp = NULL;
>  	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
> @@ -570,9 +544,8 @@ xfs_buf_get_map(
>  		return error;
>  
>  	pag = xfs_buftarg_get_pag(btp, &cmap);
> -	bch = xfs_buftarg_buf_cache(btp, pag);
>  
> -	error = xfs_buf_lookup(bch, &cmap, flags, &bp);
> +	error = xfs_buf_lookup(btp, &cmap, flags, &bp);
>  	if (error && error != -ENOENT)
>  		goto out_put_perag;
>  
> @@ -584,7 +557,7 @@ xfs_buf_get_map(
>  			goto out_put_perag;
>  
>  		/* xfs_buf_find_insert() consumes the perag reference. */
> -		error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
> +		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
>  				flags, &bp);
>  		if (error)
>  			return error;
> @@ -848,11 +821,8 @@ xfs_buf_destroy(
>  	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  
>  	if (!xfs_buf_is_uncached(bp)) {
> -		struct xfs_buf_cache	*bch =
> -			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
> -
> -		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
> -				xfs_buf_hash_params);
> +		rhashtable_remove_fast(&bp->b_target->bt_hash,
> +				&bp->b_rhash_head, xfs_buf_hash_params);
>  
>  		if (bp->b_pag)
>  			xfs_perag_put(bp->b_pag);
> @@ -1619,6 +1589,7 @@ xfs_destroy_buftarg(
>  	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
>  	percpu_counter_destroy(&btp->bt_readahead_count);
>  	list_lru_destroy(&btp->bt_lru);
> +	rhashtable_destroy(&btp->bt_hash);
>  }
>  
>  void
> @@ -1713,8 +1684,10 @@ xfs_init_buftarg(
>  	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
>  			     DEFAULT_RATELIMIT_BURST);
>  
> -	if (list_lru_init(&btp->bt_lru))
> +	if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
>  		return -ENOMEM;
> +	if (list_lru_init(&btp->bt_lru))
> +		goto out_destroy_hash;
>  	if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
>  		goto out_destroy_lru;
>  
> @@ -1732,6 +1705,8 @@ xfs_init_buftarg(
>  	percpu_counter_destroy(&btp->bt_readahead_count);
>  out_destroy_lru:
>  	list_lru_destroy(&btp->bt_lru);
> +out_destroy_hash:
> +	rhashtable_destroy(&btp->bt_hash);
>  	return -ENOMEM;
>  }
>  
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 3a1d066e1c13..bf39d89f0f6d 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -69,13 +69,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_INCORE,		"INCORE" }, \
>  	{ XBF_TRYLOCK,		"TRYLOCK" }
>  
> -struct xfs_buf_cache {
> -	struct rhashtable	bc_hash;
> -};
> -
> -int xfs_buf_cache_init(struct xfs_buf_cache *bch);
> -void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
> -
>  /*
>   * The xfs_buftarg contains 2 notions of "sector size" -
>   *
> @@ -113,8 +106,7 @@ struct xfs_buftarg {
>  	unsigned int		bt_awu_min;
>  	unsigned int		bt_awu_max;
>  
> -	/* built-in cache, if we're not using the perag one */
> -	struct xfs_buf_cache	bt_cache[];
> +	struct rhashtable	bt_hash;
>  };
>  
>  struct xfs_buf_map {
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index 0106da0a9f44..86dbec5ee203 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -58,7 +58,7 @@ xmbuf_alloc(
>  	struct xfs_buftarg	*btp;
>  	int			error;
>  
> -	btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
> +	btp = kzalloc(sizeof(*btp), GFP_KERNEL);
>  	if (!btp)
>  		return -ENOMEM;
>  
> @@ -81,10 +81,6 @@ xmbuf_alloc(
>  	/* ensure all writes are below EOF to avoid pagecache zeroing */
>  	i_size_write(inode, inode->i_sb->s_maxbytes);
>  
> -	error = xfs_buf_cache_init(btp->bt_cache);
> -	if (error)
> -		goto out_file;
> -
>  	/* Initialize buffer target */
>  	btp->bt_mount = mp;
>  	btp->bt_dev = (dev_t)-1U;
> @@ -95,15 +91,13 @@ xmbuf_alloc(
>  
>  	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
>  	if (error)
> -		goto out_bcache;
> +		goto out_file;
>  
>  	trace_xmbuf_create(btp);
>  
>  	*btpp = btp;
>  	return 0;
>  
> -out_bcache:
> -	xfs_buf_cache_destroy(btp->bt_cache);
>  out_file:
>  	fput(file);
>  out_free_btp:
> @@ -122,7 +116,6 @@ xmbuf_free(
>  	trace_xmbuf_free(btp);
>  
>  	xfs_destroy_buftarg(btp);
> -	xfs_buf_cache_destroy(btp->bt_cache);
>  	fput(btp->bt_file);
>  	kfree(btp);
>  }
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-20  2:39   ` Darrick J. Wong
@ 2026-01-20  7:06     ` Christoph Hellwig
  2026-01-20 15:50       ` Darrick J. Wong
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-20  7:06 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs,
	syzbot+0391d34e801643e2809b

On Mon, Jan 19, 2026 at 06:39:18PM -0800, Darrick J. Wong wrote:
> On Mon, Jan 19, 2026 at 04:31:37PM +0100, Christoph Hellwig wrote:
> > The per-AG buffer hashes were added when all buffer lookups took a
> > per-hash look.  Since then we've made lookups entirely lockless and
> > removed the need for a hash-wide lock for inserts and removals as
> > well.  With this there is no need to sharding the hash, so reduce the
> > used resources by using a per-buftarg hash for all buftargs.
> 
> Hey, not having all the per-ag buffer cache sounds neat!
> 
> > Long after writing this initially, syzbot found a problem in the
> > buffer cache teardown order, which this happens to fix as well.
> 
> What did we get wrong, specifically?

Dave has a really good analysis here:

https://lore.kernel.org/linux-xfs/aLeUdemAZ5wmtZel@dread.disaster.area/

> Also: Is there a simpler fix for this bug that we can stuff into old lts
> kernels?

I can't really think of anything much simpler, just different.  It would
require some careful reordering of the unmount path, which is always
hairy.

> Or is this fix independent of the b_hold and lockref changes
> in the previous patches?

In theory it is, except that the old tricks with the refcount would
make it very difficult.  I tried to reorder this twice and failed
both times.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-20  7:06     ` Christoph Hellwig
@ 2026-01-20 15:50       ` Darrick J. Wong
  0 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2026-01-20 15:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Dave Chinner, linux-xfs,
	syzbot+0391d34e801643e2809b

On Tue, Jan 20, 2026 at 08:06:15AM +0100, Christoph Hellwig wrote:
> On Mon, Jan 19, 2026 at 06:39:18PM -0800, Darrick J. Wong wrote:
> > On Mon, Jan 19, 2026 at 04:31:37PM +0100, Christoph Hellwig wrote:
> > > The per-AG buffer hashes were added when all buffer lookups took a
> > > per-hash look.  Since then we've made lookups entirely lockless and
> > > removed the need for a hash-wide lock for inserts and removals as
> > > well.  With this there is no need to sharding the hash, so reduce the
> > > used resources by using a per-buftarg hash for all buftargs.
> > 
> > Hey, not having all the per-ag buffer cache sounds neat!
> > 
> > > Long after writing this initially, syzbot found a problem in the
> > > buffer cache teardown order, which this happens to fix as well.
> > 
> > What did we get wrong, specifically?
> 
> Dave has a really good analysis here:
> 
> https://lore.kernel.org/linux-xfs/aLeUdemAZ5wmtZel@dread.disaster.area/

Can you Link: to that in this patch?  If I'm reading linus' most recent
exposition correctly, links to other threads are still allowed.

> > Also: Is there a simpler fix for this bug that we can stuff into old lts
> > kernels?
> 
> I can't really think of anything much simpler, just different.  It would
> require some careful reordering of the unmount path, which is always
> hairy.
> 
> > Or is this fix independent of the b_hold and lockref changes
> > in the previous patches?
> 
> In theory it is, except that the old tricks with the refcount would
> make it very difficult.  I tried to reorder this twice and failed
> both times.

<nod> I figured that might be the case seeing as you cleaned up the
confusing b_hold rules.

--D

^ permalink raw reply	[flat|nested] 15+ messages in thread

* buffer cache simplification v2
@ 2026-01-22  5:26 Christoph Hellwig
  2026-01-22  5:26 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-22  5:26 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, linux-xfs

Hi all,

this series has a few old patches that simplify the LRU handling, and
moves back to only having a per-buftarg hash now that the buffer hash
is using the scalable rhashtable.  While some of this looks like
performance work, performance and scalability is unchanged even on the
80 core dual socket system I test this on.  Besides cleaning up the
code nice, it also happens to fix a syzcaller reported use after free
during buffer shutdown, which happened incidentally because of how the
tear down of the buftarg vs the perag structures is handled.

Changes since v1:
 - add more details and a link to a commit message

Diffstat:
 libxfs/xfs_ag.c |   13 ---
 libxfs/xfs_ag.h |    2 
 xfs_buf.c       |  234 ++++++++++++++++++--------------------------------------
 xfs_buf.h       |   20 ----
 xfs_buf_mem.c   |   11 --
 xfs_trace.h     |   10 +-
 6 files changed, 91 insertions(+), 199 deletions(-)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-22  5:26 buffer cache simplification v2 Christoph Hellwig
@ 2026-01-22  5:26 ` Christoph Hellwig
  2026-01-23 11:55   ` Carlos Maiolino
  2026-01-23 16:01   ` Brian Foster
  2026-01-22  5:26 ` [PATCH 2/3] xfs: use a lockref for the buffer reference count Christoph Hellwig
  2026-01-22  5:26 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
  2 siblings, 2 replies; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-22  5:26 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, linux-xfs, Darrick J. Wong

Currently the buffer cache adds a reference to b_hold for buffers that
are on the LRU.  This seems to go all the way back and allows releasing
buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
really complicated in differs from how other LRUs are implemented in
Linux.

Switch to not having a reference for buffers in the LRU, and use a
separate negative hold value to mark buffers as dead.  This simplifies
xfs_buf_rele, which now just deal with the last "real" reference,
and prepares for using the lockref primitive.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
 fs/xfs/xfs_buf.h |   8 +--
 2 files changed, 54 insertions(+), 94 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index db46883991de..aacdf080e400 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,11 +80,8 @@ xfs_buf_stale(
 
 	spin_lock(&bp->b_lock);
 	atomic_set(&bp->b_lru_ref, 0);
-	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
-	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
-		bp->b_hold--;
-
-	ASSERT(bp->b_hold >= 1);
+	if (bp->b_hold >= 0)
+		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
 	spin_unlock(&bp->b_lock);
 }
 
@@ -442,7 +439,7 @@ xfs_buf_try_hold(
 	struct xfs_buf		*bp)
 {
 	spin_lock(&bp->b_lock);
-	if (bp->b_hold == 0) {
+	if (bp->b_hold == -1) {
 		spin_unlock(&bp->b_lock);
 		return false;
 	}
@@ -862,76 +859,24 @@ xfs_buf_hold(
 }
 
 static void
-xfs_buf_rele_uncached(
-	struct xfs_buf		*bp)
-{
-	ASSERT(list_empty(&bp->b_lru));
-
-	spin_lock(&bp->b_lock);
-	if (--bp->b_hold) {
-		spin_unlock(&bp->b_lock);
-		return;
-	}
-	spin_unlock(&bp->b_lock);
-	xfs_buf_free(bp);
-}
-
-static void
-xfs_buf_rele_cached(
+xfs_buf_destroy(
 	struct xfs_buf		*bp)
 {
-	struct xfs_buftarg	*btp = bp->b_target;
-	struct xfs_perag	*pag = bp->b_pag;
-	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
-	bool			freebuf = false;
-
-	trace_xfs_buf_rele(bp, _RET_IP_);
-
-	spin_lock(&bp->b_lock);
-	ASSERT(bp->b_hold >= 1);
-	if (bp->b_hold > 1) {
-		bp->b_hold--;
-		goto out_unlock;
-	}
+	ASSERT(bp->b_hold < 0);
+	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
-	/* we are asked to drop the last reference */
-	if (atomic_read(&bp->b_lru_ref)) {
-		/*
-		 * If the buffer is added to the LRU, keep the reference to the
-		 * buffer for the LRU and clear the (now stale) dispose list
-		 * state flag, else drop the reference.
-		 */
-		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
-			bp->b_state &= ~XFS_BSTATE_DISPOSE;
-		else
-			bp->b_hold--;
-	} else {
-		bp->b_hold--;
-		/*
-		 * most of the time buffers will already be removed from the
-		 * LRU, so optimise that case by checking for the
-		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
-		 * was on was the disposal list
-		 */
-		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
-		} else {
-			ASSERT(list_empty(&bp->b_lru));
-		}
+	if (!xfs_buf_is_uncached(bp)) {
+		struct xfs_buf_cache	*bch =
+			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
 
-		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
 				xfs_buf_hash_params);
-		if (pag)
-			xfs_perag_put(pag);
-		freebuf = true;
-	}
 
-out_unlock:
-	spin_unlock(&bp->b_lock);
+		if (bp->b_pag)
+			xfs_perag_put(bp->b_pag);
+	}
 
-	if (freebuf)
-		xfs_buf_free(bp);
+	xfs_buf_free(bp);
 }
 
 /*
@@ -942,10 +887,22 @@ xfs_buf_rele(
 	struct xfs_buf		*bp)
 {
 	trace_xfs_buf_rele(bp, _RET_IP_);
-	if (xfs_buf_is_uncached(bp))
-		xfs_buf_rele_uncached(bp);
-	else
-		xfs_buf_rele_cached(bp);
+
+	spin_lock(&bp->b_lock);
+	if (!--bp->b_hold) {
+		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
+			goto kill;
+		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
+	}
+	spin_unlock(&bp->b_lock);
+	return;
+
+kill:
+	bp->b_hold = -1;
+	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+	spin_unlock(&bp->b_lock);
+
+	xfs_buf_destroy(bp);
 }
 
 /*
@@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
 
 /*
  * To simulate an I/O failure, the buffer must be locked and held with at least
- * three references. The LRU reference is dropped by the stale call. The buf
- * item reference is dropped via ioend processing. The third reference is owned
- * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ * two references.
+ *
+ * The buf item reference is dropped via ioend processing. The second reference
+ * is owned by the caller and is dropped on I/O completion if the buffer is
+ * XBF_ASYNC.
  */
 void
 xfs_buf_ioend_fail(
@@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
 
 	if (!spin_trylock(&bp->b_lock))
 		return LRU_SKIP;
-	if (bp->b_hold > 1) {
+	if (bp->b_hold > 0) {
 		/* need to wait, so skip it this pass */
 		spin_unlock(&bp->b_lock);
 		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
 		return LRU_SKIP;
 	}
 
-	/*
-	 * clear the LRU reference count so the buffer doesn't get
-	 * ignored in xfs_buf_rele().
-	 */
-	atomic_set(&bp->b_lru_ref, 0);
-	bp->b_state |= XFS_BSTATE_DISPOSE;
+	bp->b_hold = -1;
 	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
@@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
 					(long long)xfs_buf_daddr(bp));
 			}
-			xfs_buf_rele(bp);
+			xfs_buf_destroy(bp);
 		}
 		if (loop++ != 0)
 			delay(100);
@@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
 	struct list_head	*dispose = arg;
 
 	/*
-	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
-	 * If we fail to get the lock, just skip it.
+	 * We are inverting the lru lock vs bp->b_lock order here, so use a
+	 * trylock. If we fail to get the lock, just skip the buffer.
 	 */
 	if (!spin_trylock(&bp->b_lock))
 		return LRU_SKIP;
+
+	/*
+	 * If the buffer is in use, remove it from the LRU for now as we can't
+	 * free it.  It will be added to the LRU again when the reference count
+	 * hits zero.
+	 */
+	if (bp->b_hold > 0) {
+		list_lru_isolate(lru, &bp->b_lru);
+		spin_unlock(&bp->b_lock);
+		return LRU_REMOVED;
+	}
+
 	/*
 	 * Decrement the b_lru_ref count unless the value is already
 	 * zero. If the value is already zero, we need to reclaim the
@@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
 		return LRU_ROTATE;
 	}
 
-	bp->b_state |= XFS_BSTATE_DISPOSE;
+	bp->b_hold = -1;
 	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
@@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
 		struct xfs_buf *bp;
 		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 		list_del_init(&bp->b_lru);
-		xfs_buf_rele(bp);
+		xfs_buf_destroy(bp);
 	}
 
 	return freed;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e25cd2a160f3..1117cd9cbfb9 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }
 
-/*
- * Internal state flags.
- */
-#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
-
 struct xfs_buf_cache {
 	struct rhashtable	bc_hash;
 };
@@ -159,6 +154,7 @@ struct xfs_buf {
 
 	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
 	int			b_length;	/* size of buffer in BBs */
+	spinlock_t		b_lock;		/* internal state lock */
 	unsigned int		b_hold;		/* reference count */
 	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 	xfs_buf_flags_t		b_flags;	/* status flags */
@@ -169,8 +165,6 @@ struct xfs_buf {
 	 * bt_lru_lock and not by b_sema
 	 */
 	struct list_head	b_lru;		/* lru list */
-	spinlock_t		b_lock;		/* internal state lock */
-	unsigned int		b_state;	/* internal state flags */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/3] xfs: use a lockref for the buffer reference count
  2026-01-22  5:26 buffer cache simplification v2 Christoph Hellwig
  2026-01-22  5:26 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
@ 2026-01-22  5:26 ` Christoph Hellwig
  2026-01-23 12:04   ` Carlos Maiolino
  2026-01-22  5:26 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
  2 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-22  5:26 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, linux-xfs, Darrick J. Wong

The lockref structure allows incrementing/decrementing counters like
an atomic_t for the fast path, while still allowing complex slow path
operations as if the counter was protected by a lock.  The only slow
path operations that actually need to take the lock are the final
put, LRU evictions and marking a buffer stale.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c   | 79 ++++++++++++++++++----------------------------
 fs/xfs/xfs_buf.h   |  4 +--
 fs/xfs/xfs_trace.h | 10 +++---
 3 files changed, 38 insertions(+), 55 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aacdf080e400..348c91335163 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -31,20 +31,20 @@ struct kmem_cache *xfs_buf_cache;
  *
  * xfs_buf_stale:
  *	b_sema (caller holds)
- *	  b_lock
+ *	  b_lockref.lock
  *	    lru_lock
  *
  * xfs_buf_rele:
- *	b_lock
+ *	b_lockref.lock
  *	  lru_lock
  *
  * xfs_buftarg_drain_rele
  *	lru_lock
- *	  b_lock (trylock due to inversion)
+ *	  b_lockref.lock (trylock due to inversion)
  *
  * xfs_buftarg_isolate
  *	lru_lock
- *	  b_lock (trylock due to inversion)
+ *	  b_lockref.lock (trylock due to inversion)
  */
 
 static void xfs_buf_submit(struct xfs_buf *bp);
@@ -78,11 +78,11 @@ xfs_buf_stale(
 	 */
 	bp->b_flags &= ~_XBF_DELWRI_Q;
 
-	spin_lock(&bp->b_lock);
+	spin_lock(&bp->b_lockref.lock);
 	atomic_set(&bp->b_lru_ref, 0);
-	if (bp->b_hold >= 0)
+	if (!__lockref_is_dead(&bp->b_lockref))
 		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 }
 
 static void
@@ -274,10 +274,8 @@ xfs_buf_alloc(
 	 * inserting into the hash table are safe (and will have to wait for
 	 * the unlock to do anything non-trivial).
 	 */
-	bp->b_hold = 1;
+	lockref_init(&bp->b_lockref);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
-
-	spin_lock_init(&bp->b_lock);
 	atomic_set(&bp->b_lru_ref, 1);
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_lru);
@@ -434,20 +432,6 @@ xfs_buf_find_lock(
 	return 0;
 }
 
-static bool
-xfs_buf_try_hold(
-	struct xfs_buf		*bp)
-{
-	spin_lock(&bp->b_lock);
-	if (bp->b_hold == -1) {
-		spin_unlock(&bp->b_lock);
-		return false;
-	}
-	bp->b_hold++;
-	spin_unlock(&bp->b_lock);
-	return true;
-}
-
 static inline int
 xfs_buf_lookup(
 	struct xfs_buf_cache	*bch,
@@ -460,7 +444,7 @@ xfs_buf_lookup(
 
 	rcu_read_lock();
 	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
-	if (!bp || !xfs_buf_try_hold(bp)) {
+	if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
 		rcu_read_unlock();
 		return -ENOENT;
 	}
@@ -511,7 +495,7 @@ xfs_buf_find_insert(
 		error = PTR_ERR(bp);
 		goto out_free_buf;
 	}
-	if (bp && xfs_buf_try_hold(bp)) {
+	if (bp && lockref_get_not_dead(&bp->b_lockref)) {
 		/* found an existing buffer */
 		rcu_read_unlock();
 		error = xfs_buf_find_lock(bp, flags);
@@ -853,16 +837,14 @@ xfs_buf_hold(
 {
 	trace_xfs_buf_hold(bp, _RET_IP_);
 
-	spin_lock(&bp->b_lock);
-	bp->b_hold++;
-	spin_unlock(&bp->b_lock);
+	lockref_get(&bp->b_lockref);
 }
 
 static void
 xfs_buf_destroy(
 	struct xfs_buf		*bp)
 {
-	ASSERT(bp->b_hold < 0);
+	ASSERT(__lockref_is_dead(&bp->b_lockref));
 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
 	if (!xfs_buf_is_uncached(bp)) {
@@ -888,19 +870,20 @@ xfs_buf_rele(
 {
 	trace_xfs_buf_rele(bp, _RET_IP_);
 
-	spin_lock(&bp->b_lock);
-	if (!--bp->b_hold) {
+	if (lockref_put_or_lock(&bp->b_lockref))
+		return;
+	if (!--bp->b_lockref.count) {
 		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
 			goto kill;
 		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
 	}
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 	return;
 
 kill:
-	bp->b_hold = -1;
+	lockref_mark_dead(&bp->b_lockref);
 	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 
 	xfs_buf_destroy(bp);
 }
@@ -1471,18 +1454,18 @@ xfs_buftarg_drain_rele(
 	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
 	struct list_head	*dispose = arg;
 
-	if (!spin_trylock(&bp->b_lock))
+	if (!spin_trylock(&bp->b_lockref.lock))
 		return LRU_SKIP;
-	if (bp->b_hold > 0) {
+	if (bp->b_lockref.count > 0) {
 		/* need to wait, so skip it this pass */
-		spin_unlock(&bp->b_lock);
+		spin_unlock(&bp->b_lockref.lock);
 		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
 		return LRU_SKIP;
 	}
 
-	bp->b_hold = -1;
+	lockref_mark_dead(&bp->b_lockref);
 	list_lru_isolate_move(lru, item, dispose);
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 	return LRU_REMOVED;
 }
 
@@ -1564,10 +1547,10 @@ xfs_buftarg_isolate(
 	struct list_head	*dispose = arg;
 
 	/*
-	 * We are inverting the lru lock vs bp->b_lock order here, so use a
-	 * trylock. If we fail to get the lock, just skip the buffer.
+	 * We are inverting the lru lock vs bp->b_lockref.lock order here, so
+	 * use a trylock. If we fail to get the lock, just skip the buffer.
 	 */
-	if (!spin_trylock(&bp->b_lock))
+	if (!spin_trylock(&bp->b_lockref.lock))
 		return LRU_SKIP;
 
 	/*
@@ -1575,9 +1558,9 @@ xfs_buftarg_isolate(
 	 * free it.  It will be added to the LRU again when the reference count
 	 * hits zero.
 	 */
-	if (bp->b_hold > 0) {
+	if (bp->b_lockref.count > 0) {
 		list_lru_isolate(lru, &bp->b_lru);
-		spin_unlock(&bp->b_lock);
+		spin_unlock(&bp->b_lockref.lock);
 		return LRU_REMOVED;
 	}
 
@@ -1587,13 +1570,13 @@ xfs_buftarg_isolate(
 	 * buffer, otherwise it gets another trip through the LRU.
 	 */
 	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
-		spin_unlock(&bp->b_lock);
+		spin_unlock(&bp->b_lockref.lock);
 		return LRU_ROTATE;
 	}
 
-	bp->b_hold = -1;
+	lockref_mark_dead(&bp->b_lockref);
 	list_lru_isolate_move(lru, item, dispose);
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 	return LRU_REMOVED;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 1117cd9cbfb9..3a1d066e1c13 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -14,6 +14,7 @@
 #include <linux/dax.h>
 #include <linux/uio.h>
 #include <linux/list_lru.h>
+#include <linux/lockref.h>
 
 extern struct kmem_cache *xfs_buf_cache;
 
@@ -154,8 +155,7 @@ struct xfs_buf {
 
 	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
 	int			b_length;	/* size of buffer in BBs */
-	spinlock_t		b_lock;		/* internal state lock */
-	unsigned int		b_hold;		/* reference count */
+	struct lockref		b_lockref;	/* refcount + lock */
 	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct semaphore	b_sema;		/* semaphore for lockables */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f70afbf3cb19..abf022d5ff42 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -736,7 +736,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
 		__entry->dev = bp->b_target->bt_dev;
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->nblks = bp->b_length;
-		__entry->hold = bp->b_hold;
+		__entry->hold = bp->b_lockref.count;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->flags = bp->b_flags;
@@ -810,7 +810,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->length = bp->b_length;
 		__entry->flags = flags;
-		__entry->hold = bp->b_hold;
+		__entry->hold = bp->b_lockref.count;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->caller_ip = caller_ip;
@@ -854,7 +854,7 @@ TRACE_EVENT(xfs_buf_ioerror,
 		__entry->dev = bp->b_target->bt_dev;
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->length = bp->b_length;
-		__entry->hold = bp->b_hold;
+		__entry->hold = bp->b_lockref.count;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->error = error;
@@ -898,7 +898,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
 		__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
 		__entry->buf_len = bip->bli_buf->b_length;
 		__entry->buf_flags = bip->bli_buf->b_flags;
-		__entry->buf_hold = bip->bli_buf->b_hold;
+		__entry->buf_hold = bip->bli_buf->b_lockref.count;
 		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
 		__entry->buf_lockval = bip->bli_buf->b_sema.count;
 		__entry->li_flags = bip->bli_item.li_flags;
@@ -5181,7 +5181,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
 		__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->nblks = bp->b_length;
-		__entry->hold = bp->b_hold;
+		__entry->hold = bp->b_lockref.count;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->flags = bp->b_flags;
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-22  5:26 buffer cache simplification v2 Christoph Hellwig
  2026-01-22  5:26 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
  2026-01-22  5:26 ` [PATCH 2/3] xfs: use a lockref for the buffer reference count Christoph Hellwig
@ 2026-01-22  5:26 ` Christoph Hellwig
  2026-01-22 18:10   ` Darrick J. Wong
  2 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-22  5:26 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, linux-xfs, syzbot+0391d34e801643e2809b

The per-AG buffer hashes were added when all buffer lookups took a
per-hash look.  Since then we've made lookups entirely lockless and
removed the need for a hash-wide lock for inserts and removals as
well.  With this there is no need to sharding the hash, so reduce the
used resources by using a per-buftarg hash for all buftargs.

Long after writing this initially, syzbot found a problem in the buffer
cache teardown order, which this happens to fix as well by doing the
entire buffer cache teardown in one places instead of splitting it
between destroying the buftarg and the perag structures.

Link: https://lore.kernel.org/linux-xfs/aLeUdemAZ5wmtZel@dread.disaster.area/
Reported-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Tested-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_ag.c | 13 ++---------
 fs/xfs/libxfs/xfs_ag.h |  2 --
 fs/xfs/xfs_buf.c       | 51 +++++++++++-------------------------------
 fs/xfs/xfs_buf.h       | 10 +--------
 fs/xfs/xfs_buf_mem.c   | 11 ++-------
 5 files changed, 18 insertions(+), 69 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 586918ed1cbf..a41d782e8e8c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -110,10 +110,7 @@ xfs_perag_uninit(
 	struct xfs_group	*xg)
 {
 #ifdef __KERNEL__
-	struct xfs_perag	*pag = to_perag(xg);
-
-	cancel_delayed_work_sync(&pag->pag_blockgc_work);
-	xfs_buf_cache_destroy(&pag->pag_bcache);
+	cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
 #endif
 }
 
@@ -235,10 +232,6 @@ xfs_perag_alloc(
 	INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 #endif /* __KERNEL__ */
 
-	error = xfs_buf_cache_init(&pag->pag_bcache);
-	if (error)
-		goto out_free_perag;
-
 	/*
 	 * Pre-calculated geometry
 	 */
@@ -250,12 +243,10 @@ xfs_perag_alloc(
 
 	error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
 	if (error)
-		goto out_buf_cache_destroy;
+		goto out_free_perag;
 
 	return 0;
 
-out_buf_cache_destroy:
-	xfs_buf_cache_destroy(&pag->pag_bcache);
 out_free_perag:
 	kfree(pag);
 	return error;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 1f24cfa27321..f02323416973 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -85,8 +85,6 @@ struct xfs_perag {
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
-	struct xfs_buf_cache	pag_bcache;
-
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
 #endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 348c91335163..76eb7c5a73f1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -363,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
 	.obj_cmpfn		= _xfs_buf_obj_cmp,
 };
 
-int
-xfs_buf_cache_init(
-	struct xfs_buf_cache	*bch)
-{
-	return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
-}
-
-void
-xfs_buf_cache_destroy(
-	struct xfs_buf_cache	*bch)
-{
-	rhashtable_destroy(&bch->bc_hash);
-}
-
 static int
 xfs_buf_map_verify(
 	struct xfs_buftarg	*btp,
@@ -434,7 +420,7 @@ xfs_buf_find_lock(
 
 static inline int
 xfs_buf_lookup(
-	struct xfs_buf_cache	*bch,
+	struct xfs_buftarg	*btp,
 	struct xfs_buf_map	*map,
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
@@ -443,7 +429,7 @@ xfs_buf_lookup(
 	int			error;
 
 	rcu_read_lock();
-	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
+	bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
 	if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
 		rcu_read_unlock();
 		return -ENOENT;
@@ -468,7 +454,6 @@ xfs_buf_lookup(
 static int
 xfs_buf_find_insert(
 	struct xfs_buftarg	*btp,
-	struct xfs_buf_cache	*bch,
 	struct xfs_perag	*pag,
 	struct xfs_buf_map	*cmap,
 	struct xfs_buf_map	*map,
@@ -488,7 +473,7 @@ xfs_buf_find_insert(
 	new_bp->b_pag = pag;
 
 	rcu_read_lock();
-	bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
+	bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
 			&new_bp->b_rhash_head, xfs_buf_hash_params);
 	if (IS_ERR(bp)) {
 		rcu_read_unlock();
@@ -530,16 +515,6 @@ xfs_buftarg_get_pag(
 	return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
 }
 
-static inline struct xfs_buf_cache *
-xfs_buftarg_buf_cache(
-	struct xfs_buftarg		*btp,
-	struct xfs_perag		*pag)
-{
-	if (pag)
-		return &pag->pag_bcache;
-	return btp->bt_cache;
-}
-
 /*
  * Assembles a buffer covering the specified range. The code is optimised for
  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -553,7 +528,6 @@ xfs_buf_get_map(
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
 {
-	struct xfs_buf_cache	*bch;
 	struct xfs_perag	*pag;
 	struct xfs_buf		*bp = NULL;
 	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
@@ -570,9 +544,8 @@ xfs_buf_get_map(
 		return error;
 
 	pag = xfs_buftarg_get_pag(btp, &cmap);
-	bch = xfs_buftarg_buf_cache(btp, pag);
 
-	error = xfs_buf_lookup(bch, &cmap, flags, &bp);
+	error = xfs_buf_lookup(btp, &cmap, flags, &bp);
 	if (error && error != -ENOENT)
 		goto out_put_perag;
 
@@ -584,7 +557,7 @@ xfs_buf_get_map(
 			goto out_put_perag;
 
 		/* xfs_buf_find_insert() consumes the perag reference. */
-		error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
+		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
 				flags, &bp);
 		if (error)
 			return error;
@@ -848,11 +821,8 @@ xfs_buf_destroy(
 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
 	if (!xfs_buf_is_uncached(bp)) {
-		struct xfs_buf_cache	*bch =
-			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
-
-		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
-				xfs_buf_hash_params);
+		rhashtable_remove_fast(&bp->b_target->bt_hash,
+				&bp->b_rhash_head, xfs_buf_hash_params);
 
 		if (bp->b_pag)
 			xfs_perag_put(bp->b_pag);
@@ -1619,6 +1589,7 @@ xfs_destroy_buftarg(
 	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
 	percpu_counter_destroy(&btp->bt_readahead_count);
 	list_lru_destroy(&btp->bt_lru);
+	rhashtable_destroy(&btp->bt_hash);
 }
 
 void
@@ -1713,8 +1684,10 @@ xfs_init_buftarg(
 	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
 			     DEFAULT_RATELIMIT_BURST);
 
-	if (list_lru_init(&btp->bt_lru))
+	if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
 		return -ENOMEM;
+	if (list_lru_init(&btp->bt_lru))
+		goto out_destroy_hash;
 	if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
 		goto out_destroy_lru;
 
@@ -1732,6 +1705,8 @@ xfs_init_buftarg(
 	percpu_counter_destroy(&btp->bt_readahead_count);
 out_destroy_lru:
 	list_lru_destroy(&btp->bt_lru);
+out_destroy_hash:
+	rhashtable_destroy(&btp->bt_hash);
 	return -ENOMEM;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3a1d066e1c13..bf39d89f0f6d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -69,13 +69,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }
 
-struct xfs_buf_cache {
-	struct rhashtable	bc_hash;
-};
-
-int xfs_buf_cache_init(struct xfs_buf_cache *bch);
-void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
-
 /*
  * The xfs_buftarg contains 2 notions of "sector size" -
  *
@@ -113,8 +106,7 @@ struct xfs_buftarg {
 	unsigned int		bt_awu_min;
 	unsigned int		bt_awu_max;
 
-	/* built-in cache, if we're not using the perag one */
-	struct xfs_buf_cache	bt_cache[];
+	struct rhashtable	bt_hash;
 };
 
 struct xfs_buf_map {
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 0106da0a9f44..86dbec5ee203 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -58,7 +58,7 @@ xmbuf_alloc(
 	struct xfs_buftarg	*btp;
 	int			error;
 
-	btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
+	btp = kzalloc(sizeof(*btp), GFP_KERNEL);
 	if (!btp)
 		return -ENOMEM;
 
@@ -81,10 +81,6 @@ xmbuf_alloc(
 	/* ensure all writes are below EOF to avoid pagecache zeroing */
 	i_size_write(inode, inode->i_sb->s_maxbytes);
 
-	error = xfs_buf_cache_init(btp->bt_cache);
-	if (error)
-		goto out_file;
-
 	/* Initialize buffer target */
 	btp->bt_mount = mp;
 	btp->bt_dev = (dev_t)-1U;
@@ -95,15 +91,13 @@ xmbuf_alloc(
 
 	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
 	if (error)
-		goto out_bcache;
+		goto out_file;
 
 	trace_xmbuf_create(btp);
 
 	*btpp = btp;
 	return 0;
 
-out_bcache:
-	xfs_buf_cache_destroy(btp->bt_cache);
 out_file:
 	fput(file);
 out_free_btp:
@@ -122,7 +116,6 @@ xmbuf_free(
 	trace_xmbuf_free(btp);
 
 	xfs_destroy_buftarg(btp);
-	xfs_buf_cache_destroy(btp->bt_cache);
 	fput(btp->bt_file);
 	kfree(btp);
 }
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-22  5:26 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
@ 2026-01-22 18:10   ` Darrick J. Wong
  2026-01-23  5:36     ` Christoph Hellwig
  0 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2026-01-22 18:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Dave Chinner, linux-xfs,
	syzbot+0391d34e801643e2809b

On Thu, Jan 22, 2026 at 06:26:57AM +0100, Christoph Hellwig wrote:
> The per-AG buffer hashes were added when all buffer lookups took a
> per-hash look.  Since then we've made lookups entirely lockless and
> removed the need for a hash-wide lock for inserts and removals as
> well.  With this there is no need to sharding the hash, so reduce the
> used resources by using a per-buftarg hash for all buftargs.
> 
> Long after writing this initially, syzbot found a problem in the buffer
> cache teardown order, which this happens to fix as well by doing the
> entire buffer cache teardown in one places instead of splitting it
> between destroying the buftarg and the perag structures.
> 
> Link: https://lore.kernel.org/linux-xfs/aLeUdemAZ5wmtZel@dread.disaster.area/
> Reported-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
> Tested-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Nice cleanup!
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

Anyone want to venture forth with a fixes tag?

(Though I wonder if a less invasive fix for LTS kernels would be to make
unmount wait for xg_ref to hit zero, though I guess that runs the risk
of stuck mounts if someone leaks a xfs_group reference)

--D

> ---
>  fs/xfs/libxfs/xfs_ag.c | 13 ++---------
>  fs/xfs/libxfs/xfs_ag.h |  2 --
>  fs/xfs/xfs_buf.c       | 51 +++++++++++-------------------------------
>  fs/xfs/xfs_buf.h       | 10 +--------
>  fs/xfs/xfs_buf_mem.c   | 11 ++-------
>  5 files changed, 18 insertions(+), 69 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index 586918ed1cbf..a41d782e8e8c 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -110,10 +110,7 @@ xfs_perag_uninit(
>  	struct xfs_group	*xg)
>  {
>  #ifdef __KERNEL__
> -	struct xfs_perag	*pag = to_perag(xg);
> -
> -	cancel_delayed_work_sync(&pag->pag_blockgc_work);
> -	xfs_buf_cache_destroy(&pag->pag_bcache);
> +	cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
>  #endif
>  }
>  
> @@ -235,10 +232,6 @@ xfs_perag_alloc(
>  	INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
>  #endif /* __KERNEL__ */
>  
> -	error = xfs_buf_cache_init(&pag->pag_bcache);
> -	if (error)
> -		goto out_free_perag;
> -
>  	/*
>  	 * Pre-calculated geometry
>  	 */
> @@ -250,12 +243,10 @@ xfs_perag_alloc(
>  
>  	error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
>  	if (error)
> -		goto out_buf_cache_destroy;
> +		goto out_free_perag;
>  
>  	return 0;
>  
> -out_buf_cache_destroy:
> -	xfs_buf_cache_destroy(&pag->pag_bcache);
>  out_free_perag:
>  	kfree(pag);
>  	return error;
> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> index 1f24cfa27321..f02323416973 100644
> --- a/fs/xfs/libxfs/xfs_ag.h
> +++ b/fs/xfs/libxfs/xfs_ag.h
> @@ -85,8 +85,6 @@ struct xfs_perag {
>  	int		pag_ici_reclaimable;	/* reclaimable inodes */
>  	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
>  
> -	struct xfs_buf_cache	pag_bcache;
> -
>  	/* background prealloc block trimming */
>  	struct delayed_work	pag_blockgc_work;
>  #endif /* __KERNEL__ */
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 348c91335163..76eb7c5a73f1 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -363,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
>  	.obj_cmpfn		= _xfs_buf_obj_cmp,
>  };
>  
> -int
> -xfs_buf_cache_init(
> -	struct xfs_buf_cache	*bch)
> -{
> -	return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
> -}
> -
> -void
> -xfs_buf_cache_destroy(
> -	struct xfs_buf_cache	*bch)
> -{
> -	rhashtable_destroy(&bch->bc_hash);
> -}
> -
>  static int
>  xfs_buf_map_verify(
>  	struct xfs_buftarg	*btp,
> @@ -434,7 +420,7 @@ xfs_buf_find_lock(
>  
>  static inline int
>  xfs_buf_lookup(
> -	struct xfs_buf_cache	*bch,
> +	struct xfs_buftarg	*btp,
>  	struct xfs_buf_map	*map,
>  	xfs_buf_flags_t		flags,
>  	struct xfs_buf		**bpp)
> @@ -443,7 +429,7 @@ xfs_buf_lookup(
>  	int			error;
>  
>  	rcu_read_lock();
> -	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
> +	bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
>  	if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
>  		rcu_read_unlock();
>  		return -ENOENT;
> @@ -468,7 +454,6 @@ xfs_buf_lookup(
>  static int
>  xfs_buf_find_insert(
>  	struct xfs_buftarg	*btp,
> -	struct xfs_buf_cache	*bch,
>  	struct xfs_perag	*pag,
>  	struct xfs_buf_map	*cmap,
>  	struct xfs_buf_map	*map,
> @@ -488,7 +473,7 @@ xfs_buf_find_insert(
>  	new_bp->b_pag = pag;
>  
>  	rcu_read_lock();
> -	bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
> +	bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
>  			&new_bp->b_rhash_head, xfs_buf_hash_params);
>  	if (IS_ERR(bp)) {
>  		rcu_read_unlock();
> @@ -530,16 +515,6 @@ xfs_buftarg_get_pag(
>  	return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
>  }
>  
> -static inline struct xfs_buf_cache *
> -xfs_buftarg_buf_cache(
> -	struct xfs_buftarg		*btp,
> -	struct xfs_perag		*pag)
> -{
> -	if (pag)
> -		return &pag->pag_bcache;
> -	return btp->bt_cache;
> -}
> -
>  /*
>   * Assembles a buffer covering the specified range. The code is optimised for
>   * cache hits, as metadata intensive workloads will see 3 orders of magnitude
> @@ -553,7 +528,6 @@ xfs_buf_get_map(
>  	xfs_buf_flags_t		flags,
>  	struct xfs_buf		**bpp)
>  {
> -	struct xfs_buf_cache	*bch;
>  	struct xfs_perag	*pag;
>  	struct xfs_buf		*bp = NULL;
>  	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
> @@ -570,9 +544,8 @@ xfs_buf_get_map(
>  		return error;
>  
>  	pag = xfs_buftarg_get_pag(btp, &cmap);
> -	bch = xfs_buftarg_buf_cache(btp, pag);
>  
> -	error = xfs_buf_lookup(bch, &cmap, flags, &bp);
> +	error = xfs_buf_lookup(btp, &cmap, flags, &bp);
>  	if (error && error != -ENOENT)
>  		goto out_put_perag;
>  
> @@ -584,7 +557,7 @@ xfs_buf_get_map(
>  			goto out_put_perag;
>  
>  		/* xfs_buf_find_insert() consumes the perag reference. */
> -		error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
> +		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
>  				flags, &bp);
>  		if (error)
>  			return error;
> @@ -848,11 +821,8 @@ xfs_buf_destroy(
>  	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  
>  	if (!xfs_buf_is_uncached(bp)) {
> -		struct xfs_buf_cache	*bch =
> -			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
> -
> -		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
> -				xfs_buf_hash_params);
> +		rhashtable_remove_fast(&bp->b_target->bt_hash,
> +				&bp->b_rhash_head, xfs_buf_hash_params);
>  
>  		if (bp->b_pag)
>  			xfs_perag_put(bp->b_pag);
> @@ -1619,6 +1589,7 @@ xfs_destroy_buftarg(
>  	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
>  	percpu_counter_destroy(&btp->bt_readahead_count);
>  	list_lru_destroy(&btp->bt_lru);
> +	rhashtable_destroy(&btp->bt_hash);
>  }
>  
>  void
> @@ -1713,8 +1684,10 @@ xfs_init_buftarg(
>  	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
>  			     DEFAULT_RATELIMIT_BURST);
>  
> -	if (list_lru_init(&btp->bt_lru))
> +	if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
>  		return -ENOMEM;
> +	if (list_lru_init(&btp->bt_lru))
> +		goto out_destroy_hash;
>  	if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
>  		goto out_destroy_lru;
>  
> @@ -1732,6 +1705,8 @@ xfs_init_buftarg(
>  	percpu_counter_destroy(&btp->bt_readahead_count);
>  out_destroy_lru:
>  	list_lru_destroy(&btp->bt_lru);
> +out_destroy_hash:
> +	rhashtable_destroy(&btp->bt_hash);
>  	return -ENOMEM;
>  }
>  
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 3a1d066e1c13..bf39d89f0f6d 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -69,13 +69,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_INCORE,		"INCORE" }, \
>  	{ XBF_TRYLOCK,		"TRYLOCK" }
>  
> -struct xfs_buf_cache {
> -	struct rhashtable	bc_hash;
> -};
> -
> -int xfs_buf_cache_init(struct xfs_buf_cache *bch);
> -void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
> -
>  /*
>   * The xfs_buftarg contains 2 notions of "sector size" -
>   *
> @@ -113,8 +106,7 @@ struct xfs_buftarg {
>  	unsigned int		bt_awu_min;
>  	unsigned int		bt_awu_max;
>  
> -	/* built-in cache, if we're not using the perag one */
> -	struct xfs_buf_cache	bt_cache[];
> +	struct rhashtable	bt_hash;
>  };
>  
>  struct xfs_buf_map {
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index 0106da0a9f44..86dbec5ee203 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -58,7 +58,7 @@ xmbuf_alloc(
>  	struct xfs_buftarg	*btp;
>  	int			error;
>  
> -	btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
> +	btp = kzalloc(sizeof(*btp), GFP_KERNEL);
>  	if (!btp)
>  		return -ENOMEM;
>  
> @@ -81,10 +81,6 @@ xmbuf_alloc(
>  	/* ensure all writes are below EOF to avoid pagecache zeroing */
>  	i_size_write(inode, inode->i_sb->s_maxbytes);
>  
> -	error = xfs_buf_cache_init(btp->bt_cache);
> -	if (error)
> -		goto out_file;
> -
>  	/* Initialize buffer target */
>  	btp->bt_mount = mp;
>  	btp->bt_dev = (dev_t)-1U;
> @@ -95,15 +91,13 @@ xmbuf_alloc(
>  
>  	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
>  	if (error)
> -		goto out_bcache;
> +		goto out_file;
>  
>  	trace_xmbuf_create(btp);
>  
>  	*btpp = btp;
>  	return 0;
>  
> -out_bcache:
> -	xfs_buf_cache_destroy(btp->bt_cache);
>  out_file:
>  	fput(file);
>  out_free_btp:
> @@ -122,7 +116,6 @@ xmbuf_free(
>  	trace_xmbuf_free(btp);
>  
>  	xfs_destroy_buftarg(btp);
> -	xfs_buf_cache_destroy(btp->bt_cache);
>  	fput(btp->bt_file);
>  	kfree(btp);
>  }
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-22 18:10   ` Darrick J. Wong
@ 2026-01-23  5:36     ` Christoph Hellwig
  2026-01-23  7:01       ` Darrick J. Wong
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-23  5:36 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs,
	syzbot+0391d34e801643e2809b

On Thu, Jan 22, 2026 at 10:10:12AM -0800, Darrick J. Wong wrote:
> Anyone want to venture forth with a fixes tag?
> 
> (Though I wonder if a less invasive fix for LTS kernels would be to make
> unmount wait for xg_ref to hit zero, though I guess that runs the risk
> of stuck mounts if someone leaks a xfs_group reference)

I tried.  It probably is the patch that initially added the per-ag
rbtrees:

74f75a0cb7033918eb0fa4a50df25091ac75c16e
Author: Dave Chinner <dchinner@redhat.com>
Date:   Fri Sep 24 19:59:04 2010 +1000

    xfs: convert buffer cache hash to rbtree

but as I can't reproduce the issue locally, and getting syzbot to
verify it required horribly hacker to avoid lockdep I can't actually
confirm it.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-23  5:36     ` Christoph Hellwig
@ 2026-01-23  7:01       ` Darrick J. Wong
  0 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2026-01-23  7:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Dave Chinner, linux-xfs,
	syzbot+0391d34e801643e2809b

On Fri, Jan 23, 2026 at 06:36:19AM +0100, Christoph Hellwig wrote:
> On Thu, Jan 22, 2026 at 10:10:12AM -0800, Darrick J. Wong wrote:
> > Anyone want to venture forth with a fixes tag?
> > 
> > (Though I wonder if a less invasive fix for LTS kernels would be to make
> > unmount wait for xg_ref to hit zero, though I guess that runs the risk
> > of stuck mounts if someone leaks a xfs_group reference)
> 
> I tried.  It probably is the patch that initially added the per-ag
> rbtrees:
> 
> 74f75a0cb7033918eb0fa4a50df25091ac75c16e
> Author: Dave Chinner <dchinner@redhat.com>
> Date:   Fri Sep 24 19:59:04 2010 +1000
> 
>     xfs: convert buffer cache hash to rbtree
> 
> but as I can't reproduce the issue locally, and getting syzbot to
> verify it required horribly hacker to avoid lockdep I can't actually
> confirm it.

Yeah, I, uh, rely on syzbot reporters to test the patches because I
can't make head nor tails of the horrid reproducer code it generates.

Granted, the reproducer code is a lot less awful than it was back in
2017.

--D

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-22  5:26 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
@ 2026-01-23 11:55   ` Carlos Maiolino
  2026-01-23 16:01   ` Brian Foster
  1 sibling, 0 replies; 15+ messages in thread
From: Carlos Maiolino @ 2026-01-23 11:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Dave Chinner, linux-xfs, Darrick J. Wong

On Thu, Jan 22, 2026 at 06:26:55AM +0100, Christoph Hellwig wrote:
> Currently the buffer cache adds a reference to b_hold for buffers that
> are on the LRU.  This seems to go all the way back and allows releasing
> buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
> really complicated in differs from how other LRUs are implemented in
> Linux.
> 
> Switch to not having a reference for buffers in the LRU, and use a
> separate negative hold value to mark buffers as dead.  This simplifies
> xfs_buf_rele, which now just deal with the last "real" reference,
> and prepares for using the lockref primitive.
> 

Looks fine.
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
>  fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
>  fs/xfs/xfs_buf.h |   8 +--
>  2 files changed, 54 insertions(+), 94 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index db46883991de..aacdf080e400 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -80,11 +80,8 @@ xfs_buf_stale(
>  
>  	spin_lock(&bp->b_lock);
>  	atomic_set(&bp->b_lru_ref, 0);
> -	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
> -	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
> -		bp->b_hold--;
> -
> -	ASSERT(bp->b_hold >= 1);
> +	if (bp->b_hold >= 0)
> +		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
>  	spin_unlock(&bp->b_lock);
>  }
>  
> @@ -442,7 +439,7 @@ xfs_buf_try_hold(
>  	struct xfs_buf		*bp)
>  {
>  	spin_lock(&bp->b_lock);
> -	if (bp->b_hold == 0) {
> +	if (bp->b_hold == -1) {
>  		spin_unlock(&bp->b_lock);
>  		return false;
>  	}
> @@ -862,76 +859,24 @@ xfs_buf_hold(
>  }
>  
>  static void
> -xfs_buf_rele_uncached(
> -	struct xfs_buf		*bp)
> -{
> -	ASSERT(list_empty(&bp->b_lru));
> -
> -	spin_lock(&bp->b_lock);
> -	if (--bp->b_hold) {
> -		spin_unlock(&bp->b_lock);
> -		return;
> -	}
> -	spin_unlock(&bp->b_lock);
> -	xfs_buf_free(bp);
> -}
> -
> -static void
> -xfs_buf_rele_cached(
> +xfs_buf_destroy(
>  	struct xfs_buf		*bp)
>  {
> -	struct xfs_buftarg	*btp = bp->b_target;
> -	struct xfs_perag	*pag = bp->b_pag;
> -	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
> -	bool			freebuf = false;
> -
> -	trace_xfs_buf_rele(bp, _RET_IP_);
> -
> -	spin_lock(&bp->b_lock);
> -	ASSERT(bp->b_hold >= 1);
> -	if (bp->b_hold > 1) {
> -		bp->b_hold--;
> -		goto out_unlock;
> -	}
> +	ASSERT(bp->b_hold < 0);
> +	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  
> -	/* we are asked to drop the last reference */
> -	if (atomic_read(&bp->b_lru_ref)) {
> -		/*
> -		 * If the buffer is added to the LRU, keep the reference to the
> -		 * buffer for the LRU and clear the (now stale) dispose list
> -		 * state flag, else drop the reference.
> -		 */
> -		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
> -			bp->b_state &= ~XFS_BSTATE_DISPOSE;
> -		else
> -			bp->b_hold--;
> -	} else {
> -		bp->b_hold--;
> -		/*
> -		 * most of the time buffers will already be removed from the
> -		 * LRU, so optimise that case by checking for the
> -		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
> -		 * was on was the disposal list
> -		 */
> -		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
> -			list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
> -		} else {
> -			ASSERT(list_empty(&bp->b_lru));
> -		}
> +	if (!xfs_buf_is_uncached(bp)) {
> +		struct xfs_buf_cache	*bch =
> +			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
>  
> -		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
>  				xfs_buf_hash_params);
> -		if (pag)
> -			xfs_perag_put(pag);
> -		freebuf = true;
> -	}
>  
> -out_unlock:
> -	spin_unlock(&bp->b_lock);
> +		if (bp->b_pag)
> +			xfs_perag_put(bp->b_pag);
> +	}
>  
> -	if (freebuf)
> -		xfs_buf_free(bp);
> +	xfs_buf_free(bp);
>  }
>  
>  /*
> @@ -942,10 +887,22 @@ xfs_buf_rele(
>  	struct xfs_buf		*bp)
>  {
>  	trace_xfs_buf_rele(bp, _RET_IP_);
> -	if (xfs_buf_is_uncached(bp))
> -		xfs_buf_rele_uncached(bp);
> -	else
> -		xfs_buf_rele_cached(bp);
> +
> +	spin_lock(&bp->b_lock);
> +	if (!--bp->b_hold) {
> +		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
> +			goto kill;
> +		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	}
> +	spin_unlock(&bp->b_lock);
> +	return;
> +
> +kill:
> +	bp->b_hold = -1;
> +	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	spin_unlock(&bp->b_lock);
> +
> +	xfs_buf_destroy(bp);
>  }
>  
>  /*
> @@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
>  
>  /*
>   * To simulate an I/O failure, the buffer must be locked and held with at least
> - * three references. The LRU reference is dropped by the stale call. The buf
> - * item reference is dropped via ioend processing. The third reference is owned
> - * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
> + * two references.
> + *
> + * The buf item reference is dropped via ioend processing. The second reference
> + * is owned by the caller and is dropped on I/O completion if the buffer is
> + * XBF_ASYNC.
>   */
>  void
>  xfs_buf_ioend_fail(
> @@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
>  
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> -	if (bp->b_hold > 1) {
> +	if (bp->b_hold > 0) {
>  		/* need to wait, so skip it this pass */
>  		spin_unlock(&bp->b_lock);
>  		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
>  		return LRU_SKIP;
>  	}
>  
> -	/*
> -	 * clear the LRU reference count so the buffer doesn't get
> -	 * ignored in xfs_buf_rele().
> -	 */
> -	atomic_set(&bp->b_lru_ref, 0);
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
>  "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
>  					(long long)xfs_buf_daddr(bp));
>  			}
> -			xfs_buf_rele(bp);
> +			xfs_buf_destroy(bp);
>  		}
>  		if (loop++ != 0)
>  			delay(100);
> @@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
>  	struct list_head	*dispose = arg;
>  
>  	/*
> -	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
> -	 * If we fail to get the lock, just skip it.
> +	 * We are inverting the lru lock vs bp->b_lock order here, so use a
> +	 * trylock. If we fail to get the lock, just skip the buffer.
>  	 */
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> +
> +	/*
> +	 * If the buffer is in use, remove it from the LRU for now as we can't
> +	 * free it.  It will be added to the LRU again when the reference count
> +	 * hits zero.
> +	 */
> +	if (bp->b_hold > 0) {
> +		list_lru_isolate(lru, &bp->b_lru);
> +		spin_unlock(&bp->b_lock);
> +		return LRU_REMOVED;
> +	}
> +
>  	/*
>  	 * Decrement the b_lru_ref count unless the value is already
>  	 * zero. If the value is already zero, we need to reclaim the
> @@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
>  		return LRU_ROTATE;
>  	}
>  
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
>  		struct xfs_buf *bp;
>  		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
>  		list_del_init(&bp->b_lru);
> -		xfs_buf_rele(bp);
> +		xfs_buf_destroy(bp);
>  	}
>  
>  	return freed;
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index e25cd2a160f3..1117cd9cbfb9 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_INCORE,		"INCORE" }, \
>  	{ XBF_TRYLOCK,		"TRYLOCK" }
>  
> -/*
> - * Internal state flags.
> - */
> -#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
> -
>  struct xfs_buf_cache {
>  	struct rhashtable	bc_hash;
>  };
> @@ -159,6 +154,7 @@ struct xfs_buf {
>  
>  	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
>  	int			b_length;	/* size of buffer in BBs */
> +	spinlock_t		b_lock;		/* internal state lock */
>  	unsigned int		b_hold;		/* reference count */
>  	atomic_t		b_lru_ref;	/* lru reclaim ref count */
>  	xfs_buf_flags_t		b_flags;	/* status flags */
> @@ -169,8 +165,6 @@ struct xfs_buf {
>  	 * bt_lru_lock and not by b_sema
>  	 */
>  	struct list_head	b_lru;		/* lru list */
> -	spinlock_t		b_lock;		/* internal state lock */
> -	unsigned int		b_state;	/* internal state flags */
>  	wait_queue_head_t	b_waiters;	/* unpin waiters */
>  	struct list_head	b_list;
>  	struct xfs_perag	*b_pag;
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] xfs: use a lockref for the buffer reference count
  2026-01-22  5:26 ` [PATCH 2/3] xfs: use a lockref for the buffer reference count Christoph Hellwig
@ 2026-01-23 12:04   ` Carlos Maiolino
  0 siblings, 0 replies; 15+ messages in thread
From: Carlos Maiolino @ 2026-01-23 12:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Dave Chinner, linux-xfs, Darrick J. Wong

On Thu, Jan 22, 2026 at 06:26:56AM +0100, Christoph Hellwig wrote:
> The lockref structure allows incrementing/decrementing counters like
> an atomic_t for the fast path, while still allowing complex slow path
> operations as if the counter was protected by a lock.  The only slow
> path operations that actually need to take the lock are the final
> put, LRU evictions and marking a buffer stale.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

> ---
>  fs/xfs/xfs_buf.c   | 79 ++++++++++++++++++----------------------------
>  fs/xfs/xfs_buf.h   |  4 +--
>  fs/xfs/xfs_trace.h | 10 +++---
>  3 files changed, 38 insertions(+), 55 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index aacdf080e400..348c91335163 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -31,20 +31,20 @@ struct kmem_cache *xfs_buf_cache;
>   *
>   * xfs_buf_stale:
>   *	b_sema (caller holds)
> - *	  b_lock
> + *	  b_lockref.lock
>   *	    lru_lock
>   *
>   * xfs_buf_rele:
> - *	b_lock
> + *	b_lockref.lock
>   *	  lru_lock
>   *
>   * xfs_buftarg_drain_rele
>   *	lru_lock
> - *	  b_lock (trylock due to inversion)
> + *	  b_lockref.lock (trylock due to inversion)
>   *
>   * xfs_buftarg_isolate
>   *	lru_lock
> - *	  b_lock (trylock due to inversion)
> + *	  b_lockref.lock (trylock due to inversion)
>   */
>  
>  static void xfs_buf_submit(struct xfs_buf *bp);
> @@ -78,11 +78,11 @@ xfs_buf_stale(
>  	 */
>  	bp->b_flags &= ~_XBF_DELWRI_Q;
>  
> -	spin_lock(&bp->b_lock);
> +	spin_lock(&bp->b_lockref.lock);
>  	atomic_set(&bp->b_lru_ref, 0);
> -	if (bp->b_hold >= 0)
> +	if (!__lockref_is_dead(&bp->b_lockref))
>  		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
> -	spin_unlock(&bp->b_lock);
> +	spin_unlock(&bp->b_lockref.lock);
>  }
>  
>  static void
> @@ -274,10 +274,8 @@ xfs_buf_alloc(
>  	 * inserting into the hash table are safe (and will have to wait for
>  	 * the unlock to do anything non-trivial).
>  	 */
> -	bp->b_hold = 1;
> +	lockref_init(&bp->b_lockref);
>  	sema_init(&bp->b_sema, 0); /* held, no waiters */
> -
> -	spin_lock_init(&bp->b_lock);
>  	atomic_set(&bp->b_lru_ref, 1);
>  	init_completion(&bp->b_iowait);
>  	INIT_LIST_HEAD(&bp->b_lru);
> @@ -434,20 +432,6 @@ xfs_buf_find_lock(
>  	return 0;
>  }
>  
> -static bool
> -xfs_buf_try_hold(
> -	struct xfs_buf		*bp)
> -{
> -	spin_lock(&bp->b_lock);
> -	if (bp->b_hold == -1) {
> -		spin_unlock(&bp->b_lock);
> -		return false;
> -	}
> -	bp->b_hold++;
> -	spin_unlock(&bp->b_lock);
> -	return true;
> -}
> -
>  static inline int
>  xfs_buf_lookup(
>  	struct xfs_buf_cache	*bch,
> @@ -460,7 +444,7 @@ xfs_buf_lookup(
>  
>  	rcu_read_lock();
>  	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
> -	if (!bp || !xfs_buf_try_hold(bp)) {
> +	if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
>  		rcu_read_unlock();
>  		return -ENOENT;
>  	}
> @@ -511,7 +495,7 @@ xfs_buf_find_insert(
>  		error = PTR_ERR(bp);
>  		goto out_free_buf;
>  	}
> -	if (bp && xfs_buf_try_hold(bp)) {
> +	if (bp && lockref_get_not_dead(&bp->b_lockref)) {
>  		/* found an existing buffer */
>  		rcu_read_unlock();
>  		error = xfs_buf_find_lock(bp, flags);
> @@ -853,16 +837,14 @@ xfs_buf_hold(
>  {
>  	trace_xfs_buf_hold(bp, _RET_IP_);
>  
> -	spin_lock(&bp->b_lock);
> -	bp->b_hold++;
> -	spin_unlock(&bp->b_lock);
> +	lockref_get(&bp->b_lockref);
>  }
>  
>  static void
>  xfs_buf_destroy(
>  	struct xfs_buf		*bp)
>  {
> -	ASSERT(bp->b_hold < 0);
> +	ASSERT(__lockref_is_dead(&bp->b_lockref));
>  	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  
>  	if (!xfs_buf_is_uncached(bp)) {
> @@ -888,19 +870,20 @@ xfs_buf_rele(
>  {
>  	trace_xfs_buf_rele(bp, _RET_IP_);
>  
> -	spin_lock(&bp->b_lock);
> -	if (!--bp->b_hold) {
> +	if (lockref_put_or_lock(&bp->b_lockref))
> +		return;
> +	if (!--bp->b_lockref.count) {
>  		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
>  			goto kill;
>  		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
>  	}
> -	spin_unlock(&bp->b_lock);
> +	spin_unlock(&bp->b_lockref.lock);
>  	return;
>  
>  kill:
> -	bp->b_hold = -1;
> +	lockref_mark_dead(&bp->b_lockref);
>  	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
> -	spin_unlock(&bp->b_lock);
> +	spin_unlock(&bp->b_lockref.lock);
>  
>  	xfs_buf_destroy(bp);
>  }
> @@ -1471,18 +1454,18 @@ xfs_buftarg_drain_rele(
>  	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
>  	struct list_head	*dispose = arg;
>  
> -	if (!spin_trylock(&bp->b_lock))
> +	if (!spin_trylock(&bp->b_lockref.lock))
>  		return LRU_SKIP;
> -	if (bp->b_hold > 0) {
> +	if (bp->b_lockref.count > 0) {
>  		/* need to wait, so skip it this pass */
> -		spin_unlock(&bp->b_lock);
> +		spin_unlock(&bp->b_lockref.lock);
>  		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
>  		return LRU_SKIP;
>  	}
>  
> -	bp->b_hold = -1;
> +	lockref_mark_dead(&bp->b_lockref);
>  	list_lru_isolate_move(lru, item, dispose);
> -	spin_unlock(&bp->b_lock);
> +	spin_unlock(&bp->b_lockref.lock);
>  	return LRU_REMOVED;
>  }
>  
> @@ -1564,10 +1547,10 @@ xfs_buftarg_isolate(
>  	struct list_head	*dispose = arg;
>  
>  	/*
> -	 * We are inverting the lru lock vs bp->b_lock order here, so use a
> -	 * trylock. If we fail to get the lock, just skip the buffer.
> +	 * We are inverting the lru lock vs bp->b_lockref.lock order here, so
> +	 * use a trylock. If we fail to get the lock, just skip the buffer.
>  	 */
> -	if (!spin_trylock(&bp->b_lock))
> +	if (!spin_trylock(&bp->b_lockref.lock))
>  		return LRU_SKIP;
>  
>  	/*
> @@ -1575,9 +1558,9 @@ xfs_buftarg_isolate(
>  	 * free it.  It will be added to the LRU again when the reference count
>  	 * hits zero.
>  	 */
> -	if (bp->b_hold > 0) {
> +	if (bp->b_lockref.count > 0) {
>  		list_lru_isolate(lru, &bp->b_lru);
> -		spin_unlock(&bp->b_lock);
> +		spin_unlock(&bp->b_lockref.lock);
>  		return LRU_REMOVED;
>  	}
>  
> @@ -1587,13 +1570,13 @@ xfs_buftarg_isolate(
>  	 * buffer, otherwise it gets another trip through the LRU.
>  	 */
>  	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
> -		spin_unlock(&bp->b_lock);
> +		spin_unlock(&bp->b_lockref.lock);
>  		return LRU_ROTATE;
>  	}
>  
> -	bp->b_hold = -1;
> +	lockref_mark_dead(&bp->b_lockref);
>  	list_lru_isolate_move(lru, item, dispose);
> -	spin_unlock(&bp->b_lock);
> +	spin_unlock(&bp->b_lockref.lock);
>  	return LRU_REMOVED;
>  }
>  
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 1117cd9cbfb9..3a1d066e1c13 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -14,6 +14,7 @@
>  #include <linux/dax.h>
>  #include <linux/uio.h>
>  #include <linux/list_lru.h>
> +#include <linux/lockref.h>
>  
>  extern struct kmem_cache *xfs_buf_cache;
>  
> @@ -154,8 +155,7 @@ struct xfs_buf {
>  
>  	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
>  	int			b_length;	/* size of buffer in BBs */
> -	spinlock_t		b_lock;		/* internal state lock */
> -	unsigned int		b_hold;		/* reference count */
> +	struct lockref		b_lockref;	/* refcount + lock */
>  	atomic_t		b_lru_ref;	/* lru reclaim ref count */
>  	xfs_buf_flags_t		b_flags;	/* status flags */
>  	struct semaphore	b_sema;		/* semaphore for lockables */
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index f70afbf3cb19..abf022d5ff42 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -736,7 +736,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
>  		__entry->dev = bp->b_target->bt_dev;
>  		__entry->bno = xfs_buf_daddr(bp);
>  		__entry->nblks = bp->b_length;
> -		__entry->hold = bp->b_hold;
> +		__entry->hold = bp->b_lockref.count;
>  		__entry->pincount = atomic_read(&bp->b_pin_count);
>  		__entry->lockval = bp->b_sema.count;
>  		__entry->flags = bp->b_flags;
> @@ -810,7 +810,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
>  		__entry->bno = xfs_buf_daddr(bp);
>  		__entry->length = bp->b_length;
>  		__entry->flags = flags;
> -		__entry->hold = bp->b_hold;
> +		__entry->hold = bp->b_lockref.count;
>  		__entry->pincount = atomic_read(&bp->b_pin_count);
>  		__entry->lockval = bp->b_sema.count;
>  		__entry->caller_ip = caller_ip;
> @@ -854,7 +854,7 @@ TRACE_EVENT(xfs_buf_ioerror,
>  		__entry->dev = bp->b_target->bt_dev;
>  		__entry->bno = xfs_buf_daddr(bp);
>  		__entry->length = bp->b_length;
> -		__entry->hold = bp->b_hold;
> +		__entry->hold = bp->b_lockref.count;
>  		__entry->pincount = atomic_read(&bp->b_pin_count);
>  		__entry->lockval = bp->b_sema.count;
>  		__entry->error = error;
> @@ -898,7 +898,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
>  		__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
>  		__entry->buf_len = bip->bli_buf->b_length;
>  		__entry->buf_flags = bip->bli_buf->b_flags;
> -		__entry->buf_hold = bip->bli_buf->b_hold;
> +		__entry->buf_hold = bip->bli_buf->b_lockref.count;
>  		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
>  		__entry->buf_lockval = bip->bli_buf->b_sema.count;
>  		__entry->li_flags = bip->bli_item.li_flags;
> @@ -5181,7 +5181,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
>  		__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
>  		__entry->bno = xfs_buf_daddr(bp);
>  		__entry->nblks = bp->b_length;
> -		__entry->hold = bp->b_hold;
> +		__entry->hold = bp->b_lockref.count;
>  		__entry->pincount = atomic_read(&bp->b_pin_count);
>  		__entry->lockval = bp->b_sema.count;
>  		__entry->flags = bp->b_flags;
> -- 
> 2.47.3
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-22  5:26 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
  2026-01-23 11:55   ` Carlos Maiolino
@ 2026-01-23 16:01   ` Brian Foster
  1 sibling, 0 replies; 15+ messages in thread
From: Brian Foster @ 2026-01-23 16:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Dave Chinner, linux-xfs, Darrick J. Wong

On Thu, Jan 22, 2026 at 06:26:55AM +0100, Christoph Hellwig wrote:
> Currently the buffer cache adds a reference to b_hold for buffers that
> are on the LRU.  This seems to go all the way back and allows releasing
> buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
> really complicated in differs from how other LRUs are implemented in
> Linux.
> 
> Switch to not having a reference for buffers in the LRU, and use a
> separate negative hold value to mark buffers as dead.  This simplifies
> xfs_buf_rele, which now just deal with the last "real" reference,
> and prepares for using the lockref primitive.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
>  fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
>  fs/xfs/xfs_buf.h |   8 +--
>  2 files changed, 54 insertions(+), 94 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index db46883991de..aacdf080e400 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -80,11 +80,8 @@ xfs_buf_stale(
>  
>  	spin_lock(&bp->b_lock);
>  	atomic_set(&bp->b_lru_ref, 0);
> -	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
> -	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
> -		bp->b_hold--;
> -
> -	ASSERT(bp->b_hold >= 1);
> +	if (bp->b_hold >= 0)
> +		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);

I see that b_hold goes away in subsequent patches, but nonetheless it's
unsigned as of this patch, which kind of makes this look like a
potential bisect bomb. I wouldn't want to turn this into a big effort
just to fix it up mid-series, but maybe this should just change b_hold
to a signed type and call out the transient wart in the commit log?

(A quick/spot test might not hurt either if that hadn't been done.)

>  	spin_unlock(&bp->b_lock);
>  }
>  
...
> @@ -862,76 +859,24 @@ xfs_buf_hold(
>  }
>  
...
> -static void
> -xfs_buf_rele_cached(
> +xfs_buf_destroy(
>  	struct xfs_buf		*bp)
>  {
...
> +	if (!xfs_buf_is_uncached(bp)) {
> +		struct xfs_buf_cache	*bch =
> +			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
>  
> -		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
>  				xfs_buf_hash_params);

This looks like a subtle locking change in that we're no longer under
b_lock..? AFAICT that is fine as RCU protection is more important for
the lookup side, but it also might be worth documenting that change in
the commit log as well so it's clear that it is intentional and safe.

Brian

> -		if (pag)
> -			xfs_perag_put(pag);
> -		freebuf = true;
> -	}
>  
> -out_unlock:
> -	spin_unlock(&bp->b_lock);
> +		if (bp->b_pag)
> +			xfs_perag_put(bp->b_pag);
> +	}
>  
> -	if (freebuf)
> -		xfs_buf_free(bp);
> +	xfs_buf_free(bp);
>  }
>  
>  /*
> @@ -942,10 +887,22 @@ xfs_buf_rele(
>  	struct xfs_buf		*bp)
>  {
>  	trace_xfs_buf_rele(bp, _RET_IP_);
> -	if (xfs_buf_is_uncached(bp))
> -		xfs_buf_rele_uncached(bp);
> -	else
> -		xfs_buf_rele_cached(bp);
> +
> +	spin_lock(&bp->b_lock);
> +	if (!--bp->b_hold) {
> +		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
> +			goto kill;
> +		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	}
> +	spin_unlock(&bp->b_lock);
> +	return;
> +
> +kill:
> +	bp->b_hold = -1;
> +	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	spin_unlock(&bp->b_lock);
> +
> +	xfs_buf_destroy(bp);
>  }
>  
>  /*
> @@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
>  
>  /*
>   * To simulate an I/O failure, the buffer must be locked and held with at least
> - * three references. The LRU reference is dropped by the stale call. The buf
> - * item reference is dropped via ioend processing. The third reference is owned
> - * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
> + * two references.
> + *
> + * The buf item reference is dropped via ioend processing. The second reference
> + * is owned by the caller and is dropped on I/O completion if the buffer is
> + * XBF_ASYNC.
>   */
>  void
>  xfs_buf_ioend_fail(
> @@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
>  
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> -	if (bp->b_hold > 1) {
> +	if (bp->b_hold > 0) {
>  		/* need to wait, so skip it this pass */
>  		spin_unlock(&bp->b_lock);
>  		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
>  		return LRU_SKIP;
>  	}
>  
> -	/*
> -	 * clear the LRU reference count so the buffer doesn't get
> -	 * ignored in xfs_buf_rele().
> -	 */
> -	atomic_set(&bp->b_lru_ref, 0);
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
>  "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
>  					(long long)xfs_buf_daddr(bp));
>  			}
> -			xfs_buf_rele(bp);
> +			xfs_buf_destroy(bp);
>  		}
>  		if (loop++ != 0)
>  			delay(100);
> @@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
>  	struct list_head	*dispose = arg;
>  
>  	/*
> -	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
> -	 * If we fail to get the lock, just skip it.
> +	 * We are inverting the lru lock vs bp->b_lock order here, so use a
> +	 * trylock. If we fail to get the lock, just skip the buffer.
>  	 */
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> +
> +	/*
> +	 * If the buffer is in use, remove it from the LRU for now as we can't
> +	 * free it.  It will be added to the LRU again when the reference count
> +	 * hits zero.
> +	 */
> +	if (bp->b_hold > 0) {
> +		list_lru_isolate(lru, &bp->b_lru);
> +		spin_unlock(&bp->b_lock);
> +		return LRU_REMOVED;
> +	}
> +
>  	/*
>  	 * Decrement the b_lru_ref count unless the value is already
>  	 * zero. If the value is already zero, we need to reclaim the
> @@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
>  		return LRU_ROTATE;
>  	}
>  
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
>  		struct xfs_buf *bp;
>  		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
>  		list_del_init(&bp->b_lru);
> -		xfs_buf_rele(bp);
> +		xfs_buf_destroy(bp);
>  	}
>  
>  	return freed;
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index e25cd2a160f3..1117cd9cbfb9 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_INCORE,		"INCORE" }, \
>  	{ XBF_TRYLOCK,		"TRYLOCK" }
>  
> -/*
> - * Internal state flags.
> - */
> -#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
> -
>  struct xfs_buf_cache {
>  	struct rhashtable	bc_hash;
>  };
> @@ -159,6 +154,7 @@ struct xfs_buf {
>  
>  	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
>  	int			b_length;	/* size of buffer in BBs */
> +	spinlock_t		b_lock;		/* internal state lock */
>  	unsigned int		b_hold;		/* reference count */
>  	atomic_t		b_lru_ref;	/* lru reclaim ref count */
>  	xfs_buf_flags_t		b_flags;	/* status flags */
> @@ -169,8 +165,6 @@ struct xfs_buf {
>  	 * bt_lru_lock and not by b_sema
>  	 */
>  	struct list_head	b_lru;		/* lru list */
> -	spinlock_t		b_lock;		/* internal state lock */
> -	unsigned int		b_state;	/* internal state flags */
>  	wait_queue_head_t	b_waiters;	/* unpin waiters */
>  	struct list_head	b_list;
>  	struct xfs_perag	*b_pag;
> -- 
> 2.47.3
> 
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-26  5:37 buffer cache simplification v3 Christoph Hellwig
@ 2026-01-26  5:38 ` Christoph Hellwig
  0 siblings, 0 replies; 15+ messages in thread
From: Christoph Hellwig @ 2026-01-26  5:38 UTC (permalink / raw)
  To: Carlos Maiolino
  Cc: Dave Chinner, Brian Foster, linux-xfs,
	syzbot+0391d34e801643e2809b

The per-AG buffer hashes were added when all buffer lookups took a
per-hash look.  Since then we've made lookups entirely lockless and
removed the need for a hash-wide lock for inserts and removals as
well.  With this there is no need to sharding the hash, so reduce the
used resources by using a per-buftarg hash for all buftargs.

Long after writing this initially, syzbot found a problem in the buffer
cache teardown order, which this happens to fix as well by doing the
entire buffer cache teardown in one places instead of splitting it
between destroying the buftarg and the perag structures.

Link: https://lore.kernel.org/linux-xfs/aLeUdemAZ5wmtZel@dread.disaster.area/
Reported-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Tested-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_ag.c | 13 ++---------
 fs/xfs/libxfs/xfs_ag.h |  2 --
 fs/xfs/xfs_buf.c       | 51 +++++++++++-------------------------------
 fs/xfs/xfs_buf.h       | 10 +--------
 fs/xfs/xfs_buf_mem.c   | 11 ++-------
 5 files changed, 18 insertions(+), 69 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 586918ed1cbf..a41d782e8e8c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -110,10 +110,7 @@ xfs_perag_uninit(
 	struct xfs_group	*xg)
 {
 #ifdef __KERNEL__
-	struct xfs_perag	*pag = to_perag(xg);
-
-	cancel_delayed_work_sync(&pag->pag_blockgc_work);
-	xfs_buf_cache_destroy(&pag->pag_bcache);
+	cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
 #endif
 }
 
@@ -235,10 +232,6 @@ xfs_perag_alloc(
 	INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 #endif /* __KERNEL__ */
 
-	error = xfs_buf_cache_init(&pag->pag_bcache);
-	if (error)
-		goto out_free_perag;
-
 	/*
 	 * Pre-calculated geometry
 	 */
@@ -250,12 +243,10 @@ xfs_perag_alloc(
 
 	error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
 	if (error)
-		goto out_buf_cache_destroy;
+		goto out_free_perag;
 
 	return 0;
 
-out_buf_cache_destroy:
-	xfs_buf_cache_destroy(&pag->pag_bcache);
 out_free_perag:
 	kfree(pag);
 	return error;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 1f24cfa27321..f02323416973 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -85,8 +85,6 @@ struct xfs_perag {
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
-	struct xfs_buf_cache	pag_bcache;
-
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
 #endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 348c91335163..76eb7c5a73f1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -363,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
 	.obj_cmpfn		= _xfs_buf_obj_cmp,
 };
 
-int
-xfs_buf_cache_init(
-	struct xfs_buf_cache	*bch)
-{
-	return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
-}
-
-void
-xfs_buf_cache_destroy(
-	struct xfs_buf_cache	*bch)
-{
-	rhashtable_destroy(&bch->bc_hash);
-}
-
 static int
 xfs_buf_map_verify(
 	struct xfs_buftarg	*btp,
@@ -434,7 +420,7 @@ xfs_buf_find_lock(
 
 static inline int
 xfs_buf_lookup(
-	struct xfs_buf_cache	*bch,
+	struct xfs_buftarg	*btp,
 	struct xfs_buf_map	*map,
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
@@ -443,7 +429,7 @@ xfs_buf_lookup(
 	int			error;
 
 	rcu_read_lock();
-	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
+	bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
 	if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
 		rcu_read_unlock();
 		return -ENOENT;
@@ -468,7 +454,6 @@ xfs_buf_lookup(
 static int
 xfs_buf_find_insert(
 	struct xfs_buftarg	*btp,
-	struct xfs_buf_cache	*bch,
 	struct xfs_perag	*pag,
 	struct xfs_buf_map	*cmap,
 	struct xfs_buf_map	*map,
@@ -488,7 +473,7 @@ xfs_buf_find_insert(
 	new_bp->b_pag = pag;
 
 	rcu_read_lock();
-	bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
+	bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
 			&new_bp->b_rhash_head, xfs_buf_hash_params);
 	if (IS_ERR(bp)) {
 		rcu_read_unlock();
@@ -530,16 +515,6 @@ xfs_buftarg_get_pag(
 	return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
 }
 
-static inline struct xfs_buf_cache *
-xfs_buftarg_buf_cache(
-	struct xfs_buftarg		*btp,
-	struct xfs_perag		*pag)
-{
-	if (pag)
-		return &pag->pag_bcache;
-	return btp->bt_cache;
-}
-
 /*
  * Assembles a buffer covering the specified range. The code is optimised for
  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -553,7 +528,6 @@ xfs_buf_get_map(
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
 {
-	struct xfs_buf_cache	*bch;
 	struct xfs_perag	*pag;
 	struct xfs_buf		*bp = NULL;
 	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
@@ -570,9 +544,8 @@ xfs_buf_get_map(
 		return error;
 
 	pag = xfs_buftarg_get_pag(btp, &cmap);
-	bch = xfs_buftarg_buf_cache(btp, pag);
 
-	error = xfs_buf_lookup(bch, &cmap, flags, &bp);
+	error = xfs_buf_lookup(btp, &cmap, flags, &bp);
 	if (error && error != -ENOENT)
 		goto out_put_perag;
 
@@ -584,7 +557,7 @@ xfs_buf_get_map(
 			goto out_put_perag;
 
 		/* xfs_buf_find_insert() consumes the perag reference. */
-		error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
+		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
 				flags, &bp);
 		if (error)
 			return error;
@@ -848,11 +821,8 @@ xfs_buf_destroy(
 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
 	if (!xfs_buf_is_uncached(bp)) {
-		struct xfs_buf_cache	*bch =
-			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
-
-		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
-				xfs_buf_hash_params);
+		rhashtable_remove_fast(&bp->b_target->bt_hash,
+				&bp->b_rhash_head, xfs_buf_hash_params);
 
 		if (bp->b_pag)
 			xfs_perag_put(bp->b_pag);
@@ -1619,6 +1589,7 @@ xfs_destroy_buftarg(
 	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
 	percpu_counter_destroy(&btp->bt_readahead_count);
 	list_lru_destroy(&btp->bt_lru);
+	rhashtable_destroy(&btp->bt_hash);
 }
 
 void
@@ -1713,8 +1684,10 @@ xfs_init_buftarg(
 	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
 			     DEFAULT_RATELIMIT_BURST);
 
-	if (list_lru_init(&btp->bt_lru))
+	if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
 		return -ENOMEM;
+	if (list_lru_init(&btp->bt_lru))
+		goto out_destroy_hash;
 	if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
 		goto out_destroy_lru;
 
@@ -1732,6 +1705,8 @@ xfs_init_buftarg(
 	percpu_counter_destroy(&btp->bt_readahead_count);
 out_destroy_lru:
 	list_lru_destroy(&btp->bt_lru);
+out_destroy_hash:
+	rhashtable_destroy(&btp->bt_hash);
 	return -ENOMEM;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3a1d066e1c13..bf39d89f0f6d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -69,13 +69,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }
 
-struct xfs_buf_cache {
-	struct rhashtable	bc_hash;
-};
-
-int xfs_buf_cache_init(struct xfs_buf_cache *bch);
-void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
-
 /*
  * The xfs_buftarg contains 2 notions of "sector size" -
  *
@@ -113,8 +106,7 @@ struct xfs_buftarg {
 	unsigned int		bt_awu_min;
 	unsigned int		bt_awu_max;
 
-	/* built-in cache, if we're not using the perag one */
-	struct xfs_buf_cache	bt_cache[];
+	struct rhashtable	bt_hash;
 };
 
 struct xfs_buf_map {
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 0106da0a9f44..86dbec5ee203 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -58,7 +58,7 @@ xmbuf_alloc(
 	struct xfs_buftarg	*btp;
 	int			error;
 
-	btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
+	btp = kzalloc(sizeof(*btp), GFP_KERNEL);
 	if (!btp)
 		return -ENOMEM;
 
@@ -81,10 +81,6 @@ xmbuf_alloc(
 	/* ensure all writes are below EOF to avoid pagecache zeroing */
 	i_size_write(inode, inode->i_sb->s_maxbytes);
 
-	error = xfs_buf_cache_init(btp->bt_cache);
-	if (error)
-		goto out_file;
-
 	/* Initialize buffer target */
 	btp->bt_mount = mp;
 	btp->bt_dev = (dev_t)-1U;
@@ -95,15 +91,13 @@ xmbuf_alloc(
 
 	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
 	if (error)
-		goto out_bcache;
+		goto out_file;
 
 	trace_xmbuf_create(btp);
 
 	*btpp = btp;
 	return 0;
 
-out_bcache:
-	xfs_buf_cache_destroy(btp->bt_cache);
 out_file:
 	fput(file);
 out_free_btp:
@@ -122,7 +116,6 @@ xmbuf_free(
 	trace_xmbuf_free(btp);
 
 	xfs_destroy_buftarg(btp);
-	xfs_buf_cache_destroy(btp->bt_cache);
 	fput(btp->bt_file);
 	kfree(btp);
 }
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2026-01-26  5:38 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-22  5:26 buffer cache simplification v2 Christoph Hellwig
2026-01-22  5:26 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
2026-01-23 11:55   ` Carlos Maiolino
2026-01-23 16:01   ` Brian Foster
2026-01-22  5:26 ` [PATCH 2/3] xfs: use a lockref for the buffer reference count Christoph Hellwig
2026-01-23 12:04   ` Carlos Maiolino
2026-01-22  5:26 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
2026-01-22 18:10   ` Darrick J. Wong
2026-01-23  5:36     ` Christoph Hellwig
2026-01-23  7:01       ` Darrick J. Wong
  -- strict thread matches above, loose matches on Subject: below --
2026-01-26  5:37 buffer cache simplification v3 Christoph Hellwig
2026-01-26  5:38 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
2026-01-19 15:31 buffer cache simplification Christoph Hellwig
2026-01-19 15:31 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
2026-01-20  2:39   ` Darrick J. Wong
2026-01-20  7:06     ` Christoph Hellwig
2026-01-20 15:50       ` Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox