[PATCH 1/3] xfs: don't keep a reference for buffers on the LRU

public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-19 15:31 buffer cache simplification Christoph Hellwig
@ 2026-01-19 15:31 ` Christoph Hellwig
  2026-01-20  2:53   ` Darrick J. Wong
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2026-01-19 15:31 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, linux-xfs

Currently the buffer cache adds a reference to b_hold for buffers that
are on the LRU.  This seems to go all the way back and allows releasing
buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
really complicated in differs from how other LRUs are implemented in
Linux.

Switch to not having a reference for buffers in the LRU, and use a
separate negative hold value to mark buffers as dead.  This simplifies
xfs_buf_rele, which now just deal with the last "real" reference,
and prepares for using the lockref primitive.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
 fs/xfs/xfs_buf.h |   8 +--
 2 files changed, 54 insertions(+), 94 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index db46883991de..aacdf080e400 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,11 +80,8 @@ xfs_buf_stale(
 
 	spin_lock(&bp->b_lock);
 	atomic_set(&bp->b_lru_ref, 0);
-	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
-	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
-		bp->b_hold--;
-
-	ASSERT(bp->b_hold >= 1);
+	if (bp->b_hold >= 0)
+		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
 	spin_unlock(&bp->b_lock);
 }
 
@@ -442,7 +439,7 @@ xfs_buf_try_hold(
 	struct xfs_buf		*bp)
 {
 	spin_lock(&bp->b_lock);
-	if (bp->b_hold == 0) {
+	if (bp->b_hold == -1) {
 		spin_unlock(&bp->b_lock);
 		return false;
 	}
@@ -862,76 +859,24 @@ xfs_buf_hold(
 }
 
 static void
-xfs_buf_rele_uncached(
-	struct xfs_buf		*bp)
-{
-	ASSERT(list_empty(&bp->b_lru));
-
-	spin_lock(&bp->b_lock);
-	if (--bp->b_hold) {
-		spin_unlock(&bp->b_lock);
-		return;
-	}
-	spin_unlock(&bp->b_lock);
-	xfs_buf_free(bp);
-}
-
-static void
-xfs_buf_rele_cached(
+xfs_buf_destroy(
 	struct xfs_buf		*bp)
 {
-	struct xfs_buftarg	*btp = bp->b_target;
-	struct xfs_perag	*pag = bp->b_pag;
-	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
-	bool			freebuf = false;
-
-	trace_xfs_buf_rele(bp, _RET_IP_);
-
-	spin_lock(&bp->b_lock);
-	ASSERT(bp->b_hold >= 1);
-	if (bp->b_hold > 1) {
-		bp->b_hold--;
-		goto out_unlock;
-	}
+	ASSERT(bp->b_hold < 0);
+	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
-	/* we are asked to drop the last reference */
-	if (atomic_read(&bp->b_lru_ref)) {
-		/*
-		 * If the buffer is added to the LRU, keep the reference to the
-		 * buffer for the LRU and clear the (now stale) dispose list
-		 * state flag, else drop the reference.
-		 */
-		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
-			bp->b_state &= ~XFS_BSTATE_DISPOSE;
-		else
-			bp->b_hold--;
-	} else {
-		bp->b_hold--;
-		/*
-		 * most of the time buffers will already be removed from the
-		 * LRU, so optimise that case by checking for the
-		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
-		 * was on was the disposal list
-		 */
-		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
-		} else {
-			ASSERT(list_empty(&bp->b_lru));
-		}
+	if (!xfs_buf_is_uncached(bp)) {
+		struct xfs_buf_cache	*bch =
+			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
 
-		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
 				xfs_buf_hash_params);
-		if (pag)
-			xfs_perag_put(pag);
-		freebuf = true;
-	}
 
-out_unlock:
-	spin_unlock(&bp->b_lock);
+		if (bp->b_pag)
+			xfs_perag_put(bp->b_pag);
+	}
 
-	if (freebuf)
-		xfs_buf_free(bp);
+	xfs_buf_free(bp);
 }
 
 /*
@@ -942,10 +887,22 @@ xfs_buf_rele(
 	struct xfs_buf		*bp)
 {
 	trace_xfs_buf_rele(bp, _RET_IP_);
-	if (xfs_buf_is_uncached(bp))
-		xfs_buf_rele_uncached(bp);
-	else
-		xfs_buf_rele_cached(bp);
+
+	spin_lock(&bp->b_lock);
+	if (!--bp->b_hold) {
+		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
+			goto kill;
+		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
+	}
+	spin_unlock(&bp->b_lock);
+	return;
+
+kill:
+	bp->b_hold = -1;
+	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+	spin_unlock(&bp->b_lock);
+
+	xfs_buf_destroy(bp);
 }
 
 /*
@@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
 
 /*
  * To simulate an I/O failure, the buffer must be locked and held with at least
- * three references. The LRU reference is dropped by the stale call. The buf
- * item reference is dropped via ioend processing. The third reference is owned
- * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ * two references.
+ *
+ * The buf item reference is dropped via ioend processing. The second reference
+ * is owned by the caller and is dropped on I/O completion if the buffer is
+ * XBF_ASYNC.
  */
 void
 xfs_buf_ioend_fail(
@@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
 
 	if (!spin_trylock(&bp->b_lock))
 		return LRU_SKIP;
-	if (bp->b_hold > 1) {
+	if (bp->b_hold > 0) {
 		/* need to wait, so skip it this pass */
 		spin_unlock(&bp->b_lock);
 		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
 		return LRU_SKIP;
 	}
 
-	/*
-	 * clear the LRU reference count so the buffer doesn't get
-	 * ignored in xfs_buf_rele().
-	 */
-	atomic_set(&bp->b_lru_ref, 0);
-	bp->b_state |= XFS_BSTATE_DISPOSE;
+	bp->b_hold = -1;
 	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
@@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
 					(long long)xfs_buf_daddr(bp));
 			}
-			xfs_buf_rele(bp);
+			xfs_buf_destroy(bp);
 		}
 		if (loop++ != 0)
 			delay(100);
@@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
 	struct list_head	*dispose = arg;
 
 	/*
-	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
-	 * If we fail to get the lock, just skip it.
+	 * We are inverting the lru lock vs bp->b_lock order here, so use a
+	 * trylock. If we fail to get the lock, just skip the buffer.
 	 */
 	if (!spin_trylock(&bp->b_lock))
 		return LRU_SKIP;
+
+	/*
+	 * If the buffer is in use, remove it from the LRU for now as we can't
+	 * free it.  It will be added to the LRU again when the reference count
+	 * hits zero.
+	 */
+	if (bp->b_hold > 0) {
+		list_lru_isolate(lru, &bp->b_lru);
+		spin_unlock(&bp->b_lock);
+		return LRU_REMOVED;
+	}
+
 	/*
 	 * Decrement the b_lru_ref count unless the value is already
 	 * zero. If the value is already zero, we need to reclaim the
@@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
 		return LRU_ROTATE;
 	}
 
-	bp->b_state |= XFS_BSTATE_DISPOSE;
+	bp->b_hold = -1;
 	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
@@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
 		struct xfs_buf *bp;
 		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 		list_del_init(&bp->b_lru);
-		xfs_buf_rele(bp);
+		xfs_buf_destroy(bp);
 	}
 
 	return freed;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e25cd2a160f3..1117cd9cbfb9 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }
 
-/*
- * Internal state flags.
- */
-#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
-
 struct xfs_buf_cache {
 	struct rhashtable	bc_hash;
 };
@@ -159,6 +154,7 @@ struct xfs_buf {
 
 	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
 	int			b_length;	/* size of buffer in BBs */
+	spinlock_t		b_lock;		/* internal state lock */
 	unsigned int		b_hold;		/* reference count */
 	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 	xfs_buf_flags_t		b_flags;	/* status flags */
@@ -169,8 +165,6 @@ struct xfs_buf {
 	 * bt_lru_lock and not by b_sema
 	 */
 	struct list_head	b_lru;		/* lru list */
-	spinlock_t		b_lock;		/* internal state lock */
-	unsigned int		b_state;	/* internal state flags */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-19 15:31 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
@ 2026-01-20  2:53   ` Darrick J. Wong
  2026-01-20  6:55     ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Darrick J. Wong @ 2026-01-20  2:53 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, Dave Chinner, linux-xfs

On Mon, Jan 19, 2026 at 04:31:35PM +0100, Christoph Hellwig wrote:
> Currently the buffer cache adds a reference to b_hold for buffers that
> are on the LRU.  This seems to go all the way back and allows releasing
> buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
> really complicated in differs from how other LRUs are implemented in
> Linux.

I'd noticed that.

So under the current code, a cached buffer gets created with b_hold=1
and b_lru_ref=1.  Calling xfs_buf_hold can bump up b_hold.  Calling
xfs_buf_rele immediately will either transfer ownership of the buf to
the lru (if it wasn't on the lru ref) or decrement b_hold.

Higher level xfs code might boost b_lru_ref for buffers that it thinks
we should try to hang on to (e.g. AG headers).

xfs_buftarg_isolate will decrement b_lru_ref unless it was already zero,
in which case it'll actually free the buffer.  If xfs_buf_rele finds a
buffer with b_lru_ref==0 it'll drop b_hold and try to free the buffer if
b_hold drops to zero.

Right?

> Switch to not having a reference for buffers in the LRU, and use a
> separate negative hold value to mark buffers as dead.  This simplifies
> xfs_buf_rele, which now just deal with the last "real" reference,
> and prepares for using the lockref primitive.

And now, b_hold is the number of higher-level owners of the buffer.  If
that drops to zero the buffer gets put on the lru list if it hasn't
already run out of lru refs, in which case it's freed directly.  If a
buffer on the lru list runs out of lru refs then it'll get freed.
b_hodl < 0 means "headed for destruction".

Is that also correct?

If yes to both then I've understood what's going on here well enough to 
say:
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D


> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
>  fs/xfs/xfs_buf.h |   8 +--
>  2 files changed, 54 insertions(+), 94 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index db46883991de..aacdf080e400 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -80,11 +80,8 @@ xfs_buf_stale(
>  
>  	spin_lock(&bp->b_lock);
>  	atomic_set(&bp->b_lru_ref, 0);
> -	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
> -	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
> -		bp->b_hold--;
> -
> -	ASSERT(bp->b_hold >= 1);
> +	if (bp->b_hold >= 0)
> +		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
>  	spin_unlock(&bp->b_lock);
>  }
>  
> @@ -442,7 +439,7 @@ xfs_buf_try_hold(
>  	struct xfs_buf		*bp)
>  {
>  	spin_lock(&bp->b_lock);
> -	if (bp->b_hold == 0) {
> +	if (bp->b_hold == -1) {
>  		spin_unlock(&bp->b_lock);
>  		return false;
>  	}
> @@ -862,76 +859,24 @@ xfs_buf_hold(
>  }
>  
>  static void
> -xfs_buf_rele_uncached(
> -	struct xfs_buf		*bp)
> -{
> -	ASSERT(list_empty(&bp->b_lru));
> -
> -	spin_lock(&bp->b_lock);
> -	if (--bp->b_hold) {
> -		spin_unlock(&bp->b_lock);
> -		return;
> -	}
> -	spin_unlock(&bp->b_lock);
> -	xfs_buf_free(bp);
> -}
> -
> -static void
> -xfs_buf_rele_cached(
> +xfs_buf_destroy(
>  	struct xfs_buf		*bp)
>  {
> -	struct xfs_buftarg	*btp = bp->b_target;
> -	struct xfs_perag	*pag = bp->b_pag;
> -	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
> -	bool			freebuf = false;
> -
> -	trace_xfs_buf_rele(bp, _RET_IP_);
> -
> -	spin_lock(&bp->b_lock);
> -	ASSERT(bp->b_hold >= 1);
> -	if (bp->b_hold > 1) {
> -		bp->b_hold--;
> -		goto out_unlock;
> -	}
> +	ASSERT(bp->b_hold < 0);
> +	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  
> -	/* we are asked to drop the last reference */
> -	if (atomic_read(&bp->b_lru_ref)) {
> -		/*
> -		 * If the buffer is added to the LRU, keep the reference to the
> -		 * buffer for the LRU and clear the (now stale) dispose list
> -		 * state flag, else drop the reference.
> -		 */
> -		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
> -			bp->b_state &= ~XFS_BSTATE_DISPOSE;
> -		else
> -			bp->b_hold--;
> -	} else {
> -		bp->b_hold--;
> -		/*
> -		 * most of the time buffers will already be removed from the
> -		 * LRU, so optimise that case by checking for the
> -		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
> -		 * was on was the disposal list
> -		 */
> -		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
> -			list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
> -		} else {
> -			ASSERT(list_empty(&bp->b_lru));
> -		}
> +	if (!xfs_buf_is_uncached(bp)) {
> +		struct xfs_buf_cache	*bch =
> +			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
>  
> -		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
>  				xfs_buf_hash_params);
> -		if (pag)
> -			xfs_perag_put(pag);
> -		freebuf = true;
> -	}
>  
> -out_unlock:
> -	spin_unlock(&bp->b_lock);
> +		if (bp->b_pag)
> +			xfs_perag_put(bp->b_pag);
> +	}
>  
> -	if (freebuf)
> -		xfs_buf_free(bp);
> +	xfs_buf_free(bp);
>  }
>  
>  /*
> @@ -942,10 +887,22 @@ xfs_buf_rele(
>  	struct xfs_buf		*bp)
>  {
>  	trace_xfs_buf_rele(bp, _RET_IP_);
> -	if (xfs_buf_is_uncached(bp))
> -		xfs_buf_rele_uncached(bp);
> -	else
> -		xfs_buf_rele_cached(bp);
> +
> +	spin_lock(&bp->b_lock);
> +	if (!--bp->b_hold) {
> +		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
> +			goto kill;
> +		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	}
> +	spin_unlock(&bp->b_lock);
> +	return;
> +
> +kill:
> +	bp->b_hold = -1;
> +	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	spin_unlock(&bp->b_lock);
> +
> +	xfs_buf_destroy(bp);
>  }
>  
>  /*
> @@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
>  
>  /*
>   * To simulate an I/O failure, the buffer must be locked and held with at least
> - * three references. The LRU reference is dropped by the stale call. The buf
> - * item reference is dropped via ioend processing. The third reference is owned
> - * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
> + * two references.
> + *
> + * The buf item reference is dropped via ioend processing. The second reference
> + * is owned by the caller and is dropped on I/O completion if the buffer is
> + * XBF_ASYNC.
>   */
>  void
>  xfs_buf_ioend_fail(
> @@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
>  
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> -	if (bp->b_hold > 1) {
> +	if (bp->b_hold > 0) {
>  		/* need to wait, so skip it this pass */
>  		spin_unlock(&bp->b_lock);
>  		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
>  		return LRU_SKIP;
>  	}
>  
> -	/*
> -	 * clear the LRU reference count so the buffer doesn't get
> -	 * ignored in xfs_buf_rele().
> -	 */
> -	atomic_set(&bp->b_lru_ref, 0);
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
>  "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
>  					(long long)xfs_buf_daddr(bp));
>  			}
> -			xfs_buf_rele(bp);
> +			xfs_buf_destroy(bp);
>  		}
>  		if (loop++ != 0)
>  			delay(100);
> @@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
>  	struct list_head	*dispose = arg;
>  
>  	/*
> -	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
> -	 * If we fail to get the lock, just skip it.
> +	 * We are inverting the lru lock vs bp->b_lock order here, so use a
> +	 * trylock. If we fail to get the lock, just skip the buffer.
>  	 */
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> +
> +	/*
> +	 * If the buffer is in use, remove it from the LRU for now as we can't
> +	 * free it.  It will be added to the LRU again when the reference count
> +	 * hits zero.
> +	 */
> +	if (bp->b_hold > 0) {
> +		list_lru_isolate(lru, &bp->b_lru);
> +		spin_unlock(&bp->b_lock);
> +		return LRU_REMOVED;
> +	}
> +
>  	/*
>  	 * Decrement the b_lru_ref count unless the value is already
>  	 * zero. If the value is already zero, we need to reclaim the
> @@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
>  		return LRU_ROTATE;
>  	}
>  
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
>  		struct xfs_buf *bp;
>  		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
>  		list_del_init(&bp->b_lru);
> -		xfs_buf_rele(bp);
> +		xfs_buf_destroy(bp);
>  	}
>  
>  	return freed;
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index e25cd2a160f3..1117cd9cbfb9 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_INCORE,		"INCORE" }, \
>  	{ XBF_TRYLOCK,		"TRYLOCK" }
>  
> -/*
> - * Internal state flags.
> - */
> -#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
> -
>  struct xfs_buf_cache {
>  	struct rhashtable	bc_hash;
>  };
> @@ -159,6 +154,7 @@ struct xfs_buf {
>  
>  	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
>  	int			b_length;	/* size of buffer in BBs */
> +	spinlock_t		b_lock;		/* internal state lock */
>  	unsigned int		b_hold;		/* reference count */
>  	atomic_t		b_lru_ref;	/* lru reclaim ref count */
>  	xfs_buf_flags_t		b_flags;	/* status flags */
> @@ -169,8 +165,6 @@ struct xfs_buf {
>  	 * bt_lru_lock and not by b_sema
>  	 */
>  	struct list_head	b_lru;		/* lru list */
> -	spinlock_t		b_lock;		/* internal state lock */
> -	unsigned int		b_state;	/* internal state flags */
>  	wait_queue_head_t	b_waiters;	/* unpin waiters */
>  	struct list_head	b_list;
>  	struct xfs_perag	*b_pag;
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-20  2:53   ` Darrick J. Wong
@ 2026-01-20  6:55     ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2026-01-20  6:55 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs

On Mon, Jan 19, 2026 at 06:53:15PM -0800, Darrick J. Wong wrote:
> So under the current code, a cached buffer gets created with b_hold=1
> and b_lru_ref=1.  Calling xfs_buf_hold can bump up b_hold.  Calling
> xfs_buf_rele immediately will either transfer ownership of the buf to
> the lru (if it wasn't on the lru ref) or decrement b_hold.
> 
> Higher level xfs code might boost b_lru_ref for buffers that it thinks
> we should try to hang on to (e.g. AG headers).

We actually do this b_lru_ref boost for a lot, if not most of the
metadata buffers.  It is used to manage relative eviction priority.

> xfs_buftarg_isolate will decrement b_lru_ref unless it was already zero,
> in which case it'll actually free the buffer.  If xfs_buf_rele finds a
> buffer with b_lru_ref==0 it'll drop b_hold and try to free the buffer if
> b_hold drops to zero.
> 
> Right?

Yes.

> > Switch to not having a reference for buffers in the LRU, and use a
> > separate negative hold value to mark buffers as dead.  This simplifies
> > xfs_buf_rele, which now just deal with the last "real" reference,
> > and prepares for using the lockref primitive.
> 
> And now, b_hold is the number of higher-level owners of the buffer.  If
> that drops to zero the buffer gets put on the lru list if it hasn't
> already run out of lru refs, in which case it's freed directly.  If a
> buffer on the lru list runs out of lru refs then it'll get freed.
> b_hodl < 0 means "headed for destruction".
> 
> Is that also correct?

Yes.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-22  5:26 buffer cache simplification v2 Christoph Hellwig
@ 2026-01-22  5:26 ` Christoph Hellwig
  2026-01-23 11:55   ` Carlos Maiolino
  2026-01-23 16:01   ` Brian Foster
  0 siblings, 2 replies; 14+ messages in thread
From: Christoph Hellwig @ 2026-01-22  5:26 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, linux-xfs, Darrick J. Wong

Currently the buffer cache adds a reference to b_hold for buffers that
are on the LRU.  This seems to go all the way back and allows releasing
buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
really complicated in differs from how other LRUs are implemented in
Linux.

Switch to not having a reference for buffers in the LRU, and use a
separate negative hold value to mark buffers as dead.  This simplifies
xfs_buf_rele, which now just deal with the last "real" reference,
and prepares for using the lockref primitive.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
 fs/xfs/xfs_buf.h |   8 +--
 2 files changed, 54 insertions(+), 94 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index db46883991de..aacdf080e400 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,11 +80,8 @@ xfs_buf_stale(
 
 	spin_lock(&bp->b_lock);
 	atomic_set(&bp->b_lru_ref, 0);
-	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
-	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
-		bp->b_hold--;
-
-	ASSERT(bp->b_hold >= 1);
+	if (bp->b_hold >= 0)
+		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
 	spin_unlock(&bp->b_lock);
 }
 
@@ -442,7 +439,7 @@ xfs_buf_try_hold(
 	struct xfs_buf		*bp)
 {
 	spin_lock(&bp->b_lock);
-	if (bp->b_hold == 0) {
+	if (bp->b_hold == -1) {
 		spin_unlock(&bp->b_lock);
 		return false;
 	}
@@ -862,76 +859,24 @@ xfs_buf_hold(
 }
 
 static void
-xfs_buf_rele_uncached(
-	struct xfs_buf		*bp)
-{
-	ASSERT(list_empty(&bp->b_lru));
-
-	spin_lock(&bp->b_lock);
-	if (--bp->b_hold) {
-		spin_unlock(&bp->b_lock);
-		return;
-	}
-	spin_unlock(&bp->b_lock);
-	xfs_buf_free(bp);
-}
-
-static void
-xfs_buf_rele_cached(
+xfs_buf_destroy(
 	struct xfs_buf		*bp)
 {
-	struct xfs_buftarg	*btp = bp->b_target;
-	struct xfs_perag	*pag = bp->b_pag;
-	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
-	bool			freebuf = false;
-
-	trace_xfs_buf_rele(bp, _RET_IP_);
-
-	spin_lock(&bp->b_lock);
-	ASSERT(bp->b_hold >= 1);
-	if (bp->b_hold > 1) {
-		bp->b_hold--;
-		goto out_unlock;
-	}
+	ASSERT(bp->b_hold < 0);
+	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
-	/* we are asked to drop the last reference */
-	if (atomic_read(&bp->b_lru_ref)) {
-		/*
-		 * If the buffer is added to the LRU, keep the reference to the
-		 * buffer for the LRU and clear the (now stale) dispose list
-		 * state flag, else drop the reference.
-		 */
-		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
-			bp->b_state &= ~XFS_BSTATE_DISPOSE;
-		else
-			bp->b_hold--;
-	} else {
-		bp->b_hold--;
-		/*
-		 * most of the time buffers will already be removed from the
-		 * LRU, so optimise that case by checking for the
-		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
-		 * was on was the disposal list
-		 */
-		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
-		} else {
-			ASSERT(list_empty(&bp->b_lru));
-		}
+	if (!xfs_buf_is_uncached(bp)) {
+		struct xfs_buf_cache	*bch =
+			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
 
-		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
 				xfs_buf_hash_params);
-		if (pag)
-			xfs_perag_put(pag);
-		freebuf = true;
-	}
 
-out_unlock:
-	spin_unlock(&bp->b_lock);
+		if (bp->b_pag)
+			xfs_perag_put(bp->b_pag);
+	}
 
-	if (freebuf)
-		xfs_buf_free(bp);
+	xfs_buf_free(bp);
 }
 
 /*
@@ -942,10 +887,22 @@ xfs_buf_rele(
 	struct xfs_buf		*bp)
 {
 	trace_xfs_buf_rele(bp, _RET_IP_);
-	if (xfs_buf_is_uncached(bp))
-		xfs_buf_rele_uncached(bp);
-	else
-		xfs_buf_rele_cached(bp);
+
+	spin_lock(&bp->b_lock);
+	if (!--bp->b_hold) {
+		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
+			goto kill;
+		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
+	}
+	spin_unlock(&bp->b_lock);
+	return;
+
+kill:
+	bp->b_hold = -1;
+	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+	spin_unlock(&bp->b_lock);
+
+	xfs_buf_destroy(bp);
 }
 
 /*
@@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
 
 /*
  * To simulate an I/O failure, the buffer must be locked and held with at least
- * three references. The LRU reference is dropped by the stale call. The buf
- * item reference is dropped via ioend processing. The third reference is owned
- * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ * two references.
+ *
+ * The buf item reference is dropped via ioend processing. The second reference
+ * is owned by the caller and is dropped on I/O completion if the buffer is
+ * XBF_ASYNC.
  */
 void
 xfs_buf_ioend_fail(
@@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
 
 	if (!spin_trylock(&bp->b_lock))
 		return LRU_SKIP;
-	if (bp->b_hold > 1) {
+	if (bp->b_hold > 0) {
 		/* need to wait, so skip it this pass */
 		spin_unlock(&bp->b_lock);
 		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
 		return LRU_SKIP;
 	}
 
-	/*
-	 * clear the LRU reference count so the buffer doesn't get
-	 * ignored in xfs_buf_rele().
-	 */
-	atomic_set(&bp->b_lru_ref, 0);
-	bp->b_state |= XFS_BSTATE_DISPOSE;
+	bp->b_hold = -1;
 	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
@@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
 					(long long)xfs_buf_daddr(bp));
 			}
-			xfs_buf_rele(bp);
+			xfs_buf_destroy(bp);
 		}
 		if (loop++ != 0)
 			delay(100);
@@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
 	struct list_head	*dispose = arg;
 
 	/*
-	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
-	 * If we fail to get the lock, just skip it.
+	 * We are inverting the lru lock vs bp->b_lock order here, so use a
+	 * trylock. If we fail to get the lock, just skip the buffer.
 	 */
 	if (!spin_trylock(&bp->b_lock))
 		return LRU_SKIP;
+
+	/*
+	 * If the buffer is in use, remove it from the LRU for now as we can't
+	 * free it.  It will be added to the LRU again when the reference count
+	 * hits zero.
+	 */
+	if (bp->b_hold > 0) {
+		list_lru_isolate(lru, &bp->b_lru);
+		spin_unlock(&bp->b_lock);
+		return LRU_REMOVED;
+	}
+
 	/*
 	 * Decrement the b_lru_ref count unless the value is already
 	 * zero. If the value is already zero, we need to reclaim the
@@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
 		return LRU_ROTATE;
 	}
 
-	bp->b_state |= XFS_BSTATE_DISPOSE;
+	bp->b_hold = -1;
 	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
@@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
 		struct xfs_buf *bp;
 		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 		list_del_init(&bp->b_lru);
-		xfs_buf_rele(bp);
+		xfs_buf_destroy(bp);
 	}
 
 	return freed;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e25cd2a160f3..1117cd9cbfb9 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }
 
-/*
- * Internal state flags.
- */
-#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
-
 struct xfs_buf_cache {
 	struct rhashtable	bc_hash;
 };
@@ -159,6 +154,7 @@ struct xfs_buf {
 
 	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
 	int			b_length;	/* size of buffer in BBs */
+	spinlock_t		b_lock;		/* internal state lock */
 	unsigned int		b_hold;		/* reference count */
 	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 	xfs_buf_flags_t		b_flags;	/* status flags */
@@ -169,8 +165,6 @@ struct xfs_buf {
 	 * bt_lru_lock and not by b_sema
 	 */
 	struct list_head	b_lru;		/* lru list */
-	spinlock_t		b_lock;		/* internal state lock */
-	unsigned int		b_state;	/* internal state flags */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-22  5:26 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
@ 2026-01-23 11:55   ` Carlos Maiolino
  2026-01-23 16:01   ` Brian Foster
  1 sibling, 0 replies; 14+ messages in thread
From: Carlos Maiolino @ 2026-01-23 11:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Dave Chinner, linux-xfs, Darrick J. Wong

On Thu, Jan 22, 2026 at 06:26:55AM +0100, Christoph Hellwig wrote:
> Currently the buffer cache adds a reference to b_hold for buffers that
> are on the LRU.  This seems to go all the way back and allows releasing
> buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
> really complicated in differs from how other LRUs are implemented in
> Linux.
> 
> Switch to not having a reference for buffers in the LRU, and use a
> separate negative hold value to mark buffers as dead.  This simplifies
> xfs_buf_rele, which now just deal with the last "real" reference,
> and prepares for using the lockref primitive.
> 

Looks fine.
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
>  fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
>  fs/xfs/xfs_buf.h |   8 +--
>  2 files changed, 54 insertions(+), 94 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index db46883991de..aacdf080e400 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -80,11 +80,8 @@ xfs_buf_stale(
>  
>  	spin_lock(&bp->b_lock);
>  	atomic_set(&bp->b_lru_ref, 0);
> -	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
> -	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
> -		bp->b_hold--;
> -
> -	ASSERT(bp->b_hold >= 1);
> +	if (bp->b_hold >= 0)
> +		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
>  	spin_unlock(&bp->b_lock);
>  }
>  
> @@ -442,7 +439,7 @@ xfs_buf_try_hold(
>  	struct xfs_buf		*bp)
>  {
>  	spin_lock(&bp->b_lock);
> -	if (bp->b_hold == 0) {
> +	if (bp->b_hold == -1) {
>  		spin_unlock(&bp->b_lock);
>  		return false;
>  	}
> @@ -862,76 +859,24 @@ xfs_buf_hold(
>  }
>  
>  static void
> -xfs_buf_rele_uncached(
> -	struct xfs_buf		*bp)
> -{
> -	ASSERT(list_empty(&bp->b_lru));
> -
> -	spin_lock(&bp->b_lock);
> -	if (--bp->b_hold) {
> -		spin_unlock(&bp->b_lock);
> -		return;
> -	}
> -	spin_unlock(&bp->b_lock);
> -	xfs_buf_free(bp);
> -}
> -
> -static void
> -xfs_buf_rele_cached(
> +xfs_buf_destroy(
>  	struct xfs_buf		*bp)
>  {
> -	struct xfs_buftarg	*btp = bp->b_target;
> -	struct xfs_perag	*pag = bp->b_pag;
> -	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
> -	bool			freebuf = false;
> -
> -	trace_xfs_buf_rele(bp, _RET_IP_);
> -
> -	spin_lock(&bp->b_lock);
> -	ASSERT(bp->b_hold >= 1);
> -	if (bp->b_hold > 1) {
> -		bp->b_hold--;
> -		goto out_unlock;
> -	}
> +	ASSERT(bp->b_hold < 0);
> +	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  
> -	/* we are asked to drop the last reference */
> -	if (atomic_read(&bp->b_lru_ref)) {
> -		/*
> -		 * If the buffer is added to the LRU, keep the reference to the
> -		 * buffer for the LRU and clear the (now stale) dispose list
> -		 * state flag, else drop the reference.
> -		 */
> -		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
> -			bp->b_state &= ~XFS_BSTATE_DISPOSE;
> -		else
> -			bp->b_hold--;
> -	} else {
> -		bp->b_hold--;
> -		/*
> -		 * most of the time buffers will already be removed from the
> -		 * LRU, so optimise that case by checking for the
> -		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
> -		 * was on was the disposal list
> -		 */
> -		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
> -			list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
> -		} else {
> -			ASSERT(list_empty(&bp->b_lru));
> -		}
> +	if (!xfs_buf_is_uncached(bp)) {
> +		struct xfs_buf_cache	*bch =
> +			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
>  
> -		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
>  				xfs_buf_hash_params);
> -		if (pag)
> -			xfs_perag_put(pag);
> -		freebuf = true;
> -	}
>  
> -out_unlock:
> -	spin_unlock(&bp->b_lock);
> +		if (bp->b_pag)
> +			xfs_perag_put(bp->b_pag);
> +	}
>  
> -	if (freebuf)
> -		xfs_buf_free(bp);
> +	xfs_buf_free(bp);
>  }
>  
>  /*
> @@ -942,10 +887,22 @@ xfs_buf_rele(
>  	struct xfs_buf		*bp)
>  {
>  	trace_xfs_buf_rele(bp, _RET_IP_);
> -	if (xfs_buf_is_uncached(bp))
> -		xfs_buf_rele_uncached(bp);
> -	else
> -		xfs_buf_rele_cached(bp);
> +
> +	spin_lock(&bp->b_lock);
> +	if (!--bp->b_hold) {
> +		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
> +			goto kill;
> +		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	}
> +	spin_unlock(&bp->b_lock);
> +	return;
> +
> +kill:
> +	bp->b_hold = -1;
> +	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	spin_unlock(&bp->b_lock);
> +
> +	xfs_buf_destroy(bp);
>  }
>  
>  /*
> @@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
>  
>  /*
>   * To simulate an I/O failure, the buffer must be locked and held with at least
> - * three references. The LRU reference is dropped by the stale call. The buf
> - * item reference is dropped via ioend processing. The third reference is owned
> - * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
> + * two references.
> + *
> + * The buf item reference is dropped via ioend processing. The second reference
> + * is owned by the caller and is dropped on I/O completion if the buffer is
> + * XBF_ASYNC.
>   */
>  void
>  xfs_buf_ioend_fail(
> @@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
>  
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> -	if (bp->b_hold > 1) {
> +	if (bp->b_hold > 0) {
>  		/* need to wait, so skip it this pass */
>  		spin_unlock(&bp->b_lock);
>  		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
>  		return LRU_SKIP;
>  	}
>  
> -	/*
> -	 * clear the LRU reference count so the buffer doesn't get
> -	 * ignored in xfs_buf_rele().
> -	 */
> -	atomic_set(&bp->b_lru_ref, 0);
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
>  "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
>  					(long long)xfs_buf_daddr(bp));
>  			}
> -			xfs_buf_rele(bp);
> +			xfs_buf_destroy(bp);
>  		}
>  		if (loop++ != 0)
>  			delay(100);
> @@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
>  	struct list_head	*dispose = arg;
>  
>  	/*
> -	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
> -	 * If we fail to get the lock, just skip it.
> +	 * We are inverting the lru lock vs bp->b_lock order here, so use a
> +	 * trylock. If we fail to get the lock, just skip the buffer.
>  	 */
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> +
> +	/*
> +	 * If the buffer is in use, remove it from the LRU for now as we can't
> +	 * free it.  It will be added to the LRU again when the reference count
> +	 * hits zero.
> +	 */
> +	if (bp->b_hold > 0) {
> +		list_lru_isolate(lru, &bp->b_lru);
> +		spin_unlock(&bp->b_lock);
> +		return LRU_REMOVED;
> +	}
> +
>  	/*
>  	 * Decrement the b_lru_ref count unless the value is already
>  	 * zero. If the value is already zero, we need to reclaim the
> @@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
>  		return LRU_ROTATE;
>  	}
>  
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
>  		struct xfs_buf *bp;
>  		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
>  		list_del_init(&bp->b_lru);
> -		xfs_buf_rele(bp);
> +		xfs_buf_destroy(bp);
>  	}
>  
>  	return freed;
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index e25cd2a160f3..1117cd9cbfb9 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_INCORE,		"INCORE" }, \
>  	{ XBF_TRYLOCK,		"TRYLOCK" }
>  
> -/*
> - * Internal state flags.
> - */
> -#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
> -
>  struct xfs_buf_cache {
>  	struct rhashtable	bc_hash;
>  };
> @@ -159,6 +154,7 @@ struct xfs_buf {
>  
>  	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
>  	int			b_length;	/* size of buffer in BBs */
> +	spinlock_t		b_lock;		/* internal state lock */
>  	unsigned int		b_hold;		/* reference count */
>  	atomic_t		b_lru_ref;	/* lru reclaim ref count */
>  	xfs_buf_flags_t		b_flags;	/* status flags */
> @@ -169,8 +165,6 @@ struct xfs_buf {
>  	 * bt_lru_lock and not by b_sema
>  	 */
>  	struct list_head	b_lru;		/* lru list */
> -	spinlock_t		b_lock;		/* internal state lock */
> -	unsigned int		b_state;	/* internal state flags */
>  	wait_queue_head_t	b_waiters;	/* unpin waiters */
>  	struct list_head	b_list;
>  	struct xfs_perag	*b_pag;
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-22  5:26 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
  2026-01-23 11:55   ` Carlos Maiolino
@ 2026-01-23 16:01   ` Brian Foster
  1 sibling, 0 replies; 14+ messages in thread
From: Brian Foster @ 2026-01-23 16:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Dave Chinner, linux-xfs, Darrick J. Wong

On Thu, Jan 22, 2026 at 06:26:55AM +0100, Christoph Hellwig wrote:
> Currently the buffer cache adds a reference to b_hold for buffers that
> are on the LRU.  This seems to go all the way back and allows releasing
> buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
> really complicated in differs from how other LRUs are implemented in
> Linux.
> 
> Switch to not having a reference for buffers in the LRU, and use a
> separate negative hold value to mark buffers as dead.  This simplifies
> xfs_buf_rele, which now just deal with the last "real" reference,
> and prepares for using the lockref primitive.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
>  fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
>  fs/xfs/xfs_buf.h |   8 +--
>  2 files changed, 54 insertions(+), 94 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index db46883991de..aacdf080e400 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -80,11 +80,8 @@ xfs_buf_stale(
>  
>  	spin_lock(&bp->b_lock);
>  	atomic_set(&bp->b_lru_ref, 0);
> -	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
> -	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
> -		bp->b_hold--;
> -
> -	ASSERT(bp->b_hold >= 1);
> +	if (bp->b_hold >= 0)
> +		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);

I see that b_hold goes away in subsequent patches, but nonetheless it's
unsigned as of this patch, which kind of makes this look like a
potential bisect bomb. I wouldn't want to turn this into a big effort
just to fix it up mid-series, but maybe this should just change b_hold
to a signed type and call out the transient wart in the commit log?

(A quick/spot test might not hurt either if that hadn't been done.)

>  	spin_unlock(&bp->b_lock);
>  }
>  
...
> @@ -862,76 +859,24 @@ xfs_buf_hold(
>  }
>  
...
> -static void
> -xfs_buf_rele_cached(
> +xfs_buf_destroy(
>  	struct xfs_buf		*bp)
>  {
...
> +	if (!xfs_buf_is_uncached(bp)) {
> +		struct xfs_buf_cache	*bch =
> +			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
>  
> -		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>  		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
>  				xfs_buf_hash_params);

This looks like a subtle locking change in that we're no longer under
b_lock..? AFAICT that is fine as RCU protection is more important for
the lookup side, but it also might be worth documenting that change in
the commit log as well so it's clear that it is intentional and safe.

Brian

> -		if (pag)
> -			xfs_perag_put(pag);
> -		freebuf = true;
> -	}
>  
> -out_unlock:
> -	spin_unlock(&bp->b_lock);
> +		if (bp->b_pag)
> +			xfs_perag_put(bp->b_pag);
> +	}
>  
> -	if (freebuf)
> -		xfs_buf_free(bp);
> +	xfs_buf_free(bp);
>  }
>  
>  /*
> @@ -942,10 +887,22 @@ xfs_buf_rele(
>  	struct xfs_buf		*bp)
>  {
>  	trace_xfs_buf_rele(bp, _RET_IP_);
> -	if (xfs_buf_is_uncached(bp))
> -		xfs_buf_rele_uncached(bp);
> -	else
> -		xfs_buf_rele_cached(bp);
> +
> +	spin_lock(&bp->b_lock);
> +	if (!--bp->b_hold) {
> +		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
> +			goto kill;
> +		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	}
> +	spin_unlock(&bp->b_lock);
> +	return;
> +
> +kill:
> +	bp->b_hold = -1;
> +	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
> +	spin_unlock(&bp->b_lock);
> +
> +	xfs_buf_destroy(bp);
>  }
>  
>  /*
> @@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
>  
>  /*
>   * To simulate an I/O failure, the buffer must be locked and held with at least
> - * three references. The LRU reference is dropped by the stale call. The buf
> - * item reference is dropped via ioend processing. The third reference is owned
> - * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
> + * two references.
> + *
> + * The buf item reference is dropped via ioend processing. The second reference
> + * is owned by the caller and is dropped on I/O completion if the buffer is
> + * XBF_ASYNC.
>   */
>  void
>  xfs_buf_ioend_fail(
> @@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
>  
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> -	if (bp->b_hold > 1) {
> +	if (bp->b_hold > 0) {
>  		/* need to wait, so skip it this pass */
>  		spin_unlock(&bp->b_lock);
>  		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
>  		return LRU_SKIP;
>  	}
>  
> -	/*
> -	 * clear the LRU reference count so the buffer doesn't get
> -	 * ignored in xfs_buf_rele().
> -	 */
> -	atomic_set(&bp->b_lru_ref, 0);
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
>  "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
>  					(long long)xfs_buf_daddr(bp));
>  			}
> -			xfs_buf_rele(bp);
> +			xfs_buf_destroy(bp);
>  		}
>  		if (loop++ != 0)
>  			delay(100);
> @@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
>  	struct list_head	*dispose = arg;
>  
>  	/*
> -	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
> -	 * If we fail to get the lock, just skip it.
> +	 * We are inverting the lru lock vs bp->b_lock order here, so use a
> +	 * trylock. If we fail to get the lock, just skip the buffer.
>  	 */
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> +
> +	/*
> +	 * If the buffer is in use, remove it from the LRU for now as we can't
> +	 * free it.  It will be added to the LRU again when the reference count
> +	 * hits zero.
> +	 */
> +	if (bp->b_hold > 0) {
> +		list_lru_isolate(lru, &bp->b_lru);
> +		spin_unlock(&bp->b_lock);
> +		return LRU_REMOVED;
> +	}
> +
>  	/*
>  	 * Decrement the b_lru_ref count unless the value is already
>  	 * zero. If the value is already zero, we need to reclaim the
> @@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
>  		return LRU_ROTATE;
>  	}
>  
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
>  		struct xfs_buf *bp;
>  		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
>  		list_del_init(&bp->b_lru);
> -		xfs_buf_rele(bp);
> +		xfs_buf_destroy(bp);
>  	}
>  
>  	return freed;
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index e25cd2a160f3..1117cd9cbfb9 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_INCORE,		"INCORE" }, \
>  	{ XBF_TRYLOCK,		"TRYLOCK" }
>  
> -/*
> - * Internal state flags.
> - */
> -#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
> -
>  struct xfs_buf_cache {
>  	struct rhashtable	bc_hash;
>  };
> @@ -159,6 +154,7 @@ struct xfs_buf {
>  
>  	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
>  	int			b_length;	/* size of buffer in BBs */
> +	spinlock_t		b_lock;		/* internal state lock */
>  	unsigned int		b_hold;		/* reference count */
>  	atomic_t		b_lru_ref;	/* lru reclaim ref count */
>  	xfs_buf_flags_t		b_flags;	/* status flags */
> @@ -169,8 +165,6 @@ struct xfs_buf {
>  	 * bt_lru_lock and not by b_sema
>  	 */
>  	struct list_head	b_lru;		/* lru list */
> -	spinlock_t		b_lock;		/* internal state lock */
> -	unsigned int		b_state;	/* internal state flags */
>  	wait_queue_head_t	b_waiters;	/* unpin waiters */
>  	struct list_head	b_list;
>  	struct xfs_perag	*b_pag;
> -- 
> 2.47.3
> 
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* buffer cache simplification v3
@ 2026-01-26  5:37 Christoph Hellwig
  2026-01-26  5:38 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Christoph Hellwig @ 2026-01-26  5:37 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, Brian Foster, linux-xfs

Hi all,

this series has a few old patches that simplify the LRU handling, and
moves back to only having a per-buftarg hash now that the buffer hash
is using the scalable rhashtable.  While some of this looks like
performance work, performance and scalability is unchanged even on the
80 core dual socket system I test this on.  Besides cleaning up the
code nice, it also happens to fix a syzcaller reported use after free
during buffer shutdown, which happened incidentally because of how the
tear down of the buftarg vs the perag structures is handled.

Changes since v2:
 - mark b_hold as signed in patch 1 before removing it in patch 2
 - document the changed locking in xfs_buf_rele_cached.

Changes since v2:
 - add more details and a link to a commit message

Diffstat:
 libxfs/xfs_ag.c |   13 ---
 libxfs/xfs_ag.h |    2 
 xfs_buf.c       |  234 ++++++++++++++++++--------------------------------------
 xfs_buf.h       |   20 ----
 xfs_buf_mem.c   |   11 --
 xfs_trace.h     |   10 +-
 6 files changed, 91 insertions(+), 199 deletions(-)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-26  5:37 buffer cache simplification v3 Christoph Hellwig
@ 2026-01-26  5:38 ` Christoph Hellwig
  2026-01-26 19:18   ` Brian Foster
  2026-01-26  5:38 ` [PATCH 2/3] xfs: use a lockref for the buffer reference count Christoph Hellwig
  2026-01-26  5:38 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
  2 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2026-01-26  5:38 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, Brian Foster, linux-xfs, Darrick J. Wong

Currently the buffer cache adds a reference to b_hold for buffers that
are on the LRU.  This seems to go all the way back and allows releasing
buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
really complicated in differs from how other LRUs are implemented in
Linux.

Switch to not having a reference for buffers in the LRU, and use a
separate negative hold value to mark buffers as dead.  This simplifies
xfs_buf_rele, which now just deal with the last "real" reference,
and prepares for using the lockref primitive.

This also removes the b_lock protection for removing buffers from the
buffer hash.  This is the desired outcome because the rhashtable is
fully internally synchronized, and previously the lock was mostly
held out of ordering constrains in xfs_buf_rele_cached.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
 fs/xfs/xfs_buf.h |   8 +--
 2 files changed, 54 insertions(+), 94 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index db46883991de..aacdf080e400 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,11 +80,8 @@ xfs_buf_stale(
 
 	spin_lock(&bp->b_lock);
 	atomic_set(&bp->b_lru_ref, 0);
-	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
-	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
-		bp->b_hold--;
-
-	ASSERT(bp->b_hold >= 1);
+	if (bp->b_hold >= 0)
+		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
 	spin_unlock(&bp->b_lock);
 }
 
@@ -442,7 +439,7 @@ xfs_buf_try_hold(
 	struct xfs_buf		*bp)
 {
 	spin_lock(&bp->b_lock);
-	if (bp->b_hold == 0) {
+	if (bp->b_hold == -1) {
 		spin_unlock(&bp->b_lock);
 		return false;
 	}
@@ -862,76 +859,24 @@ xfs_buf_hold(
 }
 
 static void
-xfs_buf_rele_uncached(
-	struct xfs_buf		*bp)
-{
-	ASSERT(list_empty(&bp->b_lru));
-
-	spin_lock(&bp->b_lock);
-	if (--bp->b_hold) {
-		spin_unlock(&bp->b_lock);
-		return;
-	}
-	spin_unlock(&bp->b_lock);
-	xfs_buf_free(bp);
-}
-
-static void
-xfs_buf_rele_cached(
+xfs_buf_destroy(
 	struct xfs_buf		*bp)
 {
-	struct xfs_buftarg	*btp = bp->b_target;
-	struct xfs_perag	*pag = bp->b_pag;
-	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
-	bool			freebuf = false;
-
-	trace_xfs_buf_rele(bp, _RET_IP_);
-
-	spin_lock(&bp->b_lock);
-	ASSERT(bp->b_hold >= 1);
-	if (bp->b_hold > 1) {
-		bp->b_hold--;
-		goto out_unlock;
-	}
+	ASSERT(bp->b_hold < 0);
+	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
-	/* we are asked to drop the last reference */
-	if (atomic_read(&bp->b_lru_ref)) {
-		/*
-		 * If the buffer is added to the LRU, keep the reference to the
-		 * buffer for the LRU and clear the (now stale) dispose list
-		 * state flag, else drop the reference.
-		 */
-		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
-			bp->b_state &= ~XFS_BSTATE_DISPOSE;
-		else
-			bp->b_hold--;
-	} else {
-		bp->b_hold--;
-		/*
-		 * most of the time buffers will already be removed from the
-		 * LRU, so optimise that case by checking for the
-		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
-		 * was on was the disposal list
-		 */
-		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
-		} else {
-			ASSERT(list_empty(&bp->b_lru));
-		}
+	if (!xfs_buf_is_uncached(bp)) {
+		struct xfs_buf_cache	*bch =
+			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
 
-		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
 				xfs_buf_hash_params);
-		if (pag)
-			xfs_perag_put(pag);
-		freebuf = true;
-	}
 
-out_unlock:
-	spin_unlock(&bp->b_lock);
+		if (bp->b_pag)
+			xfs_perag_put(bp->b_pag);
+	}
 
-	if (freebuf)
-		xfs_buf_free(bp);
+	xfs_buf_free(bp);
 }
 
 /*
@@ -942,10 +887,22 @@ xfs_buf_rele(
 	struct xfs_buf		*bp)
 {
 	trace_xfs_buf_rele(bp, _RET_IP_);
-	if (xfs_buf_is_uncached(bp))
-		xfs_buf_rele_uncached(bp);
-	else
-		xfs_buf_rele_cached(bp);
+
+	spin_lock(&bp->b_lock);
+	if (!--bp->b_hold) {
+		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
+			goto kill;
+		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
+	}
+	spin_unlock(&bp->b_lock);
+	return;
+
+kill:
+	bp->b_hold = -1;
+	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+	spin_unlock(&bp->b_lock);
+
+	xfs_buf_destroy(bp);
 }
 
 /*
@@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
 
 /*
  * To simulate an I/O failure, the buffer must be locked and held with at least
- * three references. The LRU reference is dropped by the stale call. The buf
- * item reference is dropped via ioend processing. The third reference is owned
- * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ * two references.
+ *
+ * The buf item reference is dropped via ioend processing. The second reference
+ * is owned by the caller and is dropped on I/O completion if the buffer is
+ * XBF_ASYNC.
  */
 void
 xfs_buf_ioend_fail(
@@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
 
 	if (!spin_trylock(&bp->b_lock))
 		return LRU_SKIP;
-	if (bp->b_hold > 1) {
+	if (bp->b_hold > 0) {
 		/* need to wait, so skip it this pass */
 		spin_unlock(&bp->b_lock);
 		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
 		return LRU_SKIP;
 	}
 
-	/*
-	 * clear the LRU reference count so the buffer doesn't get
-	 * ignored in xfs_buf_rele().
-	 */
-	atomic_set(&bp->b_lru_ref, 0);
-	bp->b_state |= XFS_BSTATE_DISPOSE;
+	bp->b_hold = -1;
 	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
@@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
 					(long long)xfs_buf_daddr(bp));
 			}
-			xfs_buf_rele(bp);
+			xfs_buf_destroy(bp);
 		}
 		if (loop++ != 0)
 			delay(100);
@@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
 	struct list_head	*dispose = arg;
 
 	/*
-	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
-	 * If we fail to get the lock, just skip it.
+	 * We are inverting the lru lock vs bp->b_lock order here, so use a
+	 * trylock. If we fail to get the lock, just skip the buffer.
 	 */
 	if (!spin_trylock(&bp->b_lock))
 		return LRU_SKIP;
+
+	/*
+	 * If the buffer is in use, remove it from the LRU for now as we can't
+	 * free it.  It will be added to the LRU again when the reference count
+	 * hits zero.
+	 */
+	if (bp->b_hold > 0) {
+		list_lru_isolate(lru, &bp->b_lru);
+		spin_unlock(&bp->b_lock);
+		return LRU_REMOVED;
+	}
+
 	/*
 	 * Decrement the b_lru_ref count unless the value is already
 	 * zero. If the value is already zero, we need to reclaim the
@@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
 		return LRU_ROTATE;
 	}
 
-	bp->b_state |= XFS_BSTATE_DISPOSE;
+	bp->b_hold = -1;
 	list_lru_isolate_move(lru, item, dispose);
 	spin_unlock(&bp->b_lock);
 	return LRU_REMOVED;
@@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
 		struct xfs_buf *bp;
 		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 		list_del_init(&bp->b_lru);
-		xfs_buf_rele(bp);
+		xfs_buf_destroy(bp);
 	}
 
 	return freed;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e25cd2a160f3..e7324d58bd96 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }
 
-/*
- * Internal state flags.
- */
-#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
-
 struct xfs_buf_cache {
 	struct rhashtable	bc_hash;
 };
@@ -159,7 +154,7 @@ struct xfs_buf {
 
 	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
 	int			b_length;	/* size of buffer in BBs */
-	unsigned int		b_hold;		/* reference count */
+	int			b_hold;		/* reference count */
 	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct semaphore	b_sema;		/* semaphore for lockables */
@@ -170,7 +165,6 @@ struct xfs_buf {
 	 */
 	struct list_head	b_lru;		/* lru list */
 	spinlock_t		b_lock;		/* internal state lock */
-	unsigned int		b_state;	/* internal state flags */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] xfs: use a lockref for the buffer reference count
  2026-01-26  5:37 buffer cache simplification v3 Christoph Hellwig
  2026-01-26  5:38 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
@ 2026-01-26  5:38 ` Christoph Hellwig
  2026-01-26  5:38 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
  2 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2026-01-26  5:38 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Dave Chinner, Brian Foster, linux-xfs, Darrick J. Wong

The lockref structure allows incrementing/decrementing counters like
an atomic_t for the fast path, while still allowing complex slow path
operations as if the counter was protected by a lock.  The only slow
path operations that actually need to take the lock are the final
put, LRU evictions and marking a buffer stale.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c   | 79 ++++++++++++++++++----------------------------
 fs/xfs/xfs_buf.h   |  4 +--
 fs/xfs/xfs_trace.h | 10 +++---
 3 files changed, 38 insertions(+), 55 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aacdf080e400..348c91335163 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -31,20 +31,20 @@ struct kmem_cache *xfs_buf_cache;
  *
  * xfs_buf_stale:
  *	b_sema (caller holds)
- *	  b_lock
+ *	  b_lockref.lock
  *	    lru_lock
  *
  * xfs_buf_rele:
- *	b_lock
+ *	b_lockref.lock
  *	  lru_lock
  *
  * xfs_buftarg_drain_rele
  *	lru_lock
- *	  b_lock (trylock due to inversion)
+ *	  b_lockref.lock (trylock due to inversion)
  *
  * xfs_buftarg_isolate
  *	lru_lock
- *	  b_lock (trylock due to inversion)
+ *	  b_lockref.lock (trylock due to inversion)
  */
 
 static void xfs_buf_submit(struct xfs_buf *bp);
@@ -78,11 +78,11 @@ xfs_buf_stale(
 	 */
 	bp->b_flags &= ~_XBF_DELWRI_Q;
 
-	spin_lock(&bp->b_lock);
+	spin_lock(&bp->b_lockref.lock);
 	atomic_set(&bp->b_lru_ref, 0);
-	if (bp->b_hold >= 0)
+	if (!__lockref_is_dead(&bp->b_lockref))
 		list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 }
 
 static void
@@ -274,10 +274,8 @@ xfs_buf_alloc(
 	 * inserting into the hash table are safe (and will have to wait for
 	 * the unlock to do anything non-trivial).
 	 */
-	bp->b_hold = 1;
+	lockref_init(&bp->b_lockref);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
-
-	spin_lock_init(&bp->b_lock);
 	atomic_set(&bp->b_lru_ref, 1);
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_lru);
@@ -434,20 +432,6 @@ xfs_buf_find_lock(
 	return 0;
 }
 
-static bool
-xfs_buf_try_hold(
-	struct xfs_buf		*bp)
-{
-	spin_lock(&bp->b_lock);
-	if (bp->b_hold == -1) {
-		spin_unlock(&bp->b_lock);
-		return false;
-	}
-	bp->b_hold++;
-	spin_unlock(&bp->b_lock);
-	return true;
-}
-
 static inline int
 xfs_buf_lookup(
 	struct xfs_buf_cache	*bch,
@@ -460,7 +444,7 @@ xfs_buf_lookup(
 
 	rcu_read_lock();
 	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
-	if (!bp || !xfs_buf_try_hold(bp)) {
+	if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
 		rcu_read_unlock();
 		return -ENOENT;
 	}
@@ -511,7 +495,7 @@ xfs_buf_find_insert(
 		error = PTR_ERR(bp);
 		goto out_free_buf;
 	}
-	if (bp && xfs_buf_try_hold(bp)) {
+	if (bp && lockref_get_not_dead(&bp->b_lockref)) {
 		/* found an existing buffer */
 		rcu_read_unlock();
 		error = xfs_buf_find_lock(bp, flags);
@@ -853,16 +837,14 @@ xfs_buf_hold(
 {
 	trace_xfs_buf_hold(bp, _RET_IP_);
 
-	spin_lock(&bp->b_lock);
-	bp->b_hold++;
-	spin_unlock(&bp->b_lock);
+	lockref_get(&bp->b_lockref);
 }
 
 static void
 xfs_buf_destroy(
 	struct xfs_buf		*bp)
 {
-	ASSERT(bp->b_hold < 0);
+	ASSERT(__lockref_is_dead(&bp->b_lockref));
 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
 	if (!xfs_buf_is_uncached(bp)) {
@@ -888,19 +870,20 @@ xfs_buf_rele(
 {
 	trace_xfs_buf_rele(bp, _RET_IP_);
 
-	spin_lock(&bp->b_lock);
-	if (!--bp->b_hold) {
+	if (lockref_put_or_lock(&bp->b_lockref))
+		return;
+	if (!--bp->b_lockref.count) {
 		if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
 			goto kill;
 		list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
 	}
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 	return;
 
 kill:
-	bp->b_hold = -1;
+	lockref_mark_dead(&bp->b_lockref);
 	list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 
 	xfs_buf_destroy(bp);
 }
@@ -1471,18 +1454,18 @@ xfs_buftarg_drain_rele(
 	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
 	struct list_head	*dispose = arg;
 
-	if (!spin_trylock(&bp->b_lock))
+	if (!spin_trylock(&bp->b_lockref.lock))
 		return LRU_SKIP;
-	if (bp->b_hold > 0) {
+	if (bp->b_lockref.count > 0) {
 		/* need to wait, so skip it this pass */
-		spin_unlock(&bp->b_lock);
+		spin_unlock(&bp->b_lockref.lock);
 		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
 		return LRU_SKIP;
 	}
 
-	bp->b_hold = -1;
+	lockref_mark_dead(&bp->b_lockref);
 	list_lru_isolate_move(lru, item, dispose);
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 	return LRU_REMOVED;
 }
 
@@ -1564,10 +1547,10 @@ xfs_buftarg_isolate(
 	struct list_head	*dispose = arg;
 
 	/*
-	 * We are inverting the lru lock vs bp->b_lock order here, so use a
-	 * trylock. If we fail to get the lock, just skip the buffer.
+	 * We are inverting the lru lock vs bp->b_lockref.lock order here, so
+	 * use a trylock. If we fail to get the lock, just skip the buffer.
 	 */
-	if (!spin_trylock(&bp->b_lock))
+	if (!spin_trylock(&bp->b_lockref.lock))
 		return LRU_SKIP;
 
 	/*
@@ -1575,9 +1558,9 @@ xfs_buftarg_isolate(
 	 * free it.  It will be added to the LRU again when the reference count
 	 * hits zero.
 	 */
-	if (bp->b_hold > 0) {
+	if (bp->b_lockref.count > 0) {
 		list_lru_isolate(lru, &bp->b_lru);
-		spin_unlock(&bp->b_lock);
+		spin_unlock(&bp->b_lockref.lock);
 		return LRU_REMOVED;
 	}
 
@@ -1587,13 +1570,13 @@ xfs_buftarg_isolate(
 	 * buffer, otherwise it gets another trip through the LRU.
 	 */
 	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
-		spin_unlock(&bp->b_lock);
+		spin_unlock(&bp->b_lockref.lock);
 		return LRU_ROTATE;
 	}
 
-	bp->b_hold = -1;
+	lockref_mark_dead(&bp->b_lockref);
 	list_lru_isolate_move(lru, item, dispose);
-	spin_unlock(&bp->b_lock);
+	spin_unlock(&bp->b_lockref.lock);
 	return LRU_REMOVED;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e7324d58bd96..3a1d066e1c13 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -14,6 +14,7 @@
 #include <linux/dax.h>
 #include <linux/uio.h>
 #include <linux/list_lru.h>
+#include <linux/lockref.h>
 
 extern struct kmem_cache *xfs_buf_cache;
 
@@ -154,7 +155,7 @@ struct xfs_buf {
 
 	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
 	int			b_length;	/* size of buffer in BBs */
-	int			b_hold;		/* reference count */
+	struct lockref		b_lockref;	/* refcount + lock */
 	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct semaphore	b_sema;		/* semaphore for lockables */
@@ -164,7 +165,6 @@ struct xfs_buf {
 	 * bt_lru_lock and not by b_sema
 	 */
 	struct list_head	b_lru;		/* lru list */
-	spinlock_t		b_lock;		/* internal state lock */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f70afbf3cb19..abf022d5ff42 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -736,7 +736,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
 		__entry->dev = bp->b_target->bt_dev;
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->nblks = bp->b_length;
-		__entry->hold = bp->b_hold;
+		__entry->hold = bp->b_lockref.count;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->flags = bp->b_flags;
@@ -810,7 +810,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->length = bp->b_length;
 		__entry->flags = flags;
-		__entry->hold = bp->b_hold;
+		__entry->hold = bp->b_lockref.count;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->caller_ip = caller_ip;
@@ -854,7 +854,7 @@ TRACE_EVENT(xfs_buf_ioerror,
 		__entry->dev = bp->b_target->bt_dev;
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->length = bp->b_length;
-		__entry->hold = bp->b_hold;
+		__entry->hold = bp->b_lockref.count;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->error = error;
@@ -898,7 +898,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
 		__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
 		__entry->buf_len = bip->bli_buf->b_length;
 		__entry->buf_flags = bip->bli_buf->b_flags;
-		__entry->buf_hold = bip->bli_buf->b_hold;
+		__entry->buf_hold = bip->bli_buf->b_lockref.count;
 		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
 		__entry->buf_lockval = bip->bli_buf->b_sema.count;
 		__entry->li_flags = bip->bli_item.li_flags;
@@ -5181,7 +5181,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
 		__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
 		__entry->bno = xfs_buf_daddr(bp);
 		__entry->nblks = bp->b_length;
-		__entry->hold = bp->b_hold;
+		__entry->hold = bp->b_lockref.count;
 		__entry->pincount = atomic_read(&bp->b_pin_count);
 		__entry->lockval = bp->b_sema.count;
 		__entry->flags = bp->b_flags;
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash
  2026-01-26  5:37 buffer cache simplification v3 Christoph Hellwig
  2026-01-26  5:38 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
  2026-01-26  5:38 ` [PATCH 2/3] xfs: use a lockref for the buffer reference count Christoph Hellwig
@ 2026-01-26  5:38 ` Christoph Hellwig
  2 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2026-01-26  5:38 UTC (permalink / raw)
  To: Carlos Maiolino
  Cc: Dave Chinner, Brian Foster, linux-xfs,
	syzbot+0391d34e801643e2809b

The per-AG buffer hashes were added when all buffer lookups took a
per-hash look.  Since then we've made lookups entirely lockless and
removed the need for a hash-wide lock for inserts and removals as
well.  With this there is no need to sharding the hash, so reduce the
used resources by using a per-buftarg hash for all buftargs.

Long after writing this initially, syzbot found a problem in the buffer
cache teardown order, which this happens to fix as well by doing the
entire buffer cache teardown in one places instead of splitting it
between destroying the buftarg and the perag structures.

Link: https://lore.kernel.org/linux-xfs/aLeUdemAZ5wmtZel@dread.disaster.area/
Reported-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Tested-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_ag.c | 13 ++---------
 fs/xfs/libxfs/xfs_ag.h |  2 --
 fs/xfs/xfs_buf.c       | 51 +++++++++++-------------------------------
 fs/xfs/xfs_buf.h       | 10 +--------
 fs/xfs/xfs_buf_mem.c   | 11 ++-------
 5 files changed, 18 insertions(+), 69 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 586918ed1cbf..a41d782e8e8c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -110,10 +110,7 @@ xfs_perag_uninit(
 	struct xfs_group	*xg)
 {
 #ifdef __KERNEL__
-	struct xfs_perag	*pag = to_perag(xg);
-
-	cancel_delayed_work_sync(&pag->pag_blockgc_work);
-	xfs_buf_cache_destroy(&pag->pag_bcache);
+	cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
 #endif
 }
 
@@ -235,10 +232,6 @@ xfs_perag_alloc(
 	INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 #endif /* __KERNEL__ */
 
-	error = xfs_buf_cache_init(&pag->pag_bcache);
-	if (error)
-		goto out_free_perag;
-
 	/*
 	 * Pre-calculated geometry
 	 */
@@ -250,12 +243,10 @@ xfs_perag_alloc(
 
 	error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
 	if (error)
-		goto out_buf_cache_destroy;
+		goto out_free_perag;
 
 	return 0;
 
-out_buf_cache_destroy:
-	xfs_buf_cache_destroy(&pag->pag_bcache);
 out_free_perag:
 	kfree(pag);
 	return error;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 1f24cfa27321..f02323416973 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -85,8 +85,6 @@ struct xfs_perag {
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
-	struct xfs_buf_cache	pag_bcache;
-
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
 #endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 348c91335163..76eb7c5a73f1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -363,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
 	.obj_cmpfn		= _xfs_buf_obj_cmp,
 };
 
-int
-xfs_buf_cache_init(
-	struct xfs_buf_cache	*bch)
-{
-	return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
-}
-
-void
-xfs_buf_cache_destroy(
-	struct xfs_buf_cache	*bch)
-{
-	rhashtable_destroy(&bch->bc_hash);
-}
-
 static int
 xfs_buf_map_verify(
 	struct xfs_buftarg	*btp,
@@ -434,7 +420,7 @@ xfs_buf_find_lock(
 
 static inline int
 xfs_buf_lookup(
-	struct xfs_buf_cache	*bch,
+	struct xfs_buftarg	*btp,
 	struct xfs_buf_map	*map,
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
@@ -443,7 +429,7 @@ xfs_buf_lookup(
 	int			error;
 
 	rcu_read_lock();
-	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
+	bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
 	if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
 		rcu_read_unlock();
 		return -ENOENT;
@@ -468,7 +454,6 @@ xfs_buf_lookup(
 static int
 xfs_buf_find_insert(
 	struct xfs_buftarg	*btp,
-	struct xfs_buf_cache	*bch,
 	struct xfs_perag	*pag,
 	struct xfs_buf_map	*cmap,
 	struct xfs_buf_map	*map,
@@ -488,7 +473,7 @@ xfs_buf_find_insert(
 	new_bp->b_pag = pag;
 
 	rcu_read_lock();
-	bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
+	bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
 			&new_bp->b_rhash_head, xfs_buf_hash_params);
 	if (IS_ERR(bp)) {
 		rcu_read_unlock();
@@ -530,16 +515,6 @@ xfs_buftarg_get_pag(
 	return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
 }
 
-static inline struct xfs_buf_cache *
-xfs_buftarg_buf_cache(
-	struct xfs_buftarg		*btp,
-	struct xfs_perag		*pag)
-{
-	if (pag)
-		return &pag->pag_bcache;
-	return btp->bt_cache;
-}
-
 /*
  * Assembles a buffer covering the specified range. The code is optimised for
  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -553,7 +528,6 @@ xfs_buf_get_map(
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
 {
-	struct xfs_buf_cache	*bch;
 	struct xfs_perag	*pag;
 	struct xfs_buf		*bp = NULL;
 	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
@@ -570,9 +544,8 @@ xfs_buf_get_map(
 		return error;
 
 	pag = xfs_buftarg_get_pag(btp, &cmap);
-	bch = xfs_buftarg_buf_cache(btp, pag);
 
-	error = xfs_buf_lookup(bch, &cmap, flags, &bp);
+	error = xfs_buf_lookup(btp, &cmap, flags, &bp);
 	if (error && error != -ENOENT)
 		goto out_put_perag;
 
@@ -584,7 +557,7 @@ xfs_buf_get_map(
 			goto out_put_perag;
 
 		/* xfs_buf_find_insert() consumes the perag reference. */
-		error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
+		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
 				flags, &bp);
 		if (error)
 			return error;
@@ -848,11 +821,8 @@ xfs_buf_destroy(
 	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 
 	if (!xfs_buf_is_uncached(bp)) {
-		struct xfs_buf_cache	*bch =
-			xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
-
-		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
-				xfs_buf_hash_params);
+		rhashtable_remove_fast(&bp->b_target->bt_hash,
+				&bp->b_rhash_head, xfs_buf_hash_params);
 
 		if (bp->b_pag)
 			xfs_perag_put(bp->b_pag);
@@ -1619,6 +1589,7 @@ xfs_destroy_buftarg(
 	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
 	percpu_counter_destroy(&btp->bt_readahead_count);
 	list_lru_destroy(&btp->bt_lru);
+	rhashtable_destroy(&btp->bt_hash);
 }
 
 void
@@ -1713,8 +1684,10 @@ xfs_init_buftarg(
 	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
 			     DEFAULT_RATELIMIT_BURST);
 
-	if (list_lru_init(&btp->bt_lru))
+	if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
 		return -ENOMEM;
+	if (list_lru_init(&btp->bt_lru))
+		goto out_destroy_hash;
 	if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
 		goto out_destroy_lru;
 
@@ -1732,6 +1705,8 @@ xfs_init_buftarg(
 	percpu_counter_destroy(&btp->bt_readahead_count);
 out_destroy_lru:
 	list_lru_destroy(&btp->bt_lru);
+out_destroy_hash:
+	rhashtable_destroy(&btp->bt_hash);
 	return -ENOMEM;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3a1d066e1c13..bf39d89f0f6d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -69,13 +69,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }
 
-struct xfs_buf_cache {
-	struct rhashtable	bc_hash;
-};
-
-int xfs_buf_cache_init(struct xfs_buf_cache *bch);
-void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
-
 /*
  * The xfs_buftarg contains 2 notions of "sector size" -
  *
@@ -113,8 +106,7 @@ struct xfs_buftarg {
 	unsigned int		bt_awu_min;
 	unsigned int		bt_awu_max;
 
-	/* built-in cache, if we're not using the perag one */
-	struct xfs_buf_cache	bt_cache[];
+	struct rhashtable	bt_hash;
 };
 
 struct xfs_buf_map {
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 0106da0a9f44..86dbec5ee203 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -58,7 +58,7 @@ xmbuf_alloc(
 	struct xfs_buftarg	*btp;
 	int			error;
 
-	btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
+	btp = kzalloc(sizeof(*btp), GFP_KERNEL);
 	if (!btp)
 		return -ENOMEM;
 
@@ -81,10 +81,6 @@ xmbuf_alloc(
 	/* ensure all writes are below EOF to avoid pagecache zeroing */
 	i_size_write(inode, inode->i_sb->s_maxbytes);
 
-	error = xfs_buf_cache_init(btp->bt_cache);
-	if (error)
-		goto out_file;
-
 	/* Initialize buffer target */
 	btp->bt_mount = mp;
 	btp->bt_dev = (dev_t)-1U;
@@ -95,15 +91,13 @@ xmbuf_alloc(
 
 	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
 	if (error)
-		goto out_bcache;
+		goto out_file;
 
 	trace_xmbuf_create(btp);
 
 	*btpp = btp;
 	return 0;
 
-out_bcache:
-	xfs_buf_cache_destroy(btp->bt_cache);
 out_file:
 	fput(file);
 out_free_btp:
@@ -122,7 +116,6 @@ xmbuf_free(
 	trace_xmbuf_free(btp);
 
 	xfs_destroy_buftarg(btp);
-	xfs_buf_cache_destroy(btp->bt_cache);
 	fput(btp->bt_file);
 	kfree(btp);
 }
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-26  5:38 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
@ 2026-01-26 19:18   ` Brian Foster
  2026-01-27  5:20     ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Brian Foster @ 2026-01-26 19:18 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Dave Chinner, linux-xfs, Darrick J. Wong

On Mon, Jan 26, 2026 at 06:38:00AM +0100, Christoph Hellwig wrote:
> Currently the buffer cache adds a reference to b_hold for buffers that
> are on the LRU.  This seems to go all the way back and allows releasing
> buffers from the LRU using xfs_buf_rele.  But it makes xfs_buf_rele
> really complicated in differs from how other LRUs are implemented in
> Linux.
> 
> Switch to not having a reference for buffers in the LRU, and use a
> separate negative hold value to mark buffers as dead.  This simplifies
> xfs_buf_rele, which now just deal with the last "real" reference,
> and prepares for using the lockref primitive.
> 
> This also removes the b_lock protection for removing buffers from the
> buffer hash.  This is the desired outcome because the rhashtable is
> fully internally synchronized, and previously the lock was mostly
> held out of ordering constrains in xfs_buf_rele_cached.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> ---

Thanks for the tweaks..

>  fs/xfs/xfs_buf.c | 140 ++++++++++++++++++-----------------------------
>  fs/xfs/xfs_buf.h |   8 +--
>  2 files changed, 54 insertions(+), 94 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index db46883991de..aacdf080e400 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
...
> @@ -1610,11 +1564,23 @@ xfs_buftarg_isolate(
>  	struct list_head	*dispose = arg;
>  
>  	/*
> -	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
> -	 * If we fail to get the lock, just skip it.
> +	 * We are inverting the lru lock vs bp->b_lock order here, so use a
> +	 * trylock. If we fail to get the lock, just skip the buffer.
>  	 */
>  	if (!spin_trylock(&bp->b_lock))
>  		return LRU_SKIP;
> +
> +	/*
> +	 * If the buffer is in use, remove it from the LRU for now as we can't
> +	 * free it.  It will be added to the LRU again when the reference count
> +	 * hits zero.
> +	 */
> +	if (bp->b_hold > 0) {
> +		list_lru_isolate(lru, &bp->b_lru);
> +		spin_unlock(&bp->b_lock);
> +		return LRU_REMOVED;
> +	}
> +

Sorry I missed this on my first look at this, but I don't think I quite
realized why this was here. This looks like a subtle change in behavior
where a buffer that makes it onto the LRU and then is subsequently held
can no longer be cycled off the LRU by background shrinker activity.
Instead, we drop the buffer off the LRU entirely where it will no longer
be visible from ongoing shrinker activity.

AFAICT the reason for this is we no longer support the ability for the
shrinker to drop the LRU b_hold ref to indicate a buffer is fully cycled
out and can go direct to freeing when the current b_hold lifecycle ends.
Am I following that correctly?

If so, that doesn't necessarily seem like a showstopper as I'm not sure
realistically a significant amount of memory would be caught up like
this in practice, even under significant pressure. But regardless, why
not do this preventative LRU removal only after the lru ref count is
fully depleted? Wouldn't that more accurately preserve existing
behavior, or am I missing something?

Brian

>  	/*
>  	 * Decrement the b_lru_ref count unless the value is already
>  	 * zero. If the value is already zero, we need to reclaim the
> @@ -1625,7 +1591,7 @@ xfs_buftarg_isolate(
>  		return LRU_ROTATE;
>  	}
>  
> -	bp->b_state |= XFS_BSTATE_DISPOSE;
> +	bp->b_hold = -1;
>  	list_lru_isolate_move(lru, item, dispose);
>  	spin_unlock(&bp->b_lock);
>  	return LRU_REMOVED;
> @@ -1647,7 +1613,7 @@ xfs_buftarg_shrink_scan(
>  		struct xfs_buf *bp;
>  		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
>  		list_del_init(&bp->b_lru);
> -		xfs_buf_rele(bp);
> +		xfs_buf_destroy(bp);
>  	}
>  
>  	return freed;
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index e25cd2a160f3..e7324d58bd96 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_INCORE,		"INCORE" }, \
>  	{ XBF_TRYLOCK,		"TRYLOCK" }
>  
> -/*
> - * Internal state flags.
> - */
> -#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
> -
>  struct xfs_buf_cache {
>  	struct rhashtable	bc_hash;
>  };
> @@ -159,7 +154,7 @@ struct xfs_buf {
>  
>  	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
>  	int			b_length;	/* size of buffer in BBs */
> -	unsigned int		b_hold;		/* reference count */
> +	int			b_hold;		/* reference count */
>  	atomic_t		b_lru_ref;	/* lru reclaim ref count */
>  	xfs_buf_flags_t		b_flags;	/* status flags */
>  	struct semaphore	b_sema;		/* semaphore for lockables */
> @@ -170,7 +165,6 @@ struct xfs_buf {
>  	 */
>  	struct list_head	b_lru;		/* lru list */
>  	spinlock_t		b_lock;		/* internal state lock */
> -	unsigned int		b_state;	/* internal state flags */
>  	wait_queue_head_t	b_waiters;	/* unpin waiters */
>  	struct list_head	b_list;
>  	struct xfs_perag	*b_pag;
> -- 
> 2.47.3
> 
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-26 19:18   ` Brian Foster
@ 2026-01-27  5:20     ` Christoph Hellwig
  2026-01-27 15:42       ` Brian Foster
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2026-01-27  5:20 UTC (permalink / raw)
  To: Brian Foster
  Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs,
	Darrick J. Wong

On Mon, Jan 26, 2026 at 02:18:42PM -0500, Brian Foster wrote:
> > +	/*
> > +	 * If the buffer is in use, remove it from the LRU for now as we can't
> > +	 * free it.  It will be added to the LRU again when the reference count
> > +	 * hits zero.
> > +	 */
> > +	if (bp->b_hold > 0) {
> > +		list_lru_isolate(lru, &bp->b_lru);
> > +		spin_unlock(&bp->b_lock);
> > +		return LRU_REMOVED;
> > +	}
> > +
> 
> Sorry I missed this on my first look at this, but I don't think I quite
> realized why this was here. This looks like a subtle change in behavior
> where a buffer that makes it onto the LRU and then is subsequently held
> can no longer be cycled off the LRU by background shrinker activity.
> Instead, we drop the buffer off the LRU entirely where it will no longer
> be visible from ongoing shrinker activity.

Yes.

> AFAICT the reason for this is we no longer support the ability for the
> shrinker to drop the LRU b_hold ref to indicate a buffer is fully cycled
> out and can go direct to freeing when the current b_hold lifecycle ends.
> Am I following that correctly?

Yes.

> If so, that doesn't necessarily seem like a showstopper as I'm not sure
> realistically a significant amount of memory would be caught up like
> this in practice, even under significant pressure. But regardless, why
> not do this preventative LRU removal only after the lru ref count is
> fully depleted? Wouldn't that more accurately preserve existing
> behavior, or am I missing something?

It would more closely resemble the current behavior, which seems wrong
and an artifact of the current reference counting.  A buffer that is
in use really should not be counted down on the LRU, which really is
for unused buffers.  So starting the countdown only once the buffer
is unused is the right thing to do.  I should have explained this
much better, though.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-27  5:20     ` Christoph Hellwig
@ 2026-01-27 15:42       ` Brian Foster
  2026-01-27 16:42         ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Brian Foster @ 2026-01-27 15:42 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Dave Chinner, linux-xfs, Darrick J. Wong

On Tue, Jan 27, 2026 at 06:20:50AM +0100, Christoph Hellwig wrote:
> On Mon, Jan 26, 2026 at 02:18:42PM -0500, Brian Foster wrote:
> > > +	/*
> > > +	 * If the buffer is in use, remove it from the LRU for now as we can't
> > > +	 * free it.  It will be added to the LRU again when the reference count
> > > +	 * hits zero.
> > > +	 */
> > > +	if (bp->b_hold > 0) {
> > > +		list_lru_isolate(lru, &bp->b_lru);
> > > +		spin_unlock(&bp->b_lock);
> > > +		return LRU_REMOVED;
> > > +	}
> > > +
> > 
> > Sorry I missed this on my first look at this, but I don't think I quite
> > realized why this was here. This looks like a subtle change in behavior
> > where a buffer that makes it onto the LRU and then is subsequently held
> > can no longer be cycled off the LRU by background shrinker activity.
> > Instead, we drop the buffer off the LRU entirely where it will no longer
> > be visible from ongoing shrinker activity.
> 
> Yes.
> 
> > AFAICT the reason for this is we no longer support the ability for the
> > shrinker to drop the LRU b_hold ref to indicate a buffer is fully cycled
> > out and can go direct to freeing when the current b_hold lifecycle ends.
> > Am I following that correctly?
> 
> Yes.
> 
> > If so, that doesn't necessarily seem like a showstopper as I'm not sure
> > realistically a significant amount of memory would be caught up like
> > this in practice, even under significant pressure. But regardless, why
> > not do this preventative LRU removal only after the lru ref count is
> > fully depleted? Wouldn't that more accurately preserve existing
> > behavior, or am I missing something?
> 
> It would more closely resemble the current behavior, which seems wrong
> and an artifact of the current reference counting.  A buffer that is
> in use really should not be counted down on the LRU, which really is
> for unused buffers.  So starting the countdown only once the buffer
> is unused is the right thing to do.  I should have explained this
> much better, though.
> 

Ok, but right or wrong it's been that way for quite some time (forever?)
and this sort of change probably shouldn't be buried implicitly in this
patch. For one, the LRU refcounting behavior doesn't depend on this work
because we could have always done something like skip the decrement and
rotate held buffers, but nobody has ever proposed such a change to my
knowledge. Also, ISTM this patch could fairly easily preserve existing
behavior by decrementing lru ref and deferring list removal to when the
lru ref drops to zero but the buffer is still held.

IOW, I'm not arguing for or against a change in buffer lifetime behavior
here, just that it should probably be done separately with some more
careful analysis. The secondary advantage is that if this behavior does
somehow uncover something problematic, we can bisect/revert back to
historical lifetime behavior without having to walk back these
functional changes.

Brian


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU
  2026-01-27 15:42       ` Brian Foster
@ 2026-01-27 16:42         ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2026-01-27 16:42 UTC (permalink / raw)
  To: Brian Foster
  Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs,
	Darrick J. Wong

On Tue, Jan 27, 2026 at 10:42:18AM -0500, Brian Foster wrote:
> IOW, I'm not arguing for or against a change in buffer lifetime behavior
> here, just that it should probably be done separately with some more
> careful analysis. The secondary advantage is that if this behavior does
> somehow uncover something problematic, we can bisect/revert back to
> historical lifetime behavior without having to walk back these
> functional changes.

Allright, I'll see if I can split it out.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2026-01-27 16:42 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-26  5:37 buffer cache simplification v3 Christoph Hellwig
2026-01-26  5:38 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
2026-01-26 19:18   ` Brian Foster
2026-01-27  5:20     ` Christoph Hellwig
2026-01-27 15:42       ` Brian Foster
2026-01-27 16:42         ` Christoph Hellwig
2026-01-26  5:38 ` [PATCH 2/3] xfs: use a lockref for the buffer reference count Christoph Hellwig
2026-01-26  5:38 ` [PATCH 3/3] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
  -- strict thread matches above, loose matches on Subject: below --
2026-01-22  5:26 buffer cache simplification v2 Christoph Hellwig
2026-01-22  5:26 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
2026-01-23 11:55   ` Carlos Maiolino
2026-01-23 16:01   ` Brian Foster
2026-01-19 15:31 buffer cache simplification Christoph Hellwig
2026-01-19 15:31 ` [PATCH 1/3] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
2026-01-20  2:53   ` Darrick J. Wong
2026-01-20  6:55     ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox