* [PATCH 1/4] xfs: don't keep a reference for buffers on the LRU
2026-03-17 13:40 buffer cache simplification v5 Christoph Hellwig
@ 2026-03-17 13:40 ` Christoph Hellwig
2026-03-17 21:33 ` Dave Chinner
2026-03-18 11:44 ` Brian Foster
2026-03-17 13:40 ` [PATCH 2/4] xfs: use a lockref for the buffer reference count Christoph Hellwig
` (2 subsequent siblings)
3 siblings, 2 replies; 16+ messages in thread
From: Christoph Hellwig @ 2026-03-17 13:40 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Dave Chinner, Brian Foster, linux-xfs, Darrick J. Wong
Currently the buffer cache adds a reference to b_hold for buffers that
are on the LRU. This seems to go all the way back and allows releasing
buffers from the LRU using xfs_buf_rele. But it makes xfs_buf_rele
really complicated in differs from how other LRUs are implemented in
Linux.
Switch to not having a reference for buffers in the LRU, and use a
separate negative hold value to mark buffers as dead. This simplifies
xfs_buf_rele, which now just deal with the last "real" reference,
and prepares for using the lockref primitive.
This also removes the b_lock protection for removing buffers from the
buffer hash. This is the desired outcome because the rhashtable is
fully internally synchronized, and previously the lock was mostly
held out of ordering constrains in xfs_buf_rele_cached.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
fs/xfs/xfs_buf.c | 139 ++++++++++++++++++-----------------------------
fs/xfs/xfs_buf.h | 8 +--
2 files changed, 53 insertions(+), 94 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d2f3c50d80e7..3cd37f082a69 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,11 +80,8 @@ xfs_buf_stale(
spin_lock(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 0);
- if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
- (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
- bp->b_hold--;
-
- ASSERT(bp->b_hold >= 1);
+ if (bp->b_hold >= 0)
+ list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
spin_unlock(&bp->b_lock);
}
@@ -442,7 +439,7 @@ xfs_buf_try_hold(
struct xfs_buf *bp)
{
spin_lock(&bp->b_lock);
- if (bp->b_hold == 0) {
+ if (bp->b_hold == -1) {
spin_unlock(&bp->b_lock);
return false;
}
@@ -862,76 +859,24 @@ xfs_buf_hold(
}
static void
-xfs_buf_rele_uncached(
- struct xfs_buf *bp)
-{
- ASSERT(list_empty(&bp->b_lru));
-
- spin_lock(&bp->b_lock);
- if (--bp->b_hold) {
- spin_unlock(&bp->b_lock);
- return;
- }
- spin_unlock(&bp->b_lock);
- xfs_buf_free(bp);
-}
-
-static void
-xfs_buf_rele_cached(
+xfs_buf_destroy(
struct xfs_buf *bp)
{
- struct xfs_buftarg *btp = bp->b_target;
- struct xfs_perag *pag = bp->b_pag;
- struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
- bool freebuf = false;
-
- trace_xfs_buf_rele(bp, _RET_IP_);
-
- spin_lock(&bp->b_lock);
- ASSERT(bp->b_hold >= 1);
- if (bp->b_hold > 1) {
- bp->b_hold--;
- goto out_unlock;
- }
+ ASSERT(bp->b_hold < 0);
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- /* we are asked to drop the last reference */
- if (atomic_read(&bp->b_lru_ref)) {
- /*
- * If the buffer is added to the LRU, keep the reference to the
- * buffer for the LRU and clear the (now stale) dispose list
- * state flag, else drop the reference.
- */
- if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
- bp->b_state &= ~XFS_BSTATE_DISPOSE;
- else
- bp->b_hold--;
- } else {
- bp->b_hold--;
- /*
- * most of the time buffers will already be removed from the
- * LRU, so optimise that case by checking for the
- * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
- * was on was the disposal list
- */
- if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
- } else {
- ASSERT(list_empty(&bp->b_lru));
- }
+ if (!xfs_buf_is_uncached(bp)) {
+ struct xfs_buf_cache *bch =
+ xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
- ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
xfs_buf_hash_params);
- if (pag)
- xfs_perag_put(pag);
- freebuf = true;
- }
-out_unlock:
- spin_unlock(&bp->b_lock);
+ if (bp->b_pag)
+ xfs_perag_put(bp->b_pag);
+ }
- if (freebuf)
- xfs_buf_free(bp);
+ xfs_buf_free(bp);
}
/*
@@ -942,10 +887,22 @@ xfs_buf_rele(
struct xfs_buf *bp)
{
trace_xfs_buf_rele(bp, _RET_IP_);
- if (xfs_buf_is_uncached(bp))
- xfs_buf_rele_uncached(bp);
- else
- xfs_buf_rele_cached(bp);
+
+ spin_lock(&bp->b_lock);
+ if (!--bp->b_hold) {
+ if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
+ goto kill;
+ list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
+ }
+ spin_unlock(&bp->b_lock);
+ return;
+
+kill:
+ bp->b_hold = -1;
+ list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+ spin_unlock(&bp->b_lock);
+
+ xfs_buf_destroy(bp);
}
/*
@@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
/*
* To simulate an I/O failure, the buffer must be locked and held with at least
- * three references. The LRU reference is dropped by the stale call. The buf
- * item reference is dropped via ioend processing. The third reference is owned
- * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ * two references.
+ *
+ * The buf item reference is dropped via ioend processing. The second reference
+ * is owned by the caller and is dropped on I/O completion if the buffer is
+ * XBF_ASYNC.
*/
void
xfs_buf_ioend_fail(
@@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
if (!spin_trylock(&bp->b_lock))
return LRU_SKIP;
- if (bp->b_hold > 1) {
+ if (bp->b_hold > 0) {
/* need to wait, so skip it this pass */
spin_unlock(&bp->b_lock);
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
- /*
- * clear the LRU reference count so the buffer doesn't get
- * ignored in xfs_buf_rele().
- */
- atomic_set(&bp->b_lru_ref, 0);
- bp->b_state |= XFS_BSTATE_DISPOSE;
+ bp->b_hold = -1;
list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
return LRU_REMOVED;
@@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
(long long)xfs_buf_daddr(bp));
}
- xfs_buf_rele(bp);
+ xfs_buf_destroy(bp);
}
if (loop++ != 0)
delay(100);
@@ -1610,11 +1564,12 @@ xfs_buftarg_isolate(
struct list_head *dispose = arg;
/*
- * we are inverting the lru lock/bp->b_lock here, so use a trylock.
- * If we fail to get the lock, just skip it.
+ * We are inverting the lru lock vs bp->b_lock order here, so use a
+ * trylock. If we fail to get the lock, just skip the buffer.
*/
if (!spin_trylock(&bp->b_lock))
return LRU_SKIP;
+
/*
* Decrement the b_lru_ref count unless the value is already
* zero. If the value is already zero, we need to reclaim the
@@ -1624,8 +1579,18 @@ xfs_buftarg_isolate(
spin_unlock(&bp->b_lock);
return LRU_ROTATE;
}
+
+ /*
+ * If the buffer is in use, remove it from the LRU for now as we can't
+ * free it. It will be freed when the last reference drops.
+ */
+ if (bp->b_hold > 0) {
+ list_lru_isolate(lru, &bp->b_lru);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
+ }
- bp->b_state |= XFS_BSTATE_DISPOSE;
+ bp->b_hold = -1;
list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
return LRU_REMOVED;
@@ -1647,7 +1612,7 @@ xfs_buftarg_shrink_scan(
struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
- xfs_buf_rele(bp);
+ xfs_buf_destroy(bp);
}
return freed;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e25cd2a160f3..e7324d58bd96 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }
-/*
- * Internal state flags.
- */
-#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
-
struct xfs_buf_cache {
struct rhashtable bc_hash;
};
@@ -159,7 +154,7 @@ struct xfs_buf {
xfs_daddr_t b_rhash_key; /* buffer cache index */
int b_length; /* size of buffer in BBs */
- unsigned int b_hold; /* reference count */
+ int b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
@@ -170,7 +165,6 @@ struct xfs_buf {
*/
struct list_head b_lru; /* lru list */
spinlock_t b_lock; /* internal state lock */
- unsigned int b_state; /* internal state flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag;
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH 1/4] xfs: don't keep a reference for buffers on the LRU
2026-03-17 13:40 ` [PATCH 1/4] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
@ 2026-03-17 21:33 ` Dave Chinner
2026-03-18 14:38 ` Christoph Hellwig
2026-03-18 11:44 ` Brian Foster
1 sibling, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2026-03-17 21:33 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Carlos Maiolino, Dave Chinner, Brian Foster, linux-xfs,
Darrick J. Wong
On Tue, Mar 17, 2026 at 02:40:52PM +0100, Christoph Hellwig wrote:
> @@ -942,10 +887,22 @@ xfs_buf_rele(
> struct xfs_buf *bp)
> {
> trace_xfs_buf_rele(bp, _RET_IP_);
> - if (xfs_buf_is_uncached(bp))
> - xfs_buf_rele_uncached(bp);
> - else
> - xfs_buf_rele_cached(bp);
> +
> + spin_lock(&bp->b_lock);
> + if (!--bp->b_hold) {
> + if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
> + goto kill;
> + list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
> + }
> + spin_unlock(&bp->b_lock);
> + return;
> +
> +kill:
> + bp->b_hold = -1;
> + list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
> + spin_unlock(&bp->b_lock);
> +
> + xfs_buf_destroy(bp);
> }
The jump with the lock held is kinda nasty. The lock scope can be
straight-lined by inverting the logic like so:
spin_lock(&bp->b_lock);
if (--bp->b_hold) {
spin_unlock(&bp->b_lock);
return;
}
if (!xfs_buf_is_uncached(bp) && atomic_read(&bp->b_lru_ref)) {
list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
spin_unlock(&bp->b_lock);
return;
}
/* time to die */
bp->b_hold = -1;
list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
spin_unlock(&bp->b_lock);
xfs_buf_destroy(bp);
Otherwise the change looks OK.
-Dave.
--
Dave Chinner
dgc@kernel.org
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH 1/4] xfs: don't keep a reference for buffers on the LRU
2026-03-17 21:33 ` Dave Chinner
@ 2026-03-18 14:38 ` Christoph Hellwig
0 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2026-03-18 14:38 UTC (permalink / raw)
To: Dave Chinner
Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, Brian Foster,
linux-xfs, Darrick J. Wong
On Wed, Mar 18, 2026 at 08:33:30AM +1100, Dave Chinner wrote:
> The jump with the lock held is kinda nasty. The lock scope can be
> straight-lined by inverting the logic like so:
The current code keeps things most straight by keeping all the
non-default code in a barnch. The above with more unlocks in
multiple branches feels a lot harder to follow.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] xfs: don't keep a reference for buffers on the LRU
2026-03-17 13:40 ` [PATCH 1/4] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
2026-03-17 21:33 ` Dave Chinner
@ 2026-03-18 11:44 ` Brian Foster
1 sibling, 0 replies; 16+ messages in thread
From: Brian Foster @ 2026-03-18 11:44 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Carlos Maiolino, Dave Chinner, linux-xfs, Darrick J. Wong
On Tue, Mar 17, 2026 at 02:40:52PM +0100, Christoph Hellwig wrote:
> Currently the buffer cache adds a reference to b_hold for buffers that
> are on the LRU. This seems to go all the way back and allows releasing
> buffers from the LRU using xfs_buf_rele. But it makes xfs_buf_rele
> really complicated in differs from how other LRUs are implemented in
> Linux.
>
> Switch to not having a reference for buffers in the LRU, and use a
> separate negative hold value to mark buffers as dead. This simplifies
> xfs_buf_rele, which now just deal with the last "real" reference,
> and prepares for using the lockref primitive.
>
> This also removes the b_lock protection for removing buffers from the
> buffer hash. This is the desired outcome because the rhashtable is
> fully internally synchronized, and previously the lock was mostly
> held out of ordering constrains in xfs_buf_rele_cached.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
> fs/xfs/xfs_buf.c | 139 ++++++++++++++++++-----------------------------
> fs/xfs/xfs_buf.h | 8 +--
> 2 files changed, 53 insertions(+), 94 deletions(-)
>
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index d2f3c50d80e7..3cd37f082a69 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
...
> @@ -1610,11 +1564,12 @@ xfs_buftarg_isolate(
> struct list_head *dispose = arg;
>
> /*
> - * we are inverting the lru lock/bp->b_lock here, so use a trylock.
> - * If we fail to get the lock, just skip it.
> + * We are inverting the lru lock vs bp->b_lock order here, so use a
> + * trylock. If we fail to get the lock, just skip the buffer.
> */
> if (!spin_trylock(&bp->b_lock))
> return LRU_SKIP;
> +
> /*
> * Decrement the b_lru_ref count unless the value is already
> * zero. If the value is already zero, we need to reclaim the
> @@ -1624,8 +1579,18 @@ xfs_buftarg_isolate(
> spin_unlock(&bp->b_lock);
> return LRU_ROTATE;
> }
> +
(Trailing) Whitespace damage here ^, JFYI.
Otherwise LGTM. I like Dave's suggestion, but either way:
Reviewed-by: Brian Foster <bfoster@redhat.com>
> + /*
> + * If the buffer is in use, remove it from the LRU for now as we can't
> + * free it. It will be freed when the last reference drops.
> + */
> + if (bp->b_hold > 0) {
> + list_lru_isolate(lru, &bp->b_lru);
> + spin_unlock(&bp->b_lock);
> + return LRU_REMOVED;
> + }
>
> - bp->b_state |= XFS_BSTATE_DISPOSE;
> + bp->b_hold = -1;
> list_lru_isolate_move(lru, item, dispose);
> spin_unlock(&bp->b_lock);
> return LRU_REMOVED;
> @@ -1647,7 +1612,7 @@ xfs_buftarg_shrink_scan(
> struct xfs_buf *bp;
> bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
> list_del_init(&bp->b_lru);
> - xfs_buf_rele(bp);
> + xfs_buf_destroy(bp);
> }
>
> return freed;
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index e25cd2a160f3..e7324d58bd96 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
> { XBF_INCORE, "INCORE" }, \
> { XBF_TRYLOCK, "TRYLOCK" }
>
> -/*
> - * Internal state flags.
> - */
> -#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
> -
> struct xfs_buf_cache {
> struct rhashtable bc_hash;
> };
> @@ -159,7 +154,7 @@ struct xfs_buf {
>
> xfs_daddr_t b_rhash_key; /* buffer cache index */
> int b_length; /* size of buffer in BBs */
> - unsigned int b_hold; /* reference count */
> + int b_hold; /* reference count */
> atomic_t b_lru_ref; /* lru reclaim ref count */
> xfs_buf_flags_t b_flags; /* status flags */
> struct semaphore b_sema; /* semaphore for lockables */
> @@ -170,7 +165,6 @@ struct xfs_buf {
> */
> struct list_head b_lru; /* lru list */
> spinlock_t b_lock; /* internal state lock */
> - unsigned int b_state; /* internal state flags */
> wait_queue_head_t b_waiters; /* unpin waiters */
> struct list_head b_list;
> struct xfs_perag *b_pag;
> --
> 2.47.3
>
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 2/4] xfs: use a lockref for the buffer reference count
2026-03-17 13:40 buffer cache simplification v5 Christoph Hellwig
2026-03-17 13:40 ` [PATCH 1/4] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
@ 2026-03-17 13:40 ` Christoph Hellwig
2026-03-17 21:53 ` Dave Chinner
2026-03-17 13:40 ` [PATCH 3/4] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
2026-03-17 13:40 ` [PATCH 4/4] xfs: don't decrement the buffer LRU count for in-use buffers Christoph Hellwig
3 siblings, 1 reply; 16+ messages in thread
From: Christoph Hellwig @ 2026-03-17 13:40 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Dave Chinner, Brian Foster, linux-xfs, Darrick J. Wong
The lockref structure allows incrementing/decrementing counters like
an atomic_t for the fast path, while still allowing complex slow path
operations as if the counter was protected by a lock. The only slow
path operations that actually need to take the lock are the final
put, LRU evictions and marking a buffer stale.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
fs/xfs/xfs_buf.c | 79 ++++++++++++++++++----------------------------
fs/xfs/xfs_buf.h | 4 +--
fs/xfs/xfs_trace.h | 10 +++---
3 files changed, 38 insertions(+), 55 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 3cd37f082a69..d585848a6be1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -31,20 +31,20 @@ struct kmem_cache *xfs_buf_cache;
*
* xfs_buf_stale:
* b_sema (caller holds)
- * b_lock
+ * b_lockref.lock
* lru_lock
*
* xfs_buf_rele:
- * b_lock
+ * b_lockref.lock
* lru_lock
*
* xfs_buftarg_drain_rele
* lru_lock
- * b_lock (trylock due to inversion)
+ * b_lockref.lock (trylock due to inversion)
*
* xfs_buftarg_isolate
* lru_lock
- * b_lock (trylock due to inversion)
+ * b_lockref.lock (trylock due to inversion)
*/
static void xfs_buf_submit(struct xfs_buf *bp);
@@ -78,11 +78,11 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
- spin_lock(&bp->b_lock);
+ spin_lock(&bp->b_lockref.lock);
atomic_set(&bp->b_lru_ref, 0);
- if (bp->b_hold >= 0)
+ if (!__lockref_is_dead(&bp->b_lockref))
list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
}
static void
@@ -274,10 +274,8 @@ xfs_buf_alloc(
* inserting into the hash table are safe (and will have to wait for
* the unlock to do anything non-trivial).
*/
- bp->b_hold = 1;
+ lockref_init(&bp->b_lockref);
sema_init(&bp->b_sema, 0); /* held, no waiters */
-
- spin_lock_init(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
@@ -434,20 +432,6 @@ xfs_buf_find_lock(
return 0;
}
-static bool
-xfs_buf_try_hold(
- struct xfs_buf *bp)
-{
- spin_lock(&bp->b_lock);
- if (bp->b_hold == -1) {
- spin_unlock(&bp->b_lock);
- return false;
- }
- bp->b_hold++;
- spin_unlock(&bp->b_lock);
- return true;
-}
-
static inline int
xfs_buf_lookup(
struct xfs_buf_cache *bch,
@@ -460,7 +444,7 @@ xfs_buf_lookup(
rcu_read_lock();
bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
- if (!bp || !xfs_buf_try_hold(bp)) {
+ if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
rcu_read_unlock();
return -ENOENT;
}
@@ -511,7 +495,7 @@ xfs_buf_find_insert(
error = PTR_ERR(bp);
goto out_free_buf;
}
- if (bp && xfs_buf_try_hold(bp)) {
+ if (bp && lockref_get_not_dead(&bp->b_lockref)) {
/* found an existing buffer */
rcu_read_unlock();
error = xfs_buf_find_lock(bp, flags);
@@ -853,16 +837,14 @@ xfs_buf_hold(
{
trace_xfs_buf_hold(bp, _RET_IP_);
- spin_lock(&bp->b_lock);
- bp->b_hold++;
- spin_unlock(&bp->b_lock);
+ lockref_get(&bp->b_lockref);
}
static void
xfs_buf_destroy(
struct xfs_buf *bp)
{
- ASSERT(bp->b_hold < 0);
+ ASSERT(__lockref_is_dead(&bp->b_lockref));
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
if (!xfs_buf_is_uncached(bp)) {
@@ -888,19 +870,20 @@ xfs_buf_rele(
{
trace_xfs_buf_rele(bp, _RET_IP_);
- spin_lock(&bp->b_lock);
- if (!--bp->b_hold) {
+ if (lockref_put_or_lock(&bp->b_lockref))
+ return;
+ if (!--bp->b_lockref.count) {
if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
goto kill;
list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
}
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
return;
kill:
- bp->b_hold = -1;
+ lockref_mark_dead(&bp->b_lockref);
list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
xfs_buf_destroy(bp);
}
@@ -1471,18 +1454,18 @@ xfs_buftarg_drain_rele(
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
struct list_head *dispose = arg;
- if (!spin_trylock(&bp->b_lock))
+ if (!spin_trylock(&bp->b_lockref.lock))
return LRU_SKIP;
- if (bp->b_hold > 0) {
+ if (bp->b_lockref.count > 0) {
/* need to wait, so skip it this pass */
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
- bp->b_hold = -1;
+ lockref_mark_dead(&bp->b_lockref);
list_lru_isolate_move(lru, item, dispose);
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
}
@@ -1564,10 +1547,10 @@ xfs_buftarg_isolate(
struct list_head *dispose = arg;
/*
- * We are inverting the lru lock vs bp->b_lock order here, so use a
- * trylock. If we fail to get the lock, just skip the buffer.
+ * We are inverting the lru lock vs bp->b_lockref.lock order here, so
+ * use a trylock. If we fail to get the lock, just skip the buffer.
*/
- if (!spin_trylock(&bp->b_lock))
+ if (!spin_trylock(&bp->b_lockref.lock))
return LRU_SKIP;
/*
@@ -1576,7 +1559,7 @@ xfs_buftarg_isolate(
* buffer, otherwise it gets another trip through the LRU.
*/
if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
return LRU_ROTATE;
}
@@ -1584,15 +1567,15 @@ xfs_buftarg_isolate(
* If the buffer is in use, remove it from the LRU for now as we can't
* free it. It will be freed when the last reference drops.
*/
- if (bp->b_hold > 0) {
+ if (bp->b_lockref.count > 0) {
list_lru_isolate(lru, &bp->b_lru);
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
}
- bp->b_hold = -1;
+ lockref_mark_dead(&bp->b_lockref);
list_lru_isolate_move(lru, item, dispose);
- spin_unlock(&bp->b_lock);
+ spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e7324d58bd96..3a1d066e1c13 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -14,6 +14,7 @@
#include <linux/dax.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
+#include <linux/lockref.h>
extern struct kmem_cache *xfs_buf_cache;
@@ -154,7 +155,7 @@ struct xfs_buf {
xfs_daddr_t b_rhash_key; /* buffer cache index */
int b_length; /* size of buffer in BBs */
- int b_hold; /* reference count */
+ struct lockref b_lockref; /* refcount + lock */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
@@ -164,7 +165,6 @@ struct xfs_buf {
* bt_lru_lock and not by b_sema
*/
struct list_head b_lru; /* lru list */
- spinlock_t b_lock; /* internal state lock */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 813e5a9f57eb..c191f4b163a8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -739,7 +739,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
- __entry->hold = bp->b_hold;
+ __entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
@@ -813,7 +813,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
__entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length;
__entry->flags = flags;
- __entry->hold = bp->b_hold;
+ __entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->caller_ip = caller_ip;
@@ -857,7 +857,7 @@ TRACE_EVENT(xfs_buf_ioerror,
__entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length;
- __entry->hold = bp->b_hold;
+ __entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->error = error;
@@ -901,7 +901,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
__entry->buf_len = bip->bli_buf->b_length;
__entry->buf_flags = bip->bli_buf->b_flags;
- __entry->buf_hold = bip->bli_buf->b_hold;
+ __entry->buf_hold = bip->bli_buf->b_lockref.count;
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
__entry->buf_lockval = bip->bli_buf->b_sema.count;
__entry->li_flags = bip->bli_item.li_flags;
@@ -5185,7 +5185,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
__entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
- __entry->hold = bp->b_hold;
+ __entry->hold = bp->b_lockref.count;
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH 2/4] xfs: use a lockref for the buffer reference count
2026-03-17 13:40 ` [PATCH 2/4] xfs: use a lockref for the buffer reference count Christoph Hellwig
@ 2026-03-17 21:53 ` Dave Chinner
2026-03-18 14:49 ` Christoph Hellwig
0 siblings, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2026-03-17 21:53 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Carlos Maiolino, Dave Chinner, Brian Foster, linux-xfs,
Darrick J. Wong
On Tue, Mar 17, 2026 at 02:40:53PM +0100, Christoph Hellwig wrote:
> The lockref structure allows incrementing/decrementing counters like
> an atomic_t for the fast path, while still allowing complex slow path
> operations as if the counter was protected by a lock. The only slow
> path operations that actually need to take the lock are the final
> put, LRU evictions and marking a buffer stale.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Seems straight forward enough, even if it is more verbose...
> @@ -853,16 +837,14 @@ xfs_buf_hold(
> {
> trace_xfs_buf_hold(bp, _RET_IP_);
>
> - spin_lock(&bp->b_lock);
> - bp->b_hold++;
> - spin_unlock(&bp->b_lock);
> + lockref_get(&bp->b_lockref);
> }
Can we make xfs_buf_hold a static inline in xfs_buf.h now? It is
called quite frequently and it's now just a one line wrapper
around the lockref code...
> @@ -1564,10 +1547,10 @@ xfs_buftarg_isolate(
> struct list_head *dispose = arg;
>
> /*
> - * We are inverting the lru lock vs bp->b_lock order here, so use a
> - * trylock. If we fail to get the lock, just skip the buffer.
> + * We are inverting the lru lock vs bp->b_lockref.lock order here, so
> + * use a trylock. If we fail to get the lock, just skip the buffer.
> */
> - if (!spin_trylock(&bp->b_lock))
> + if (!spin_trylock(&bp->b_lockref.lock))
> return LRU_SKIP;
>
> /*
You modify this comment and whitespace in the previous patch without
any code change, and then do it again in this patch with a code
change. Can you collapse that just into a single change in this
patch with the code change?
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index e7324d58bd96..3a1d066e1c13 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -14,6 +14,7 @@
> #include <linux/dax.h>
> #include <linux/uio.h>
> #include <linux/list_lru.h>
> +#include <linux/lockref.h>
>
> extern struct kmem_cache *xfs_buf_cache;
>
> @@ -154,7 +155,7 @@ struct xfs_buf {
>
> xfs_daddr_t b_rhash_key; /* buffer cache index */
> int b_length; /* size of buffer in BBs */
> - int b_hold; /* reference count */
> + struct lockref b_lockref; /* refcount + lock */
> atomic_t b_lru_ref; /* lru reclaim ref count */
IIUC, this adds a 4 byte hole to the structure - b_lockref is an aligned
u64...
> xfs_buf_flags_t b_flags; /* status flags */
And we end up with another 4 byte hole here because...
> struct semaphore b_sema; /* semaphore for lockables */
the struct semaphore also ends up being u64 aligned...
The first cacheline of the xfs_buf is explicitly packed
packed with the fields that a hash lookup needs to
minimise cacheline misses during lookup, so we really don't want to
screw that up by wasting space in the first cacheline.
Indeed, seeing as we've simplified the structure of the cache index
over time and are moving back to a global cache index, we probably
need to take another look at exactly where the cacheline boundary
sits and exactly what the lockless lookup fast path accesses. i.e.
make sure that lookups only access the first cacheline, and there is
nothing on that first cacheline that any other concurrent buffer
access might contend on (e.g. lru list scanning, wait queues, pure
read-only accesses, etc.)
Otherwise, the code change looks ok.
-Dave.
--
Dave Chinner
dgc@kernel.org
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH 2/4] xfs: use a lockref for the buffer reference count
2026-03-17 21:53 ` Dave Chinner
@ 2026-03-18 14:49 ` Christoph Hellwig
0 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2026-03-18 14:49 UTC (permalink / raw)
To: Dave Chinner
Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, Brian Foster,
linux-xfs, Darrick J. Wong
On Wed, Mar 18, 2026 at 08:53:13AM +1100, Dave Chinner wrote:
> > - spin_lock(&bp->b_lock);
> > - bp->b_hold++;
> > - spin_unlock(&bp->b_lock);
> > + lockref_get(&bp->b_lockref);
> > }
>
> Can we make xfs_buf_hold a static inline in xfs_buf.h now? It is
> called quite frequently and it's now just a one line wrapper
> around the lockref code...
Well, that plus the trace point. For which we'd need to pull in
xfs_trace.h into xfs_buf.h, which I've been trying to avoid.
>
> > @@ -1564,10 +1547,10 @@ xfs_buftarg_isolate(
> > struct list_head *dispose = arg;
> >
> > /*
> > - * We are inverting the lru lock vs bp->b_lock order here, so use a
> > - * trylock. If we fail to get the lock, just skip the buffer.
> > + * We are inverting the lru lock vs bp->b_lockref.lock order here, so
> > + * use a trylock. If we fail to get the lock, just skip the buffer.
> > */
> > - if (!spin_trylock(&bp->b_lock))
> > + if (!spin_trylock(&bp->b_lockref.lock))
> > return LRU_SKIP;
> >
> > /*
>
> You modify this comment and whitespace in the previous patch without
> any code change, and then do it again in this patch with a code
> change. Can you collapse that just into a single change in this
> patch with the code change?
Sure, done.
> > xfs_daddr_t b_rhash_key; /* buffer cache index */
> > int b_length; /* size of buffer in BBs */
> > - int b_hold; /* reference count */
> > + struct lockref b_lockref; /* refcount + lock */
> > atomic_t b_lru_ref; /* lru reclaim ref count */
>
> IIUC, this adds a 4 byte hole to the structure - b_lockref is an aligned
> u64...
Yes.
> The first cacheline of the xfs_buf is explicitly packed
> packed with the fields that a hash lookup needs to
> minimise cacheline misses during lookup, so we really don't want to
> screw that up by wasting space in the first cacheline.
>
> Indeed, seeing as we've simplified the structure of the cache index
> over time and are moving back to a global cache index, we probably
> need to take another look at exactly where the cacheline boundary
> sits and exactly what the lockless lookup fast path accesses. i.e.
> make sure that lookups only access the first cacheline, and there is
> nothing on that first cacheline that any other concurrent buffer
> access might contend on (e.g. lru list scanning, wait queues, pure
> read-only accesses, etc.)
With this change, the first 64-byte cache line ends after b_flags.
So even as-is it avoid having b_sema spread cache lines (all according
to b_sema). So for now I'd like to avoid reordering things in this
series, even if I agree with the sentiment that a good analysis
here might be in order. Maybe wait until willy's series to shrink
the semaphore lands, which will changes thing a bit again.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 3/4] xfs: switch (back) to a per-buftarg buffer hash
2026-03-17 13:40 buffer cache simplification v5 Christoph Hellwig
2026-03-17 13:40 ` [PATCH 1/4] xfs: don't keep a reference for buffers on the LRU Christoph Hellwig
2026-03-17 13:40 ` [PATCH 2/4] xfs: use a lockref for the buffer reference count Christoph Hellwig
@ 2026-03-17 13:40 ` Christoph Hellwig
2026-03-17 22:00 ` Dave Chinner
2026-03-18 12:14 ` Brian Foster
2026-03-17 13:40 ` [PATCH 4/4] xfs: don't decrement the buffer LRU count for in-use buffers Christoph Hellwig
3 siblings, 2 replies; 16+ messages in thread
From: Christoph Hellwig @ 2026-03-17 13:40 UTC (permalink / raw)
To: Carlos Maiolino
Cc: Dave Chinner, Brian Foster, linux-xfs,
syzbot+0391d34e801643e2809b, Darrick J. Wong
The per-AG buffer hashes were added when all buffer lookups took a
per-hash look. Since then we've made lookups entirely lockless and
removed the need for a hash-wide lock for inserts and removals as
well. With this there is no need to sharding the hash, so reduce the
used resources by using a per-buftarg hash for all buftargs.
Long after writing this initially, syzbot found a problem in the buffer
cache teardown order, which this happens to fix as well by doing the
entire buffer cache teardown in one places instead of splitting it
between destroying the buftarg and the perag structures.
Link: https://lore.kernel.org/linux-xfs/aLeUdemAZ5wmtZel@dread.disaster.area/
Reported-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Tested-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_ag.c | 13 ++---------
fs/xfs/libxfs/xfs_ag.h | 2 --
fs/xfs/xfs_buf.c | 51 +++++++++++-------------------------------
fs/xfs/xfs_buf.h | 10 +--------
fs/xfs/xfs_buf_mem.c | 11 ++-------
5 files changed, 18 insertions(+), 69 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index bd8fbb40b49e..dcd2f93b6a6c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -110,10 +110,7 @@ xfs_perag_uninit(
struct xfs_group *xg)
{
#ifdef __KERNEL__
- struct xfs_perag *pag = to_perag(xg);
-
- cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_buf_cache_destroy(&pag->pag_bcache);
+ cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
#endif
}
@@ -235,10 +232,6 @@ xfs_perag_alloc(
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
#endif /* __KERNEL__ */
- error = xfs_buf_cache_init(&pag->pag_bcache);
- if (error)
- goto out_free_perag;
-
/*
* Pre-calculated geometry
*/
@@ -250,12 +243,10 @@ xfs_perag_alloc(
error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
if (error)
- goto out_buf_cache_destroy;
+ goto out_free_perag;
return 0;
-out_buf_cache_destroy:
- xfs_buf_cache_destroy(&pag->pag_bcache);
out_free_perag:
kfree(pag);
return error;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 3cd4790768ff..16a9b43a3c27 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -85,8 +85,6 @@ struct xfs_perag {
int pag_ici_reclaimable; /* reclaimable inodes */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
- struct xfs_buf_cache pag_bcache;
-
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d585848a6be1..c0a4d0a37f57 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -363,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
.obj_cmpfn = _xfs_buf_obj_cmp,
};
-int
-xfs_buf_cache_init(
- struct xfs_buf_cache *bch)
-{
- return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
-}
-
-void
-xfs_buf_cache_destroy(
- struct xfs_buf_cache *bch)
-{
- rhashtable_destroy(&bch->bc_hash);
-}
-
static int
xfs_buf_map_verify(
struct xfs_buftarg *btp,
@@ -434,7 +420,7 @@ xfs_buf_find_lock(
static inline int
xfs_buf_lookup(
- struct xfs_buf_cache *bch,
+ struct xfs_buftarg *btp,
struct xfs_buf_map *map,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
@@ -443,7 +429,7 @@ xfs_buf_lookup(
int error;
rcu_read_lock();
- bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
+ bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
rcu_read_unlock();
return -ENOENT;
@@ -468,7 +454,6 @@ xfs_buf_lookup(
static int
xfs_buf_find_insert(
struct xfs_buftarg *btp,
- struct xfs_buf_cache *bch,
struct xfs_perag *pag,
struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
@@ -488,7 +473,7 @@ xfs_buf_find_insert(
new_bp->b_pag = pag;
rcu_read_lock();
- bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
+ bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
rcu_read_unlock();
@@ -530,16 +515,6 @@ xfs_buftarg_get_pag(
return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
}
-static inline struct xfs_buf_cache *
-xfs_buftarg_buf_cache(
- struct xfs_buftarg *btp,
- struct xfs_perag *pag)
-{
- if (pag)
- return &pag->pag_bcache;
- return btp->bt_cache;
-}
-
/*
* Assembles a buffer covering the specified range. The code is optimised for
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -553,7 +528,6 @@ xfs_buf_get_map(
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
- struct xfs_buf_cache *bch;
struct xfs_perag *pag;
struct xfs_buf *bp = NULL;
struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
@@ -570,9 +544,8 @@ xfs_buf_get_map(
return error;
pag = xfs_buftarg_get_pag(btp, &cmap);
- bch = xfs_buftarg_buf_cache(btp, pag);
- error = xfs_buf_lookup(bch, &cmap, flags, &bp);
+ error = xfs_buf_lookup(btp, &cmap, flags, &bp);
if (error && error != -ENOENT)
goto out_put_perag;
@@ -584,7 +557,7 @@ xfs_buf_get_map(
goto out_put_perag;
/* xfs_buf_find_insert() consumes the perag reference. */
- error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
+ error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
flags, &bp);
if (error)
return error;
@@ -848,11 +821,8 @@ xfs_buf_destroy(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
if (!xfs_buf_is_uncached(bp)) {
- struct xfs_buf_cache *bch =
- xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
-
- rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
- xfs_buf_hash_params);
+ rhashtable_remove_fast(&bp->b_target->bt_hash,
+ &bp->b_rhash_head, xfs_buf_hash_params);
if (bp->b_pag)
xfs_perag_put(bp->b_pag);
@@ -1618,6 +1588,7 @@ xfs_destroy_buftarg(
ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
percpu_counter_destroy(&btp->bt_readahead_count);
list_lru_destroy(&btp->bt_lru);
+ rhashtable_destroy(&btp->bt_hash);
}
void
@@ -1712,8 +1683,10 @@ xfs_init_buftarg(
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
- if (list_lru_init(&btp->bt_lru))
+ if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
return -ENOMEM;
+ if (list_lru_init(&btp->bt_lru))
+ goto out_destroy_hash;
if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
goto out_destroy_lru;
@@ -1731,6 +1704,8 @@ xfs_init_buftarg(
percpu_counter_destroy(&btp->bt_readahead_count);
out_destroy_lru:
list_lru_destroy(&btp->bt_lru);
+out_destroy_hash:
+ rhashtable_destroy(&btp->bt_hash);
return -ENOMEM;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3a1d066e1c13..bf39d89f0f6d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -69,13 +69,6 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }
-struct xfs_buf_cache {
- struct rhashtable bc_hash;
-};
-
-int xfs_buf_cache_init(struct xfs_buf_cache *bch);
-void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
-
/*
* The xfs_buftarg contains 2 notions of "sector size" -
*
@@ -113,8 +106,7 @@ struct xfs_buftarg {
unsigned int bt_awu_min;
unsigned int bt_awu_max;
- /* built-in cache, if we're not using the perag one */
- struct xfs_buf_cache bt_cache[];
+ struct rhashtable bt_hash;
};
struct xfs_buf_map {
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index b0b3696bf599..b2fd7276b131 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -58,7 +58,7 @@ xmbuf_alloc(
struct xfs_buftarg *btp;
int error;
- btp = kzalloc_flex(*btp, bt_cache, 1);
+ btp = kzalloc_obj(*btp);
if (!btp)
return -ENOMEM;
@@ -81,10 +81,6 @@ xmbuf_alloc(
/* ensure all writes are below EOF to avoid pagecache zeroing */
i_size_write(inode, inode->i_sb->s_maxbytes);
- error = xfs_buf_cache_init(btp->bt_cache);
- if (error)
- goto out_file;
-
/* Initialize buffer target */
btp->bt_mount = mp;
btp->bt_dev = (dev_t)-1U;
@@ -95,15 +91,13 @@ xmbuf_alloc(
error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
if (error)
- goto out_bcache;
+ goto out_file;
trace_xmbuf_create(btp);
*btpp = btp;
return 0;
-out_bcache:
- xfs_buf_cache_destroy(btp->bt_cache);
out_file:
fput(file);
out_free_btp:
@@ -122,7 +116,6 @@ xmbuf_free(
trace_xmbuf_free(btp);
xfs_destroy_buftarg(btp);
- xfs_buf_cache_destroy(btp->bt_cache);
fput(btp->bt_file);
kfree(btp);
}
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH 3/4] xfs: switch (back) to a per-buftarg buffer hash
2026-03-17 13:40 ` [PATCH 3/4] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
@ 2026-03-17 22:00 ` Dave Chinner
2026-03-18 12:14 ` Brian Foster
1 sibling, 0 replies; 16+ messages in thread
From: Dave Chinner @ 2026-03-17 22:00 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Carlos Maiolino, Dave Chinner, Brian Foster, linux-xfs,
syzbot+0391d34e801643e2809b, Darrick J. Wong
On Tue, Mar 17, 2026 at 02:40:54PM +0100, Christoph Hellwig wrote:
> The per-AG buffer hashes were added when all buffer lookups took a
> per-hash look. Since then we've made lookups entirely lockless and
> removed the need for a hash-wide lock for inserts and removals as
> well. With this there is no need to sharding the hash, so reduce the
> used resources by using a per-buftarg hash for all buftargs.
>
> Long after writing this initially, syzbot found a problem in the buffer
> cache teardown order, which this happens to fix as well by doing the
> entire buffer cache teardown in one places instead of splitting it
> between destroying the buftarg and the perag structures.
>
> Link: https://lore.kernel.org/linux-xfs/aLeUdemAZ5wmtZel@dread.disaster.area/
> Reported-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> Tested-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/libxfs/xfs_ag.c | 13 ++---------
> fs/xfs/libxfs/xfs_ag.h | 2 --
> fs/xfs/xfs_buf.c | 51 +++++++++++-------------------------------
> fs/xfs/xfs_buf.h | 10 +--------
> fs/xfs/xfs_buf_mem.c | 11 ++-------
> 5 files changed, 18 insertions(+), 69 deletions(-)
Looks fine from a logic POV - the LRU life cycle now sits inside the
hash table life cycle.
I'd also suggest that the minimum size of the rhashtable should be
increased - it will always have a higher minimum population as a
global table as a set of per-ag tables. We should try to avoid
thrashing on resizing when the filesystem is mostly idle and/or
under memory pressure, so I think a larger min size should be
specified along with this globalisation change...
-Dave.
--
Dave Chinner
dgc@kernel.org
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/4] xfs: switch (back) to a per-buftarg buffer hash
2026-03-17 13:40 ` [PATCH 3/4] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
2026-03-17 22:00 ` Dave Chinner
@ 2026-03-18 12:14 ` Brian Foster
1 sibling, 0 replies; 16+ messages in thread
From: Brian Foster @ 2026-03-18 12:14 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Carlos Maiolino, Dave Chinner, linux-xfs,
syzbot+0391d34e801643e2809b, Darrick J. Wong
On Tue, Mar 17, 2026 at 02:40:54PM +0100, Christoph Hellwig wrote:
> The per-AG buffer hashes were added when all buffer lookups took a
> per-hash look. Since then we've made lookups entirely lockless and
> removed the need for a hash-wide lock for inserts and removals as
> well. With this there is no need to sharding the hash, so reduce the
> used resources by using a per-buftarg hash for all buftargs.
>
> Long after writing this initially, syzbot found a problem in the buffer
> cache teardown order, which this happens to fix as well by doing the
> entire buffer cache teardown in one places instead of splitting it
> between destroying the buftarg and the perag structures.
>
> Link: https://lore.kernel.org/linux-xfs/aLeUdemAZ5wmtZel@dread.disaster.area/
> Reported-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> Tested-by: syzbot+0391d34e801643e2809b@syzkaller.appspotmail.com
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
Modulo Dave's comments:
Reviewed-by: Brian Foster <bfoster@redhat.com>
> fs/xfs/libxfs/xfs_ag.c | 13 ++---------
> fs/xfs/libxfs/xfs_ag.h | 2 --
> fs/xfs/xfs_buf.c | 51 +++++++++++-------------------------------
> fs/xfs/xfs_buf.h | 10 +--------
> fs/xfs/xfs_buf_mem.c | 11 ++-------
> 5 files changed, 18 insertions(+), 69 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> index bd8fbb40b49e..dcd2f93b6a6c 100644
> --- a/fs/xfs/libxfs/xfs_ag.c
> +++ b/fs/xfs/libxfs/xfs_ag.c
> @@ -110,10 +110,7 @@ xfs_perag_uninit(
> struct xfs_group *xg)
> {
> #ifdef __KERNEL__
> - struct xfs_perag *pag = to_perag(xg);
> -
> - cancel_delayed_work_sync(&pag->pag_blockgc_work);
> - xfs_buf_cache_destroy(&pag->pag_bcache);
> + cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work);
> #endif
> }
>
> @@ -235,10 +232,6 @@ xfs_perag_alloc(
> INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
> #endif /* __KERNEL__ */
>
> - error = xfs_buf_cache_init(&pag->pag_bcache);
> - if (error)
> - goto out_free_perag;
> -
> /*
> * Pre-calculated geometry
> */
> @@ -250,12 +243,10 @@ xfs_perag_alloc(
>
> error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
> if (error)
> - goto out_buf_cache_destroy;
> + goto out_free_perag;
>
> return 0;
>
> -out_buf_cache_destroy:
> - xfs_buf_cache_destroy(&pag->pag_bcache);
> out_free_perag:
> kfree(pag);
> return error;
> diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> index 3cd4790768ff..16a9b43a3c27 100644
> --- a/fs/xfs/libxfs/xfs_ag.h
> +++ b/fs/xfs/libxfs/xfs_ag.h
> @@ -85,8 +85,6 @@ struct xfs_perag {
> int pag_ici_reclaimable; /* reclaimable inodes */
> unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
>
> - struct xfs_buf_cache pag_bcache;
> -
> /* background prealloc block trimming */
> struct delayed_work pag_blockgc_work;
> #endif /* __KERNEL__ */
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index d585848a6be1..c0a4d0a37f57 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -363,20 +363,6 @@ static const struct rhashtable_params xfs_buf_hash_params = {
> .obj_cmpfn = _xfs_buf_obj_cmp,
> };
>
> -int
> -xfs_buf_cache_init(
> - struct xfs_buf_cache *bch)
> -{
> - return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
> -}
> -
> -void
> -xfs_buf_cache_destroy(
> - struct xfs_buf_cache *bch)
> -{
> - rhashtable_destroy(&bch->bc_hash);
> -}
> -
> static int
> xfs_buf_map_verify(
> struct xfs_buftarg *btp,
> @@ -434,7 +420,7 @@ xfs_buf_find_lock(
>
> static inline int
> xfs_buf_lookup(
> - struct xfs_buf_cache *bch,
> + struct xfs_buftarg *btp,
> struct xfs_buf_map *map,
> xfs_buf_flags_t flags,
> struct xfs_buf **bpp)
> @@ -443,7 +429,7 @@ xfs_buf_lookup(
> int error;
>
> rcu_read_lock();
> - bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
> + bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params);
> if (!bp || !lockref_get_not_dead(&bp->b_lockref)) {
> rcu_read_unlock();
> return -ENOENT;
> @@ -468,7 +454,6 @@ xfs_buf_lookup(
> static int
> xfs_buf_find_insert(
> struct xfs_buftarg *btp,
> - struct xfs_buf_cache *bch,
> struct xfs_perag *pag,
> struct xfs_buf_map *cmap,
> struct xfs_buf_map *map,
> @@ -488,7 +473,7 @@ xfs_buf_find_insert(
> new_bp->b_pag = pag;
>
> rcu_read_lock();
> - bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
> + bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
> &new_bp->b_rhash_head, xfs_buf_hash_params);
> if (IS_ERR(bp)) {
> rcu_read_unlock();
> @@ -530,16 +515,6 @@ xfs_buftarg_get_pag(
> return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
> }
>
> -static inline struct xfs_buf_cache *
> -xfs_buftarg_buf_cache(
> - struct xfs_buftarg *btp,
> - struct xfs_perag *pag)
> -{
> - if (pag)
> - return &pag->pag_bcache;
> - return btp->bt_cache;
> -}
> -
> /*
> * Assembles a buffer covering the specified range. The code is optimised for
> * cache hits, as metadata intensive workloads will see 3 orders of magnitude
> @@ -553,7 +528,6 @@ xfs_buf_get_map(
> xfs_buf_flags_t flags,
> struct xfs_buf **bpp)
> {
> - struct xfs_buf_cache *bch;
> struct xfs_perag *pag;
> struct xfs_buf *bp = NULL;
> struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
> @@ -570,9 +544,8 @@ xfs_buf_get_map(
> return error;
>
> pag = xfs_buftarg_get_pag(btp, &cmap);
> - bch = xfs_buftarg_buf_cache(btp, pag);
>
> - error = xfs_buf_lookup(bch, &cmap, flags, &bp);
> + error = xfs_buf_lookup(btp, &cmap, flags, &bp);
> if (error && error != -ENOENT)
> goto out_put_perag;
>
> @@ -584,7 +557,7 @@ xfs_buf_get_map(
> goto out_put_perag;
>
> /* xfs_buf_find_insert() consumes the perag reference. */
> - error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
> + error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
> flags, &bp);
> if (error)
> return error;
> @@ -848,11 +821,8 @@ xfs_buf_destroy(
> ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
>
> if (!xfs_buf_is_uncached(bp)) {
> - struct xfs_buf_cache *bch =
> - xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
> -
> - rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
> - xfs_buf_hash_params);
> + rhashtable_remove_fast(&bp->b_target->bt_hash,
> + &bp->b_rhash_head, xfs_buf_hash_params);
>
> if (bp->b_pag)
> xfs_perag_put(bp->b_pag);
> @@ -1618,6 +1588,7 @@ xfs_destroy_buftarg(
> ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
> percpu_counter_destroy(&btp->bt_readahead_count);
> list_lru_destroy(&btp->bt_lru);
> + rhashtable_destroy(&btp->bt_hash);
> }
>
> void
> @@ -1712,8 +1683,10 @@ xfs_init_buftarg(
> ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
> DEFAULT_RATELIMIT_BURST);
>
> - if (list_lru_init(&btp->bt_lru))
> + if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params))
> return -ENOMEM;
> + if (list_lru_init(&btp->bt_lru))
> + goto out_destroy_hash;
> if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
> goto out_destroy_lru;
>
> @@ -1731,6 +1704,8 @@ xfs_init_buftarg(
> percpu_counter_destroy(&btp->bt_readahead_count);
> out_destroy_lru:
> list_lru_destroy(&btp->bt_lru);
> +out_destroy_hash:
> + rhashtable_destroy(&btp->bt_hash);
> return -ENOMEM;
> }
>
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 3a1d066e1c13..bf39d89f0f6d 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -69,13 +69,6 @@ typedef unsigned int xfs_buf_flags_t;
> { XBF_INCORE, "INCORE" }, \
> { XBF_TRYLOCK, "TRYLOCK" }
>
> -struct xfs_buf_cache {
> - struct rhashtable bc_hash;
> -};
> -
> -int xfs_buf_cache_init(struct xfs_buf_cache *bch);
> -void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
> -
> /*
> * The xfs_buftarg contains 2 notions of "sector size" -
> *
> @@ -113,8 +106,7 @@ struct xfs_buftarg {
> unsigned int bt_awu_min;
> unsigned int bt_awu_max;
>
> - /* built-in cache, if we're not using the perag one */
> - struct xfs_buf_cache bt_cache[];
> + struct rhashtable bt_hash;
> };
>
> struct xfs_buf_map {
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index b0b3696bf599..b2fd7276b131 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -58,7 +58,7 @@ xmbuf_alloc(
> struct xfs_buftarg *btp;
> int error;
>
> - btp = kzalloc_flex(*btp, bt_cache, 1);
> + btp = kzalloc_obj(*btp);
> if (!btp)
> return -ENOMEM;
>
> @@ -81,10 +81,6 @@ xmbuf_alloc(
> /* ensure all writes are below EOF to avoid pagecache zeroing */
> i_size_write(inode, inode->i_sb->s_maxbytes);
>
> - error = xfs_buf_cache_init(btp->bt_cache);
> - if (error)
> - goto out_file;
> -
> /* Initialize buffer target */
> btp->bt_mount = mp;
> btp->bt_dev = (dev_t)-1U;
> @@ -95,15 +91,13 @@ xmbuf_alloc(
>
> error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
> if (error)
> - goto out_bcache;
> + goto out_file;
>
> trace_xmbuf_create(btp);
>
> *btpp = btp;
> return 0;
>
> -out_bcache:
> - xfs_buf_cache_destroy(btp->bt_cache);
> out_file:
> fput(file);
> out_free_btp:
> @@ -122,7 +116,6 @@ xmbuf_free(
> trace_xmbuf_free(btp);
>
> xfs_destroy_buftarg(btp);
> - xfs_buf_cache_destroy(btp->bt_cache);
> fput(btp->bt_file);
> kfree(btp);
> }
> --
> 2.47.3
>
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 4/4] xfs: don't decrement the buffer LRU count for in-use buffers
2026-03-17 13:40 buffer cache simplification v5 Christoph Hellwig
` (2 preceding siblings ...)
2026-03-17 13:40 ` [PATCH 3/4] xfs: switch (back) to a per-buftarg buffer hash Christoph Hellwig
@ 2026-03-17 13:40 ` Christoph Hellwig
2026-03-17 22:06 ` Dave Chinner
2026-03-18 11:45 ` Brian Foster
3 siblings, 2 replies; 16+ messages in thread
From: Christoph Hellwig @ 2026-03-17 13:40 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Dave Chinner, Brian Foster, linux-xfs, Darrick J. Wong
XFS buffers are added to the LRU when they are unused, but are only
removed from the LRU lazily when the LRU list scan finds a used buffer.
So far this only happen when the LRU counter hits 0, which is suboptimal
as buffers that were added to the LRU, but are in use again still consume
LRU scanning resources and are aged while actually in use.
Fix this by checking for in-use buffers and removing the from the LRU
before decrementing the LRU counter.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
fs/xfs/xfs_buf.c | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c0a4d0a37f57..8ba9b74339a7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1523,6 +1523,18 @@ xfs_buftarg_isolate(
if (!spin_trylock(&bp->b_lockref.lock))
return LRU_SKIP;
+ /*
+ * If the buffer is in use, remove it from the LRU for now. We can't
+ * free it while someone is using it, and we should also not count
+ * eviction passed for it, just as if it hadn't been added to the LRU
+ * yet.
+ */
+ if (bp->b_lockref.count > 0) {
+ list_lru_isolate(lru, &bp->b_lru);
+ spin_unlock(&bp->b_lockref.lock);
+ return LRU_REMOVED;
+ }
+
/*
* Decrement the b_lru_ref count unless the value is already
* zero. If the value is already zero, we need to reclaim the
@@ -1533,16 +1545,6 @@ xfs_buftarg_isolate(
return LRU_ROTATE;
}
- /*
- * If the buffer is in use, remove it from the LRU for now as we can't
- * free it. It will be freed when the last reference drops.
- */
- if (bp->b_lockref.count > 0) {
- list_lru_isolate(lru, &bp->b_lru);
- spin_unlock(&bp->b_lockref.lock);
- return LRU_REMOVED;
- }
-
lockref_mark_dead(&bp->b_lockref);
list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lockref.lock);
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH 4/4] xfs: don't decrement the buffer LRU count for in-use buffers
2026-03-17 13:40 ` [PATCH 4/4] xfs: don't decrement the buffer LRU count for in-use buffers Christoph Hellwig
@ 2026-03-17 22:06 ` Dave Chinner
2026-03-18 11:47 ` Brian Foster
2026-03-18 11:45 ` Brian Foster
1 sibling, 1 reply; 16+ messages in thread
From: Dave Chinner @ 2026-03-17 22:06 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Carlos Maiolino, Dave Chinner, Brian Foster, linux-xfs,
Darrick J. Wong
On Tue, Mar 17, 2026 at 02:40:55PM +0100, Christoph Hellwig wrote:
> XFS buffers are added to the LRU when they are unused, but are only
> removed from the LRU lazily when the LRU list scan finds a used buffer.
> So far this only happen when the LRU counter hits 0, which is suboptimal
> as buffers that were added to the LRU, but are in use again still consume
> LRU scanning resources and are aged while actually in use.
>
> Fix this by checking for in-use buffers and removing the from the LRU
> before decrementing the LRU counter.
So I wondered why you put the "in use" check where you did a couple
of patches ago - it didn't make a lot of sense to me because 'in use
buffers' typically reset the lru count to their maximum at the same
time they lookup, lock and take a reference to the buffer. The the
lock and reference are dropped at the same time when we are done
with the buffer, so it never made sense to decrement the LRU count
whilst buffers are locked and referenced...
Can you just fold this back into the original patch that introduces
this check?
-Dave.
--
Dave Chinner
dgc@kernel.org
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 4/4] xfs: don't decrement the buffer LRU count for in-use buffers
2026-03-17 22:06 ` Dave Chinner
@ 2026-03-18 11:47 ` Brian Foster
0 siblings, 0 replies; 16+ messages in thread
From: Brian Foster @ 2026-03-18 11:47 UTC (permalink / raw)
To: Dave Chinner
Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs,
Darrick J. Wong
On Wed, Mar 18, 2026 at 09:06:42AM +1100, Dave Chinner wrote:
> On Tue, Mar 17, 2026 at 02:40:55PM +0100, Christoph Hellwig wrote:
> > XFS buffers are added to the LRU when they are unused, but are only
> > removed from the LRU lazily when the LRU list scan finds a used buffer.
> > So far this only happen when the LRU counter hits 0, which is suboptimal
> > as buffers that were added to the LRU, but are in use again still consume
> > LRU scanning resources and are aged while actually in use.
> >
> > Fix this by checking for in-use buffers and removing the from the LRU
> > before decrementing the LRU counter.
>
> So I wondered why you put the "in use" check where you did a couple
> of patches ago - it didn't make a lot of sense to me because 'in use
> buffers' typically reset the lru count to their maximum at the same
> time they lookup, lock and take a reference to the buffer. The the
> lock and reference are dropped at the same time when we are done
> with the buffer, so it never made sense to decrement the LRU count
> whilst buffers are locked and referenced...
>
> Can you just fold this back into the original patch that introduces
> this check?
>
I explicitly asked Christoph to split this out in an earlier version
because it unnecessarily combines logical changes. The in-use check in
the first patch has nothing to do with how LRU references are tracked or
set, but rather exists because the LRU itself no longer has a hold on
the buffer. Therefore it cannot maintain its own hold while a used
buffer may go on to sit on the dispose list for some period of time.
Instead it has to either free the buffer or not, so the in-use check
leaves the fate of the buffer to the holder(s).
AFAICT this is effectively similar to how existing code works. Whereas
this patch changes how the LRU refcount is managed by always removing
in-use buffers. Note that I'm not arguing whether current behavior makes
sense, just that it's a change in behavior because regardless of whether
holders reset b_lru_ref, nothing totally prevents an LRU resident buffer
from cycling out while in use. This makes it easier to separate
functional rework from buffer LRU behavior change on the off chance that
this leads to an issue down the road and somebody needs to bisect
through these changes.
Brian
> -Dave.
> --
> Dave Chinner
> dgc@kernel.org
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 4/4] xfs: don't decrement the buffer LRU count for in-use buffers
2026-03-17 13:40 ` [PATCH 4/4] xfs: don't decrement the buffer LRU count for in-use buffers Christoph Hellwig
2026-03-17 22:06 ` Dave Chinner
@ 2026-03-18 11:45 ` Brian Foster
1 sibling, 0 replies; 16+ messages in thread
From: Brian Foster @ 2026-03-18 11:45 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Carlos Maiolino, Dave Chinner, linux-xfs, Darrick J. Wong
On Tue, Mar 17, 2026 at 02:40:55PM +0100, Christoph Hellwig wrote:
> XFS buffers are added to the LRU when they are unused, but are only
> removed from the LRU lazily when the LRU list scan finds a used buffer.
> So far this only happen when the LRU counter hits 0, which is suboptimal
> as buffers that were added to the LRU, but are in use again still consume
> LRU scanning resources and are aged while actually in use.
>
> Fix this by checking for in-use buffers and removing the from the LRU
> before decrementing the LRU counter.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
Reviewed-by: Brian Foster <bfoster@redhat.com>
> fs/xfs/xfs_buf.c | 22 ++++++++++++----------
> 1 file changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index c0a4d0a37f57..8ba9b74339a7 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1523,6 +1523,18 @@ xfs_buftarg_isolate(
> if (!spin_trylock(&bp->b_lockref.lock))
> return LRU_SKIP;
>
> + /*
> + * If the buffer is in use, remove it from the LRU for now. We can't
> + * free it while someone is using it, and we should also not count
> + * eviction passed for it, just as if it hadn't been added to the LRU
> + * yet.
> + */
> + if (bp->b_lockref.count > 0) {
> + list_lru_isolate(lru, &bp->b_lru);
> + spin_unlock(&bp->b_lockref.lock);
> + return LRU_REMOVED;
> + }
> +
> /*
> * Decrement the b_lru_ref count unless the value is already
> * zero. If the value is already zero, we need to reclaim the
> @@ -1533,16 +1545,6 @@ xfs_buftarg_isolate(
> return LRU_ROTATE;
> }
>
> - /*
> - * If the buffer is in use, remove it from the LRU for now as we can't
> - * free it. It will be freed when the last reference drops.
> - */
> - if (bp->b_lockref.count > 0) {
> - list_lru_isolate(lru, &bp->b_lru);
> - spin_unlock(&bp->b_lockref.lock);
> - return LRU_REMOVED;
> - }
> -
> lockref_mark_dead(&bp->b_lockref);
> list_lru_isolate_move(lru, item, dispose);
> spin_unlock(&bp->b_lockref.lock);
> --
> 2.47.3
>
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 1/4] xfs: don't keep a reference for buffers on the LRU
2026-03-23 7:50 buffer cache simplification v6 Christoph Hellwig
@ 2026-03-23 7:50 ` Christoph Hellwig
0 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2026-03-23 7:50 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Dave Chinner, Brian Foster, linux-xfs, Darrick J. Wong
Currently the buffer cache adds a reference to b_hold for buffers that
are on the LRU. This seems to go all the way back and allows releasing
buffers from the LRU using xfs_buf_rele. But it makes xfs_buf_rele
really complicated in differs from how other LRUs are implemented in
Linux.
Switch to not having a reference for buffers in the LRU, and use a
separate negative hold value to mark buffers as dead. This simplifies
xfs_buf_rele, which now just deal with the last "real" reference,
and prepares for using the lockref primitive.
This also removes the b_lock protection for removing buffers from the
buffer hash. This is the desired outcome because the rhashtable is
fully internally synchronized, and previously the lock was mostly
held out of ordering constrains in xfs_buf_rele_cached.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
fs/xfs/xfs_buf.c | 134 +++++++++++++++++------------------------------
fs/xfs/xfs_buf.h | 8 +--
2 files changed, 50 insertions(+), 92 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d2f3c50d80e7..61e393ac4952 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,11 +80,8 @@ xfs_buf_stale(
spin_lock(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 0);
- if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
- (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
- bp->b_hold--;
-
- ASSERT(bp->b_hold >= 1);
+ if (bp->b_hold >= 0)
+ list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
spin_unlock(&bp->b_lock);
}
@@ -442,7 +439,7 @@ xfs_buf_try_hold(
struct xfs_buf *bp)
{
spin_lock(&bp->b_lock);
- if (bp->b_hold == 0) {
+ if (bp->b_hold == -1) {
spin_unlock(&bp->b_lock);
return false;
}
@@ -862,76 +859,24 @@ xfs_buf_hold(
}
static void
-xfs_buf_rele_uncached(
- struct xfs_buf *bp)
-{
- ASSERT(list_empty(&bp->b_lru));
-
- spin_lock(&bp->b_lock);
- if (--bp->b_hold) {
- spin_unlock(&bp->b_lock);
- return;
- }
- spin_unlock(&bp->b_lock);
- xfs_buf_free(bp);
-}
-
-static void
-xfs_buf_rele_cached(
+xfs_buf_destroy(
struct xfs_buf *bp)
{
- struct xfs_buftarg *btp = bp->b_target;
- struct xfs_perag *pag = bp->b_pag;
- struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
- bool freebuf = false;
-
- trace_xfs_buf_rele(bp, _RET_IP_);
-
- spin_lock(&bp->b_lock);
- ASSERT(bp->b_hold >= 1);
- if (bp->b_hold > 1) {
- bp->b_hold--;
- goto out_unlock;
- }
+ ASSERT(bp->b_hold < 0);
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- /* we are asked to drop the last reference */
- if (atomic_read(&bp->b_lru_ref)) {
- /*
- * If the buffer is added to the LRU, keep the reference to the
- * buffer for the LRU and clear the (now stale) dispose list
- * state flag, else drop the reference.
- */
- if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
- bp->b_state &= ~XFS_BSTATE_DISPOSE;
- else
- bp->b_hold--;
- } else {
- bp->b_hold--;
- /*
- * most of the time buffers will already be removed from the
- * LRU, so optimise that case by checking for the
- * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
- * was on was the disposal list
- */
- if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
- } else {
- ASSERT(list_empty(&bp->b_lru));
- }
+ if (!xfs_buf_is_uncached(bp)) {
+ struct xfs_buf_cache *bch =
+ xfs_buftarg_buf_cache(bp->b_target, bp->b_pag);
- ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
xfs_buf_hash_params);
- if (pag)
- xfs_perag_put(pag);
- freebuf = true;
- }
-out_unlock:
- spin_unlock(&bp->b_lock);
+ if (bp->b_pag)
+ xfs_perag_put(bp->b_pag);
+ }
- if (freebuf)
- xfs_buf_free(bp);
+ xfs_buf_free(bp);
}
/*
@@ -942,10 +887,22 @@ xfs_buf_rele(
struct xfs_buf *bp)
{
trace_xfs_buf_rele(bp, _RET_IP_);
- if (xfs_buf_is_uncached(bp))
- xfs_buf_rele_uncached(bp);
- else
- xfs_buf_rele_cached(bp);
+
+ spin_lock(&bp->b_lock);
+ if (!--bp->b_hold) {
+ if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref))
+ goto kill;
+ list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru);
+ }
+ spin_unlock(&bp->b_lock);
+ return;
+
+kill:
+ bp->b_hold = -1;
+ list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+ spin_unlock(&bp->b_lock);
+
+ xfs_buf_destroy(bp);
}
/*
@@ -1254,9 +1211,11 @@ xfs_buf_ioerror_alert(
/*
* To simulate an I/O failure, the buffer must be locked and held with at least
- * three references. The LRU reference is dropped by the stale call. The buf
- * item reference is dropped via ioend processing. The third reference is owned
- * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ * two references.
+ *
+ * The buf item reference is dropped via ioend processing. The second reference
+ * is owned by the caller and is dropped on I/O completion if the buffer is
+ * XBF_ASYNC.
*/
void
xfs_buf_ioend_fail(
@@ -1514,19 +1473,14 @@ xfs_buftarg_drain_rele(
if (!spin_trylock(&bp->b_lock))
return LRU_SKIP;
- if (bp->b_hold > 1) {
+ if (bp->b_hold > 0) {
/* need to wait, so skip it this pass */
spin_unlock(&bp->b_lock);
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
- /*
- * clear the LRU reference count so the buffer doesn't get
- * ignored in xfs_buf_rele().
- */
- atomic_set(&bp->b_lru_ref, 0);
- bp->b_state |= XFS_BSTATE_DISPOSE;
+ bp->b_hold = -1;
list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
return LRU_REMOVED;
@@ -1581,7 +1535,7 @@ xfs_buftarg_drain(
"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
(long long)xfs_buf_daddr(bp));
}
- xfs_buf_rele(bp);
+ xfs_buf_destroy(bp);
}
if (loop++ != 0)
delay(100);
@@ -1625,7 +1579,17 @@ xfs_buftarg_isolate(
return LRU_ROTATE;
}
- bp->b_state |= XFS_BSTATE_DISPOSE;
+ /*
+ * If the buffer is in use, remove it from the LRU for now as we can't
+ * free it. It will be freed when the last reference drops.
+ */
+ if (bp->b_hold > 0) {
+ list_lru_isolate(lru, &bp->b_lru);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
+ }
+
+ bp->b_hold = -1;
list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
return LRU_REMOVED;
@@ -1647,7 +1611,7 @@ xfs_buftarg_shrink_scan(
struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
- xfs_buf_rele(bp);
+ xfs_buf_destroy(bp);
}
return freed;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e25cd2a160f3..e7324d58bd96 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -68,11 +68,6 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }
-/*
- * Internal state flags.
- */
-#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
-
struct xfs_buf_cache {
struct rhashtable bc_hash;
};
@@ -159,7 +154,7 @@ struct xfs_buf {
xfs_daddr_t b_rhash_key; /* buffer cache index */
int b_length; /* size of buffer in BBs */
- unsigned int b_hold; /* reference count */
+ int b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
@@ -170,7 +165,6 @@ struct xfs_buf {
*/
struct list_head b_lru; /* lru list */
spinlock_t b_lock; /* internal state lock */
- unsigned int b_state; /* internal state flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag;
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread