public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed
* buffer cache simplifications
@ 2025-02-17  9:31 Christoph Hellwig
  2025-02-17  9:31 ` [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-17  9:31 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, linux-xfs

Hi all,

this series reduces some superlfous work done in the buffer cache.  Most
notable an extra workqueue context switch for synchronous I/O, and
tracking of in-flight I/O for buffers where that is not needed.

Diffstat:
 xfs_buf.c         |  181 ++++++++++++++++++------------------------------------
 xfs_buf.h         |    7 --
 xfs_buf_mem.c     |    2 
 xfs_log_recover.c |    2 
 xfs_mount.c       |    7 --
 xfs_rtalloc.c     |    2 
 xfs_trace.h       |    1 
 7 files changed, 70 insertions(+), 132 deletions(-)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O
  2025-02-17  9:31 buffer cache simplifications Christoph Hellwig
@ 2025-02-17  9:31 ` Christoph Hellwig
  2025-02-18 20:05   ` Darrick J. Wong
  2025-02-17  9:31 ` [PATCH 2/4] xfs: decouple buffer readahead from the normal buffer read path Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-17  9:31 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, linux-xfs

Currently all metadata I/O completions happen in the m_buf_workqueue
workqueue.  But for synchronous I/O (i.e. all buffer reads) there is no
need for that, as there always is a called in process context that is
waiting for the I/O.  Factor out the guts of xfs_buf_ioend into a
separate helper and call it from xfs_buf_iowait to avoid a double
an extra context switch to the workqueue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15bb790359f8..050f2c2f6a40 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1345,6 +1345,7 @@ xfs_buf_ioend_handle_error(
 resubmit:
 	xfs_buf_ioerror(bp, 0);
 	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+	reinit_completion(&bp->b_iowait);
 	xfs_buf_submit(bp);
 	return true;
 out_stale:
@@ -1355,8 +1356,8 @@ xfs_buf_ioend_handle_error(
 	return false;
 }
 
-static void
-xfs_buf_ioend(
+static bool
+__xfs_buf_ioend(
 	struct xfs_buf	*bp)
 {
 	trace_xfs_buf_iodone(bp, _RET_IP_);
@@ -1376,7 +1377,7 @@ xfs_buf_ioend(
 		}
 
 		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
-			return;
+			return false;
 
 		/* clear the retry state */
 		bp->b_last_error = 0;
@@ -1397,7 +1398,15 @@ xfs_buf_ioend(
 
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
 			 _XBF_LOGRECOVERY);
+	return true;
+}
 
+static void
+xfs_buf_ioend(
+	struct xfs_buf	*bp)
+{
+	if (!__xfs_buf_ioend(bp))
+		return;
 	if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
 	else
@@ -1411,15 +1420,8 @@ xfs_buf_ioend_work(
 	struct xfs_buf		*bp =
 		container_of(work, struct xfs_buf, b_ioend_work);
 
-	xfs_buf_ioend(bp);
-}
-
-static void
-xfs_buf_ioend_async(
-	struct xfs_buf	*bp)
-{
-	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
-	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+	if (__xfs_buf_ioend(bp))
+		xfs_buf_relse(bp);
 }
 
 void
@@ -1491,7 +1493,13 @@ xfs_buf_bio_end_io(
 		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
 		xfs_buf_ioerror(bp, -EIO);
 
-	xfs_buf_ioend_async(bp);
+	if (bp->b_flags & XBF_ASYNC) {
+		INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
+		queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+	} else {
+		complete(&bp->b_iowait);
+	}
+
 	bio_put(bio);
 }
 
@@ -1568,9 +1576,11 @@ xfs_buf_iowait(
 {
 	ASSERT(!(bp->b_flags & XBF_ASYNC));
 
-	trace_xfs_buf_iowait(bp, _RET_IP_);
-	wait_for_completion(&bp->b_iowait);
-	trace_xfs_buf_iowait_done(bp, _RET_IP_);
+	do {
+		trace_xfs_buf_iowait(bp, _RET_IP_);
+		wait_for_completion(&bp->b_iowait);
+		trace_xfs_buf_iowait_done(bp, _RET_IP_);
+	} while (!__xfs_buf_ioend(bp));
 
 	return bp->b_error;
 }
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/4] xfs: decouple buffer readahead from the normal buffer read path
  2025-02-17  9:31 buffer cache simplifications Christoph Hellwig
  2025-02-17  9:31 ` [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O Christoph Hellwig
@ 2025-02-17  9:31 ` Christoph Hellwig
  2025-02-18 20:10   ` Darrick J. Wong
  2025-02-17  9:31 ` [PATCH 3/4] xfs: remove most in-flight buffer accounting Christoph Hellwig
  2025-02-17  9:31 ` [PATCH 4/4] xfs: remove the XBF_STALE check from xfs_buf_rele_cached Christoph Hellwig
  3 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-17  9:31 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, linux-xfs

xfs_buf_readahead_map is the only caller of xfs_buf_read_map and thus
_xfs_buf_read that is not synchronous.  Split it from xfs_buf_read_map
so that the asynchronous path is self-contained and the now purely
synchronous xfs_buf_read_map / _xfs_buf_read implementation can be
simplified.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c         | 41 ++++++++++++++++++++--------------------
 fs/xfs/xfs_buf.h         |  2 +-
 fs/xfs/xfs_log_recover.c |  2 +-
 fs/xfs/xfs_trace.h       |  1 +
 4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 050f2c2f6a40..52fb85c42e94 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -794,18 +794,13 @@ xfs_buf_get_map(
 
 int
 _xfs_buf_read(
-	struct xfs_buf		*bp,
-	xfs_buf_flags_t		flags)
+	struct xfs_buf		*bp)
 {
-	ASSERT(!(flags & XBF_WRITE));
 	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
 
 	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
-	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
-
+	bp->b_flags |= XBF_READ;
 	xfs_buf_submit(bp);
-	if (flags & XBF_ASYNC)
-		return 0;
 	return xfs_buf_iowait(bp);
 }
 
@@ -857,6 +852,8 @@ xfs_buf_read_map(
 	struct xfs_buf		*bp;
 	int			error;
 
+	ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD)));
+
 	flags |= XBF_READ;
 	*bpp = NULL;
 
@@ -870,21 +867,11 @@ xfs_buf_read_map(
 		/* Initiate the buffer read and wait. */
 		XFS_STATS_INC(target->bt_mount, xb_get_read);
 		bp->b_ops = ops;
-		error = _xfs_buf_read(bp, flags);
-
-		/* Readahead iodone already dropped the buffer, so exit. */
-		if (flags & XBF_ASYNC)
-			return 0;
+		error = _xfs_buf_read(bp);
 	} else {
 		/* Buffer already read; all we need to do is check it. */
 		error = xfs_buf_reverify(bp, ops);
 
-		/* Readahead already finished; drop the buffer and exit. */
-		if (flags & XBF_ASYNC) {
-			xfs_buf_relse(bp);
-			return 0;
-		}
-
 		/* We do not want read in the flags */
 		bp->b_flags &= ~XBF_READ;
 		ASSERT(bp->b_ops != NULL || ops == NULL);
@@ -936,6 +923,7 @@ xfs_buf_readahead_map(
 	int			nmaps,
 	const struct xfs_buf_ops *ops)
 {
+	const xfs_buf_flags_t	flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD;
 	struct xfs_buf		*bp;
 
 	/*
@@ -945,9 +933,20 @@ xfs_buf_readahead_map(
 	if (xfs_buftarg_is_mem(target))
 		return;
 
-	xfs_buf_read_map(target, map, nmaps,
-		     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
-		     __this_address);
+	if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp))
+		return;
+	trace_xfs_buf_readahead(bp, 0, _RET_IP_);
+
+	if (bp->b_flags & XBF_DONE) {
+		xfs_buf_reverify(bp, ops);
+		xfs_buf_relse(bp);
+		return;
+	}
+	XFS_STATS_INC(target->bt_mount, xb_get_read);
+	bp->b_ops = ops;
+	bp->b_flags &= ~(XBF_WRITE | XBF_DONE);
+	bp->b_flags |= flags;
+	xfs_buf_submit(bp);
 }
 
 /*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3b4ed42e11c0..2e747555ad3f 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -291,7 +291,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
 int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
 		size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
 		const struct xfs_buf_ops *ops);
-int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
+int _xfs_buf_read(struct xfs_buf *bp);
 void xfs_buf_hold(struct xfs_buf *bp);
 
 /* Releasing Buffers */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b3c27dbccce8..2f76531842f8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3380,7 +3380,7 @@ xlog_do_recover(
 	 */
 	xfs_buf_lock(bp);
 	xfs_buf_hold(bp);
-	error = _xfs_buf_read(bp, XBF_READ);
+	error = _xfs_buf_read(bp);
 	if (error) {
 		if (!xlog_is_shutdown(log)) {
 			xfs_buf_ioerror_alert(bp, __this_address);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b29462363b81..bfc2f1249022 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -593,6 +593,7 @@ DEFINE_EVENT(xfs_buf_flags_class, name, \
 DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
 DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
 DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_readahead);
 
 TRACE_EVENT(xfs_buf_ioerror,
 	TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip),
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/4] xfs: remove most in-flight buffer accounting
  2025-02-17  9:31 buffer cache simplifications Christoph Hellwig
  2025-02-17  9:31 ` [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O Christoph Hellwig
  2025-02-17  9:31 ` [PATCH 2/4] xfs: decouple buffer readahead from the normal buffer read path Christoph Hellwig
@ 2025-02-17  9:31 ` Christoph Hellwig
  2025-02-18 20:23   ` Darrick J. Wong
  2025-02-17  9:31 ` [PATCH 4/4] xfs: remove the XBF_STALE check from xfs_buf_rele_cached Christoph Hellwig
  3 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-17  9:31 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, linux-xfs

The buffer cache keeps a bt_io_count per-CPU counter to track all
in-flight I/O, which is used to ensure no I/O is in flight when
unmounting the file system.

For most I/O we already keep track of inflight I/O at higher levels:

 - for synchronous I/O (xfs_buf_read/xfs_bwrite/xfs_buf_delwri_submit),
   the caller has a reference and waits for I/O completions using
   xfs_buf_iowait
 - for xfs_buf_delwri_submit the only caller (AIL writeback) tracks the
   log items that the buffer attached to

This only leaves only xfs_buf_readahead_map as a submitter of
asynchronous I/O that is not tracked by anything else.  Replace the
bt_io_count per-cpu counter with a more specific bt_readahead_count
counter only tracking readahead I/O.  This allows to simply increment
it when submitting readahead I/O and decrementing it when it completed,
and thus simplify xfs_buf_rele and remove the needed for the
XBF_NO_IOACCT flags and the XFS_BSTATE_IN_FLIGHT buffer state.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c     | 90 ++++++++------------------------------------
 fs/xfs/xfs_buf.h     |  5 +--
 fs/xfs/xfs_buf_mem.c |  2 +-
 fs/xfs/xfs_mount.c   |  7 +---
 fs/xfs/xfs_rtalloc.c |  2 +-
 5 files changed, 20 insertions(+), 86 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 52fb85c42e94..f8efdee3c8b4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -29,11 +29,6 @@ struct kmem_cache *xfs_buf_cache;
 /*
  * Locking orders
  *
- * xfs_buf_ioacct_inc:
- * xfs_buf_ioacct_dec:
- *	b_sema (caller holds)
- *	  b_lock
- *
  * xfs_buf_stale:
  *	b_sema (caller holds)
  *	  b_lock
@@ -81,51 +76,6 @@ xfs_buf_vmap_len(
 	return (bp->b_page_count * PAGE_SIZE);
 }
 
-/*
- * Bump the I/O in flight count on the buftarg if we haven't yet done so for
- * this buffer. The count is incremented once per buffer (per hold cycle)
- * because the corresponding decrement is deferred to buffer release. Buffers
- * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
- * tracking adds unnecessary overhead. This is used for sychronization purposes
- * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
- * in-flight buffers.
- *
- * Buffers that are never released (e.g., superblock, iclog buffers) must set
- * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
- * never reaches zero and unmount hangs indefinitely.
- */
-static inline void
-xfs_buf_ioacct_inc(
-	struct xfs_buf	*bp)
-{
-	if (bp->b_flags & XBF_NO_IOACCT)
-		return;
-
-	ASSERT(bp->b_flags & XBF_ASYNC);
-	spin_lock(&bp->b_lock);
-	if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
-		bp->b_state |= XFS_BSTATE_IN_FLIGHT;
-		percpu_counter_inc(&bp->b_target->bt_io_count);
-	}
-	spin_unlock(&bp->b_lock);
-}
-
-/*
- * Clear the in-flight state on a buffer about to be released to the LRU or
- * freed and unaccount from the buftarg.
- */
-static inline void
-__xfs_buf_ioacct_dec(
-	struct xfs_buf	*bp)
-{
-	lockdep_assert_held(&bp->b_lock);
-
-	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
-		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
-		percpu_counter_dec(&bp->b_target->bt_io_count);
-	}
-}
-
 /*
  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  * b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -156,8 +106,6 @@ xfs_buf_stale(
 	 * status now to preserve accounting consistency.
 	 */
 	spin_lock(&bp->b_lock);
-	__xfs_buf_ioacct_dec(bp);
-
 	atomic_set(&bp->b_lru_ref, 0);
 	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
 	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
@@ -946,6 +894,7 @@ xfs_buf_readahead_map(
 	bp->b_ops = ops;
 	bp->b_flags &= ~(XBF_WRITE | XBF_DONE);
 	bp->b_flags |= flags;
+	percpu_counter_inc(&target->bt_readahead_count);
 	xfs_buf_submit(bp);
 }
 
@@ -1002,10 +951,12 @@ xfs_buf_get_uncached(
 	struct xfs_buf		*bp;
 	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
 
+	/* there are currently no valid flags for xfs_buf_get_uncached */
+	ASSERT(flags == 0);
+
 	*bpp = NULL;
 
-	/* flags might contain irrelevant bits, pass only what we care about */
-	error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
+	error = _xfs_buf_alloc(target, &map, 1, flags, &bp);
 	if (error)
 		return error;
 
@@ -1059,7 +1010,6 @@ xfs_buf_rele_uncached(
 		spin_unlock(&bp->b_lock);
 		return;
 	}
-	__xfs_buf_ioacct_dec(bp);
 	spin_unlock(&bp->b_lock);
 	xfs_buf_free(bp);
 }
@@ -1078,19 +1028,11 @@ xfs_buf_rele_cached(
 	spin_lock(&bp->b_lock);
 	ASSERT(bp->b_hold >= 1);
 	if (bp->b_hold > 1) {
-		/*
-		 * Drop the in-flight state if the buffer is already on the LRU
-		 * and it holds the only reference. This is racy because we
-		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
-		 * ensures the decrement occurs only once per-buf.
-		 */
-		if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
-			__xfs_buf_ioacct_dec(bp);
+		bp->b_hold--;
 		goto out_unlock;
 	}
 
 	/* we are asked to drop the last reference */
-	__xfs_buf_ioacct_dec(bp);
 	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
 		/*
 		 * If the buffer is added to the LRU, keep the reference to the
@@ -1369,6 +1311,8 @@ __xfs_buf_ioend(
 			bp->b_ops->verify_read(bp);
 		if (!bp->b_error)
 			bp->b_flags |= XBF_DONE;
+		if (bp->b_flags & XBF_READ_AHEAD)
+			percpu_counter_dec(&bp->b_target->bt_readahead_count);
 	} else {
 		if (!bp->b_error) {
 			bp->b_flags &= ~XBF_WRITE_FAIL;
@@ -1657,9 +1601,6 @@ xfs_buf_submit(
 	 */
 	bp->b_error = 0;
 
-	if (bp->b_flags & XBF_ASYNC)
-		xfs_buf_ioacct_inc(bp);
-
 	if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) {
 		xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE);
 		xfs_buf_ioend(bp);
@@ -1785,9 +1726,8 @@ xfs_buftarg_wait(
 	struct xfs_buftarg	*btp)
 {
 	/*
-	 * First wait on the buftarg I/O count for all in-flight buffers to be
-	 * released. This is critical as new buffers do not make the LRU until
-	 * they are released.
+	 * First wait for all in-flight readahead buffers to be released.  This is
+	 8 critical as new buffers do not make the LRU until they are released.
 	 *
 	 * Next, flush the buffer workqueue to ensure all completion processing
 	 * has finished. Just waiting on buffer locks is not sufficient for
@@ -1796,7 +1736,7 @@ xfs_buftarg_wait(
 	 * all reference counts have been dropped before we start walking the
 	 * LRU list.
 	 */
-	while (percpu_counter_sum(&btp->bt_io_count))
+	while (percpu_counter_sum(&btp->bt_readahead_count))
 		delay(100);
 	flush_workqueue(btp->bt_mount->m_buf_workqueue);
 }
@@ -1913,8 +1853,8 @@ xfs_destroy_buftarg(
 	struct xfs_buftarg	*btp)
 {
 	shrinker_free(btp->bt_shrinker);
-	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
-	percpu_counter_destroy(&btp->bt_io_count);
+	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
+	percpu_counter_destroy(&btp->bt_readahead_count);
 	list_lru_destroy(&btp->bt_lru);
 }
 
@@ -1968,7 +1908,7 @@ xfs_init_buftarg(
 
 	if (list_lru_init(&btp->bt_lru))
 		return -ENOMEM;
-	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+	if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
 		goto out_destroy_lru;
 
 	btp->bt_shrinker =
@@ -1982,7 +1922,7 @@ xfs_init_buftarg(
 	return 0;
 
 out_destroy_io_count:
-	percpu_counter_destroy(&btp->bt_io_count);
+	percpu_counter_destroy(&btp->bt_readahead_count);
 out_destroy_lru:
 	list_lru_destroy(&btp->bt_lru);
 	return -ENOMEM;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 2e747555ad3f..80e06eecaf56 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -27,7 +27,6 @@ struct xfs_buf;
 #define XBF_READ	 (1u << 0) /* buffer intended for reading from device */
 #define XBF_WRITE	 (1u << 1) /* buffer intended for writing to device */
 #define XBF_READ_AHEAD	 (1u << 2) /* asynchronous read-ahead */
-#define XBF_NO_IOACCT	 (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
 #define XBF_ASYNC	 (1u << 4) /* initiator will not wait for completion */
 #define XBF_DONE	 (1u << 5) /* all pages in the buffer uptodate */
 #define XBF_STALE	 (1u << 6) /* buffer has been staled, do not find it */
@@ -58,7 +57,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_READ,		"READ" }, \
 	{ XBF_WRITE,		"WRITE" }, \
 	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
-	{ XBF_NO_IOACCT,	"NO_IOACCT" }, \
 	{ XBF_ASYNC,		"ASYNC" }, \
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_STALE,		"STALE" }, \
@@ -77,7 +75,6 @@ typedef unsigned int xfs_buf_flags_t;
  * Internal state flags.
  */
 #define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
-#define XFS_BSTATE_IN_FLIGHT	 (1 << 1)	/* I/O in flight */
 
 struct xfs_buf_cache {
 	struct rhashtable	bc_hash;
@@ -116,7 +113,7 @@ struct xfs_buftarg {
 	struct shrinker		*bt_shrinker;
 	struct list_lru		bt_lru;
 
-	struct percpu_counter	bt_io_count;
+	struct percpu_counter	bt_readahead_count;
 	struct ratelimit_state	bt_ioerror_rl;
 
 	/* Atomic write unit values */
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 07bebbfb16ee..5b64a2b3b113 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -117,7 +117,7 @@ xmbuf_free(
 	struct xfs_buftarg	*btp)
 {
 	ASSERT(xfs_buftarg_is_mem(btp));
-	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
 
 	trace_xmbuf_free(btp);
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 477c5262cf91..b69356582b86 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -181,14 +181,11 @@ xfs_readsb(
 
 	/*
 	 * Allocate a (locked) buffer to hold the superblock. This will be kept
-	 * around at all times to optimize access to the superblock. Therefore,
-	 * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
-	 * elevated.
+	 * around at all times to optimize access to the superblock.
 	 */
 reread:
 	error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-				      BTOBB(sector_size), XBF_NO_IOACCT, &bp,
-				      buf_ops);
+				      BTOBB(sector_size), 0, &bp, buf_ops);
 	if (error) {
 		if (loud)
 			xfs_warn(mp, "SB validate failed with error %d.", error);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index d8e6d073d64d..57bef567e011 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1407,7 +1407,7 @@ xfs_rtmount_readsb(
 
 	/* m_blkbb_log is not set up yet */
 	error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR,
-			mp->m_sb.sb_blocksize >> BBSHIFT, XBF_NO_IOACCT, &bp,
+			mp->m_sb.sb_blocksize >> BBSHIFT, 0, &bp,
 			&xfs_rtsb_buf_ops);
 	if (error) {
 		xfs_warn(mp, "rt sb validate failed with error %d.", error);
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/4] xfs: remove the XBF_STALE check from xfs_buf_rele_cached
  2025-02-17  9:31 buffer cache simplifications Christoph Hellwig
                   ` (2 preceding siblings ...)
  2025-02-17  9:31 ` [PATCH 3/4] xfs: remove most in-flight buffer accounting Christoph Hellwig
@ 2025-02-17  9:31 ` Christoph Hellwig
  2025-02-18 20:23   ` Darrick J. Wong
  3 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-17  9:31 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, linux-xfs

xfs_buf_stale already set b_lru_ref to 0, and thus prevents the buffer
from moving to the LRU.  Remove the duplicate check.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f8efdee3c8b4..cf88b25fe3c5 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -99,12 +99,6 @@ xfs_buf_stale(
 	 */
 	bp->b_flags &= ~_XBF_DELWRI_Q;
 
-	/*
-	 * Once the buffer is marked stale and unlocked, a subsequent lookup
-	 * could reset b_flags. There is no guarantee that the buffer is
-	 * unaccounted (released to LRU) before that occurs. Drop in-flight
-	 * status now to preserve accounting consistency.
-	 */
 	spin_lock(&bp->b_lock);
 	atomic_set(&bp->b_lru_ref, 0);
 	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@ -1033,7 +1027,7 @@ xfs_buf_rele_cached(
 	}
 
 	/* we are asked to drop the last reference */
-	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+	if (atomic_read(&bp->b_lru_ref)) {
 		/*
 		 * If the buffer is added to the LRU, keep the reference to the
 		 * buffer for the LRU and clear the (now stale) dispose list
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O
  2025-02-17  9:31 ` [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O Christoph Hellwig
@ 2025-02-18 20:05   ` Darrick J. Wong
  2025-02-19  5:32     ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Darrick J. Wong @ 2025-02-18 20:05 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, linux-xfs

On Mon, Feb 17, 2025 at 10:31:26AM +0100, Christoph Hellwig wrote:
> Currently all metadata I/O completions happen in the m_buf_workqueue
> workqueue.  But for synchronous I/O (i.e. all buffer reads) there is no
> need for that, as there always is a called in process context that is
> waiting for the I/O.  Factor out the guts of xfs_buf_ioend into a
> separate helper and call it from xfs_buf_iowait to avoid a double
> an extra context switch to the workqueue.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_buf.c | 42 ++++++++++++++++++++++++++----------------
>  1 file changed, 26 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 15bb790359f8..050f2c2f6a40 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1345,6 +1345,7 @@ xfs_buf_ioend_handle_error(
>  resubmit:
>  	xfs_buf_ioerror(bp, 0);
>  	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
> +	reinit_completion(&bp->b_iowait);
>  	xfs_buf_submit(bp);
>  	return true;
>  out_stale:
> @@ -1355,8 +1356,8 @@ xfs_buf_ioend_handle_error(
>  	return false;
>  }
>  
> -static void
> -xfs_buf_ioend(
> +static bool
> +__xfs_buf_ioend(

What does the return value here indicate?  I /think/ it's true if the IO
is really complete, or false if we're going to retry?  But maybe it
actually means true if the bp reference is still live?  A comment would
be really helpful here.

--D

>  	struct xfs_buf	*bp)
>  {
>  	trace_xfs_buf_iodone(bp, _RET_IP_);
> @@ -1376,7 +1377,7 @@ xfs_buf_ioend(
>  		}
>  
>  		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
> -			return;
> +			return false;
>  
>  		/* clear the retry state */
>  		bp->b_last_error = 0;
> @@ -1397,7 +1398,15 @@ xfs_buf_ioend(
>  
>  	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
>  			 _XBF_LOGRECOVERY);
> +	return true;
> +}
>  
> +static void
> +xfs_buf_ioend(
> +	struct xfs_buf	*bp)
> +{
> +	if (!__xfs_buf_ioend(bp))
> +		return;
>  	if (bp->b_flags & XBF_ASYNC)
>  		xfs_buf_relse(bp);
>  	else
> @@ -1411,15 +1420,8 @@ xfs_buf_ioend_work(
>  	struct xfs_buf		*bp =
>  		container_of(work, struct xfs_buf, b_ioend_work);
>  
> -	xfs_buf_ioend(bp);
> -}
> -
> -static void
> -xfs_buf_ioend_async(
> -	struct xfs_buf	*bp)
> -{
> -	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
> -	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
> +	if (__xfs_buf_ioend(bp))
> +		xfs_buf_relse(bp);
>  }
>  
>  void
> @@ -1491,7 +1493,13 @@ xfs_buf_bio_end_io(
>  		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
>  		xfs_buf_ioerror(bp, -EIO);
>  
> -	xfs_buf_ioend_async(bp);
> +	if (bp->b_flags & XBF_ASYNC) {
> +		INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
> +		queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
> +	} else {
> +		complete(&bp->b_iowait);
> +	}
> +
>  	bio_put(bio);
>  }
>  
> @@ -1568,9 +1576,11 @@ xfs_buf_iowait(
>  {
>  	ASSERT(!(bp->b_flags & XBF_ASYNC));
>  
> -	trace_xfs_buf_iowait(bp, _RET_IP_);
> -	wait_for_completion(&bp->b_iowait);
> -	trace_xfs_buf_iowait_done(bp, _RET_IP_);
> +	do {
> +		trace_xfs_buf_iowait(bp, _RET_IP_);
> +		wait_for_completion(&bp->b_iowait);
> +		trace_xfs_buf_iowait_done(bp, _RET_IP_);
> +	} while (!__xfs_buf_ioend(bp));
>  
>  	return bp->b_error;
>  }
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] xfs: decouple buffer readahead from the normal buffer read path
  2025-02-17  9:31 ` [PATCH 2/4] xfs: decouple buffer readahead from the normal buffer read path Christoph Hellwig
@ 2025-02-18 20:10   ` Darrick J. Wong
  2025-02-19  5:32     ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Darrick J. Wong @ 2025-02-18 20:10 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, linux-xfs

On Mon, Feb 17, 2025 at 10:31:27AM +0100, Christoph Hellwig wrote:
> xfs_buf_readahead_map is the only caller of xfs_buf_read_map and thus
> _xfs_buf_read that is not synchronous.  Split it from xfs_buf_read_map
> so that the asynchronous path is self-contained and the now purely
> synchronous xfs_buf_read_map / _xfs_buf_read implementation can be
> simplified.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Thanks for adding the ASSERT as a guardrail against misuse of
xfs_buf_read.

This appears to be a straightforward split, so
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_buf.c         | 41 ++++++++++++++++++++--------------------
>  fs/xfs/xfs_buf.h         |  2 +-
>  fs/xfs/xfs_log_recover.c |  2 +-
>  fs/xfs/xfs_trace.h       |  1 +
>  4 files changed, 23 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 050f2c2f6a40..52fb85c42e94 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -794,18 +794,13 @@ xfs_buf_get_map(
>  
>  int
>  _xfs_buf_read(
> -	struct xfs_buf		*bp,
> -	xfs_buf_flags_t		flags)
> +	struct xfs_buf		*bp)
>  {
> -	ASSERT(!(flags & XBF_WRITE));
>  	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
>  
>  	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
> -	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
> -
> +	bp->b_flags |= XBF_READ;
>  	xfs_buf_submit(bp);
> -	if (flags & XBF_ASYNC)
> -		return 0;
>  	return xfs_buf_iowait(bp);
>  }
>  
> @@ -857,6 +852,8 @@ xfs_buf_read_map(
>  	struct xfs_buf		*bp;
>  	int			error;
>  
> +	ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD)));
> +
>  	flags |= XBF_READ;
>  	*bpp = NULL;
>  
> @@ -870,21 +867,11 @@ xfs_buf_read_map(
>  		/* Initiate the buffer read and wait. */
>  		XFS_STATS_INC(target->bt_mount, xb_get_read);
>  		bp->b_ops = ops;
> -		error = _xfs_buf_read(bp, flags);
> -
> -		/* Readahead iodone already dropped the buffer, so exit. */
> -		if (flags & XBF_ASYNC)
> -			return 0;
> +		error = _xfs_buf_read(bp);
>  	} else {
>  		/* Buffer already read; all we need to do is check it. */
>  		error = xfs_buf_reverify(bp, ops);
>  
> -		/* Readahead already finished; drop the buffer and exit. */
> -		if (flags & XBF_ASYNC) {
> -			xfs_buf_relse(bp);
> -			return 0;
> -		}
> -
>  		/* We do not want read in the flags */
>  		bp->b_flags &= ~XBF_READ;
>  		ASSERT(bp->b_ops != NULL || ops == NULL);
> @@ -936,6 +923,7 @@ xfs_buf_readahead_map(
>  	int			nmaps,
>  	const struct xfs_buf_ops *ops)
>  {
> +	const xfs_buf_flags_t	flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD;
>  	struct xfs_buf		*bp;
>  
>  	/*
> @@ -945,9 +933,20 @@ xfs_buf_readahead_map(
>  	if (xfs_buftarg_is_mem(target))
>  		return;
>  
> -	xfs_buf_read_map(target, map, nmaps,
> -		     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
> -		     __this_address);
> +	if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp))
> +		return;
> +	trace_xfs_buf_readahead(bp, 0, _RET_IP_);
> +
> +	if (bp->b_flags & XBF_DONE) {
> +		xfs_buf_reverify(bp, ops);
> +		xfs_buf_relse(bp);
> +		return;
> +	}
> +	XFS_STATS_INC(target->bt_mount, xb_get_read);
> +	bp->b_ops = ops;
> +	bp->b_flags &= ~(XBF_WRITE | XBF_DONE);
> +	bp->b_flags |= flags;
> +	xfs_buf_submit(bp);
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 3b4ed42e11c0..2e747555ad3f 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -291,7 +291,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
>  int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
>  		size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
>  		const struct xfs_buf_ops *ops);
> -int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
> +int _xfs_buf_read(struct xfs_buf *bp);
>  void xfs_buf_hold(struct xfs_buf *bp);
>  
>  /* Releasing Buffers */
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index b3c27dbccce8..2f76531842f8 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -3380,7 +3380,7 @@ xlog_do_recover(
>  	 */
>  	xfs_buf_lock(bp);
>  	xfs_buf_hold(bp);
> -	error = _xfs_buf_read(bp, XBF_READ);
> +	error = _xfs_buf_read(bp);
>  	if (error) {
>  		if (!xlog_is_shutdown(log)) {
>  			xfs_buf_ioerror_alert(bp, __this_address);
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index b29462363b81..bfc2f1249022 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -593,6 +593,7 @@ DEFINE_EVENT(xfs_buf_flags_class, name, \
>  DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
>  DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
>  DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
> +DEFINE_BUF_FLAGS_EVENT(xfs_buf_readahead);
>  
>  TRACE_EVENT(xfs_buf_ioerror,
>  	TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip),
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/4] xfs: remove most in-flight buffer accounting
  2025-02-17  9:31 ` [PATCH 3/4] xfs: remove most in-flight buffer accounting Christoph Hellwig
@ 2025-02-18 20:23   ` Darrick J. Wong
  2025-02-19  5:35     ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Darrick J. Wong @ 2025-02-18 20:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, linux-xfs

On Mon, Feb 17, 2025 at 10:31:28AM +0100, Christoph Hellwig wrote:
> The buffer cache keeps a bt_io_count per-CPU counter to track all
> in-flight I/O, which is used to ensure no I/O is in flight when
> unmounting the file system.
> 
> For most I/O we already keep track of inflight I/O at higher levels:
> 
>  - for synchronous I/O (xfs_buf_read/xfs_bwrite/xfs_buf_delwri_submit),
>    the caller has a reference and waits for I/O completions using
>    xfs_buf_iowait
>  - for xfs_buf_delwri_submit the only caller (AIL writeback) tracks the

Do you mean xfs_buf_delwri_submit_nowait here?

>    log items that the buffer attached to
> 
> This only leaves only xfs_buf_readahead_map as a submitter of
> asynchronous I/O that is not tracked by anything else.  Replace the
> bt_io_count per-cpu counter with a more specific bt_readahead_count
> counter only tracking readahead I/O.  This allows to simply increment
> it when submitting readahead I/O and decrementing it when it completed,
> and thus simplify xfs_buf_rele and remove the needed for the
> XBF_NO_IOACCT flags and the XFS_BSTATE_IN_FLIGHT buffer state.

IOWs, only asynchronous readahead needs an explicit counter in the
xfs_buf to prevent unmount because:

0. Anything done in mount/unmount/freeze holds s_umount
1. Buffer reads done on behalf of a file hold the file open and pin the
   mount
2. Dirty buffers have log items, and we can't unmount until those are
   dealt with
3. Fsck holds an open fd and hence pins the mount
4. Unmount blocks until background gc finishes

Right?  I almost wonder if you could just have a percpu counter in the
xfs_mount but that sounds pretty hot.

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_buf.c     | 90 ++++++++------------------------------------
>  fs/xfs/xfs_buf.h     |  5 +--
>  fs/xfs/xfs_buf_mem.c |  2 +-
>  fs/xfs/xfs_mount.c   |  7 +---
>  fs/xfs/xfs_rtalloc.c |  2 +-
>  5 files changed, 20 insertions(+), 86 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 52fb85c42e94..f8efdee3c8b4 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -29,11 +29,6 @@ struct kmem_cache *xfs_buf_cache;
>  /*
>   * Locking orders
>   *
> - * xfs_buf_ioacct_inc:
> - * xfs_buf_ioacct_dec:
> - *	b_sema (caller holds)
> - *	  b_lock
> - *
>   * xfs_buf_stale:
>   *	b_sema (caller holds)
>   *	  b_lock
> @@ -81,51 +76,6 @@ xfs_buf_vmap_len(
>  	return (bp->b_page_count * PAGE_SIZE);
>  }
>  
> -/*
> - * Bump the I/O in flight count on the buftarg if we haven't yet done so for
> - * this buffer. The count is incremented once per buffer (per hold cycle)
> - * because the corresponding decrement is deferred to buffer release. Buffers
> - * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
> - * tracking adds unnecessary overhead. This is used for sychronization purposes
> - * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
> - * in-flight buffers.
> - *
> - * Buffers that are never released (e.g., superblock, iclog buffers) must set
> - * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
> - * never reaches zero and unmount hangs indefinitely.
> - */
> -static inline void
> -xfs_buf_ioacct_inc(
> -	struct xfs_buf	*bp)
> -{
> -	if (bp->b_flags & XBF_NO_IOACCT)
> -		return;
> -
> -	ASSERT(bp->b_flags & XBF_ASYNC);
> -	spin_lock(&bp->b_lock);
> -	if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
> -		bp->b_state |= XFS_BSTATE_IN_FLIGHT;
> -		percpu_counter_inc(&bp->b_target->bt_io_count);
> -	}
> -	spin_unlock(&bp->b_lock);
> -}
> -
> -/*
> - * Clear the in-flight state on a buffer about to be released to the LRU or
> - * freed and unaccount from the buftarg.
> - */
> -static inline void
> -__xfs_buf_ioacct_dec(
> -	struct xfs_buf	*bp)
> -{
> -	lockdep_assert_held(&bp->b_lock);
> -
> -	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
> -		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
> -		percpu_counter_dec(&bp->b_target->bt_io_count);
> -	}
> -}
> -
>  /*
>   * When we mark a buffer stale, we remove the buffer from the LRU and clear the
>   * b_lru_ref count so that the buffer is freed immediately when the buffer
> @@ -156,8 +106,6 @@ xfs_buf_stale(
>  	 * status now to preserve accounting consistency.
>  	 */
>  	spin_lock(&bp->b_lock);
> -	__xfs_buf_ioacct_dec(bp);
> -
>  	atomic_set(&bp->b_lru_ref, 0);
>  	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
>  	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
> @@ -946,6 +894,7 @@ xfs_buf_readahead_map(
>  	bp->b_ops = ops;
>  	bp->b_flags &= ~(XBF_WRITE | XBF_DONE);
>  	bp->b_flags |= flags;
> +	percpu_counter_inc(&target->bt_readahead_count);
>  	xfs_buf_submit(bp);
>  }
>  
> @@ -1002,10 +951,12 @@ xfs_buf_get_uncached(
>  	struct xfs_buf		*bp;
>  	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
>  
> +	/* there are currently no valid flags for xfs_buf_get_uncached */
> +	ASSERT(flags == 0);

Can we just get rid of flags then?  AFAICT nobody uses it either here or
in xfsprogs, and in fact I think there's a nasty bug in the userspace
rtsb code:

	error = -libxfs_buf_get_uncached(mp->m_rtdev_targp,
			XFS_FSB_TO_BB(mp, 1), XFS_RTSB_DADDR, &rtsb_bp);

(Note: the second to last argument is flags, not daddr.)

Will send a patch for xfsprogs fixing that.

--D

> +
>  	*bpp = NULL;
>  
> -	/* flags might contain irrelevant bits, pass only what we care about */
> -	error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
> +	error = _xfs_buf_alloc(target, &map, 1, flags, &bp);
>  	if (error)
>  		return error;
>  
> @@ -1059,7 +1010,6 @@ xfs_buf_rele_uncached(
>  		spin_unlock(&bp->b_lock);
>  		return;
>  	}
> -	__xfs_buf_ioacct_dec(bp);
>  	spin_unlock(&bp->b_lock);
>  	xfs_buf_free(bp);
>  }
> @@ -1078,19 +1028,11 @@ xfs_buf_rele_cached(
>  	spin_lock(&bp->b_lock);
>  	ASSERT(bp->b_hold >= 1);
>  	if (bp->b_hold > 1) {
> -		/*
> -		 * Drop the in-flight state if the buffer is already on the LRU
> -		 * and it holds the only reference. This is racy because we
> -		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
> -		 * ensures the decrement occurs only once per-buf.
> -		 */
> -		if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
> -			__xfs_buf_ioacct_dec(bp);
> +		bp->b_hold--;
>  		goto out_unlock;
>  	}
>  
>  	/* we are asked to drop the last reference */
> -	__xfs_buf_ioacct_dec(bp);
>  	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
>  		/*
>  		 * If the buffer is added to the LRU, keep the reference to the
> @@ -1369,6 +1311,8 @@ __xfs_buf_ioend(
>  			bp->b_ops->verify_read(bp);
>  		if (!bp->b_error)
>  			bp->b_flags |= XBF_DONE;
> +		if (bp->b_flags & XBF_READ_AHEAD)
> +			percpu_counter_dec(&bp->b_target->bt_readahead_count);
>  	} else {
>  		if (!bp->b_error) {
>  			bp->b_flags &= ~XBF_WRITE_FAIL;
> @@ -1657,9 +1601,6 @@ xfs_buf_submit(
>  	 */
>  	bp->b_error = 0;
>  
> -	if (bp->b_flags & XBF_ASYNC)
> -		xfs_buf_ioacct_inc(bp);
> -
>  	if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) {
>  		xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE);
>  		xfs_buf_ioend(bp);
> @@ -1785,9 +1726,8 @@ xfs_buftarg_wait(
>  	struct xfs_buftarg	*btp)
>  {
>  	/*
> -	 * First wait on the buftarg I/O count for all in-flight buffers to be
> -	 * released. This is critical as new buffers do not make the LRU until
> -	 * they are released.
> +	 * First wait for all in-flight readahead buffers to be released.  This is
> +	 8 critical as new buffers do not make the LRU until they are released.
>  	 *
>  	 * Next, flush the buffer workqueue to ensure all completion processing
>  	 * has finished. Just waiting on buffer locks is not sufficient for
> @@ -1796,7 +1736,7 @@ xfs_buftarg_wait(
>  	 * all reference counts have been dropped before we start walking the
>  	 * LRU list.
>  	 */
> -	while (percpu_counter_sum(&btp->bt_io_count))
> +	while (percpu_counter_sum(&btp->bt_readahead_count))
>  		delay(100);
>  	flush_workqueue(btp->bt_mount->m_buf_workqueue);
>  }
> @@ -1913,8 +1853,8 @@ xfs_destroy_buftarg(
>  	struct xfs_buftarg	*btp)
>  {
>  	shrinker_free(btp->bt_shrinker);
> -	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
> -	percpu_counter_destroy(&btp->bt_io_count);
> +	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
> +	percpu_counter_destroy(&btp->bt_readahead_count);
>  	list_lru_destroy(&btp->bt_lru);
>  }
>  
> @@ -1968,7 +1908,7 @@ xfs_init_buftarg(
>  
>  	if (list_lru_init(&btp->bt_lru))
>  		return -ENOMEM;
> -	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
> +	if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
>  		goto out_destroy_lru;
>  
>  	btp->bt_shrinker =
> @@ -1982,7 +1922,7 @@ xfs_init_buftarg(
>  	return 0;
>  
>  out_destroy_io_count:
> -	percpu_counter_destroy(&btp->bt_io_count);
> +	percpu_counter_destroy(&btp->bt_readahead_count);
>  out_destroy_lru:
>  	list_lru_destroy(&btp->bt_lru);
>  	return -ENOMEM;
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 2e747555ad3f..80e06eecaf56 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -27,7 +27,6 @@ struct xfs_buf;
>  #define XBF_READ	 (1u << 0) /* buffer intended for reading from device */
>  #define XBF_WRITE	 (1u << 1) /* buffer intended for writing to device */
>  #define XBF_READ_AHEAD	 (1u << 2) /* asynchronous read-ahead */
> -#define XBF_NO_IOACCT	 (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
>  #define XBF_ASYNC	 (1u << 4) /* initiator will not wait for completion */
>  #define XBF_DONE	 (1u << 5) /* all pages in the buffer uptodate */
>  #define XBF_STALE	 (1u << 6) /* buffer has been staled, do not find it */
> @@ -58,7 +57,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_READ,		"READ" }, \
>  	{ XBF_WRITE,		"WRITE" }, \
>  	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
> -	{ XBF_NO_IOACCT,	"NO_IOACCT" }, \
>  	{ XBF_ASYNC,		"ASYNC" }, \
>  	{ XBF_DONE,		"DONE" }, \
>  	{ XBF_STALE,		"STALE" }, \
> @@ -77,7 +75,6 @@ typedef unsigned int xfs_buf_flags_t;
>   * Internal state flags.
>   */
>  #define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
> -#define XFS_BSTATE_IN_FLIGHT	 (1 << 1)	/* I/O in flight */
>  
>  struct xfs_buf_cache {
>  	struct rhashtable	bc_hash;
> @@ -116,7 +113,7 @@ struct xfs_buftarg {
>  	struct shrinker		*bt_shrinker;
>  	struct list_lru		bt_lru;
>  
> -	struct percpu_counter	bt_io_count;
> +	struct percpu_counter	bt_readahead_count;
>  	struct ratelimit_state	bt_ioerror_rl;
>  
>  	/* Atomic write unit values */
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index 07bebbfb16ee..5b64a2b3b113 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -117,7 +117,7 @@ xmbuf_free(
>  	struct xfs_buftarg	*btp)
>  {
>  	ASSERT(xfs_buftarg_is_mem(btp));
> -	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
> +	ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
>  
>  	trace_xmbuf_free(btp);
>  
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 477c5262cf91..b69356582b86 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -181,14 +181,11 @@ xfs_readsb(
>  
>  	/*
>  	 * Allocate a (locked) buffer to hold the superblock. This will be kept
> -	 * around at all times to optimize access to the superblock. Therefore,
> -	 * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
> -	 * elevated.
> +	 * around at all times to optimize access to the superblock.
>  	 */
>  reread:
>  	error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
> -				      BTOBB(sector_size), XBF_NO_IOACCT, &bp,
> -				      buf_ops);
> +				      BTOBB(sector_size), 0, &bp, buf_ops);
>  	if (error) {
>  		if (loud)
>  			xfs_warn(mp, "SB validate failed with error %d.", error);
> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index d8e6d073d64d..57bef567e011 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -1407,7 +1407,7 @@ xfs_rtmount_readsb(
>  
>  	/* m_blkbb_log is not set up yet */
>  	error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR,
> -			mp->m_sb.sb_blocksize >> BBSHIFT, XBF_NO_IOACCT, &bp,
> +			mp->m_sb.sb_blocksize >> BBSHIFT, 0, &bp,
>  			&xfs_rtsb_buf_ops);
>  	if (error) {
>  		xfs_warn(mp, "rt sb validate failed with error %d.", error);
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] xfs: remove the XBF_STALE check from xfs_buf_rele_cached
  2025-02-17  9:31 ` [PATCH 4/4] xfs: remove the XBF_STALE check from xfs_buf_rele_cached Christoph Hellwig
@ 2025-02-18 20:23   ` Darrick J. Wong
  0 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2025-02-18 20:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, linux-xfs

On Mon, Feb 17, 2025 at 10:31:29AM +0100, Christoph Hellwig wrote:
> xfs_buf_stale already set b_lru_ref to 0, and thus prevents the buffer
> from moving to the LRU.  Remove the duplicate check.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

I'd long wondered why that was necessary...
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_buf.c | 8 +-------
>  1 file changed, 1 insertion(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index f8efdee3c8b4..cf88b25fe3c5 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -99,12 +99,6 @@ xfs_buf_stale(
>  	 */
>  	bp->b_flags &= ~_XBF_DELWRI_Q;
>  
> -	/*
> -	 * Once the buffer is marked stale and unlocked, a subsequent lookup
> -	 * could reset b_flags. There is no guarantee that the buffer is
> -	 * unaccounted (released to LRU) before that occurs. Drop in-flight
> -	 * status now to preserve accounting consistency.
> -	 */
>  	spin_lock(&bp->b_lock);
>  	atomic_set(&bp->b_lru_ref, 0);
>  	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
> @@ -1033,7 +1027,7 @@ xfs_buf_rele_cached(
>  	}
>  
>  	/* we are asked to drop the last reference */
> -	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
> +	if (atomic_read(&bp->b_lru_ref)) {
>  		/*
>  		 * If the buffer is added to the LRU, keep the reference to the
>  		 * buffer for the LRU and clear the (now stale) dispose list
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O
  2025-02-18 20:05   ` Darrick J. Wong
@ 2025-02-19  5:32     ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-19  5:32 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, Carlos Maiolino, linux-xfs

On Tue, Feb 18, 2025 at 12:05:51PM -0800, Darrick J. Wong wrote:
> What does the return value here indicate?  I /think/ it's true if the IO
> is really complete, or false if we're going to retry?

Yes.

> But maybe it
> actually means true if the bp reference is still live?  A comment would
> be really helpful here.

Sure, I'll add one.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] xfs: decouple buffer readahead from the normal buffer read path
  2025-02-18 20:10   ` Darrick J. Wong
@ 2025-02-19  5:32     ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-19  5:32 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, Carlos Maiolino, linux-xfs

On Tue, Feb 18, 2025 at 12:10:29PM -0800, Darrick J. Wong wrote:
> On Mon, Feb 17, 2025 at 10:31:27AM +0100, Christoph Hellwig wrote:
> > xfs_buf_readahead_map is the only caller of xfs_buf_read_map and thus
> > _xfs_buf_read that is not synchronous.  Split it from xfs_buf_read_map
> > so that the asynchronous path is self-contained and the now purely
> > synchronous xfs_buf_read_map / _xfs_buf_read implementation can be
> > simplified.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Thanks for adding the ASSERT as a guardrail against misuse of
> xfs_buf_read.

I was just going to remove the argument, but that would clash with the
refactoring in the zoned series.  But I plan to send that later.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/4] xfs: remove most in-flight buffer accounting
  2025-02-18 20:23   ` Darrick J. Wong
@ 2025-02-19  5:35     ` Christoph Hellwig
  2025-02-19  5:37       ` Darrick J. Wong
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-19  5:35 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, Carlos Maiolino, linux-xfs

On Tue, Feb 18, 2025 at 12:23:27PM -0800, Darrick J. Wong wrote:
> On Mon, Feb 17, 2025 at 10:31:28AM +0100, Christoph Hellwig wrote:
> > The buffer cache keeps a bt_io_count per-CPU counter to track all
> > in-flight I/O, which is used to ensure no I/O is in flight when
> > unmounting the file system.
> > 
> > For most I/O we already keep track of inflight I/O at higher levels:
> > 
> >  - for synchronous I/O (xfs_buf_read/xfs_bwrite/xfs_buf_delwri_submit),
> >    the caller has a reference and waits for I/O completions using
> >    xfs_buf_iowait
> >  - for xfs_buf_delwri_submit the only caller (AIL writeback) tracks the
> 
> Do you mean xfs_buf_delwri_submit_nowait here?

Yes.

> IOWs, only asynchronous readahead needs an explicit counter in the
> xfs_buf to prevent unmount because:
> 
> 0. Anything done in mount/unmount/freeze holds s_umount
> 1. Buffer reads done on behalf of a file hold the file open and pin the
>    mount
> 2. Dirty buffers have log items, and we can't unmount until those are
>    dealt with
> 3. Fsck holds an open fd and hence pins the mount
> 4. Unmount blocks until background gc finishes
> 
> Right?

Yes.

> I almost wonder if you could just have a percpu counter in the
> xfs_mount but that sounds pretty hot.

Well, that would remove the nice xfs_buftarg_wait() abstraction.
Givne that we don't even allocate an extra buftrag unless we use
it that doesn't seem very useful.

> > +	/* there are currently no valid flags for xfs_buf_get_uncached */
> > +	ASSERT(flags == 0);
> 
> Can we just get rid of flags then?  AFAICT nobody uses it either here or
> in xfsprogs, and in fact I think there's a nasty bug in the userspace
> rtsb code:

See my reply to the last patch: I actually have a patch to remove it,
but it conflicts with the zoned series.  So for now I'll defer it until
that is merged.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/4] xfs: remove most in-flight buffer accounting
  2025-02-19  5:35     ` Christoph Hellwig
@ 2025-02-19  5:37       ` Darrick J. Wong
  0 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2025-02-19  5:37 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, linux-xfs

On Wed, Feb 19, 2025 at 06:35:27AM +0100, Christoph Hellwig wrote:
> On Tue, Feb 18, 2025 at 12:23:27PM -0800, Darrick J. Wong wrote:
> > On Mon, Feb 17, 2025 at 10:31:28AM +0100, Christoph Hellwig wrote:
> > > The buffer cache keeps a bt_io_count per-CPU counter to track all
> > > in-flight I/O, which is used to ensure no I/O is in flight when
> > > unmounting the file system.
> > > 
> > > For most I/O we already keep track of inflight I/O at higher levels:
> > > 
> > >  - for synchronous I/O (xfs_buf_read/xfs_bwrite/xfs_buf_delwri_submit),
> > >    the caller has a reference and waits for I/O completions using
> > >    xfs_buf_iowait
> > >  - for xfs_buf_delwri_submit the only caller (AIL writeback) tracks the
> > 
> > Do you mean xfs_buf_delwri_submit_nowait here?
> 
> Yes.
> 
> > IOWs, only asynchronous readahead needs an explicit counter in the
> > xfs_buf to prevent unmount because:
> > 
> > 0. Anything done in mount/unmount/freeze holds s_umount
> > 1. Buffer reads done on behalf of a file hold the file open and pin the
> >    mount
> > 2. Dirty buffers have log items, and we can't unmount until those are
> >    dealt with
> > 3. Fsck holds an open fd and hence pins the mount
> > 4. Unmount blocks until background gc finishes
> > 
> > Right?
> 
> Yes.
> 
> > I almost wonder if you could just have a percpu counter in the
> > xfs_mount but that sounds pretty hot.
> 
> Well, that would remove the nice xfs_buftarg_wait() abstraction.
> Givne that we don't even allocate an extra buftrag unless we use
> it that doesn't seem very useful.

Heheh.  Anyway, the changes look good to me so with that cleared up
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

> > > +	/* there are currently no valid flags for xfs_buf_get_uncached */
> > > +	ASSERT(flags == 0);
> > 
> > Can we just get rid of flags then?  AFAICT nobody uses it either here or
> > in xfsprogs, and in fact I think there's a nasty bug in the userspace
> > rtsb code:
> 
> See my reply to the last patch: I actually have a patch to remove it,
> but it conflicts with the zoned series.  So for now I'll defer it until
> that is merged.

<nod>

--D

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O
  2025-02-24 15:11 buffer cache simplifications v2 Christoph Hellwig
@ 2025-02-24 15:11 ` Christoph Hellwig
  2025-02-24 19:13   ` Darrick J. Wong
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-24 15:11 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, linux-xfs

Currently all metadata I/O completions happen in the m_buf_workqueue
workqueue.  But for synchronous I/O (i.e. all buffer reads) there is no
need for that, as there always is a called in process context that is
waiting for the I/O.  Factor out the guts of xfs_buf_ioend into a
separate helper and call it from xfs_buf_iowait to avoid a double
an extra context switch to the workqueue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15bb790359f8..821aa85e2ce5 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1345,6 +1345,7 @@ xfs_buf_ioend_handle_error(
 resubmit:
 	xfs_buf_ioerror(bp, 0);
 	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+	reinit_completion(&bp->b_iowait);
 	xfs_buf_submit(bp);
 	return true;
 out_stale:
@@ -1355,8 +1356,9 @@ xfs_buf_ioend_handle_error(
 	return false;
 }
 
-static void
-xfs_buf_ioend(
+/* returns false if the caller needs to resubmit the I/O, else false */
+static bool
+__xfs_buf_ioend(
 	struct xfs_buf	*bp)
 {
 	trace_xfs_buf_iodone(bp, _RET_IP_);
@@ -1376,7 +1378,7 @@ xfs_buf_ioend(
 		}
 
 		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
-			return;
+			return false;
 
 		/* clear the retry state */
 		bp->b_last_error = 0;
@@ -1397,7 +1399,15 @@ xfs_buf_ioend(
 
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
 			 _XBF_LOGRECOVERY);
+	return true;
+}
 
+static void
+xfs_buf_ioend(
+	struct xfs_buf	*bp)
+{
+	if (!__xfs_buf_ioend(bp))
+		return;
 	if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
 	else
@@ -1411,15 +1421,8 @@ xfs_buf_ioend_work(
 	struct xfs_buf		*bp =
 		container_of(work, struct xfs_buf, b_ioend_work);
 
-	xfs_buf_ioend(bp);
-}
-
-static void
-xfs_buf_ioend_async(
-	struct xfs_buf	*bp)
-{
-	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
-	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+	if (__xfs_buf_ioend(bp))
+		xfs_buf_relse(bp);
 }
 
 void
@@ -1491,7 +1494,13 @@ xfs_buf_bio_end_io(
 		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
 		xfs_buf_ioerror(bp, -EIO);
 
-	xfs_buf_ioend_async(bp);
+	if (bp->b_flags & XBF_ASYNC) {
+		INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
+		queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+	} else {
+		complete(&bp->b_iowait);
+	}
+
 	bio_put(bio);
 }
 
@@ -1568,9 +1577,11 @@ xfs_buf_iowait(
 {
 	ASSERT(!(bp->b_flags & XBF_ASYNC));
 
-	trace_xfs_buf_iowait(bp, _RET_IP_);
-	wait_for_completion(&bp->b_iowait);
-	trace_xfs_buf_iowait_done(bp, _RET_IP_);
+	do {
+		trace_xfs_buf_iowait(bp, _RET_IP_);
+		wait_for_completion(&bp->b_iowait);
+		trace_xfs_buf_iowait_done(bp, _RET_IP_);
+	} while (!__xfs_buf_ioend(bp));
 
 	return bp->b_error;
 }
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O
  2025-02-24 15:11 ` [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O Christoph Hellwig
@ 2025-02-24 19:13   ` Darrick J. Wong
  0 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2025-02-24 19:13 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, linux-xfs

On Mon, Feb 24, 2025 at 07:11:35AM -0800, Christoph Hellwig wrote:
> Currently all metadata I/O completions happen in the m_buf_workqueue
> workqueue.  But for synchronous I/O (i.e. all buffer reads) there is no
> need for that, as there always is a called in process context that is
> waiting for the I/O.  Factor out the guts of xfs_buf_ioend into a
> separate helper and call it from xfs_buf_iowait to avoid a double
> an extra context switch to the workqueue.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_buf.c | 43 +++++++++++++++++++++++++++----------------
>  1 file changed, 27 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 15bb790359f8..821aa85e2ce5 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1345,6 +1345,7 @@ xfs_buf_ioend_handle_error(
>  resubmit:
>  	xfs_buf_ioerror(bp, 0);
>  	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
> +	reinit_completion(&bp->b_iowait);
>  	xfs_buf_submit(bp);
>  	return true;
>  out_stale:
> @@ -1355,8 +1356,9 @@ xfs_buf_ioend_handle_error(
>  	return false;
>  }
>  
> -static void
> -xfs_buf_ioend(
> +/* returns false if the caller needs to resubmit the I/O, else false */

"...else true" ?

--D

> +static bool
> +__xfs_buf_ioend(
>  	struct xfs_buf	*bp)
>  {
>  	trace_xfs_buf_iodone(bp, _RET_IP_);
> @@ -1376,7 +1378,7 @@ xfs_buf_ioend(
>  		}
>  
>  		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
> -			return;
> +			return false;
>  
>  		/* clear the retry state */
>  		bp->b_last_error = 0;
> @@ -1397,7 +1399,15 @@ xfs_buf_ioend(
>  
>  	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
>  			 _XBF_LOGRECOVERY);
> +	return true;
> +}
>  
> +static void
> +xfs_buf_ioend(
> +	struct xfs_buf	*bp)
> +{
> +	if (!__xfs_buf_ioend(bp))
> +		return;
>  	if (bp->b_flags & XBF_ASYNC)
>  		xfs_buf_relse(bp);
>  	else
> @@ -1411,15 +1421,8 @@ xfs_buf_ioend_work(
>  	struct xfs_buf		*bp =
>  		container_of(work, struct xfs_buf, b_ioend_work);
>  
> -	xfs_buf_ioend(bp);
> -}
> -
> -static void
> -xfs_buf_ioend_async(
> -	struct xfs_buf	*bp)
> -{
> -	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
> -	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
> +	if (__xfs_buf_ioend(bp))
> +		xfs_buf_relse(bp);
>  }
>  
>  void
> @@ -1491,7 +1494,13 @@ xfs_buf_bio_end_io(
>  		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
>  		xfs_buf_ioerror(bp, -EIO);
>  
> -	xfs_buf_ioend_async(bp);
> +	if (bp->b_flags & XBF_ASYNC) {
> +		INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
> +		queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
> +	} else {
> +		complete(&bp->b_iowait);
> +	}
> +
>  	bio_put(bio);
>  }
>  
> @@ -1568,9 +1577,11 @@ xfs_buf_iowait(
>  {
>  	ASSERT(!(bp->b_flags & XBF_ASYNC));
>  
> -	trace_xfs_buf_iowait(bp, _RET_IP_);
> -	wait_for_completion(&bp->b_iowait);
> -	trace_xfs_buf_iowait_done(bp, _RET_IP_);
> +	do {
> +		trace_xfs_buf_iowait(bp, _RET_IP_);
> +		wait_for_completion(&bp->b_iowait);
> +		trace_xfs_buf_iowait_done(bp, _RET_IP_);
> +	} while (!__xfs_buf_ioend(bp));
>  
>  	return bp->b_error;
>  }
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O
  2025-02-24 23:48 buffer cache simplifications v3 Christoph Hellwig
@ 2025-02-24 23:48 ` Christoph Hellwig
  2025-02-25  5:37   ` Darrick J. Wong
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2025-02-24 23:48 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, linux-xfs, Dave Chinner

Currently all metadata I/O completions happen in the m_buf_workqueue
workqueue.  But for synchronous I/O (i.e. all buffer reads) there is no
need for that, as there always is a called in process context that is
waiting for the I/O.  Factor out the guts of xfs_buf_ioend into a
separate helper and call it from xfs_buf_iowait to avoid a double
an extra context switch to the workqueue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_buf.c | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15bb790359f8..dfc1849b3314 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1345,6 +1345,7 @@ xfs_buf_ioend_handle_error(
 resubmit:
 	xfs_buf_ioerror(bp, 0);
 	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+	reinit_completion(&bp->b_iowait);
 	xfs_buf_submit(bp);
 	return true;
 out_stale:
@@ -1355,8 +1356,9 @@ xfs_buf_ioend_handle_error(
 	return false;
 }
 
-static void
-xfs_buf_ioend(
+/* returns false if the caller needs to resubmit the I/O, else true */
+static bool
+__xfs_buf_ioend(
 	struct xfs_buf	*bp)
 {
 	trace_xfs_buf_iodone(bp, _RET_IP_);
@@ -1376,7 +1378,7 @@ xfs_buf_ioend(
 		}
 
 		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
-			return;
+			return false;
 
 		/* clear the retry state */
 		bp->b_last_error = 0;
@@ -1397,7 +1399,15 @@ xfs_buf_ioend(
 
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
 			 _XBF_LOGRECOVERY);
+	return true;
+}
 
+static void
+xfs_buf_ioend(
+	struct xfs_buf	*bp)
+{
+	if (!__xfs_buf_ioend(bp))
+		return;
 	if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
 	else
@@ -1411,15 +1421,8 @@ xfs_buf_ioend_work(
 	struct xfs_buf		*bp =
 		container_of(work, struct xfs_buf, b_ioend_work);
 
-	xfs_buf_ioend(bp);
-}
-
-static void
-xfs_buf_ioend_async(
-	struct xfs_buf	*bp)
-{
-	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
-	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+	if (__xfs_buf_ioend(bp))
+		xfs_buf_relse(bp);
 }
 
 void
@@ -1491,7 +1494,13 @@ xfs_buf_bio_end_io(
 		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
 		xfs_buf_ioerror(bp, -EIO);
 
-	xfs_buf_ioend_async(bp);
+	if (bp->b_flags & XBF_ASYNC) {
+		INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
+		queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+	} else {
+		complete(&bp->b_iowait);
+	}
+
 	bio_put(bio);
 }
 
@@ -1568,9 +1577,11 @@ xfs_buf_iowait(
 {
 	ASSERT(!(bp->b_flags & XBF_ASYNC));
 
-	trace_xfs_buf_iowait(bp, _RET_IP_);
-	wait_for_completion(&bp->b_iowait);
-	trace_xfs_buf_iowait_done(bp, _RET_IP_);
+	do {
+		trace_xfs_buf_iowait(bp, _RET_IP_);
+		wait_for_completion(&bp->b_iowait);
+		trace_xfs_buf_iowait_done(bp, _RET_IP_);
+	} while (!__xfs_buf_ioend(bp));
 
 	return bp->b_error;
 }
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O
  2025-02-24 23:48 ` [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O Christoph Hellwig
@ 2025-02-25  5:37   ` Darrick J. Wong
  0 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2025-02-25  5:37 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, linux-xfs, Dave Chinner

On Mon, Feb 24, 2025 at 03:48:52PM -0800, Christoph Hellwig wrote:
> Currently all metadata I/O completions happen in the m_buf_workqueue
> workqueue.  But for synchronous I/O (i.e. all buffer reads) there is no
> need for that, as there always is a called in process context that is
> waiting for the I/O.  Factor out the guts of xfs_buf_ioend into a
> separate helper and call it from xfs_buf_iowait to avoid a double
> an extra context switch to the workqueue.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>

Looks good now,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_buf.c | 43 +++++++++++++++++++++++++++----------------
>  1 file changed, 27 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 15bb790359f8..dfc1849b3314 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1345,6 +1345,7 @@ xfs_buf_ioend_handle_error(
>  resubmit:
>  	xfs_buf_ioerror(bp, 0);
>  	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
> +	reinit_completion(&bp->b_iowait);
>  	xfs_buf_submit(bp);
>  	return true;
>  out_stale:
> @@ -1355,8 +1356,9 @@ xfs_buf_ioend_handle_error(
>  	return false;
>  }
>  
> -static void
> -xfs_buf_ioend(
> +/* returns false if the caller needs to resubmit the I/O, else true */
> +static bool
> +__xfs_buf_ioend(
>  	struct xfs_buf	*bp)
>  {
>  	trace_xfs_buf_iodone(bp, _RET_IP_);
> @@ -1376,7 +1378,7 @@ xfs_buf_ioend(
>  		}
>  
>  		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
> -			return;
> +			return false;
>  
>  		/* clear the retry state */
>  		bp->b_last_error = 0;
> @@ -1397,7 +1399,15 @@ xfs_buf_ioend(
>  
>  	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
>  			 _XBF_LOGRECOVERY);
> +	return true;
> +}
>  
> +static void
> +xfs_buf_ioend(
> +	struct xfs_buf	*bp)
> +{
> +	if (!__xfs_buf_ioend(bp))
> +		return;
>  	if (bp->b_flags & XBF_ASYNC)
>  		xfs_buf_relse(bp);
>  	else
> @@ -1411,15 +1421,8 @@ xfs_buf_ioend_work(
>  	struct xfs_buf		*bp =
>  		container_of(work, struct xfs_buf, b_ioend_work);
>  
> -	xfs_buf_ioend(bp);
> -}
> -
> -static void
> -xfs_buf_ioend_async(
> -	struct xfs_buf	*bp)
> -{
> -	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
> -	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
> +	if (__xfs_buf_ioend(bp))
> +		xfs_buf_relse(bp);
>  }
>  
>  void
> @@ -1491,7 +1494,13 @@ xfs_buf_bio_end_io(
>  		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
>  		xfs_buf_ioerror(bp, -EIO);
>  
> -	xfs_buf_ioend_async(bp);
> +	if (bp->b_flags & XBF_ASYNC) {
> +		INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
> +		queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
> +	} else {
> +		complete(&bp->b_iowait);
> +	}
> +
>  	bio_put(bio);
>  }
>  
> @@ -1568,9 +1577,11 @@ xfs_buf_iowait(
>  {
>  	ASSERT(!(bp->b_flags & XBF_ASYNC));
>  
> -	trace_xfs_buf_iowait(bp, _RET_IP_);
> -	wait_for_completion(&bp->b_iowait);
> -	trace_xfs_buf_iowait_done(bp, _RET_IP_);
> +	do {
> +		trace_xfs_buf_iowait(bp, _RET_IP_);
> +		wait_for_completion(&bp->b_iowait);
> +		trace_xfs_buf_iowait_done(bp, _RET_IP_);
> +	} while (!__xfs_buf_ioend(bp));
>  
>  	return bp->b_error;
>  }
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2025-02-25  5:37 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-17  9:31 buffer cache simplifications Christoph Hellwig
2025-02-17  9:31 ` [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O Christoph Hellwig
2025-02-18 20:05   ` Darrick J. Wong
2025-02-19  5:32     ` Christoph Hellwig
2025-02-17  9:31 ` [PATCH 2/4] xfs: decouple buffer readahead from the normal buffer read path Christoph Hellwig
2025-02-18 20:10   ` Darrick J. Wong
2025-02-19  5:32     ` Christoph Hellwig
2025-02-17  9:31 ` [PATCH 3/4] xfs: remove most in-flight buffer accounting Christoph Hellwig
2025-02-18 20:23   ` Darrick J. Wong
2025-02-19  5:35     ` Christoph Hellwig
2025-02-19  5:37       ` Darrick J. Wong
2025-02-17  9:31 ` [PATCH 4/4] xfs: remove the XBF_STALE check from xfs_buf_rele_cached Christoph Hellwig
2025-02-18 20:23   ` Darrick J. Wong
  -- strict thread matches above, loose matches on Subject: below --
2025-02-24 15:11 buffer cache simplifications v2 Christoph Hellwig
2025-02-24 15:11 ` [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O Christoph Hellwig
2025-02-24 19:13   ` Darrick J. Wong
2025-02-24 23:48 buffer cache simplifications v3 Christoph Hellwig
2025-02-24 23:48 ` [PATCH 1/4] xfs: reduce context switches for synchronous buffered I/O Christoph Hellwig
2025-02-25  5:37   ` Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox