From: "Darrick J. Wong" <djwong@kernel.org>
To: Christoph Hellwig <hch@lst.de>
Cc: Carlos Maiolino <cem@kernel.org>, linux-xfs@vger.kernel.org
Subject: Re: [PATCH 09/15] xfs: simplify buffer I/O submission
Date: Mon, 6 Jan 2025 22:42:24 -0800 [thread overview]
Message-ID: <20250107064224.GA6174@frogsfrogsfrogs> (raw)
In-Reply-To: <20250106095613.847700-10-hch@lst.de>
On Mon, Jan 06, 2025 at 10:54:46AM +0100, Christoph Hellwig wrote:
> The code in _xfs_buf_ioapply is unnecessarily complicated because it
> doesn't take advantage of modern bio features.
>
> Simplify it by making use of bio splitting and chaining, that is build
> a single bio for the pages in the buffer using a simple loop, and then
> split that bio on the map boundaries for discontiguous multi-FSB buffers
> and chain the split bios to the main one so that there is only a single
> I/O completion.
>
> This not only simplifies the code to build the buffer, but also removes
> the need for the b_io_remaining field as buffer ownership is granted
> to the bio on submit of the final bio with no chance for a completion
> before that as well as the b_io_error field that is now superfluous
> because there always is exactly one completion.
So I guess b_io_remaining was the count of in-flight bios? And making a
chain of bios basically just moves all that to the bio layer, so all
xfs_buf needs to know is whether or not a bio is in progress, right?
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_buf.c | 207 ++++++++++++++---------------------------------
> fs/xfs/xfs_buf.h | 2 -
> 2 files changed, 60 insertions(+), 149 deletions(-)
>
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index e886605b5721..094f16457998 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1364,13 +1364,6 @@ xfs_buf_ioend(
> {
> trace_xfs_buf_iodone(bp, _RET_IP_);
>
> - /*
> - * Pull in IO completion errors now. We are guaranteed to be running
> - * single threaded, so we don't need the lock to read b_io_error.
> - */
> - if (!bp->b_error && bp->b_io_error)
> - xfs_buf_ioerror(bp, bp->b_io_error);
> -
> if (bp->b_flags & XBF_READ) {
> if (!bp->b_error && bp->b_ops)
> bp->b_ops->verify_read(bp);
> @@ -1494,118 +1487,26 @@ static void
> xfs_buf_bio_end_io(
> struct bio *bio)
> {
> - struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
> + struct xfs_buf *bp = bio->bi_private;
>
> - if (!bio->bi_status &&
> - (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
> - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
> - bio->bi_status = BLK_STS_IOERR;
> -
> - /*
> - * don't overwrite existing errors - otherwise we can lose errors on
> - * buffers that require multiple bios to complete.
> - */
> - if (bio->bi_status) {
> - int error = blk_status_to_errno(bio->bi_status);
> -
> - cmpxchg(&bp->b_io_error, 0, error);
> - }
> + if (bio->bi_status)
> + xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
> + else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
> + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
> + xfs_buf_ioerror(bp, -EIO);
>
> if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
> invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
>
> - if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
> - xfs_buf_ioend_async(bp);
> + xfs_buf_ioend_async(bp);
> bio_put(bio);
> }
>
> -static void
> -xfs_buf_ioapply_map(
> - struct xfs_buf *bp,
> - int map,
> - int *buf_offset,
> - int *count,
> - blk_opf_t op)
> -{
> - int page_index;
> - unsigned int total_nr_pages = bp->b_page_count;
> - int nr_pages;
> - struct bio *bio;
> - sector_t sector = bp->b_maps[map].bm_bn;
> - int size;
> - int offset;
> -
> - /* skip the pages in the buffer before the start offset */
> - page_index = 0;
> - offset = *buf_offset;
> - while (offset >= PAGE_SIZE) {
> - page_index++;
> - offset -= PAGE_SIZE;
> - }
> -
> - /*
> - * Limit the IO size to the length of the current vector, and update the
> - * remaining IO count for the next time around.
> - */
> - size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
> - *count -= size;
> - *buf_offset += size;
> -
> -next_chunk:
> - atomic_inc(&bp->b_io_remaining);
> - nr_pages = bio_max_segs(total_nr_pages);
> -
> - bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
> - bio->bi_iter.bi_sector = sector;
> - bio->bi_end_io = xfs_buf_bio_end_io;
> - bio->bi_private = bp;
> -
> - for (; size && nr_pages; nr_pages--, page_index++) {
> - int rbytes, nbytes = PAGE_SIZE - offset;
> -
> - if (nbytes > size)
> - nbytes = size;
> -
> - rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
> - offset);
> - if (rbytes < nbytes)
> - break;
> -
> - offset = 0;
> - sector += BTOBB(nbytes);
> - size -= nbytes;
> - total_nr_pages--;
> - }
> -
> - if (likely(bio->bi_iter.bi_size)) {
> - if (xfs_buf_is_vmapped(bp)) {
> - flush_kernel_vmap_range(bp->b_addr,
> - xfs_buf_vmap_len(bp));
> - }
> - submit_bio(bio);
> - if (size)
> - goto next_chunk;
> - } else {
> - /*
> - * This is guaranteed not to be the last io reference count
> - * because the caller (xfs_buf_submit) holds a count itself.
> - */
> - atomic_dec(&bp->b_io_remaining);
> - xfs_buf_ioerror(bp, -EIO);
> - bio_put(bio);
> - }
> -
> -}
> -
> -STATIC void
> -_xfs_buf_ioapply(
> - struct xfs_buf *bp)
> +static inline blk_opf_t
> +xfs_buf_bio_op(
> + struct xfs_buf *bp)
> {
> - struct blk_plug plug;
> - blk_opf_t op;
> - int offset;
> - int size;
> - int i;
> + blk_opf_t op;
>
> if (bp->b_flags & XBF_WRITE) {
> op = REQ_OP_WRITE;
> @@ -1615,25 +1516,52 @@ _xfs_buf_ioapply(
> op |= REQ_RAHEAD;
> }
>
> - /* we only use the buffer cache for meta-data */
> - op |= REQ_META;
> + return op | REQ_META;
> +}
> +
> +static void
> +xfs_buf_submit_bio(
> + struct xfs_buf *bp)
> +{
> + unsigned int size = BBTOB(bp->b_length);
> + unsigned int map = 0, p;
> + struct blk_plug plug;
> + struct bio *bio;
> +
> + bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
> + xfs_buf_bio_op(bp), GFP_NOIO);
> + bio->bi_private = bp;
> + bio->bi_end_io = xfs_buf_bio_end_io;
> +
> + if (bp->b_flags & _XBF_KMEM) {
> + __bio_add_page(bio, virt_to_page(bp->b_addr), size,
> + bp->b_offset);
> + } else {
> + for (p = 0; p < bp->b_page_count; p++)
> + __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
> + bio->bi_iter.bi_size = size; /* limit to the actual size used */
> +
> + if (xfs_buf_is_vmapped(bp))
> + flush_kernel_vmap_range(bp->b_addr,
> + xfs_buf_vmap_len(bp));
> + }
>
> /*
> - * Walk all the vectors issuing IO on them. Set up the initial offset
> - * into the buffer and the desired IO size before we start -
> - * _xfs_buf_ioapply_vec() will modify them appropriately for each
> - * subsequent call.
> + * If there is more than one map segment, split the original bio to
> + * submit a separate bio for each discontiguous range.
> */
> - offset = bp->b_offset;
> - size = BBTOB(bp->b_length);
> blk_start_plug(&plug);
> - for (i = 0; i < bp->b_map_count; i++) {
> - xfs_buf_ioapply_map(bp, i, &offset, &size, op);
> - if (bp->b_error)
> - break;
> - if (size <= 0)
> - break; /* all done */
Eerrugh, I spent quite a while trying to wrap my head around the old
code when I was writing the in-memory xfs_buf support. This is much
less weird to look at...
> + for (map = 0; map < bp->b_map_count - 1; map++) {
...but why isn't this "map < bp->b_map_count"?
--D
> + struct bio *split;
> +
> + split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS,
> + &fs_bio_set);
> + split->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
> + bio_chain(split, bio);
> + submit_bio(split);
> }
> + bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
> + submit_bio(bio);
> blk_finish_plug(&plug);
> }
>
> @@ -1693,8 +1621,6 @@ static int
> xfs_buf_submit(
> struct xfs_buf *bp)
> {
> - int error = 0;
> -
> trace_xfs_buf_submit(bp, _RET_IP_);
>
> ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
> @@ -1735,14 +1661,7 @@ xfs_buf_submit(
> * left over from previous use of the buffer (e.g. failed readahead).
> */
> bp->b_error = 0;
> - bp->b_io_error = 0;
>
> - /*
> - * Set the count to 1 initially, this will stop an I/O completion
> - * callout which happens before we have started all the I/O from calling
> - * xfs_buf_ioend too early.
> - */
> - atomic_set(&bp->b_io_remaining, 1);
> if (bp->b_flags & XBF_ASYNC)
> xfs_buf_ioacct_inc(bp);
>
> @@ -1755,28 +1674,22 @@ xfs_buf_submit(
> if (xfs_buftarg_is_mem(bp->b_target))
> goto done;
>
> - _xfs_buf_ioapply(bp);
> + xfs_buf_submit_bio(bp);
> + goto rele;
>
> done:
> - /*
> - * If _xfs_buf_ioapply failed, we can get back here with only the IO
> - * reference we took above. If we drop it to zero, run completion so
> - * that we don't return to the caller with completion still pending.
> - */
> - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
> - if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
> - xfs_buf_ioend(bp);
> - else
> - xfs_buf_ioend_async(bp);
> - }
> -
> + if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
> + xfs_buf_ioend(bp);
> + else
> + xfs_buf_ioend_async(bp);
> +rele:
> /*
> * Release the hold that keeps the buffer referenced for the entire
> * I/O. Note that if the buffer is async, it is not safe to reference
> * after this release.
> */
> xfs_buf_rele(bp);
> - return error;
> + return 0;
> }
>
> void *
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index da80399c7457..c53d27439ff2 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -184,7 +184,6 @@ struct xfs_buf {
> struct list_head b_lru; /* lru list */
> spinlock_t b_lock; /* internal state lock */
> unsigned int b_state; /* internal state flags */
> - int b_io_error; /* internal IO error state */
> wait_queue_head_t b_waiters; /* unpin waiters */
> struct list_head b_list;
> struct xfs_perag *b_pag;
> @@ -202,7 +201,6 @@ struct xfs_buf {
> struct xfs_buf_map __b_map; /* inline compound buffer map */
> int b_map_count;
> atomic_t b_pin_count; /* pin count */
> - atomic_t b_io_remaining; /* #outstanding I/O requests */
> unsigned int b_page_count; /* size of page array */
> unsigned int b_offset; /* page offset of b_addr,
> only for _XBF_KMEM buffers */
> --
> 2.45.2
>
>
next prev parent reply other threads:[~2025-01-07 6:42 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-01-06 9:54 buffer cache cleanups Christoph Hellwig
2025-01-06 9:54 ` [PATCH 01/15] xfs: fix a double completion for buffers on in-memory targets Christoph Hellwig
2025-01-07 2:00 ` Darrick J. Wong
2025-01-07 6:05 ` Christoph Hellwig
2025-01-06 9:54 ` [PATCH 02/15] xfs: remove the incorrect comment above xfs_buf_free_maps Christoph Hellwig
2025-01-07 2:00 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 03/15] xfs: remove the incorrect comment about the b_pag field Christoph Hellwig
2025-01-07 2:01 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 04/15] xfs: move xfs_buf_iowait out of (__)xfs_buf_submit Christoph Hellwig
2025-01-07 2:02 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 05/15] xfs: simplify xfs_buf_delwri_pushbuf Christoph Hellwig
2025-01-07 2:08 ` Darrick J. Wong
2025-01-07 6:06 ` Christoph Hellwig
2025-01-13 7:12 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 06/15] xfs: remove xfs_buf_delwri_submit_buffers Christoph Hellwig
2025-01-07 6:31 ` Darrick J. Wong
2025-01-07 6:33 ` Christoph Hellwig
2025-01-06 9:54 ` [PATCH 07/15] xfs: move write verification out of _xfs_buf_ioapply Christoph Hellwig
2025-01-07 6:33 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 08/15] xfs: move in-memory buftarg handling " Christoph Hellwig
2025-01-07 6:34 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 09/15] xfs: simplify buffer I/O submission Christoph Hellwig
2025-01-07 6:42 ` Darrick J. Wong [this message]
2025-01-07 6:46 ` Christoph Hellwig
2025-01-07 6:57 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 10/15] xfs: move invalidate_kernel_vmap_range to xfs_buf_ioend Christoph Hellwig
2025-01-07 6:42 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 11/15] xfs: remove the extra buffer reference in xfs_buf_submit Christoph Hellwig
2025-01-13 7:13 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 12/15] xfs: always complete the buffer inline " Christoph Hellwig
2025-01-07 6:46 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 13/15] xfs: simplify xfsaild_resubmit_item Christoph Hellwig
2025-01-07 6:49 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 14/15] xfs: move b_li_list based retry handling to common code Christoph Hellwig
2025-01-07 6:55 ` Darrick J. Wong
2025-01-07 7:03 ` Christoph Hellwig
2025-01-13 7:18 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 15/15] xfs: add a b_iodone callback to struct xfs_buf Christoph Hellwig
2025-01-07 6:58 ` Darrick J. Wong
-- strict thread matches above, loose matches on Subject: below --
2025-01-13 14:12 buffer cache cleanups v2 Christoph Hellwig
2025-01-13 14:12 ` [PATCH 09/15] xfs: simplify buffer I/O submission Christoph Hellwig
2025-01-13 18:15 ` Darrick J. Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250107064224.GA6174@frogsfrogsfrogs \
--to=djwong@kernel.org \
--cc=cem@kernel.org \
--cc=hch@lst.de \
--cc=linux-xfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.