From: "Darrick J. Wong" <djwong@kernel.org>
To: Christoph Hellwig <hch@lst.de>
Cc: Carlos Maiolino <cem@kernel.org>, linux-xfs@vger.kernel.org
Subject: Re: [PATCH 09/15] xfs: simplify buffer I/O submission
Date: Mon, 6 Jan 2025 22:42:24 -0800 [thread overview]
Message-ID: <20250107064224.GA6174@frogsfrogsfrogs> (raw)
In-Reply-To: <20250106095613.847700-10-hch@lst.de>
On Mon, Jan 06, 2025 at 10:54:46AM +0100, Christoph Hellwig wrote:
> The code in _xfs_buf_ioapply is unnecessarily complicated because it
> doesn't take advantage of modern bio features.
>
> Simplify it by making use of bio splitting and chaining, that is build
> a single bio for the pages in the buffer using a simple loop, and then
> split that bio on the map boundaries for discontiguous multi-FSB buffers
> and chain the split bios to the main one so that there is only a single
> I/O completion.
>
> This not only simplifies the code to build the buffer, but also removes
> the need for the b_io_remaining field as buffer ownership is granted
> to the bio on submit of the final bio with no chance for a completion
> before that as well as the b_io_error field that is now superfluous
> because there always is exactly one completion.
So I guess b_io_remaining was the count of in-flight bios? And making a
chain of bios basically just moves all that to the bio layer, so all
xfs_buf needs to know is whether or not a bio is in progress, right?
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_buf.c | 207 ++++++++++++++---------------------------------
> fs/xfs/xfs_buf.h | 2 -
> 2 files changed, 60 insertions(+), 149 deletions(-)
>
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index e886605b5721..094f16457998 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1364,13 +1364,6 @@ xfs_buf_ioend(
> {
> trace_xfs_buf_iodone(bp, _RET_IP_);
>
> - /*
> - * Pull in IO completion errors now. We are guaranteed to be running
> - * single threaded, so we don't need the lock to read b_io_error.
> - */
> - if (!bp->b_error && bp->b_io_error)
> - xfs_buf_ioerror(bp, bp->b_io_error);
> -
> if (bp->b_flags & XBF_READ) {
> if (!bp->b_error && bp->b_ops)
> bp->b_ops->verify_read(bp);
> @@ -1494,118 +1487,26 @@ static void
> xfs_buf_bio_end_io(
> struct bio *bio)
> {
> - struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
> + struct xfs_buf *bp = bio->bi_private;
>
> - if (!bio->bi_status &&
> - (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
> - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
> - bio->bi_status = BLK_STS_IOERR;
> -
> - /*
> - * don't overwrite existing errors - otherwise we can lose errors on
> - * buffers that require multiple bios to complete.
> - */
> - if (bio->bi_status) {
> - int error = blk_status_to_errno(bio->bi_status);
> -
> - cmpxchg(&bp->b_io_error, 0, error);
> - }
> + if (bio->bi_status)
> + xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
> + else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
> + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
> + xfs_buf_ioerror(bp, -EIO);
>
> if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
> invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
>
> - if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
> - xfs_buf_ioend_async(bp);
> + xfs_buf_ioend_async(bp);
> bio_put(bio);
> }
>
> -static void
> -xfs_buf_ioapply_map(
> - struct xfs_buf *bp,
> - int map,
> - int *buf_offset,
> - int *count,
> - blk_opf_t op)
> -{
> - int page_index;
> - unsigned int total_nr_pages = bp->b_page_count;
> - int nr_pages;
> - struct bio *bio;
> - sector_t sector = bp->b_maps[map].bm_bn;
> - int size;
> - int offset;
> -
> - /* skip the pages in the buffer before the start offset */
> - page_index = 0;
> - offset = *buf_offset;
> - while (offset >= PAGE_SIZE) {
> - page_index++;
> - offset -= PAGE_SIZE;
> - }
> -
> - /*
> - * Limit the IO size to the length of the current vector, and update the
> - * remaining IO count for the next time around.
> - */
> - size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
> - *count -= size;
> - *buf_offset += size;
> -
> -next_chunk:
> - atomic_inc(&bp->b_io_remaining);
> - nr_pages = bio_max_segs(total_nr_pages);
> -
> - bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
> - bio->bi_iter.bi_sector = sector;
> - bio->bi_end_io = xfs_buf_bio_end_io;
> - bio->bi_private = bp;
> -
> - for (; size && nr_pages; nr_pages--, page_index++) {
> - int rbytes, nbytes = PAGE_SIZE - offset;
> -
> - if (nbytes > size)
> - nbytes = size;
> -
> - rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
> - offset);
> - if (rbytes < nbytes)
> - break;
> -
> - offset = 0;
> - sector += BTOBB(nbytes);
> - size -= nbytes;
> - total_nr_pages--;
> - }
> -
> - if (likely(bio->bi_iter.bi_size)) {
> - if (xfs_buf_is_vmapped(bp)) {
> - flush_kernel_vmap_range(bp->b_addr,
> - xfs_buf_vmap_len(bp));
> - }
> - submit_bio(bio);
> - if (size)
> - goto next_chunk;
> - } else {
> - /*
> - * This is guaranteed not to be the last io reference count
> - * because the caller (xfs_buf_submit) holds a count itself.
> - */
> - atomic_dec(&bp->b_io_remaining);
> - xfs_buf_ioerror(bp, -EIO);
> - bio_put(bio);
> - }
> -
> -}
> -
> -STATIC void
> -_xfs_buf_ioapply(
> - struct xfs_buf *bp)
> +static inline blk_opf_t
> +xfs_buf_bio_op(
> + struct xfs_buf *bp)
> {
> - struct blk_plug plug;
> - blk_opf_t op;
> - int offset;
> - int size;
> - int i;
> + blk_opf_t op;
>
> if (bp->b_flags & XBF_WRITE) {
> op = REQ_OP_WRITE;
> @@ -1615,25 +1516,52 @@ _xfs_buf_ioapply(
> op |= REQ_RAHEAD;
> }
>
> - /* we only use the buffer cache for meta-data */
> - op |= REQ_META;
> + return op | REQ_META;
> +}
> +
> +static void
> +xfs_buf_submit_bio(
> + struct xfs_buf *bp)
> +{
> + unsigned int size = BBTOB(bp->b_length);
> + unsigned int map = 0, p;
> + struct blk_plug plug;
> + struct bio *bio;
> +
> + bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
> + xfs_buf_bio_op(bp), GFP_NOIO);
> + bio->bi_private = bp;
> + bio->bi_end_io = xfs_buf_bio_end_io;
> +
> + if (bp->b_flags & _XBF_KMEM) {
> + __bio_add_page(bio, virt_to_page(bp->b_addr), size,
> + bp->b_offset);
> + } else {
> + for (p = 0; p < bp->b_page_count; p++)
> + __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
> + bio->bi_iter.bi_size = size; /* limit to the actual size used */
> +
> + if (xfs_buf_is_vmapped(bp))
> + flush_kernel_vmap_range(bp->b_addr,
> + xfs_buf_vmap_len(bp));
> + }
>
> /*
> - * Walk all the vectors issuing IO on them. Set up the initial offset
> - * into the buffer and the desired IO size before we start -
> - * _xfs_buf_ioapply_vec() will modify them appropriately for each
> - * subsequent call.
> + * If there is more than one map segment, split the original bio to
> + * submit a separate bio for each discontiguous range.
> */
> - offset = bp->b_offset;
> - size = BBTOB(bp->b_length);
> blk_start_plug(&plug);
> - for (i = 0; i < bp->b_map_count; i++) {
> - xfs_buf_ioapply_map(bp, i, &offset, &size, op);
> - if (bp->b_error)
> - break;
> - if (size <= 0)
> - break; /* all done */
Eerrugh, I spent quite a while trying to wrap my head around the old
code when I was writing the in-memory xfs_buf support. This is much
less weird to look at...
> + for (map = 0; map < bp->b_map_count - 1; map++) {
...but why isn't this "map < bp->b_map_count"?
--D
> + struct bio *split;
> +
> + split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS,
> + &fs_bio_set);
> + split->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
> + bio_chain(split, bio);
> + submit_bio(split);
> }
> + bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
> + submit_bio(bio);
> blk_finish_plug(&plug);
> }
>
> @@ -1693,8 +1621,6 @@ static int
> xfs_buf_submit(
> struct xfs_buf *bp)
> {
> - int error = 0;
> -
> trace_xfs_buf_submit(bp, _RET_IP_);
>
> ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
> @@ -1735,14 +1661,7 @@ xfs_buf_submit(
> * left over from previous use of the buffer (e.g. failed readahead).
> */
> bp->b_error = 0;
> - bp->b_io_error = 0;
>
> - /*
> - * Set the count to 1 initially, this will stop an I/O completion
> - * callout which happens before we have started all the I/O from calling
> - * xfs_buf_ioend too early.
> - */
> - atomic_set(&bp->b_io_remaining, 1);
> if (bp->b_flags & XBF_ASYNC)
> xfs_buf_ioacct_inc(bp);
>
> @@ -1755,28 +1674,22 @@ xfs_buf_submit(
> if (xfs_buftarg_is_mem(bp->b_target))
> goto done;
>
> - _xfs_buf_ioapply(bp);
> + xfs_buf_submit_bio(bp);
> + goto rele;
>
> done:
> - /*
> - * If _xfs_buf_ioapply failed, we can get back here with only the IO
> - * reference we took above. If we drop it to zero, run completion so
> - * that we don't return to the caller with completion still pending.
> - */
> - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
> - if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
> - xfs_buf_ioend(bp);
> - else
> - xfs_buf_ioend_async(bp);
> - }
> -
> + if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
> + xfs_buf_ioend(bp);
> + else
> + xfs_buf_ioend_async(bp);
> +rele:
> /*
> * Release the hold that keeps the buffer referenced for the entire
> * I/O. Note that if the buffer is async, it is not safe to reference
> * after this release.
> */
> xfs_buf_rele(bp);
> - return error;
> + return 0;
> }
>
> void *
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index da80399c7457..c53d27439ff2 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -184,7 +184,6 @@ struct xfs_buf {
> struct list_head b_lru; /* lru list */
> spinlock_t b_lock; /* internal state lock */
> unsigned int b_state; /* internal state flags */
> - int b_io_error; /* internal IO error state */
> wait_queue_head_t b_waiters; /* unpin waiters */
> struct list_head b_list;
> struct xfs_perag *b_pag;
> @@ -202,7 +201,6 @@ struct xfs_buf {
> struct xfs_buf_map __b_map; /* inline compound buffer map */
> int b_map_count;
> atomic_t b_pin_count; /* pin count */
> - atomic_t b_io_remaining; /* #outstanding I/O requests */
> unsigned int b_page_count; /* size of page array */
> unsigned int b_offset; /* page offset of b_addr,
> only for _XBF_KMEM buffers */
> --
> 2.45.2
>
>
next prev parent reply other threads:[~2025-01-07 6:42 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-01-06 9:54 buffer cache cleanups Christoph Hellwig
2025-01-06 9:54 ` [PATCH 01/15] xfs: fix a double completion for buffers on in-memory targets Christoph Hellwig
2025-01-07 2:00 ` Darrick J. Wong
2025-01-07 6:05 ` Christoph Hellwig
2025-01-06 9:54 ` [PATCH 02/15] xfs: remove the incorrect comment above xfs_buf_free_maps Christoph Hellwig
2025-01-07 2:00 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 03/15] xfs: remove the incorrect comment about the b_pag field Christoph Hellwig
2025-01-07 2:01 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 04/15] xfs: move xfs_buf_iowait out of (__)xfs_buf_submit Christoph Hellwig
2025-01-07 2:02 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 05/15] xfs: simplify xfs_buf_delwri_pushbuf Christoph Hellwig
2025-01-07 2:08 ` Darrick J. Wong
2025-01-07 6:06 ` Christoph Hellwig
2025-01-13 7:12 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 06/15] xfs: remove xfs_buf_delwri_submit_buffers Christoph Hellwig
2025-01-07 6:31 ` Darrick J. Wong
2025-01-07 6:33 ` Christoph Hellwig
2025-01-06 9:54 ` [PATCH 07/15] xfs: move write verification out of _xfs_buf_ioapply Christoph Hellwig
2025-01-07 6:33 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 08/15] xfs: move in-memory buftarg handling " Christoph Hellwig
2025-01-07 6:34 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 09/15] xfs: simplify buffer I/O submission Christoph Hellwig
2025-01-07 6:42 ` Darrick J. Wong [this message]
2025-01-07 6:46 ` Christoph Hellwig
2025-01-07 6:57 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 10/15] xfs: move invalidate_kernel_vmap_range to xfs_buf_ioend Christoph Hellwig
2025-01-07 6:42 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 11/15] xfs: remove the extra buffer reference in xfs_buf_submit Christoph Hellwig
2025-01-13 7:13 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 12/15] xfs: always complete the buffer inline " Christoph Hellwig
2025-01-07 6:46 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 13/15] xfs: simplify xfsaild_resubmit_item Christoph Hellwig
2025-01-07 6:49 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 14/15] xfs: move b_li_list based retry handling to common code Christoph Hellwig
2025-01-07 6:55 ` Darrick J. Wong
2025-01-07 7:03 ` Christoph Hellwig
2025-01-13 7:18 ` Darrick J. Wong
2025-01-06 9:54 ` [PATCH 15/15] xfs: add a b_iodone callback to struct xfs_buf Christoph Hellwig
2025-01-07 6:58 ` Darrick J. Wong
-- strict thread matches above, loose matches on Subject: below --
2025-01-13 14:12 buffer cache cleanups v2 Christoph Hellwig
2025-01-13 14:12 ` [PATCH 09/15] xfs: simplify buffer I/O submission Christoph Hellwig
2025-01-13 18:15 ` Darrick J. Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250107064224.GA6174@frogsfrogsfrogs \
--to=djwong@kernel.org \
--cc=cem@kernel.org \
--cc=hch@lst.de \
--cc=linux-xfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox