From: "Darrick J. Wong" <djwong@kernel.org>
To: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>, Carlos Maiolino <cem@kernel.org>,
Damien Le Moal <dlemoal@kernel.org>,
Hans Holmberg <hans.holmberg@wdc.com>,
Keith Busch <kbusch@kernel.org>,
linux-block@vger.kernel.org, linux-xfs@vger.kernel.org,
Carlos Maiolino <cmaiolino@redhat.com>
Subject: Re: [PATCH 3/3] xfs: rework zone GC buffer management
Date: Wed, 14 Jan 2026 11:59:47 -0800 [thread overview]
Message-ID: <20260114195947.GI15551@frogsfrogsfrogs> (raw)
In-Reply-To: <20260114130651.3439765-4-hch@lst.de>
On Wed, Jan 14, 2026 at 02:06:43PM +0100, Christoph Hellwig wrote:
> The double buffering where just one scratch area is used at a time does
> not efficiently use the available memory. It was originally implemented
> when GC I/O could happen out of order, but that was removed before
> upstream submission to avoid fragmentation. Now that all GC I/Os are
> processed in order, just use a number of buffers as a simple ring buffer.
>
> For a synthetic benchmark that fills 256MiB HDD zones and punches out
> holes to free half the space this leads to a decrease of GC time by
> a little more than 25%.
>
> Thanks to Hans Holmberg <hans.holmberg@wdc.com> for testing and
> benchmarking.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
> Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Huh. I guess I never touched this patch at all???
Ok so it looks like we're switching out garbage collecting with two
independent (but not) scratchpads for what amounts to a ring buffer
consisting of a bunch of discontig folios.
Now when gc wants to move some data, we compute the number of folios we
need, and bump scratch_head by that amount. Then we attach those folios
to a read bio and submit it. When that finishes, we call bio_reuse with
REQ_OP_WRITE and submit that. When the write finishes we do the "remap
if mapping hasn't changed" dance to update the metadata, and advance the
tail by however many folios we've stopped using.
If I got that right, then
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_zone_gc.c | 106 ++++++++++++++++++++++++-------------------
> 1 file changed, 59 insertions(+), 47 deletions(-)
>
> diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
> index c9a3df6a5289..ba4f8e011e36 100644
> --- a/fs/xfs/xfs_zone_gc.c
> +++ b/fs/xfs/xfs_zone_gc.c
> @@ -50,23 +50,11 @@
> */
>
> /*
> - * Size of each GC scratch pad. This is also the upper bound for each
> - * GC I/O, which helps to keep latency down.
> + * Size of each GC scratch allocation, and the number of buffers.
> */
> -#define XFS_GC_CHUNK_SIZE SZ_1M
> -
> -/*
> - * Scratchpad data to read GCed data into.
> - *
> - * The offset member tracks where the next allocation starts, and freed tracks
> - * the amount of space that is not used anymore.
> - */
> -#define XFS_ZONE_GC_NR_SCRATCH 2
> -struct xfs_zone_scratch {
> - struct folio *folio;
> - unsigned int offset;
> - unsigned int freed;
> -};
> +#define XFS_GC_BUF_SIZE SZ_1M
> +#define XFS_GC_NR_BUFS 2
> +static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
>
> /*
> * Chunk that is read and written for each GC operation.
> @@ -141,10 +129,14 @@ struct xfs_zone_gc_data {
> struct bio_set bio_set;
>
> /*
> - * Scratchpad used, and index to indicated which one is used.
> + * Scratchpad to buffer GC data, organized as a ring buffer over
> + * discontiguous folios. scratch_head is where the buffer is filled,
> + * and scratch_tail tracks the buffer space freed.
> */
> - struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
> - unsigned int scratch_idx;
> + struct folio *scratch_folios[XFS_GC_NR_BUFS];
> + unsigned int scratch_size;
> + unsigned int scratch_head;
> + unsigned int scratch_tail;
>
> /*
> * List of bios currently being read, written and reset.
> @@ -210,20 +202,16 @@ xfs_zone_gc_data_alloc(
> if (!data->iter.recs)
> goto out_free_data;
>
> - /*
> - * We actually only need a single bio_vec. It would be nice to have
> - * a flag that only allocates the inline bvecs and not the separate
> - * bvec pool.
> - */
> if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
> BIOSET_NEED_BVECS))
> goto out_free_recs;
> - for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
> - data->scratch[i].folio =
> - folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
> - if (!data->scratch[i].folio)
> + for (i = 0; i < XFS_GC_NR_BUFS; i++) {
> + data->scratch_folios[i] =
> + folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
> + if (!data->scratch_folios[i])
> goto out_free_scratch;
> }
> + data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
> INIT_LIST_HEAD(&data->reading);
> INIT_LIST_HEAD(&data->writing);
> INIT_LIST_HEAD(&data->resetting);
> @@ -232,7 +220,7 @@ xfs_zone_gc_data_alloc(
>
> out_free_scratch:
> while (--i >= 0)
> - folio_put(data->scratch[i].folio);
> + folio_put(data->scratch_folios[i]);
> bioset_exit(&data->bio_set);
> out_free_recs:
> kfree(data->iter.recs);
> @@ -247,8 +235,8 @@ xfs_zone_gc_data_free(
> {
> int i;
>
> - for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
> - folio_put(data->scratch[i].folio);
> + for (i = 0; i < XFS_GC_NR_BUFS; i++)
> + folio_put(data->scratch_folios[i]);
> bioset_exit(&data->bio_set);
> kfree(data->iter.recs);
> kfree(data);
> @@ -590,7 +578,12 @@ static unsigned int
> xfs_zone_gc_scratch_available(
> struct xfs_zone_gc_data *data)
> {
> - return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
> + if (!data->scratch_tail)
> + return data->scratch_size - data->scratch_head;
> +
> + if (!data->scratch_head)
> + return data->scratch_tail;
> + return (data->scratch_size - data->scratch_head) + data->scratch_tail;
> }
>
> static bool
> @@ -664,6 +657,28 @@ xfs_zone_gc_alloc_blocks(
> return oz;
> }
>
> +static void
> +xfs_zone_gc_add_data(
> + struct xfs_gc_bio *chunk)
> +{
> + struct xfs_zone_gc_data *data = chunk->data;
> + unsigned int len = chunk->len;
> + unsigned int off = data->scratch_head;
> +
> + do {
> + unsigned int this_off = off % XFS_GC_BUF_SIZE;
> + unsigned int this_len = min(len, XFS_GC_BUF_SIZE - this_off);
> +
> + bio_add_folio_nofail(&chunk->bio,
> + data->scratch_folios[off / XFS_GC_BUF_SIZE],
> + this_len, this_off);
> + len -= this_len;
> + off += this_len;
> + if (off == data->scratch_size)
> + off = 0;
> + } while (len);
> +}
> +
> static bool
> xfs_zone_gc_start_chunk(
> struct xfs_zone_gc_data *data)
> @@ -677,6 +692,7 @@ xfs_zone_gc_start_chunk(
> struct xfs_inode *ip;
> struct bio *bio;
> xfs_daddr_t daddr;
> + unsigned int len;
> bool is_seq;
>
> if (xfs_is_shutdown(mp))
> @@ -691,17 +707,19 @@ xfs_zone_gc_start_chunk(
> return false;
> }
>
> - bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
> + len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
> + bio = bio_alloc_bioset(bdev,
> + min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
> + REQ_OP_READ, GFP_NOFS, &data->bio_set);
>
> chunk = container_of(bio, struct xfs_gc_bio, bio);
> chunk->ip = ip;
> chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
> - chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
> + chunk->len = len;
> chunk->old_startblock =
> xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
> chunk->new_daddr = daddr;
> chunk->is_seq = is_seq;
> - chunk->scratch = &data->scratch[data->scratch_idx];
> chunk->data = data;
> chunk->oz = oz;
> chunk->victim_rtg = iter->victim_rtg;
> @@ -710,13 +728,9 @@ xfs_zone_gc_start_chunk(
>
> bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
> bio->bi_end_io = xfs_zone_gc_end_io;
> - bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
> - chunk->scratch->offset);
> - chunk->scratch->offset += chunk->len;
> - if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
> - data->scratch_idx =
> - (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
> - }
> + xfs_zone_gc_add_data(chunk);
> + data->scratch_head = (data->scratch_head + len) % data->scratch_size;
> +
> WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
> list_add_tail(&chunk->entry, &data->reading);
> xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
> @@ -834,6 +848,7 @@ xfs_zone_gc_finish_chunk(
> struct xfs_gc_bio *chunk)
> {
> uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> + struct xfs_zone_gc_data *data = chunk->data;
> struct xfs_inode *ip = chunk->ip;
> struct xfs_mount *mp = ip->i_mount;
> int error;
> @@ -845,11 +860,8 @@ xfs_zone_gc_finish_chunk(
> return;
> }
>
> - chunk->scratch->freed += chunk->len;
> - if (chunk->scratch->freed == chunk->scratch->offset) {
> - chunk->scratch->offset = 0;
> - chunk->scratch->freed = 0;
> - }
> + data->scratch_tail =
> + (data->scratch_tail + chunk->len) % data->scratch_size;
>
> /*
> * Cycle through the iolock and wait for direct I/O and layouts to
> --
> 2.47.3
>
>
next prev parent reply other threads:[~2026-01-14 19:59 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-14 13:06 improve zoned XFS GC buffer management v4 Christoph Hellwig
2026-01-14 13:06 ` [PATCH 1/3] block: add a bio_reuse helper Christoph Hellwig
2026-01-14 13:06 ` [PATCH 2/3] xfs: use bio_reuse in the zone GC code Christoph Hellwig
2026-01-14 19:50 ` Darrick J. Wong
2026-01-14 13:06 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
2026-01-14 19:59 ` Darrick J. Wong [this message]
2026-01-15 6:12 ` Christoph Hellwig
2026-01-25 0:03 ` Chris Mason
2026-01-26 20:30 ` Keith Busch
2026-01-27 6:53 ` Christoph Hellwig
2026-01-21 12:21 ` improve zoned XFS GC buffer management v4 Carlos Maiolino
-- strict thread matches above, loose matches on Subject: below --
2026-01-13 7:19 improve zoned XFS GC buffer management v3 Christoph Hellwig
2026-01-13 7:19 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
2026-01-06 7:58 improve zoned XFS GC buffer management v2 Christoph Hellwig
2026-01-06 7:58 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
2026-01-09 12:24 ` Carlos Maiolino
2025-12-18 6:31 improve zoned XFS " Christoph Hellwig
2025-12-18 6:31 ` [PATCH 3/3] xfs: rework zone " Christoph Hellwig
2025-12-18 7:12 ` Damien Le Moal
2025-12-19 8:06 ` Hans Holmberg
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260114195947.GI15551@frogsfrogsfrogs \
--to=djwong@kernel.org \
--cc=axboe@kernel.dk \
--cc=cem@kernel.org \
--cc=cmaiolino@redhat.com \
--cc=dlemoal@kernel.org \
--cc=hans.holmberg@wdc.com \
--cc=hch@lst.de \
--cc=kbusch@kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox