[PATCH 3/3] xfs: rework zone GC buffer management

public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 3/3] xfs: rework zone GC buffer management
  2025-12-18  6:31 improve zoned XFS " Christoph Hellwig
@ 2025-12-18  6:31 ` Christoph Hellwig
  2025-12-18  7:12   ` Damien Le Moal
  2025-12-19  8:06   ` Hans Holmberg
  0 siblings, 2 replies; 17+ messages in thread
From: Christoph Hellwig @ 2025-12-18  6:31 UTC (permalink / raw)
  To: Jens Axboe, Carlos Maiolino
  Cc: Damien Le Moal, Hans Holmberg, linux-block, linux-xfs

The double buffering where just one scratch area is used at a time does
not efficiently use the available memory.  It was originally implemented
when GC I/O could happen out of order, but that was removed before
upstream submission to avoid fragmentation.  Now that all GC I/Os are
processed in order, just use a number of buffers as a simple ring buffer.

For a synthetic benchmark that fills 256MiB HDD zones and punches out
holes to free half the space this leads to a decrease of GC time by
a little more than 25%.

Thanks to Hans Holmberg <hans.holmberg@wdc.com> for testing and
benchmarking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_zone_gc.c | 106 ++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 47 deletions(-)

diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index f3dec13a620c..90f70473c3d5 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -50,23 +50,11 @@
  */
 
 /*
- * Size of each GC scratch pad.  This is also the upper bound for each
- * GC I/O, which helps to keep latency down.
+ * Size of each GC scratch allocation, and the number of buffers.
  */
-#define XFS_GC_CHUNK_SIZE	SZ_1M
-
-/*
- * Scratchpad data to read GCed data into.
- *
- * The offset member tracks where the next allocation starts, and freed tracks
- * the amount of space that is not used anymore.
- */
-#define XFS_ZONE_GC_NR_SCRATCH	2
-struct xfs_zone_scratch {
-	struct folio			*folio;
-	unsigned int			offset;
-	unsigned int			freed;
-};
+#define XFS_GC_BUF_SIZE		SZ_1M
+#define XFS_GC_NR_BUFS		2
+static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
 
 /*
  * Chunk that is read and written for each GC operation.
@@ -141,10 +129,14 @@ struct xfs_zone_gc_data {
 	struct bio_set			bio_set;
 
 	/*
-	 * Scratchpad used, and index to indicated which one is used.
+	 * Scratchpad to buffer GC data, organized as a ring buffer over
+	 * discontiguous folios.  scratch_head is where the buffer is filled,
+	 * and scratch_tail tracks the buffer space freed.
 	 */
-	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
-	unsigned int			scratch_idx;
+	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
+	unsigned int			scratch_size;
+	unsigned int			scratch_head;
+	unsigned int			scratch_tail;
 
 	/*
 	 * List of bios currently being read, written and reset.
@@ -210,20 +202,16 @@ xfs_zone_gc_data_alloc(
 	if (!data->iter.recs)
 		goto out_free_data;
 
-	/*
-	 * We actually only need a single bio_vec.  It would be nice to have
-	 * a flag that only allocates the inline bvecs and not the separate
-	 * bvec pool.
-	 */
 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
 			BIOSET_NEED_BVECS))
 		goto out_free_recs;
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
-		data->scratch[i].folio =
-			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
-		if (!data->scratch[i].folio)
+	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
+		data->scratch_folios[i] =
+			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
+		if (!data->scratch_folios[i])
 			goto out_free_scratch;
 	}
+	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
 	INIT_LIST_HEAD(&data->reading);
 	INIT_LIST_HEAD(&data->writing);
 	INIT_LIST_HEAD(&data->resetting);
@@ -232,7 +220,7 @@ xfs_zone_gc_data_alloc(
 
 out_free_scratch:
 	while (--i >= 0)
-		folio_put(data->scratch[i].folio);
+		folio_put(data->scratch_folios[i]);
 	bioset_exit(&data->bio_set);
 out_free_recs:
 	kfree(data->iter.recs);
@@ -247,8 +235,8 @@ xfs_zone_gc_data_free(
 {
 	int			i;
 
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
-		folio_put(data->scratch[i].folio);
+	for (i = 0; i < XFS_GC_NR_BUFS; i++)
+		folio_put(data->scratch_folios[i]);
 	bioset_exit(&data->bio_set);
 	kfree(data->iter.recs);
 	kfree(data);
@@ -591,7 +579,12 @@ static unsigned int
 xfs_zone_gc_scratch_available(
 	struct xfs_zone_gc_data	*data)
 {
-	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+	if (!data->scratch_tail)
+		return data->scratch_size - data->scratch_head;
+
+	if (!data->scratch_head)
+		return data->scratch_tail;
+	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
 }
 
 static bool
@@ -665,6 +658,28 @@ xfs_zone_gc_alloc_blocks(
 	return oz;
 }
 
+static void
+xfs_zone_gc_add_data(
+	struct xfs_gc_bio	*chunk)
+{
+	struct xfs_zone_gc_data	*data = chunk->data;
+	unsigned int		len = chunk->len;
+	unsigned int		off = data->scratch_head;
+
+	do {
+		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
+		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
+
+		bio_add_folio_nofail(&chunk->bio,
+				data->scratch_folios[off / XFS_GC_BUF_SIZE],
+				this_len, this_off);
+		len -= this_len;
+		off += this_len;
+		if (off == data->scratch_size)
+			off = 0;
+	} while (len);
+}
+
 static bool
 xfs_zone_gc_start_chunk(
 	struct xfs_zone_gc_data	*data)
@@ -678,6 +693,7 @@ xfs_zone_gc_start_chunk(
 	struct xfs_inode	*ip;
 	struct bio		*bio;
 	xfs_daddr_t		daddr;
+	unsigned int		len;
 	bool			is_seq;
 
 	if (xfs_is_shutdown(mp))
@@ -692,17 +708,19 @@ xfs_zone_gc_start_chunk(
 		return false;
 	}
 
-	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+	bio = bio_alloc_bioset(bdev,
+			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
+			REQ_OP_READ, GFP_NOFS, &data->bio_set);
 
 	chunk = container_of(bio, struct xfs_gc_bio, bio);
 	chunk->ip = ip;
 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
-	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+	chunk->len = len;
 	chunk->old_startblock =
 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
 	chunk->new_daddr = daddr;
 	chunk->is_seq = is_seq;
-	chunk->scratch = &data->scratch[data->scratch_idx];
 	chunk->data = data;
 	chunk->oz = oz;
 	chunk->victim_rtg = iter->victim_rtg;
@@ -711,13 +729,9 @@ xfs_zone_gc_start_chunk(
 
 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
 	bio->bi_end_io = xfs_zone_gc_end_io;
-	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
-			chunk->scratch->offset);
-	chunk->scratch->offset += chunk->len;
-	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
-		data->scratch_idx =
-			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
-	}
+	xfs_zone_gc_add_data(chunk);
+	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
+
 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 	list_add_tail(&chunk->entry, &data->reading);
 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
@@ -835,6 +849,7 @@ xfs_zone_gc_finish_chunk(
 	struct xfs_gc_bio	*chunk)
 {
 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	struct xfs_zone_gc_data	*data = chunk->data;
 	struct xfs_inode	*ip = chunk->ip;
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;
@@ -846,11 +861,8 @@ xfs_zone_gc_finish_chunk(
 		return;
 	}
 
-	chunk->scratch->freed += chunk->len;
-	if (chunk->scratch->freed == chunk->scratch->offset) {
-		chunk->scratch->offset = 0;
-		chunk->scratch->freed = 0;
-	}
+	data->scratch_tail =
+		(data->scratch_tail + chunk->len) % data->scratch_size;
 
 	/*
 	 * Cycle through the iolock and wait for direct I/O and layouts to
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] xfs: rework zone GC buffer management
  2025-12-18  6:31 ` [PATCH 3/3] xfs: rework zone " Christoph Hellwig
@ 2025-12-18  7:12   ` Damien Le Moal
  2025-12-19  8:06   ` Hans Holmberg
  1 sibling, 0 replies; 17+ messages in thread
From: Damien Le Moal @ 2025-12-18  7:12 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Carlos Maiolino
  Cc: Hans Holmberg, linux-block, linux-xfs

On 12/18/25 15:31, Christoph Hellwig wrote:
> The double buffering where just one scratch area is used at a time does
> not efficiently use the available memory.  It was originally implemented
> when GC I/O could happen out of order, but that was removed before
> upstream submission to avoid fragmentation.  Now that all GC I/Os are
> processed in order, just use a number of buffers as a simple ring buffer.
> 
> For a synthetic benchmark that fills 256MiB HDD zones and punches out
> holes to free half the space this leads to a decrease of GC time by
> a little more than 25%.
> 
> Thanks to Hans Holmberg <hans.holmberg@wdc.com> for testing and
> benchmarking.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] xfs: rework zone GC buffer management
  2025-12-18  6:31 ` [PATCH 3/3] xfs: rework zone " Christoph Hellwig
  2025-12-18  7:12   ` Damien Le Moal
@ 2025-12-19  8:06   ` Hans Holmberg
  1 sibling, 0 replies; 17+ messages in thread
From: Hans Holmberg @ 2025-12-19  8:06 UTC (permalink / raw)
  To: hch, Jens Axboe, Carlos Maiolino
  Cc: Damien Le Moal, linux-block@vger.kernel.org,
	linux-xfs@vger.kernel.org

On 18/12/2025 07:32, Christoph Hellwig wrote:
> The double buffering where just one scratch area is used at a time does
> not efficiently use the available memory.  It was originally implemented
> when GC I/O could happen out of order, but that was removed before
> upstream submission to avoid fragmentation.  Now that all GC I/Os are
> processed in order, just use a number of buffers as a simple ring buffer.
> 
> For a synthetic benchmark that fills 256MiB HDD zones and punches out
> holes to free half the space this leads to a decrease of GC time by
> a little more than 25%.
> 
> Thanks to Hans Holmberg <hans.holmberg@wdc.com> for testing and
> benchmarking.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_zone_gc.c | 106 ++++++++++++++++++++++++-------------------
>  1 file changed, 59 insertions(+), 47 deletions(-)
> 
> diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
> index f3dec13a620c..90f70473c3d5 100644
> --- a/fs/xfs/xfs_zone_gc.c
> +++ b/fs/xfs/xfs_zone_gc.c
> @@ -50,23 +50,11 @@
>   */
>  
>  /*
> - * Size of each GC scratch pad.  This is also the upper bound for each
> - * GC I/O, which helps to keep latency down.
> + * Size of each GC scratch allocation, and the number of buffers.
>   */
> -#define XFS_GC_CHUNK_SIZE	SZ_1M
> -
> -/*
> - * Scratchpad data to read GCed data into.
> - *
> - * The offset member tracks where the next allocation starts, and freed tracks
> - * the amount of space that is not used anymore.
> - */
> -#define XFS_ZONE_GC_NR_SCRATCH	2
> -struct xfs_zone_scratch {
> -	struct folio			*folio;
> -	unsigned int			offset;
> -	unsigned int			freed;
> -};
> +#define XFS_GC_BUF_SIZE		SZ_1M
> +#define XFS_GC_NR_BUFS		2
> +static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
>  
>  /*
>   * Chunk that is read and written for each GC operation.
> @@ -141,10 +129,14 @@ struct xfs_zone_gc_data {
>  	struct bio_set			bio_set;
>  
>  	/*
> -	 * Scratchpad used, and index to indicated which one is used.
> +	 * Scratchpad to buffer GC data, organized as a ring buffer over
> +	 * discontiguous folios.  scratch_head is where the buffer is filled,
> +	 * and scratch_tail tracks the buffer space freed.
>  	 */
> -	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
> -	unsigned int			scratch_idx;
> +	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
> +	unsigned int			scratch_size;
> +	unsigned int			scratch_head;
> +	unsigned int			scratch_tail;
>  
>  	/*
>  	 * List of bios currently being read, written and reset.
> @@ -210,20 +202,16 @@ xfs_zone_gc_data_alloc(
>  	if (!data->iter.recs)
>  		goto out_free_data;
>  
> -	/*
> -	 * We actually only need a single bio_vec.  It would be nice to have
> -	 * a flag that only allocates the inline bvecs and not the separate
> -	 * bvec pool.
> -	 */
>  	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
>  			BIOSET_NEED_BVECS))
>  		goto out_free_recs;
> -	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
> -		data->scratch[i].folio =
> -			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
> -		if (!data->scratch[i].folio)
> +	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
> +		data->scratch_folios[i] =
> +			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
> +		if (!data->scratch_folios[i])
>  			goto out_free_scratch;
>  	}
> +	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
>  	INIT_LIST_HEAD(&data->reading);
>  	INIT_LIST_HEAD(&data->writing);
>  	INIT_LIST_HEAD(&data->resetting);
> @@ -232,7 +220,7 @@ xfs_zone_gc_data_alloc(
>  
>  out_free_scratch:
>  	while (--i >= 0)
> -		folio_put(data->scratch[i].folio);
> +		folio_put(data->scratch_folios[i]);
>  	bioset_exit(&data->bio_set);
>  out_free_recs:
>  	kfree(data->iter.recs);
> @@ -247,8 +235,8 @@ xfs_zone_gc_data_free(
>  {
>  	int			i;
>  
> -	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
> -		folio_put(data->scratch[i].folio);
> +	for (i = 0; i < XFS_GC_NR_BUFS; i++)
> +		folio_put(data->scratch_folios[i]);
>  	bioset_exit(&data->bio_set);
>  	kfree(data->iter.recs);
>  	kfree(data);
> @@ -591,7 +579,12 @@ static unsigned int
>  xfs_zone_gc_scratch_available(
>  	struct xfs_zone_gc_data	*data)
>  {
> -	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
> +	if (!data->scratch_tail)
> +		return data->scratch_size - data->scratch_head;
> +
> +	if (!data->scratch_head)
> +		return data->scratch_tail;
> +	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
>  }
>  
>  static bool
> @@ -665,6 +658,28 @@ xfs_zone_gc_alloc_blocks(
>  	return oz;
>  }
>  
> +static void
> +xfs_zone_gc_add_data(
> +	struct xfs_gc_bio	*chunk)
> +{
> +	struct xfs_zone_gc_data	*data = chunk->data;
> +	unsigned int		len = chunk->len;
> +	unsigned int		off = data->scratch_head;
> +
> +	do {
> +		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
> +		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
> +
> +		bio_add_folio_nofail(&chunk->bio,
> +				data->scratch_folios[off / XFS_GC_BUF_SIZE],
> +				this_len, this_off);
> +		len -= this_len;
> +		off += this_len;
> +		if (off == data->scratch_size)
> +			off = 0;
> +	} while (len);
> +}
> +
>  static bool
>  xfs_zone_gc_start_chunk(
>  	struct xfs_zone_gc_data	*data)
> @@ -678,6 +693,7 @@ xfs_zone_gc_start_chunk(
>  	struct xfs_inode	*ip;
>  	struct bio		*bio;
>  	xfs_daddr_t		daddr;
> +	unsigned int		len;
>  	bool			is_seq;
>  
>  	if (xfs_is_shutdown(mp))
> @@ -692,17 +708,19 @@ xfs_zone_gc_start_chunk(
>  		return false;
>  	}
>  
> -	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
> +	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
> +	bio = bio_alloc_bioset(bdev,
> +			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
> +			REQ_OP_READ, GFP_NOFS, &data->bio_set);
>  
>  	chunk = container_of(bio, struct xfs_gc_bio, bio);
>  	chunk->ip = ip;
>  	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
> -	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
> +	chunk->len = len;
>  	chunk->old_startblock =
>  		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
>  	chunk->new_daddr = daddr;
>  	chunk->is_seq = is_seq;
> -	chunk->scratch = &data->scratch[data->scratch_idx];
>  	chunk->data = data;
>  	chunk->oz = oz;
>  	chunk->victim_rtg = iter->victim_rtg;
> @@ -711,13 +729,9 @@ xfs_zone_gc_start_chunk(
>  
>  	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
>  	bio->bi_end_io = xfs_zone_gc_end_io;
> -	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
> -			chunk->scratch->offset);
> -	chunk->scratch->offset += chunk->len;
> -	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
> -		data->scratch_idx =
> -			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
> -	}
> +	xfs_zone_gc_add_data(chunk);
> +	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
> +
>  	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
>  	list_add_tail(&chunk->entry, &data->reading);
>  	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
> @@ -835,6 +849,7 @@ xfs_zone_gc_finish_chunk(
>  	struct xfs_gc_bio	*chunk)
>  {
>  	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> +	struct xfs_zone_gc_data	*data = chunk->data;
>  	struct xfs_inode	*ip = chunk->ip;
>  	struct xfs_mount	*mp = ip->i_mount;
>  	int			error;
> @@ -846,11 +861,8 @@ xfs_zone_gc_finish_chunk(
>  		return;
>  	}
>  
> -	chunk->scratch->freed += chunk->len;
> -	if (chunk->scratch->freed == chunk->scratch->offset) {
> -		chunk->scratch->offset = 0;
> -		chunk->scratch->freed = 0;
> -	}
> +	data->scratch_tail =
> +		(data->scratch_tail + chunk->len) % data->scratch_size;
>  
>  	/*
>  	 * Cycle through the iolock and wait for direct I/O and layouts to

Looks good,

Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 3/3] xfs: rework zone GC buffer management
  2026-01-06  7:58 improve zoned XFS GC buffer management v2 Christoph Hellwig
@ 2026-01-06  7:58 ` Christoph Hellwig
  2026-01-09 12:24   ` Carlos Maiolino
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2026-01-06  7:58 UTC (permalink / raw)
  To: Jens Axboe, Carlos Maiolino
  Cc: Damien Le Moal, Hans Holmberg, linux-block, linux-xfs

The double buffering where just one scratch area is used at a time does
not efficiently use the available memory.  It was originally implemented
when GC I/O could happen out of order, but that was removed before
upstream submission to avoid fragmentation.  Now that all GC I/Os are
processed in order, just use a number of buffers as a simple ring buffer.

For a synthetic benchmark that fills 256MiB HDD zones and punches out
holes to free half the space this leads to a decrease of GC time by
a little more than 25%.

Thanks to Hans Holmberg <hans.holmberg@wdc.com> for testing and
benchmarking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
---
 fs/xfs/xfs_zone_gc.c | 106 ++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 47 deletions(-)

diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index 40408b1132e0..960cf79d484e 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -50,23 +50,11 @@
  */
 
 /*
- * Size of each GC scratch pad.  This is also the upper bound for each
- * GC I/O, which helps to keep latency down.
+ * Size of each GC scratch allocation, and the number of buffers.
  */
-#define XFS_GC_CHUNK_SIZE	SZ_1M
-
-/*
- * Scratchpad data to read GCed data into.
- *
- * The offset member tracks where the next allocation starts, and freed tracks
- * the amount of space that is not used anymore.
- */
-#define XFS_ZONE_GC_NR_SCRATCH	2
-struct xfs_zone_scratch {
-	struct folio			*folio;
-	unsigned int			offset;
-	unsigned int			freed;
-};
+#define XFS_GC_BUF_SIZE		SZ_1M
+#define XFS_GC_NR_BUFS		2
+static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
 
 /*
  * Chunk that is read and written for each GC operation.
@@ -141,10 +129,14 @@ struct xfs_zone_gc_data {
 	struct bio_set			bio_set;
 
 	/*
-	 * Scratchpad used, and index to indicated which one is used.
+	 * Scratchpad to buffer GC data, organized as a ring buffer over
+	 * discontiguous folios.  scratch_head is where the buffer is filled,
+	 * and scratch_tail tracks the buffer space freed.
 	 */
-	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
-	unsigned int			scratch_idx;
+	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
+	unsigned int			scratch_size;
+	unsigned int			scratch_head;
+	unsigned int			scratch_tail;
 
 	/*
 	 * List of bios currently being read, written and reset.
@@ -210,20 +202,16 @@ xfs_zone_gc_data_alloc(
 	if (!data->iter.recs)
 		goto out_free_data;
 
-	/*
-	 * We actually only need a single bio_vec.  It would be nice to have
-	 * a flag that only allocates the inline bvecs and not the separate
-	 * bvec pool.
-	 */
 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
 			BIOSET_NEED_BVECS))
 		goto out_free_recs;
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
-		data->scratch[i].folio =
-			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
-		if (!data->scratch[i].folio)
+	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
+		data->scratch_folios[i] =
+			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
+		if (!data->scratch_folios[i])
 			goto out_free_scratch;
 	}
+	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
 	INIT_LIST_HEAD(&data->reading);
 	INIT_LIST_HEAD(&data->writing);
 	INIT_LIST_HEAD(&data->resetting);
@@ -232,7 +220,7 @@ xfs_zone_gc_data_alloc(
 
 out_free_scratch:
 	while (--i >= 0)
-		folio_put(data->scratch[i].folio);
+		folio_put(data->scratch_folios[i]);
 	bioset_exit(&data->bio_set);
 out_free_recs:
 	kfree(data->iter.recs);
@@ -247,8 +235,8 @@ xfs_zone_gc_data_free(
 {
 	int			i;
 
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
-		folio_put(data->scratch[i].folio);
+	for (i = 0; i < XFS_GC_NR_BUFS; i++)
+		folio_put(data->scratch_folios[i]);
 	bioset_exit(&data->bio_set);
 	kfree(data->iter.recs);
 	kfree(data);
@@ -590,7 +578,12 @@ static unsigned int
 xfs_zone_gc_scratch_available(
 	struct xfs_zone_gc_data	*data)
 {
-	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+	if (!data->scratch_tail)
+		return data->scratch_size - data->scratch_head;
+
+	if (!data->scratch_head)
+		return data->scratch_tail;
+	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
 }
 
 static bool
@@ -664,6 +657,28 @@ xfs_zone_gc_alloc_blocks(
 	return oz;
 }
 
+static void
+xfs_zone_gc_add_data(
+	struct xfs_gc_bio	*chunk)
+{
+	struct xfs_zone_gc_data	*data = chunk->data;
+	unsigned int		len = chunk->len;
+	unsigned int		off = data->scratch_head;
+
+	do {
+		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
+		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
+
+		bio_add_folio_nofail(&chunk->bio,
+				data->scratch_folios[off / XFS_GC_BUF_SIZE],
+				this_len, this_off);
+		len -= this_len;
+		off += this_len;
+		if (off == data->scratch_size)
+			off = 0;
+	} while (len);
+}
+
 static bool
 xfs_zone_gc_start_chunk(
 	struct xfs_zone_gc_data	*data)
@@ -677,6 +692,7 @@ xfs_zone_gc_start_chunk(
 	struct xfs_inode	*ip;
 	struct bio		*bio;
 	xfs_daddr_t		daddr;
+	unsigned int		len;
 	bool			is_seq;
 
 	if (xfs_is_shutdown(mp))
@@ -691,17 +707,19 @@ xfs_zone_gc_start_chunk(
 		return false;
 	}
 
-	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+	bio = bio_alloc_bioset(bdev,
+			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
+			REQ_OP_READ, GFP_NOFS, &data->bio_set);
 
 	chunk = container_of(bio, struct xfs_gc_bio, bio);
 	chunk->ip = ip;
 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
-	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+	chunk->len = len;
 	chunk->old_startblock =
 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
 	chunk->new_daddr = daddr;
 	chunk->is_seq = is_seq;
-	chunk->scratch = &data->scratch[data->scratch_idx];
 	chunk->data = data;
 	chunk->oz = oz;
 	chunk->victim_rtg = iter->victim_rtg;
@@ -710,13 +728,9 @@ xfs_zone_gc_start_chunk(
 
 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
 	bio->bi_end_io = xfs_zone_gc_end_io;
-	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
-			chunk->scratch->offset);
-	chunk->scratch->offset += chunk->len;
-	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
-		data->scratch_idx =
-			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
-	}
+	xfs_zone_gc_add_data(chunk);
+	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
+
 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 	list_add_tail(&chunk->entry, &data->reading);
 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
@@ -834,6 +848,7 @@ xfs_zone_gc_finish_chunk(
 	struct xfs_gc_bio	*chunk)
 {
 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	struct xfs_zone_gc_data	*data = chunk->data;
 	struct xfs_inode	*ip = chunk->ip;
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;
@@ -845,11 +860,8 @@ xfs_zone_gc_finish_chunk(
 		return;
 	}
 
-	chunk->scratch->freed += chunk->len;
-	if (chunk->scratch->freed == chunk->scratch->offset) {
-		chunk->scratch->offset = 0;
-		chunk->scratch->freed = 0;
-	}
+	data->scratch_tail =
+		(data->scratch_tail + chunk->len) % data->scratch_size;
 
 	/*
 	 * Cycle through the iolock and wait for direct I/O and layouts to
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] xfs: rework zone GC buffer management
  2026-01-06  7:58 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
@ 2026-01-09 12:24   ` Carlos Maiolino
  0 siblings, 0 replies; 17+ messages in thread
From: Carlos Maiolino @ 2026-01-09 12:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Damien Le Moal, Hans Holmberg, linux-block, linux-xfs

On Tue, Jan 06, 2026 at 08:58:54AM +0100, Christoph Hellwig wrote:
> The double buffering where just one scratch area is used at a time does
> not efficiently use the available memory.  It was originally implemented
> when GC I/O could happen out of order, but that was removed before
> upstream submission to avoid fragmentation.  Now that all GC I/Os are
> processed in order, just use a number of buffers as a simple ring buffer.
> 
> For a synthetic benchmark that fills 256MiB HDD zones and punches out
> holes to free half the space this leads to a decrease of GC time by
> a little more than 25%.
> 
> Thanks to Hans Holmberg <hans.holmberg@wdc.com> for testing and
> benchmarking.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
> Reviewed-by: Damien Le Moal <dlemoal@kernel.org>

Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

> ---
>  fs/xfs/xfs_zone_gc.c | 106 ++++++++++++++++++++++++-------------------
>  1 file changed, 59 insertions(+), 47 deletions(-)
> 
> diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
> index 40408b1132e0..960cf79d484e 100644
> --- a/fs/xfs/xfs_zone_gc.c
> +++ b/fs/xfs/xfs_zone_gc.c
> @@ -50,23 +50,11 @@
>   */
>  
>  /*
> - * Size of each GC scratch pad.  This is also the upper bound for each
> - * GC I/O, which helps to keep latency down.
> + * Size of each GC scratch allocation, and the number of buffers.
>   */
> -#define XFS_GC_CHUNK_SIZE	SZ_1M
> -
> -/*
> - * Scratchpad data to read GCed data into.
> - *
> - * The offset member tracks where the next allocation starts, and freed tracks
> - * the amount of space that is not used anymore.
> - */
> -#define XFS_ZONE_GC_NR_SCRATCH	2
> -struct xfs_zone_scratch {
> -	struct folio			*folio;
> -	unsigned int			offset;
> -	unsigned int			freed;
> -};
> +#define XFS_GC_BUF_SIZE		SZ_1M
> +#define XFS_GC_NR_BUFS		2
> +static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
>  
>  /*
>   * Chunk that is read and written for each GC operation.
> @@ -141,10 +129,14 @@ struct xfs_zone_gc_data {
>  	struct bio_set			bio_set;
>  
>  	/*
> -	 * Scratchpad used, and index to indicated which one is used.
> +	 * Scratchpad to buffer GC data, organized as a ring buffer over
> +	 * discontiguous folios.  scratch_head is where the buffer is filled,
> +	 * and scratch_tail tracks the buffer space freed.
>  	 */
> -	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
> -	unsigned int			scratch_idx;
> +	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
> +	unsigned int			scratch_size;
> +	unsigned int			scratch_head;
> +	unsigned int			scratch_tail;
>  
>  	/*
>  	 * List of bios currently being read, written and reset.
> @@ -210,20 +202,16 @@ xfs_zone_gc_data_alloc(
>  	if (!data->iter.recs)
>  		goto out_free_data;
>  
> -	/*
> -	 * We actually only need a single bio_vec.  It would be nice to have
> -	 * a flag that only allocates the inline bvecs and not the separate
> -	 * bvec pool.
> -	 */
>  	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
>  			BIOSET_NEED_BVECS))
>  		goto out_free_recs;
> -	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
> -		data->scratch[i].folio =
> -			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
> -		if (!data->scratch[i].folio)
> +	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
> +		data->scratch_folios[i] =
> +			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
> +		if (!data->scratch_folios[i])
>  			goto out_free_scratch;
>  	}
> +	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
>  	INIT_LIST_HEAD(&data->reading);
>  	INIT_LIST_HEAD(&data->writing);
>  	INIT_LIST_HEAD(&data->resetting);
> @@ -232,7 +220,7 @@ xfs_zone_gc_data_alloc(
>  
>  out_free_scratch:
>  	while (--i >= 0)
> -		folio_put(data->scratch[i].folio);
> +		folio_put(data->scratch_folios[i]);
>  	bioset_exit(&data->bio_set);
>  out_free_recs:
>  	kfree(data->iter.recs);
> @@ -247,8 +235,8 @@ xfs_zone_gc_data_free(
>  {
>  	int			i;
>  
> -	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
> -		folio_put(data->scratch[i].folio);
> +	for (i = 0; i < XFS_GC_NR_BUFS; i++)
> +		folio_put(data->scratch_folios[i]);
>  	bioset_exit(&data->bio_set);
>  	kfree(data->iter.recs);
>  	kfree(data);
> @@ -590,7 +578,12 @@ static unsigned int
>  xfs_zone_gc_scratch_available(
>  	struct xfs_zone_gc_data	*data)
>  {
> -	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
> +	if (!data->scratch_tail)
> +		return data->scratch_size - data->scratch_head;
> +
> +	if (!data->scratch_head)
> +		return data->scratch_tail;
> +	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
>  }
>  
>  static bool
> @@ -664,6 +657,28 @@ xfs_zone_gc_alloc_blocks(
>  	return oz;
>  }
>  
> +static void
> +xfs_zone_gc_add_data(
> +	struct xfs_gc_bio	*chunk)
> +{
> +	struct xfs_zone_gc_data	*data = chunk->data;
> +	unsigned int		len = chunk->len;
> +	unsigned int		off = data->scratch_head;
> +
> +	do {
> +		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
> +		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
> +
> +		bio_add_folio_nofail(&chunk->bio,
> +				data->scratch_folios[off / XFS_GC_BUF_SIZE],
> +				this_len, this_off);
> +		len -= this_len;
> +		off += this_len;
> +		if (off == data->scratch_size)
> +			off = 0;
> +	} while (len);
> +}
> +
>  static bool
>  xfs_zone_gc_start_chunk(
>  	struct xfs_zone_gc_data	*data)
> @@ -677,6 +692,7 @@ xfs_zone_gc_start_chunk(
>  	struct xfs_inode	*ip;
>  	struct bio		*bio;
>  	xfs_daddr_t		daddr;
> +	unsigned int		len;
>  	bool			is_seq;
>  
>  	if (xfs_is_shutdown(mp))
> @@ -691,17 +707,19 @@ xfs_zone_gc_start_chunk(
>  		return false;
>  	}
>  
> -	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
> +	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
> +	bio = bio_alloc_bioset(bdev,
> +			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
> +			REQ_OP_READ, GFP_NOFS, &data->bio_set);
>  
>  	chunk = container_of(bio, struct xfs_gc_bio, bio);
>  	chunk->ip = ip;
>  	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
> -	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
> +	chunk->len = len;
>  	chunk->old_startblock =
>  		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
>  	chunk->new_daddr = daddr;
>  	chunk->is_seq = is_seq;
> -	chunk->scratch = &data->scratch[data->scratch_idx];
>  	chunk->data = data;
>  	chunk->oz = oz;
>  	chunk->victim_rtg = iter->victim_rtg;
> @@ -710,13 +728,9 @@ xfs_zone_gc_start_chunk(
>  
>  	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
>  	bio->bi_end_io = xfs_zone_gc_end_io;
> -	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
> -			chunk->scratch->offset);
> -	chunk->scratch->offset += chunk->len;
> -	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
> -		data->scratch_idx =
> -			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
> -	}
> +	xfs_zone_gc_add_data(chunk);
> +	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
> +
>  	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
>  	list_add_tail(&chunk->entry, &data->reading);
>  	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
> @@ -834,6 +848,7 @@ xfs_zone_gc_finish_chunk(
>  	struct xfs_gc_bio	*chunk)
>  {
>  	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> +	struct xfs_zone_gc_data	*data = chunk->data;
>  	struct xfs_inode	*ip = chunk->ip;
>  	struct xfs_mount	*mp = ip->i_mount;
>  	int			error;
> @@ -845,11 +860,8 @@ xfs_zone_gc_finish_chunk(
>  		return;
>  	}
>  
> -	chunk->scratch->freed += chunk->len;
> -	if (chunk->scratch->freed == chunk->scratch->offset) {
> -		chunk->scratch->offset = 0;
> -		chunk->scratch->freed = 0;
> -	}
> +	data->scratch_tail =
> +		(data->scratch_tail + chunk->len) % data->scratch_size;
>  
>  	/*
>  	 * Cycle through the iolock and wait for direct I/O and layouts to
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 3/3] xfs: rework zone GC buffer management
  2026-01-13  7:19 improve zoned XFS GC buffer management v3 Christoph Hellwig
@ 2026-01-13  7:19 ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2026-01-13  7:19 UTC (permalink / raw)
  To: Jens Axboe, Carlos Maiolino
  Cc: Damien Le Moal, Hans Holmberg, linux-block, linux-xfs,
	Carlos Maiolino

The double buffering where just one scratch area is used at a time does
not efficiently use the available memory.  It was originally implemented
when GC I/O could happen out of order, but that was removed before
upstream submission to avoid fragmentation.  Now that all GC I/Os are
processed in order, just use a number of buffers as a simple ring buffer.

For a synthetic benchmark that fills 256MiB HDD zones and punches out
holes to free half the space this leads to a decrease of GC time by
a little more than 25%.

Thanks to Hans Holmberg <hans.holmberg@wdc.com> for testing and
benchmarking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
---
 fs/xfs/xfs_zone_gc.c | 106 ++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 47 deletions(-)

diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index 40408b1132e0..960cf79d484e 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -50,23 +50,11 @@
  */
 
 /*
- * Size of each GC scratch pad.  This is also the upper bound for each
- * GC I/O, which helps to keep latency down.
+ * Size of each GC scratch allocation, and the number of buffers.
  */
-#define XFS_GC_CHUNK_SIZE	SZ_1M
-
-/*
- * Scratchpad data to read GCed data into.
- *
- * The offset member tracks where the next allocation starts, and freed tracks
- * the amount of space that is not used anymore.
- */
-#define XFS_ZONE_GC_NR_SCRATCH	2
-struct xfs_zone_scratch {
-	struct folio			*folio;
-	unsigned int			offset;
-	unsigned int			freed;
-};
+#define XFS_GC_BUF_SIZE		SZ_1M
+#define XFS_GC_NR_BUFS		2
+static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
 
 /*
  * Chunk that is read and written for each GC operation.
@@ -141,10 +129,14 @@ struct xfs_zone_gc_data {
 	struct bio_set			bio_set;
 
 	/*
-	 * Scratchpad used, and index to indicated which one is used.
+	 * Scratchpad to buffer GC data, organized as a ring buffer over
+	 * discontiguous folios.  scratch_head is where the buffer is filled,
+	 * and scratch_tail tracks the buffer space freed.
 	 */
-	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
-	unsigned int			scratch_idx;
+	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
+	unsigned int			scratch_size;
+	unsigned int			scratch_head;
+	unsigned int			scratch_tail;
 
 	/*
 	 * List of bios currently being read, written and reset.
@@ -210,20 +202,16 @@ xfs_zone_gc_data_alloc(
 	if (!data->iter.recs)
 		goto out_free_data;
 
-	/*
-	 * We actually only need a single bio_vec.  It would be nice to have
-	 * a flag that only allocates the inline bvecs and not the separate
-	 * bvec pool.
-	 */
 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
 			BIOSET_NEED_BVECS))
 		goto out_free_recs;
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
-		data->scratch[i].folio =
-			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
-		if (!data->scratch[i].folio)
+	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
+		data->scratch_folios[i] =
+			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
+		if (!data->scratch_folios[i])
 			goto out_free_scratch;
 	}
+	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
 	INIT_LIST_HEAD(&data->reading);
 	INIT_LIST_HEAD(&data->writing);
 	INIT_LIST_HEAD(&data->resetting);
@@ -232,7 +220,7 @@ xfs_zone_gc_data_alloc(
 
 out_free_scratch:
 	while (--i >= 0)
-		folio_put(data->scratch[i].folio);
+		folio_put(data->scratch_folios[i]);
 	bioset_exit(&data->bio_set);
 out_free_recs:
 	kfree(data->iter.recs);
@@ -247,8 +235,8 @@ xfs_zone_gc_data_free(
 {
 	int			i;
 
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
-		folio_put(data->scratch[i].folio);
+	for (i = 0; i < XFS_GC_NR_BUFS; i++)
+		folio_put(data->scratch_folios[i]);
 	bioset_exit(&data->bio_set);
 	kfree(data->iter.recs);
 	kfree(data);
@@ -590,7 +578,12 @@ static unsigned int
 xfs_zone_gc_scratch_available(
 	struct xfs_zone_gc_data	*data)
 {
-	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+	if (!data->scratch_tail)
+		return data->scratch_size - data->scratch_head;
+
+	if (!data->scratch_head)
+		return data->scratch_tail;
+	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
 }
 
 static bool
@@ -664,6 +657,28 @@ xfs_zone_gc_alloc_blocks(
 	return oz;
 }
 
+static void
+xfs_zone_gc_add_data(
+	struct xfs_gc_bio	*chunk)
+{
+	struct xfs_zone_gc_data	*data = chunk->data;
+	unsigned int		len = chunk->len;
+	unsigned int		off = data->scratch_head;
+
+	do {
+		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
+		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
+
+		bio_add_folio_nofail(&chunk->bio,
+				data->scratch_folios[off / XFS_GC_BUF_SIZE],
+				this_len, this_off);
+		len -= this_len;
+		off += this_len;
+		if (off == data->scratch_size)
+			off = 0;
+	} while (len);
+}
+
 static bool
 xfs_zone_gc_start_chunk(
 	struct xfs_zone_gc_data	*data)
@@ -677,6 +692,7 @@ xfs_zone_gc_start_chunk(
 	struct xfs_inode	*ip;
 	struct bio		*bio;
 	xfs_daddr_t		daddr;
+	unsigned int		len;
 	bool			is_seq;
 
 	if (xfs_is_shutdown(mp))
@@ -691,17 +707,19 @@ xfs_zone_gc_start_chunk(
 		return false;
 	}
 
-	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+	bio = bio_alloc_bioset(bdev,
+			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
+			REQ_OP_READ, GFP_NOFS, &data->bio_set);
 
 	chunk = container_of(bio, struct xfs_gc_bio, bio);
 	chunk->ip = ip;
 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
-	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+	chunk->len = len;
 	chunk->old_startblock =
 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
 	chunk->new_daddr = daddr;
 	chunk->is_seq = is_seq;
-	chunk->scratch = &data->scratch[data->scratch_idx];
 	chunk->data = data;
 	chunk->oz = oz;
 	chunk->victim_rtg = iter->victim_rtg;
@@ -710,13 +728,9 @@ xfs_zone_gc_start_chunk(
 
 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
 	bio->bi_end_io = xfs_zone_gc_end_io;
-	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
-			chunk->scratch->offset);
-	chunk->scratch->offset += chunk->len;
-	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
-		data->scratch_idx =
-			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
-	}
+	xfs_zone_gc_add_data(chunk);
+	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
+
 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 	list_add_tail(&chunk->entry, &data->reading);
 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
@@ -834,6 +848,7 @@ xfs_zone_gc_finish_chunk(
 	struct xfs_gc_bio	*chunk)
 {
 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	struct xfs_zone_gc_data	*data = chunk->data;
 	struct xfs_inode	*ip = chunk->ip;
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;
@@ -845,11 +860,8 @@ xfs_zone_gc_finish_chunk(
 		return;
 	}
 
-	chunk->scratch->freed += chunk->len;
-	if (chunk->scratch->freed == chunk->scratch->offset) {
-		chunk->scratch->offset = 0;
-		chunk->scratch->freed = 0;
-	}
+	data->scratch_tail =
+		(data->scratch_tail + chunk->len) % data->scratch_size;
 
 	/*
 	 * Cycle through the iolock and wait for direct I/O and layouts to
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* improve zoned XFS GC buffer management v4
@ 2026-01-14 13:06 Christoph Hellwig
  2026-01-14 13:06 ` [PATCH 1/3] block: add a bio_reuse helper Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Christoph Hellwig @ 2026-01-14 13:06 UTC (permalink / raw)
  To: Jens Axboe, Carlos Maiolino
  Cc: Damien Le Moal, Hans Holmberg, Keith Busch, linux-block,
	linux-xfs

Hi all,

the zoned XFS GC code currently uses a weird bank switching strategy to
manage the scratch buffer.  The reason for that was that I/O could be
proceed out of order in the early days, but that actually changed before
the code was upstreamed to avoid fragmentation.  This replaced the logic
with a simple ring buffer, which makes the buffer space utilization much
more efficient.

Before that, two patches  makes the reuse of the bios a lot less fragile,
and for that we need a new block layer helper that should eventually also
be useful in other places.

Changes since v3:
 - pass an op to bio_reuse and fix GC on conventional zones/devices

Changes since v2:
 - fix a commit message typo
 - fix a missing : in the bio_reuse kerneldoc comment
 - warn about bios with integrity and crypto context, and add some
   documentation about this

Changes since v1:
 - assert that the reused bio is not cloned, and update the comments for
   bio_reuse a bit

Diffstat:
 block/bio.c          |   34 +++++++++++++++
 fs/xfs/xfs_zone_gc.c |  113 +++++++++++++++++++++++++++------------------------
 include/linux/bio.h  |    1 
 3 files changed, 95 insertions(+), 53 deletions(-)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/3] block: add a bio_reuse helper
  2026-01-14 13:06 improve zoned XFS GC buffer management v4 Christoph Hellwig
@ 2026-01-14 13:06 ` Christoph Hellwig
  2026-01-14 13:06 ` [PATCH 2/3] xfs: use bio_reuse in the zone GC code Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2026-01-14 13:06 UTC (permalink / raw)
  To: Jens Axboe, Carlos Maiolino
  Cc: Damien Le Moal, Hans Holmberg, Keith Busch, linux-block,
	linux-xfs, Carlos Maiolino, Darrick J. Wong

Add a helper to allow an existing bio to be resubmitted without
having to re-add the payload.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 block/bio.c         | 34 ++++++++++++++++++++++++++++++++++
 include/linux/bio.h |  1 +
 2 files changed, 35 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index e726c0e280a8..40f690985bfb 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -311,6 +311,40 @@ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
 }
 EXPORT_SYMBOL(bio_reset);
 
+/**
+ * bio_reuse - reuse a bio with the payload left intact
+ * @bio:	bio to reuse
+ * @opf:	operation and flags for the next I/O
+ *
+ * Allow reusing an existing bio for another operation with all set up
+ * fields including the payload, device and end_io handler left intact.
+ *
+ * Typically used when @bio is first used to read data which is then written
+ * to another location without modification.  @bio must not be in-flight and
+ * owned by the caller.  Can't be used for cloned bios.
+ *
+ * Note: Can't be used when @bio has integrity or blk-crypto contexts for now.
+ * Feel free to add that support when you need it, though.
+ */
+void bio_reuse(struct bio *bio, blk_opf_t opf)
+{
+	unsigned short vcnt = bio->bi_vcnt, i;
+	bio_end_io_t *end_io = bio->bi_end_io;
+	void *private = bio->bi_private;
+
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	WARN_ON_ONCE(bio_integrity(bio));
+	WARN_ON_ONCE(bio_has_crypt_ctx(bio));
+
+	bio_reset(bio, bio->bi_bdev, opf);
+	for (i = 0; i < vcnt; i++)
+		bio->bi_iter.bi_size += bio->bi_io_vec[i].bv_len;
+	bio->bi_vcnt = vcnt;
+	bio->bi_private = private;
+	bio->bi_end_io = end_io;
+}
+EXPORT_SYMBOL_GPL(bio_reuse);
+
 static struct bio *__bio_chain_endio(struct bio *bio)
 {
 	struct bio *parent = bio->bi_private;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ad2d57908c1c..8a0b9baa2d0e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -414,6 +414,7 @@ static inline void bio_init_inline(struct bio *bio, struct block_device *bdev,
 }
 extern void bio_uninit(struct bio *);
 void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf);
+void bio_reuse(struct bio *bio, blk_opf_t opf);
 void bio_chain(struct bio *, struct bio *);
 
 int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len,
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/3] xfs: use bio_reuse in the zone GC code
  2026-01-14 13:06 improve zoned XFS GC buffer management v4 Christoph Hellwig
  2026-01-14 13:06 ` [PATCH 1/3] block: add a bio_reuse helper Christoph Hellwig
@ 2026-01-14 13:06 ` Christoph Hellwig
  2026-01-14 19:50   ` Darrick J. Wong
  2026-01-14 13:06 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
  2026-01-21 12:21 ` improve zoned XFS GC buffer management v4 Carlos Maiolino
  3 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2026-01-14 13:06 UTC (permalink / raw)
  To: Jens Axboe, Carlos Maiolino
  Cc: Damien Le Moal, Hans Holmberg, Keith Busch, linux-block,
	linux-xfs, Carlos Maiolino

Replace our somewhat fragile code to reuse the bio, which caused a
regression in the past with the block layer bio_reuse helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
---
 fs/xfs/xfs_zone_gc.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index 3c52cc1497d4..c9a3df6a5289 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -811,8 +811,6 @@ xfs_zone_gc_write_chunk(
 {
 	struct xfs_zone_gc_data	*data = chunk->data;
 	struct xfs_mount	*mp = chunk->ip->i_mount;
-	phys_addr_t		bvec_paddr =
-		bvec_phys(bio_first_bvec_all(&chunk->bio));
 	struct xfs_gc_bio	*split_chunk;
 
 	if (chunk->bio.bi_status)
@@ -825,10 +823,7 @@ xfs_zone_gc_write_chunk(
 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 	list_move_tail(&chunk->entry, &data->writing);
 
-	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
-	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
-			offset_in_folio(chunk->scratch->folio, bvec_paddr));
-
+	bio_reuse(&chunk->bio, REQ_OP_WRITE);
 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
 		xfs_zone_gc_submit_write(data, split_chunk);
 	xfs_zone_gc_submit_write(data, chunk);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/3] xfs: rework zone GC buffer management
  2026-01-14 13:06 improve zoned XFS GC buffer management v4 Christoph Hellwig
  2026-01-14 13:06 ` [PATCH 1/3] block: add a bio_reuse helper Christoph Hellwig
  2026-01-14 13:06 ` [PATCH 2/3] xfs: use bio_reuse in the zone GC code Christoph Hellwig
@ 2026-01-14 13:06 ` Christoph Hellwig
  2026-01-14 19:59   ` Darrick J. Wong
  2026-01-25  0:03   ` Chris Mason
  2026-01-21 12:21 ` improve zoned XFS GC buffer management v4 Carlos Maiolino
  3 siblings, 2 replies; 17+ messages in thread
From: Christoph Hellwig @ 2026-01-14 13:06 UTC (permalink / raw)
  To: Jens Axboe, Carlos Maiolino
  Cc: Damien Le Moal, Hans Holmberg, Keith Busch, linux-block,
	linux-xfs, Carlos Maiolino

The double buffering where just one scratch area is used at a time does
not efficiently use the available memory.  It was originally implemented
when GC I/O could happen out of order, but that was removed before
upstream submission to avoid fragmentation.  Now that all GC I/Os are
processed in order, just use a number of buffers as a simple ring buffer.

For a synthetic benchmark that fills 256MiB HDD zones and punches out
holes to free half the space this leads to a decrease of GC time by
a little more than 25%.

Thanks to Hans Holmberg <hans.holmberg@wdc.com> for testing and
benchmarking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
---
 fs/xfs/xfs_zone_gc.c | 106 ++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 47 deletions(-)

diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index c9a3df6a5289..ba4f8e011e36 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -50,23 +50,11 @@
  */
 
 /*
- * Size of each GC scratch pad.  This is also the upper bound for each
- * GC I/O, which helps to keep latency down.
+ * Size of each GC scratch allocation, and the number of buffers.
  */
-#define XFS_GC_CHUNK_SIZE	SZ_1M
-
-/*
- * Scratchpad data to read GCed data into.
- *
- * The offset member tracks where the next allocation starts, and freed tracks
- * the amount of space that is not used anymore.
- */
-#define XFS_ZONE_GC_NR_SCRATCH	2
-struct xfs_zone_scratch {
-	struct folio			*folio;
-	unsigned int			offset;
-	unsigned int			freed;
-};
+#define XFS_GC_BUF_SIZE		SZ_1M
+#define XFS_GC_NR_BUFS		2
+static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
 
 /*
  * Chunk that is read and written for each GC operation.
@@ -141,10 +129,14 @@ struct xfs_zone_gc_data {
 	struct bio_set			bio_set;
 
 	/*
-	 * Scratchpad used, and index to indicated which one is used.
+	 * Scratchpad to buffer GC data, organized as a ring buffer over
+	 * discontiguous folios.  scratch_head is where the buffer is filled,
+	 * and scratch_tail tracks the buffer space freed.
 	 */
-	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
-	unsigned int			scratch_idx;
+	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
+	unsigned int			scratch_size;
+	unsigned int			scratch_head;
+	unsigned int			scratch_tail;
 
 	/*
 	 * List of bios currently being read, written and reset.
@@ -210,20 +202,16 @@ xfs_zone_gc_data_alloc(
 	if (!data->iter.recs)
 		goto out_free_data;
 
-	/*
-	 * We actually only need a single bio_vec.  It would be nice to have
-	 * a flag that only allocates the inline bvecs and not the separate
-	 * bvec pool.
-	 */
 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
 			BIOSET_NEED_BVECS))
 		goto out_free_recs;
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
-		data->scratch[i].folio =
-			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
-		if (!data->scratch[i].folio)
+	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
+		data->scratch_folios[i] =
+			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
+		if (!data->scratch_folios[i])
 			goto out_free_scratch;
 	}
+	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
 	INIT_LIST_HEAD(&data->reading);
 	INIT_LIST_HEAD(&data->writing);
 	INIT_LIST_HEAD(&data->resetting);
@@ -232,7 +220,7 @@ xfs_zone_gc_data_alloc(
 
 out_free_scratch:
 	while (--i >= 0)
-		folio_put(data->scratch[i].folio);
+		folio_put(data->scratch_folios[i]);
 	bioset_exit(&data->bio_set);
 out_free_recs:
 	kfree(data->iter.recs);
@@ -247,8 +235,8 @@ xfs_zone_gc_data_free(
 {
 	int			i;
 
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
-		folio_put(data->scratch[i].folio);
+	for (i = 0; i < XFS_GC_NR_BUFS; i++)
+		folio_put(data->scratch_folios[i]);
 	bioset_exit(&data->bio_set);
 	kfree(data->iter.recs);
 	kfree(data);
@@ -590,7 +578,12 @@ static unsigned int
 xfs_zone_gc_scratch_available(
 	struct xfs_zone_gc_data	*data)
 {
-	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+	if (!data->scratch_tail)
+		return data->scratch_size - data->scratch_head;
+
+	if (!data->scratch_head)
+		return data->scratch_tail;
+	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
 }
 
 static bool
@@ -664,6 +657,28 @@ xfs_zone_gc_alloc_blocks(
 	return oz;
 }
 
+static void
+xfs_zone_gc_add_data(
+	struct xfs_gc_bio	*chunk)
+{
+	struct xfs_zone_gc_data	*data = chunk->data;
+	unsigned int		len = chunk->len;
+	unsigned int		off = data->scratch_head;
+
+	do {
+		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
+		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
+
+		bio_add_folio_nofail(&chunk->bio,
+				data->scratch_folios[off / XFS_GC_BUF_SIZE],
+				this_len, this_off);
+		len -= this_len;
+		off += this_len;
+		if (off == data->scratch_size)
+			off = 0;
+	} while (len);
+}
+
 static bool
 xfs_zone_gc_start_chunk(
 	struct xfs_zone_gc_data	*data)
@@ -677,6 +692,7 @@ xfs_zone_gc_start_chunk(
 	struct xfs_inode	*ip;
 	struct bio		*bio;
 	xfs_daddr_t		daddr;
+	unsigned int		len;
 	bool			is_seq;
 
 	if (xfs_is_shutdown(mp))
@@ -691,17 +707,19 @@ xfs_zone_gc_start_chunk(
 		return false;
 	}
 
-	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+	bio = bio_alloc_bioset(bdev,
+			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
+			REQ_OP_READ, GFP_NOFS, &data->bio_set);
 
 	chunk = container_of(bio, struct xfs_gc_bio, bio);
 	chunk->ip = ip;
 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
-	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+	chunk->len = len;
 	chunk->old_startblock =
 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
 	chunk->new_daddr = daddr;
 	chunk->is_seq = is_seq;
-	chunk->scratch = &data->scratch[data->scratch_idx];
 	chunk->data = data;
 	chunk->oz = oz;
 	chunk->victim_rtg = iter->victim_rtg;
@@ -710,13 +728,9 @@ xfs_zone_gc_start_chunk(
 
 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
 	bio->bi_end_io = xfs_zone_gc_end_io;
-	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
-			chunk->scratch->offset);
-	chunk->scratch->offset += chunk->len;
-	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
-		data->scratch_idx =
-			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
-	}
+	xfs_zone_gc_add_data(chunk);
+	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
+
 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 	list_add_tail(&chunk->entry, &data->reading);
 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
@@ -834,6 +848,7 @@ xfs_zone_gc_finish_chunk(
 	struct xfs_gc_bio	*chunk)
 {
 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	struct xfs_zone_gc_data	*data = chunk->data;
 	struct xfs_inode	*ip = chunk->ip;
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;
@@ -845,11 +860,8 @@ xfs_zone_gc_finish_chunk(
 		return;
 	}
 
-	chunk->scratch->freed += chunk->len;
-	if (chunk->scratch->freed == chunk->scratch->offset) {
-		chunk->scratch->offset = 0;
-		chunk->scratch->freed = 0;
-	}
+	data->scratch_tail =
+		(data->scratch_tail + chunk->len) % data->scratch_size;
 
 	/*
 	 * Cycle through the iolock and wait for direct I/O and layouts to
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] xfs: use bio_reuse in the zone GC code
  2026-01-14 13:06 ` [PATCH 2/3] xfs: use bio_reuse in the zone GC code Christoph Hellwig
@ 2026-01-14 19:50   ` Darrick J. Wong
  0 siblings, 0 replies; 17+ messages in thread
From: Darrick J. Wong @ 2026-01-14 19:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Carlos Maiolino, Damien Le Moal, Hans Holmberg,
	Keith Busch, linux-block, linux-xfs, Carlos Maiolino

On Wed, Jan 14, 2026 at 02:06:42PM +0100, Christoph Hellwig wrote:
> Replace our somewhat fragile code to reuse the bio, which caused a
> regression in the past with the block layer bio_reuse helper.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
> Reviewed-by: Damien Le Moal <dlemoal@kernel.org>

Ok, you're right, that does make more sense to pass REQ_OP to bio_reuse.

Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_zone_gc.c | 7 +------
>  1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
> index 3c52cc1497d4..c9a3df6a5289 100644
> --- a/fs/xfs/xfs_zone_gc.c
> +++ b/fs/xfs/xfs_zone_gc.c
> @@ -811,8 +811,6 @@ xfs_zone_gc_write_chunk(
>  {
>  	struct xfs_zone_gc_data	*data = chunk->data;
>  	struct xfs_mount	*mp = chunk->ip->i_mount;
> -	phys_addr_t		bvec_paddr =
> -		bvec_phys(bio_first_bvec_all(&chunk->bio));
>  	struct xfs_gc_bio	*split_chunk;
>  
>  	if (chunk->bio.bi_status)
> @@ -825,10 +823,7 @@ xfs_zone_gc_write_chunk(
>  	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
>  	list_move_tail(&chunk->entry, &data->writing);
>  
> -	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
> -	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
> -			offset_in_folio(chunk->scratch->folio, bvec_paddr));
> -
> +	bio_reuse(&chunk->bio, REQ_OP_WRITE);
>  	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
>  		xfs_zone_gc_submit_write(data, split_chunk);
>  	xfs_zone_gc_submit_write(data, chunk);
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] xfs: rework zone GC buffer management
  2026-01-14 13:06 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
@ 2026-01-14 19:59   ` Darrick J. Wong
  2026-01-15  6:12     ` Christoph Hellwig
  2026-01-25  0:03   ` Chris Mason
  1 sibling, 1 reply; 17+ messages in thread
From: Darrick J. Wong @ 2026-01-14 19:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Carlos Maiolino, Damien Le Moal, Hans Holmberg,
	Keith Busch, linux-block, linux-xfs, Carlos Maiolino

On Wed, Jan 14, 2026 at 02:06:43PM +0100, Christoph Hellwig wrote:
> The double buffering where just one scratch area is used at a time does
> not efficiently use the available memory.  It was originally implemented
> when GC I/O could happen out of order, but that was removed before
> upstream submission to avoid fragmentation.  Now that all GC I/Os are
> processed in order, just use a number of buffers as a simple ring buffer.
> 
> For a synthetic benchmark that fills 256MiB HDD zones and punches out
> holes to free half the space this leads to a decrease of GC time by
> a little more than 25%.
> 
> Thanks to Hans Holmberg <hans.holmberg@wdc.com> for testing and
> benchmarking.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
> Reviewed-by: Damien Le Moal <dlemoal@kernel.org>

Huh.  I guess I never touched this patch at all???

Ok so it looks like we're switching out garbage collecting with two
independent (but not) scratchpads for what amounts to a ring buffer
consisting of a bunch of discontig folios.

Now when gc wants to move some data, we compute the number of folios we
need, and bump scratch_head by that amount.  Then we attach those folios
to a read bio and submit it.  When that finishes, we call bio_reuse with
REQ_OP_WRITE and submit that.  When the write finishes we do the "remap
if mapping hasn't changed" dance to update the metadata, and advance the
tail by however many folios we've stopped using.

If I got that right, then
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_zone_gc.c | 106 ++++++++++++++++++++++++-------------------
>  1 file changed, 59 insertions(+), 47 deletions(-)
> 
> diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
> index c9a3df6a5289..ba4f8e011e36 100644
> --- a/fs/xfs/xfs_zone_gc.c
> +++ b/fs/xfs/xfs_zone_gc.c
> @@ -50,23 +50,11 @@
>   */
>  
>  /*
> - * Size of each GC scratch pad.  This is also the upper bound for each
> - * GC I/O, which helps to keep latency down.
> + * Size of each GC scratch allocation, and the number of buffers.
>   */
> -#define XFS_GC_CHUNK_SIZE	SZ_1M
> -
> -/*
> - * Scratchpad data to read GCed data into.
> - *
> - * The offset member tracks where the next allocation starts, and freed tracks
> - * the amount of space that is not used anymore.
> - */
> -#define XFS_ZONE_GC_NR_SCRATCH	2
> -struct xfs_zone_scratch {
> -	struct folio			*folio;
> -	unsigned int			offset;
> -	unsigned int			freed;
> -};
> +#define XFS_GC_BUF_SIZE		SZ_1M
> +#define XFS_GC_NR_BUFS		2
> +static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
>  
>  /*
>   * Chunk that is read and written for each GC operation.
> @@ -141,10 +129,14 @@ struct xfs_zone_gc_data {
>  	struct bio_set			bio_set;
>  
>  	/*
> -	 * Scratchpad used, and index to indicated which one is used.
> +	 * Scratchpad to buffer GC data, organized as a ring buffer over
> +	 * discontiguous folios.  scratch_head is where the buffer is filled,
> +	 * and scratch_tail tracks the buffer space freed.
>  	 */
> -	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
> -	unsigned int			scratch_idx;
> +	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
> +	unsigned int			scratch_size;
> +	unsigned int			scratch_head;
> +	unsigned int			scratch_tail;
>  
>  	/*
>  	 * List of bios currently being read, written and reset.
> @@ -210,20 +202,16 @@ xfs_zone_gc_data_alloc(
>  	if (!data->iter.recs)
>  		goto out_free_data;
>  
> -	/*
> -	 * We actually only need a single bio_vec.  It would be nice to have
> -	 * a flag that only allocates the inline bvecs and not the separate
> -	 * bvec pool.
> -	 */
>  	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
>  			BIOSET_NEED_BVECS))
>  		goto out_free_recs;
> -	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
> -		data->scratch[i].folio =
> -			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
> -		if (!data->scratch[i].folio)
> +	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
> +		data->scratch_folios[i] =
> +			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
> +		if (!data->scratch_folios[i])
>  			goto out_free_scratch;
>  	}
> +	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
>  	INIT_LIST_HEAD(&data->reading);
>  	INIT_LIST_HEAD(&data->writing);
>  	INIT_LIST_HEAD(&data->resetting);
> @@ -232,7 +220,7 @@ xfs_zone_gc_data_alloc(
>  
>  out_free_scratch:
>  	while (--i >= 0)
> -		folio_put(data->scratch[i].folio);
> +		folio_put(data->scratch_folios[i]);
>  	bioset_exit(&data->bio_set);
>  out_free_recs:
>  	kfree(data->iter.recs);
> @@ -247,8 +235,8 @@ xfs_zone_gc_data_free(
>  {
>  	int			i;
>  
> -	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
> -		folio_put(data->scratch[i].folio);
> +	for (i = 0; i < XFS_GC_NR_BUFS; i++)
> +		folio_put(data->scratch_folios[i]);
>  	bioset_exit(&data->bio_set);
>  	kfree(data->iter.recs);
>  	kfree(data);
> @@ -590,7 +578,12 @@ static unsigned int
>  xfs_zone_gc_scratch_available(
>  	struct xfs_zone_gc_data	*data)
>  {
> -	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
> +	if (!data->scratch_tail)
> +		return data->scratch_size - data->scratch_head;
> +
> +	if (!data->scratch_head)
> +		return data->scratch_tail;
> +	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
>  }
>  
>  static bool
> @@ -664,6 +657,28 @@ xfs_zone_gc_alloc_blocks(
>  	return oz;
>  }
>  
> +static void
> +xfs_zone_gc_add_data(
> +	struct xfs_gc_bio	*chunk)
> +{
> +	struct xfs_zone_gc_data	*data = chunk->data;
> +	unsigned int		len = chunk->len;
> +	unsigned int		off = data->scratch_head;
> +
> +	do {
> +		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
> +		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
> +
> +		bio_add_folio_nofail(&chunk->bio,
> +				data->scratch_folios[off / XFS_GC_BUF_SIZE],
> +				this_len, this_off);
> +		len -= this_len;
> +		off += this_len;
> +		if (off == data->scratch_size)
> +			off = 0;
> +	} while (len);
> +}
> +
>  static bool
>  xfs_zone_gc_start_chunk(
>  	struct xfs_zone_gc_data	*data)
> @@ -677,6 +692,7 @@ xfs_zone_gc_start_chunk(
>  	struct xfs_inode	*ip;
>  	struct bio		*bio;
>  	xfs_daddr_t		daddr;
> +	unsigned int		len;
>  	bool			is_seq;
>  
>  	if (xfs_is_shutdown(mp))
> @@ -691,17 +707,19 @@ xfs_zone_gc_start_chunk(
>  		return false;
>  	}
>  
> -	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
> +	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
> +	bio = bio_alloc_bioset(bdev,
> +			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
> +			REQ_OP_READ, GFP_NOFS, &data->bio_set);
>  
>  	chunk = container_of(bio, struct xfs_gc_bio, bio);
>  	chunk->ip = ip;
>  	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
> -	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
> +	chunk->len = len;
>  	chunk->old_startblock =
>  		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
>  	chunk->new_daddr = daddr;
>  	chunk->is_seq = is_seq;
> -	chunk->scratch = &data->scratch[data->scratch_idx];
>  	chunk->data = data;
>  	chunk->oz = oz;
>  	chunk->victim_rtg = iter->victim_rtg;
> @@ -710,13 +728,9 @@ xfs_zone_gc_start_chunk(
>  
>  	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
>  	bio->bi_end_io = xfs_zone_gc_end_io;
> -	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
> -			chunk->scratch->offset);
> -	chunk->scratch->offset += chunk->len;
> -	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
> -		data->scratch_idx =
> -			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
> -	}
> +	xfs_zone_gc_add_data(chunk);
> +	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
> +
>  	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
>  	list_add_tail(&chunk->entry, &data->reading);
>  	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
> @@ -834,6 +848,7 @@ xfs_zone_gc_finish_chunk(
>  	struct xfs_gc_bio	*chunk)
>  {
>  	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> +	struct xfs_zone_gc_data	*data = chunk->data;
>  	struct xfs_inode	*ip = chunk->ip;
>  	struct xfs_mount	*mp = ip->i_mount;
>  	int			error;
> @@ -845,11 +860,8 @@ xfs_zone_gc_finish_chunk(
>  		return;
>  	}
>  
> -	chunk->scratch->freed += chunk->len;
> -	if (chunk->scratch->freed == chunk->scratch->offset) {
> -		chunk->scratch->offset = 0;
> -		chunk->scratch->freed = 0;
> -	}
> +	data->scratch_tail =
> +		(data->scratch_tail + chunk->len) % data->scratch_size;
>  
>  	/*
>  	 * Cycle through the iolock and wait for direct I/O and layouts to
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] xfs: rework zone GC buffer management
  2026-01-14 19:59   ` Darrick J. Wong
@ 2026-01-15  6:12     ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2026-01-15  6:12 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Jens Axboe, Carlos Maiolino, Damien Le Moal,
	Hans Holmberg, Keith Busch, linux-block, linux-xfs,
	Carlos Maiolino

On Wed, Jan 14, 2026 at 11:59:47AM -0800, Darrick J. Wong wrote:
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
> > Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
> > Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
> 
> Huh.  I guess I never touched this patch at all???

Not yet :)

> Ok so it looks like we're switching out garbage collecting with two
> independent (but not) scratchpads for what amounts to a ring buffer
> consisting of a bunch of discontig folios.

Yeah.

> Now when gc wants to move some data, we compute the number of folios we
> need, and bump scratch_head by that amount.  Then we attach those folios
> to a read bio and submit it.  When that finishes, we call bio_reuse with
> REQ_OP_WRITE and submit that.  When the write finishes we do the "remap
> if mapping hasn't changed" dance to update the metadata, and advance the
> tail by however many folios we've stopped using.
> 
> If I got that right, then

You did, thanks!


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: improve zoned XFS GC buffer management v4
  2026-01-14 13:06 improve zoned XFS GC buffer management v4 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2026-01-14 13:06 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
@ 2026-01-21 12:21 ` Carlos Maiolino
  3 siblings, 0 replies; 17+ messages in thread
From: Carlos Maiolino @ 2026-01-21 12:21 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig
  Cc: Damien Le Moal, Hans Holmberg, Keith Busch, linux-block,
	linux-xfs

On Wed, 14 Jan 2026 14:06:40 +0100, Christoph Hellwig wrote:
> the zoned XFS GC code currently uses a weird bank switching strategy to
> manage the scratch buffer.  The reason for that was that I/O could be
> proceed out of order in the early days, but that actually changed before
> the code was upstreamed to avoid fragmentation.  This replaced the logic
> with a simple ring buffer, which makes the buffer space utilization much
> more efficient.
> 
> [...]

Applied to for-next, thanks!

[1/3] block: add a bio_reuse helper
      commit: 7ca44303f9f6160a2f87ae3d5d2326d9127cd61c
[2/3] xfs: use bio_reuse in the zone GC code
      commit: 0506d32f7c52e41f6e8db7c337e0ce6374c6ffbb
[3/3] xfs: rework zone GC buffer management
      commit: 102f444b57b35e41b04a5c8192fcdacb467c9161

Best regards,
-- 
Carlos Maiolino <cem@kernel.org>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] xfs: rework zone GC buffer management
  2026-01-14 13:06 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
  2026-01-14 19:59   ` Darrick J. Wong
@ 2026-01-25  0:03   ` Chris Mason
  2026-01-26 20:30     ` Keith Busch
  1 sibling, 1 reply; 17+ messages in thread
From: Chris Mason @ 2026-01-25  0:03 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chris Mason, Jens Axboe, Carlos Maiolino, Damien Le Moal,
	Hans Holmberg, Keith Busch, linux-block, linux-xfs,
	Carlos Maiolino

On Wed, 14 Jan 2026 14:06:43 +0100 Christoph Hellwig <hch@lst.de> wrote:

> The double buffering where just one scratch area is used at a time does
> not efficiently use the available memory.  It was originally implemented
> when GC I/O could happen out of order, but that was removed before
> upstream submission to avoid fragmentation.  Now that all GC I/Os are
> processed in order, just use a number of buffers as a simple ring buffer.
> 
> For a synthetic benchmark that fills 256MiB HDD zones and punches out
> holes to free half the space this leads to a decrease of GC time by
> a little more than 25%.

Hi everyone,

I'm pretty low confidence on this, but it was flagged when I ran a bunch of
linux-next commits through the AI review prompts.  I'm curious if the AI is
on to something or if I need to adjust things:

> diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
> --- a/fs/xfs/xfs_zone_gc.c
> +++ b/fs/xfs/xfs_zone_gc.c

[ ... ]

> @@ -590,7 +578,12 @@ xfs_zone_gc_ensure_target(
>  xfs_zone_gc_scratch_available(
>  	struct xfs_zone_gc_data	*data)
>  {
> -	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
> +	if (!data->scratch_tail)
> +		return data->scratch_size - data->scratch_head;
> +
> +	if (!data->scratch_head)
> +		return data->scratch_tail;
> +	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
>  }

Can this function correctly distinguish between an empty and full ring
buffer?

When scratch_head wraps back to 0 and scratch_tail is also 0 (because no
I/O has completed yet), the first condition returns scratch_size (2MB),
indicating the buffer is empty.  But if the buffer just filled completely
and head wrapped to 0, the buffer is actually full, and the available
space should be 0.

Consider this sequence in xfs_zone_gc_handle_work():
  - Initial: head=0, tail=0, available=2MB (correct, empty)
  - After 1MB chunk: head=1MB, tail=0, available=1MB (correct)
  - After another 1MB chunk: head=0 (wrapped), tail=0, available=2MB (wrong)

The buffer is full but the function reports it as empty.  The while loop
in xfs_zone_gc_handle_work() calling xfs_zone_gc_start_chunk() would then
continue adding more data, overwriting the buffer contents from the first
chunk before its I/O completes.

[ ... ]

> @@ -710,13 +728,9 @@ xfs_zone_gc_start_chunk(
>
>  	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
>  	bio->bi_end_io = xfs_zone_gc_end_io;
> -	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
> -			chunk->scratch->offset);
> -	chunk->scratch->offset += chunk->len;
> -	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
> -		data->scratch_idx =
> -			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
> -	}
> +	xfs_zone_gc_add_data(chunk);
> +	data->scratch_head = (data->scratch_head + len) % data->scratch_size;

This advances scratch_head with modulo arithmetic, which is correct for
wrap-around.  However, if head catches up to tail (buffer full), the code
does not detect this condition.

[ ... ]

> @@ -845,11 +860,8 @@ xfs_zone_gc_finish_chunk(
>  		return;
>  	}
>
> -	chunk->scratch->freed += chunk->len;
> -	if (chunk->scratch->freed == chunk->scratch->offset) {
> -		chunk->scratch->offset = 0;
> -		chunk->scratch->freed = 0;
> -	}
> +	data->scratch_tail =
> +		(data->scratch_tail + chunk->len) % data->scratch_size;

And scratch_tail advances here when I/O completes.  The classic ring
buffer empty/full ambiguity applies: when head == tail, is the buffer
empty or full?  The xfs_zone_gc_scratch_available() function assumes
head=0 and tail=0 always means empty, but after wrapping it could mean
full.

A common solution is to track a separate count, or to sacrifice one slot
so head == tail always means empty, and head == tail-1 means full.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] xfs: rework zone GC buffer management
  2026-01-25  0:03   ` Chris Mason
@ 2026-01-26 20:30     ` Keith Busch
  2026-01-27  6:53       ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2026-01-26 20:30 UTC (permalink / raw)
  To: Chris Mason
  Cc: Christoph Hellwig, Jens Axboe, Carlos Maiolino, Damien Le Moal,
	Hans Holmberg, linux-block, linux-xfs, Carlos Maiolino

On Sat, Jan 24, 2026 at 04:03:04PM -0800, Chris Mason wrote:
> On Wed, 14 Jan 2026 14:06:43 +0100 Christoph Hellwig <hch@lst.de> wrote:
> > @@ -590,7 +578,12 @@ xfs_zone_gc_ensure_target(
> >  xfs_zone_gc_scratch_available(
> >  	struct xfs_zone_gc_data	*data)
> >  {
> > -	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
> > +	if (!data->scratch_tail)
> > +		return data->scratch_size - data->scratch_head;
> > +
> > +	if (!data->scratch_head)
> > +		return data->scratch_tail;
> > +	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
> >  }
> 
> Can this function correctly distinguish between an empty and full ring
> buffer?
> 
> When scratch_head wraps back to 0 and scratch_tail is also 0 (because no
> I/O has completed yet), the first condition returns scratch_size (2MB),
> indicating the buffer is empty.  But if the buffer just filled completely
> and head wrapped to 0, the buffer is actually full, and the available
> space should be 0.
> 
> Consider this sequence in xfs_zone_gc_handle_work():
>   - Initial: head=0, tail=0, available=2MB (correct, empty)
>   - After 1MB chunk: head=1MB, tail=0, available=1MB (correct)
>   - After another 1MB chunk: head=0 (wrapped), tail=0, available=2MB (wrong)
> 
> The buffer is full but the function reports it as empty.  The while loop
> in xfs_zone_gc_handle_work() calling xfs_zone_gc_start_chunk() would then
> continue adding more data, overwriting the buffer contents from the first
> chunk before its I/O completes.

I think you're right that ring wrap can't distinguish full vs. empty here.
 
> A common solution is to track a separate count, or to sacrifice one slot
> so head == tail always means empty, and head == tail-1 means full.

The buffer size is a power of two, so I suggest just let scratch_head
and scratch_tail only increment without modulo, and rely on the unsigned
int wrapping. We can get the actual offset by masking the head with the
scratch size.

---
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index ba4f8e011e36c..7d2bd0dc8b322 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -578,12 +578,7 @@ static unsigned int
 xfs_zone_gc_scratch_available(
 	struct xfs_zone_gc_data	*data)
 {
-	if (!data->scratch_tail)
-		return data->scratch_size - data->scratch_head;
-
-	if (!data->scratch_head)
-		return data->scratch_tail;
-	return (data->scratch_size - data->scratch_head) + data->scratch_tail;
+	return data->scratch_size - (data->scratch_head - data->scratch_tail);
 }
 
 static bool
@@ -663,7 +658,7 @@ xfs_zone_gc_add_data(
 {
 	struct xfs_zone_gc_data	*data = chunk->data;
 	unsigned int		len = chunk->len;
-	unsigned int		off = data->scratch_head;
+	unsigned int		off = data->scratch_head & (data->scratch_size - 1);
 
 	do {
 		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
@@ -729,7 +724,7 @@ xfs_zone_gc_start_chunk(
 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
 	bio->bi_end_io = xfs_zone_gc_end_io;
 	xfs_zone_gc_add_data(chunk);
-	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
+	data->scratch_head = data->scratch_head + len;
 
 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 	list_add_tail(&chunk->entry, &data->reading);
@@ -860,8 +855,7 @@ xfs_zone_gc_finish_chunk(
 		return;
 	}
 
-	data->scratch_tail =
-		(data->scratch_tail + chunk->len) % data->scratch_size;
+	data->scratch_tail = data->scratch_tail + chunk->len;
 
 	/*
 	 * Cycle through the iolock and wait for direct I/O and layouts to

--

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] xfs: rework zone GC buffer management
  2026-01-26 20:30     ` Keith Busch
@ 2026-01-27  6:53       ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2026-01-27  6:53 UTC (permalink / raw)
  To: Keith Busch
  Cc: Chris Mason, Christoph Hellwig, Jens Axboe, Carlos Maiolino,
	Damien Le Moal, Hans Holmberg, linux-block, linux-xfs,
	Carlos Maiolino

On Mon, Jan 26, 2026 at 01:30:54PM -0700, Keith Busch wrote:
> I think you're right that ring wrap can't distinguish full vs. empty here.
>  
> > A common solution is to track a separate count, or to sacrifice one slot
> > so head == tail always means empty, and head == tail-1 means full.
> 
> The buffer size is a power of two, so I suggest just let scratch_head
> and scratch_tail only increment without modulo, and rely on the unsigned
> int wrapping. We can get the actual offset by masking the head with the
> scratch size.

I actually added a available tracking member yesterday.  It turns out
this simplified the code quite a bit, but so does your version.  I've
kicked off QA on it, which I'd have to redo for this version.

https://git.infradead.org/?p=users/hch/xfs.git;a=shortlog;h=refs/heads/xfs-zoned-fixes


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2026-01-27  6:53 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-14 13:06 improve zoned XFS GC buffer management v4 Christoph Hellwig
2026-01-14 13:06 ` [PATCH 1/3] block: add a bio_reuse helper Christoph Hellwig
2026-01-14 13:06 ` [PATCH 2/3] xfs: use bio_reuse in the zone GC code Christoph Hellwig
2026-01-14 19:50   ` Darrick J. Wong
2026-01-14 13:06 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
2026-01-14 19:59   ` Darrick J. Wong
2026-01-15  6:12     ` Christoph Hellwig
2026-01-25  0:03   ` Chris Mason
2026-01-26 20:30     ` Keith Busch
2026-01-27  6:53       ` Christoph Hellwig
2026-01-21 12:21 ` improve zoned XFS GC buffer management v4 Carlos Maiolino
  -- strict thread matches above, loose matches on Subject: below --
2026-01-13  7:19 improve zoned XFS GC buffer management v3 Christoph Hellwig
2026-01-13  7:19 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
2026-01-06  7:58 improve zoned XFS GC buffer management v2 Christoph Hellwig
2026-01-06  7:58 ` [PATCH 3/3] xfs: rework zone GC buffer management Christoph Hellwig
2026-01-09 12:24   ` Carlos Maiolino
2025-12-18  6:31 improve zoned XFS " Christoph Hellwig
2025-12-18  6:31 ` [PATCH 3/3] xfs: rework zone " Christoph Hellwig
2025-12-18  7:12   ` Damien Le Moal
2025-12-19  8:06   ` Hans Holmberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox