public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed
From: Christoph Hellwig <hch@infradead.org>
To: Keith Busch <kbusch@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>,
	Keith Busch <kbusch@meta.com>,
	linux-block@vger.kernel.org, linux-nvme@lists.infradead.org
Subject: Re: [PATCH 2/5] block: add support for copy offload
Date: Fri, 23 May 2025 06:37:33 -0700	[thread overview]
Message-ID: <aDB6Hdp9ZQ1gX5gr@infradead.org> (raw)
In-Reply-To: <aDB3lSQRLxjDHTSE@kbusch-mbp>

On Fri, May 23, 2025 at 07:26:45AM -0600, Keith Busch wrote:
> > Urrgg.  Please don't overload the bio_vec. We've been working hard to
> > generalize it and share the data structures with more users in the
> > block layer. 
> 
> Darn, this part of the proposal is really the core concept of this patch
> set that everything builds around. It's what allows submitting
> arbitrarily large sized copy requests and letting the block layer
> efficiently split a bio to the queue limits later.

Well, you can still do that without overloading the bio_bvec by just
making bi_io_vec in the bio itself a union.

> 
> > If having a bio for each source range is too much overhead
> > for your user case (but I'd like to numbers for that), we'll need to
> > find a way to do that without overloading the actual bio_vec structure.
> 
> Getting good numbers might be a problem in the near term. The current
> generation of devices I have access to that can do copy offload don't
> have asic support for it, so it is instrumented entirely in firmware.
> The performance is currently underwhelming, but I expect next generation
> to be much better.

I meant numbers for the all in one bio vs multiple bios approach.
For hardware I think the main benefit is to not use host dram
bandwith.

Anyway, below is a patch to wire it up to the XFS garbage collection
daemin.  It survices the xfstests test cases for GC when run on a
conventional device, but otherwise I've not done much testing with it.

It shows two things, though:

 - right now there is block layer merging, and we always see single
   range bios.  That is really annoying, and fixing the fs code to
   submit multiple ranges in one go would be really annoying, as
   extent-based completions hang off the bio completions.  So I'd
   really like to block layer merges similar to what the old
   multi-bio code or the discard code do.
 - copy also needs to be handled by the zoned write plugs
 - bio_add_copy_src not updating bi_size is unexpected and annoying :)

diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index 8c541ca71872..e7dfdbbcf126 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -158,6 +158,8 @@ struct xfs_zone_gc_data {
 	 * Iterator for the victim zone.
 	 */
 	struct xfs_zone_gc_iter		iter;
+
+	bool				can_copy;
 };
 
 /*
@@ -212,12 +214,19 @@ xfs_zone_gc_data_alloc(
 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
 			BIOSET_NEED_BVECS))
 		goto out_free_recs;
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
-		data->scratch[i].folio =
-			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
-		if (!data->scratch[i].folio)
-			goto out_free_scratch;
+
+	if (bdev_copy_sectors(mp->m_rtdev_targp->bt_bdev)) {
+		xfs_info(mp, "using copy offload");
+		data->can_copy = true;
+	} else {
+		for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
+			data->scratch[i].folio = folio_alloc(GFP_KERNEL,
+					get_order(XFS_GC_CHUNK_SIZE));
+			if (!data->scratch[i].folio)
+				goto out_free_scratch;
+		}
 	}
+
 	INIT_LIST_HEAD(&data->reading);
 	INIT_LIST_HEAD(&data->writing);
 	INIT_LIST_HEAD(&data->resetting);
@@ -241,8 +250,10 @@ xfs_zone_gc_data_free(
 {
 	int			i;
 
-	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
-		folio_put(data->scratch[i].folio);
+	if (!data->can_copy) {
+		for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
+			folio_put(data->scratch[i].folio);
+	}
 	bioset_exit(&data->bio_set);
 	kfree(data->iter.recs);
 	kfree(data);
@@ -589,6 +600,8 @@ static unsigned int
 xfs_zone_gc_scratch_available(
 	struct xfs_zone_gc_data	*data)
 {
+	if (data->can_copy)
+		return UINT_MAX;
 	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
 }
 
@@ -690,7 +703,10 @@ xfs_zone_gc_start_chunk(
 		return false;
 	}
 
-	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+	bio = bio_alloc_bioset(bdev, 1,
+			data->can_copy ? REQ_OP_COPY : REQ_OP_READ,
+			GFP_NOFS, &data->bio_set);
+	bio->bi_end_io = xfs_zone_gc_end_io;
 
 	chunk = container_of(bio, struct xfs_gc_bio, bio);
 	chunk->ip = ip;
@@ -700,21 +716,38 @@ xfs_zone_gc_start_chunk(
 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
 	chunk->new_daddr = daddr;
 	chunk->is_seq = is_seq;
-	chunk->scratch = &data->scratch[data->scratch_idx];
 	chunk->data = data;
 	chunk->oz = oz;
 
-	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
-	bio->bi_end_io = xfs_zone_gc_end_io;
-	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
-			chunk->scratch->offset);
-	chunk->scratch->offset += chunk->len;
-	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
-		data->scratch_idx =
-			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
+	if (data->can_copy) {
+		struct bio_vec src = {
+			.bv_sector =
+				xfs_rtb_to_daddr(mp, chunk->old_startblock),
+			.bv_sectors = BTOBB(chunk->len),
+		};
+
+		bio_add_copy_src(bio, &src);
+		bio->bi_iter.bi_sector = daddr;
+		bio->bi_iter.bi_size = chunk->len;
+
+		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+		list_add_tail(&chunk->entry, &data->writing);
+	} else {
+		chunk->scratch = &data->scratch[data->scratch_idx];
+
+		bio->bi_iter.bi_sector =
+			xfs_rtb_to_daddr(mp, chunk->old_startblock);
+		bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+				chunk->scratch->offset);
+		chunk->scratch->offset += chunk->len;
+		if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
+			data->scratch_idx =
+				(data->scratch_idx + 1) %
+					XFS_ZONE_GC_NR_SCRATCH;
+		}
+		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+		list_add_tail(&chunk->entry, &data->reading);
 	}
-	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
-	list_add_tail(&chunk->entry, &data->reading);
 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
 
 	submit_bio(bio);
@@ -839,10 +872,12 @@ xfs_zone_gc_finish_chunk(
 		return;
 	}
 
-	chunk->scratch->freed += chunk->len;
-	if (chunk->scratch->freed == chunk->scratch->offset) {
-		chunk->scratch->offset = 0;
-		chunk->scratch->freed = 0;
+	if (!chunk->data->can_copy) {
+		chunk->scratch->freed += chunk->len;
+		if (chunk->scratch->freed == chunk->scratch->offset) {
+			chunk->scratch->offset = 0;
+			chunk->scratch->freed = 0;
+		}
 	}
 
 	/*


  reply	other threads:[~2025-05-23 14:42 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-21 22:31 [PATCH 0/5] block: another block copy offload Keith Busch
2025-05-21 22:31 ` [PATCH 1/5] block: new sector copy api Keith Busch
2025-05-22 10:02   ` Hannes Reinecke
2025-05-22 16:43     ` Keith Busch
2025-05-22 19:22   ` Bart Van Assche
2025-05-22 20:04     ` Keith Busch
2025-05-23 12:45   ` Christoph Hellwig
2025-05-23 17:02     ` Keith Busch
2025-05-26  5:18       ` Christoph Hellwig
2025-05-27 17:45         ` Keith Busch
2025-05-28  7:46           ` Christoph Hellwig
2025-05-28 22:41             ` Keith Busch
2025-06-02  4:58               ` Christoph Hellwig
2025-05-21 22:31 ` [PATCH 2/5] block: add support for copy offload Keith Busch
2025-05-22 13:49   ` Hannes Reinecke
2025-05-23 12:46   ` Christoph Hellwig
2025-05-23 13:26     ` Keith Busch
2025-05-23 13:37       ` Christoph Hellwig [this message]
2025-05-23 13:48         ` Keith Busch
2025-05-26  5:22           ` Christoph Hellwig
2025-05-27 21:33         ` Keith Busch
2025-05-28  7:47           ` Christoph Hellwig
2025-05-21 22:31 ` [PATCH 3/5] nvme: " Keith Busch
2025-05-22  0:47   ` Caleb Sander Mateos
2025-05-22  0:51     ` Caleb Sander Mateos
2025-05-22  3:23       ` Keith Busch
2025-05-22  3:41         ` Caleb Sander Mateos
2025-05-22  4:29           ` Keith Busch
2025-05-22 14:16             ` Caleb Sander Mateos
2025-05-23 12:49             ` Christoph Hellwig
2025-05-23 12:48           ` Christoph Hellwig
2025-05-22 13:54   ` Hannes Reinecke
2025-05-23 12:50     ` Christoph Hellwig
2025-05-23 14:22       ` Caleb Sander Mateos
2025-06-09  9:29   ` Niklas Cassel
2025-05-21 22:31 ` [PATCH 4/5] block: add support for vectored copies Keith Busch
2025-05-22 13:58   ` Hannes Reinecke
2025-05-22 16:36     ` Keith Busch
2025-05-21 22:31 ` [PATCH 5/5] nvmet: implement copy support for bdev backed target Keith Busch
2025-05-22 13:59   ` Hannes Reinecke
2025-05-23 13:18   ` Christoph Hellwig
2025-05-23 14:00     ` Keith Busch
2025-05-23 14:02       ` Christoph Hellwig
2025-05-22 15:52 ` [PATCH 0/5] block: another block copy offload Bart Van Assche
2025-05-23 12:53   ` Christoph Hellwig
2025-07-03 14:47 ` Niklas Cassel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aDB6Hdp9ZQ1gX5gr@infradead.org \
    --to=hch@infradead.org \
    --cc=kbusch@kernel.org \
    --cc=kbusch@meta.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox