public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed
From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, linux-scsi@vger.kernel.org,
	linux-nvme@lists.infradead.org, Christoph Hellwig <hch@lst.de>,
	Nitesh Shetty <nj.shetty@samsung.com>,
	Bart Van Assche <bvanassche@acm.org>
Subject: [PATCH 03/12] block: Introduce blkdev_copy_offload()
Date: Fri, 24 Apr 2026 15:41:52 -0700	[thread overview]
Message-ID: <20260424224201.1949243-4-bvanassche@acm.org> (raw)
In-Reply-To: <20260424224201.1949243-1-bvanassche@acm.org>

Introduce blkdev_copy_offload() for performing copy offloading. This
function implements the algorithm explained the description of the
previous patch. If the input parameters exceed what can be supported
with a single copy offload operation, multiple copy offload operations
are submitted.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/Makefile            |   2 +-
 block/blk-copy.c          | 355 ++++++++++++++++++++++++++++++++++++++
 include/linux/blk_types.h |  40 +++++
 include/linux/blkdev.h    |   1 +
 4 files changed, 397 insertions(+), 1 deletion(-)
 create mode 100644 block/blk-copy.c

diff --git a/block/Makefile b/block/Makefile
index 7dce2e44276c..d99e8d4fda7d 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@
 obj-y		:= bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
-			blk-mq-tag.o blk-mq-dma.o blk-stat.o \
+			blk-mq-tag.o blk-mq-dma.o blk-stat.o blk-copy.o \
 			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
 			disk-events.o blk-ia-ranges.o early-lookup.o
diff --git a/block/blk-copy.c b/block/blk-copy.c
new file mode 100644
index 000000000000..8ac8879442f7
--- /dev/null
+++ b/block/blk-copy.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Offloaded and onloaded data copying support.
+ */
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/blk-copy.h>
+#include <linux/blk-mq.h>
+
+/* End all bios in the @ctx->bios list with status @ctx->status. */
+static void blkdev_end_bios(struct bio_copy_offload_ctx *ctx)
+{
+	struct bio *bio, *next;
+
+	bio = ctx->bios;
+	ctx->bios = NULL;
+	for (; bio; bio = next) {
+		next = bio->bi_next;
+		bio->bi_status = ctx->status;
+		bio_endio(bio);
+	}
+}
+
+/*
+ * Called after LBA translation finished for all bios associated with copy context
+ * @ctx.
+ */
+static void blkdev_translation_complete(struct bio_copy_offload_ctx *ctx)
+{
+	struct module *owner = NULL;
+	struct bio *bio;
+
+	WARN_ON_ONCE(ctx->phase != BLKDEV_TRANSLATE_LBAS);
+	ctx->phase = BLKDEV_COPY;
+
+	/* Check whether all bios are associated with the same block driver. */
+	for (bio = ctx->bios; bio; bio = bio->bi_next) {
+		if (!owner) {
+			owner = bio->bi_bdev->bd_disk->fops->owner;
+		} else if (owner != bio->bi_bdev->bd_disk->fops->owner) {
+			ctx->status = BLK_STS_INVAL;
+			break;
+		}
+	}
+
+	/* Remove the first bio from the bio list and submit it. */
+	bio = ctx->bios;
+	ctx->bios = bio->bi_next;
+	bio->bi_next = NULL;
+	if (ctx->biotail == bio)
+		ctx->biotail = NULL;
+	if (ctx->status == BLK_STS_OK)
+		submit_bio(bio);
+	else
+		bio_endio(bio);
+}
+
+/* REQ_OP_COPY_* completion handler. */
+static void blkdev_req_op_copy_done(struct bio *bio)
+{
+	struct bio_copy_offload_ctx *ctx = bio->bi_copy_ctx;
+	struct blk_copy_params *params = ctx->params;
+	blk_status_t status;
+
+	switch (ctx->phase) {
+	case BLKDEV_TRANSLATE_LBAS:
+		scoped_guard(spinlock_irqsave, &ctx->lock)
+			if (!ctx->status)
+				ctx->status = bio->bi_status;
+		break;
+	case BLKDEV_COPY:
+		status = ctx->status;
+		ctx->phase = BLKDEV_COPY_DONE;
+		blkdev_end_bios(ctx);
+		kfree(ctx);
+		scoped_guard(spinlock_irqsave, &params->lock) {
+			if (!params->status)
+				params->status = status;
+		}
+		if (atomic_dec_and_test(&params->copy_ctx_count))
+			params->end_io(params);
+		break;
+	case BLKDEV_COPY_DONE:
+		break;
+	}
+}
+
+/*
+ * Check that all LBA offsets are aligned with both the source and the destination
+ * logical block sizes. Compare input and output length. Store the number of bytes
+ * to be transferred in *@len.
+ */
+static int blkdev_copy_check_params(const struct blk_copy_params *params,
+				    loff_t *len)
+{
+	const unsigned int mask =
+		max(bdev_logical_block_size(params->in_bdev),
+		    bdev_logical_block_size(params->out_bdev)) - 1;
+	loff_t in_len = 0, out_len = 0;
+	unsigned int i;
+
+	for (i = 0; i < params->in_nseg; i++) {
+		if ((params->in_segs[i].pos | params->in_segs[i].len) & mask)
+			return -EINVAL;
+		in_len += params->in_segs[i].len;
+	}
+
+	for (i = 0; i < params->out_nseg; i++) {
+		if ((params->out_segs[i].pos | params->out_segs[i].len) & mask)
+			return -EINVAL;
+		out_len += params->out_segs[i].len;
+	}
+
+	if (in_len != out_len)
+		return -EINVAL;
+
+	*len = in_len;
+
+	return 0;
+}
+
+/*
+ * Calculate the number of bytes in the max_copy_src_segments input segments
+ * starting from input segment @in_idx.
+ */
+static loff_t blk_max_src_len(const struct blk_copy_params *params,
+			      unsigned int in_idx)
+{
+	uint16_t max_src_segments =
+		params->in_bdev->bd_queue->limits.max_copy_src_segments;
+	unsigned int max_i = min(params->in_nseg, in_idx + max_src_segments);
+	loff_t len = 0;
+
+	for (uint32_t i = in_idx; i < max_i; i++)
+		len += params->in_segs[i].len;
+
+	return len;
+}
+
+/*
+ * Calculate the number of bytes in the max_copy_dst_segments output segments
+ * starting from output segment @out_idx.
+ */
+static loff_t blk_max_dst_len(const struct blk_copy_params *params,
+			      unsigned int out_idx)
+{
+	uint16_t max_dst_segments =
+		params->out_bdev->bd_queue->limits.max_copy_dst_segments;
+	unsigned int max_i = min(params->out_nseg, out_idx + max_dst_segments);
+	loff_t len = 0;
+
+	for (uint32_t i = out_idx; i < max_i; i++)
+		len += params->out_segs[i].len;
+
+	return len;
+}
+
+struct blkdev_copy_sync_ctx {
+	struct completion compl;
+	blk_status_t status;
+};
+
+static void blkdev_end_copy_sync(const struct blk_copy_params *params)
+{
+	struct blkdev_copy_sync_ctx *ctx = params->private;
+
+	complete(&ctx->compl);
+}
+
+static int blkdev_copy_sync(struct blk_copy_params *params)
+{
+	struct blkdev_copy_sync_ctx ctx = {
+		.compl = COMPLETION_INITIALIZER_ONSTACK(ctx.compl),
+	};
+	int ret;
+
+	WARN_ON_ONCE(params->end_io || params->private);
+	params->end_io = blkdev_end_copy_sync;
+	params->private = &ctx;
+
+	ret = blkdev_copy_offload(params);
+	if (ret && ret != -EIOCBQUEUED)
+		return ret;
+
+	wait_for_completion(&ctx.compl);
+	return blk_status_to_errno(ctx.status);
+}
+
+/**
+ * blkdev_copy_chunk() - submit a single copy offload operation
+ * @params: Copy offload input parameters.
+ * @in_idx: Index of the input segment from where to start copying.
+ * @out_idx: Index of the output segment to where to start copying.
+ * @in_offset: Offset in bytes from the start of input segment @in_idx.
+ * @out_offset: Offset in bytes from the start of output segment @out_idx.
+ * @chunk: Maximum number of bytes to copy.
+ *
+ * Returns: the number of bytes covered by the submitted copy operation or a
+ *	negative error number.
+ */
+static loff_t blkdev_copy_chunk(struct blk_copy_params *params, u32 *in_idx,
+				u32 *out_idx, loff_t *in_offset,
+				loff_t *out_offset, loff_t chunk)
+{
+	struct bio_copy_offload_ctx *ctx;
+	u32 bio_count;
+
+	ctx = kzalloc_obj(*ctx);
+	if (!ctx)
+		return -ENOMEM;
+
+	spin_lock_init(&ctx->lock);
+	ctx->params = params;
+	ctx->phase = BLKDEV_TRANSLATE_LBAS;
+	ctx->translation_complete = blkdev_translation_complete;
+	/*
+	 * Initialized to one to prevent that ctx->translation_complete() is
+	 * called before bio submission has finished.
+	 */
+	ctx->bio_count = 1;
+
+	WARN_ON_ONCE(chunk <= 0);
+	chunk = min(chunk, blk_max_src_len(params, *in_idx) - *in_offset);
+	WARN_ON_ONCE(chunk <= 0);
+	chunk = min(chunk, blk_max_dst_len(params, *out_idx) - *out_offset);
+	WARN_ON_ONCE(chunk <= 0);
+	ctx->len = chunk;
+	for (loff_t bytes, remaining_in = chunk; remaining_in > 0;
+	     remaining_in -= bytes) {
+		struct bio *src_bio;
+
+		src_bio = bio_alloc(params->in_bdev, 0, REQ_OP_COPY_SRC,
+				    GFP_NOIO);
+		if (!src_bio) {
+			if (remaining_in == chunk)
+				goto free_ctx;
+			else
+				goto enomem;
+		}
+		atomic_inc(&params->copy_ctx_count);
+		scoped_guard(spinlock_irqsave, &ctx->lock)
+			ctx->bio_count++;
+		bytes = min(remaining_in, params->in_segs[*in_idx].len -
+			    *in_offset);
+		src_bio->bi_iter.bi_size = bytes;
+		src_bio->bi_iter.bi_sector = (params->in_segs[*in_idx].pos +
+					      *in_offset) >> SECTOR_SHIFT;
+		src_bio->bi_copy_ctx = ctx;
+		src_bio->bi_end_io = blkdev_req_op_copy_done;
+		*in_offset += bytes;
+		if (*in_offset >= params->in_segs[*in_idx].len) {
+			*in_offset -= params->in_segs[*in_idx].len;
+			(*in_idx)++;
+		}
+		submit_bio(src_bio);
+	}
+	for (loff_t bytes, remaining_out = chunk; remaining_out;
+	     remaining_out -= bytes) {
+		struct bio *dst_bio;
+
+		dst_bio = bio_alloc(params->out_bdev, 0, REQ_OP_COPY_DST,
+				    GFP_NOIO);
+		if (!dst_bio)
+			goto enomem;
+		scoped_guard(spinlock_irqsave, &ctx->lock)
+			ctx->bio_count++;
+		bytes = min(remaining_out, params->out_segs[*out_idx].len -
+			    *out_offset);
+		dst_bio->bi_iter.bi_size = bytes;
+		dst_bio->bi_iter.bi_sector = (params->out_segs[*out_idx].pos +
+					      *out_offset) >> SECTOR_SHIFT;
+		dst_bio->bi_copy_ctx = ctx;
+		dst_bio->bi_end_io = blkdev_req_op_copy_done;
+		*out_offset += bytes;
+		if (*out_offset >= params->out_segs[*out_idx].len) {
+			*out_offset -= params->out_segs[*out_idx].len;
+			(*out_idx)++;
+		}
+		submit_bio(dst_bio);
+	}
+
+dec_bio_count:
+	scoped_guard(spinlock_irqsave, &ctx->lock)
+		bio_count = --ctx->bio_count;
+	if (bio_count == 0)
+		ctx->translation_complete(ctx);
+	return chunk;
+
+enomem:
+	scoped_guard(spinlock_irqsave, &ctx->lock)
+		if (!ctx->status)
+			ctx->status = BLK_STS_RESOURCE;
+	chunk = -ENOMEM;
+	goto dec_bio_count;
+
+free_ctx:
+	kfree(ctx);
+	return -ENOMEM;
+}
+
+/**
+ * blkdev_copy_offload() - copy data and offload copying if possible.
+ * @params: Source and destination block device, data ranges and completion
+ *	callback.
+ *
+ * If @params->end_io != NULL, data is copied asynchronously. If @params->end_io
+ * == NULL, this function only returns after data copying finished.
+ *
+ * Return: 0 upon success; -EIOCBQUEUED if the completion callback function will
+ *	be called or has already been called; -EOPNOTSUPP if copy offloading is
+ *	not supported by the block device or if the source or destination
+ *	address ranges span more than one dm device.
+ */
+int blkdev_copy_offload(struct blk_copy_params *params)
+{
+	loff_t in_offset = 0, out_offset = 0;
+	u32 in_idx = 0, out_idx = 0;
+	loff_t len, chunk, max_chunk;
+	int ret;
+
+	might_sleep();
+
+	if (!params->end_io)
+		return blkdev_copy_sync(params);
+
+	spin_lock_init(&params->lock);
+
+	if (!bdev_max_copy_sectors(params->in_bdev) ||
+	    !bdev_max_copy_sectors(params->out_bdev))
+		return -EOPNOTSUPP;
+
+	ret = blkdev_copy_check_params(params, &len);
+	if (ret)
+		return ret;
+
+	params->len = len;
+
+	max_chunk = (u64)min(bdev_max_copy_sectors(params->in_bdev),
+			     bdev_max_copy_sectors(params->out_bdev))
+		    << SECTOR_SHIFT;
+
+	atomic_set(&params->copy_ctx_count, 1);
+
+	for (loff_t offset = 0; offset < len; offset += chunk) {
+		chunk = min(len - offset, max_chunk);
+		chunk = blkdev_copy_chunk(params, &in_idx, &out_idx, &in_offset,
+					  &out_offset, chunk);
+	}
+
+	if (atomic_dec_and_test(&params->copy_ctx_count))
+		params->end_io(params);
+
+	return -EIOCBQUEUED;
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_offload);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4e448e810b87..27a0f92fc2cb 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -535,4 +535,44 @@ struct blk_rq_stat {
 	u64 batch;
 };
 
+/* A single input or output segment descriptor. */
+struct blk_copy_seg {
+	loff_t pos;
+	loff_t len;
+};
+
+/**
+ * struct blk_copy_params - input parameters and internal parameters for copy
+ *	operations.
+ * @in_bdev: Input block device.
+ * @in_segs: Input LBA ranges.
+ * @in_nseg: Number of elements in @in_segs.
+ * @out_bdev: Output block device.
+ * @out_segs: Output LBA ranges.
+ * @out_nseg: Number of elements in @out_segs.
+ * @end_io: Called after copying data finished. If %NULL, copying data happens
+ *	synchronously instead of asynchronously.
+ * @private: May be used by @end_io. Not used directly.
+ * @len: Total number of bytes to copy. Set by blkdev_copy_offload() or
+ *	blkdev_copy_onload().
+ * @copy_ctxs: Number of in-flight copy contexts associated with copy offload
+ *	operations.
+ * @lock: Protects @status updates.
+ * @status: I/O completion status.
+ */
+struct blk_copy_params {
+	struct block_device *in_bdev;
+	struct blk_copy_seg *in_segs;
+	unsigned int in_nseg;
+	struct block_device *out_bdev;
+	struct blk_copy_seg *out_segs;
+	unsigned int out_nseg;
+	void (*end_io)(const struct blk_copy_params *params);
+	void *private;
+	loff_t len;
+	atomic_t copy_ctx_count;
+	spinlock_t lock;
+	blk_status_t status;
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8ae64cc0546f..fea296150cda 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1283,6 +1283,7 @@ void __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
 int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp);
+int blkdev_copy_offload(struct blk_copy_params *params);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */

  parent reply	other threads:[~2026-04-24 22:42 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
2026-04-24 22:41 ` [PATCH 01/12] block: Introduce queue limits for " Bart Van Assche
2026-04-24 22:41 ` [PATCH 02/12] block: Add the REQ_OP_COPY_{SRC,DST} operations Bart Van Assche
2026-04-24 22:41 ` Bart Van Assche [this message]
2026-04-24 22:41 ` [PATCH 04/12] block: Add an onloaded copy implementation Bart Van Assche
2026-04-24 22:41 ` [PATCH 05/12] block: Introduce accessor functions for copy offload bios Bart Van Assche
2026-04-24 22:41 ` [PATCH 06/12] fs/read_write: Generalize generic_copy_file_checks() Bart Van Assche
2026-04-24 22:41 ` [PATCH 07/12] fs, block: Add copy_file_range() support for block devices Bart Van Assche
2026-04-24 22:41 ` [PATCH 08/12] nvme: Add copy offloading support Bart Van Assche
2026-04-24 22:41 ` [PATCH 09/12] nvmet: Support the Copy command Bart Van Assche
2026-04-24 22:41 ` [PATCH 10/12] dm: Add support for copy offloading Bart Van Assche
2026-04-24 22:42 ` [PATCH 11/12] dm-linear: Enable " Bart Van Assche
2026-04-24 22:42 ` [PATCH 12/12] null_blk: Add support for REQ_OP_COPY_* Bart Van Assche

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260424224201.1949243-4-bvanassche@acm.org \
    --to=bvanassche@acm.org \
    --cc=axboe@kernel.dk \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=nj.shetty@samsung.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox