All of lore.kernel.org
 help / color / mirror / Atom feed
From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, linux-scsi@vger.kernel.org,
	linux-nvme@lists.infradead.org, Christoph Hellwig <hch@lst.de>,
	Nitesh Shetty <nj.shetty@samsung.com>,
	Bart Van Assche <bvanassche@acm.org>
Subject: [PATCH 03/12] block: Introduce blkdev_copy_offload()
Date: Fri, 24 Apr 2026 15:41:52 -0700	[thread overview]
Message-ID: <20260424224201.1949243-4-bvanassche@acm.org> (raw)
In-Reply-To: <20260424224201.1949243-1-bvanassche@acm.org>

Introduce blkdev_copy_offload() for performing copy offloading. This
function implements the algorithm explained the description of the
previous patch. If the input parameters exceed what can be supported
with a single copy offload operation, multiple copy offload operations
are submitted.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/Makefile            |   2 +-
 block/blk-copy.c          | 355 ++++++++++++++++++++++++++++++++++++++
 include/linux/blk_types.h |  40 +++++
 include/linux/blkdev.h    |   1 +
 4 files changed, 397 insertions(+), 1 deletion(-)
 create mode 100644 block/blk-copy.c

diff --git a/block/Makefile b/block/Makefile
index 7dce2e44276c..d99e8d4fda7d 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@
 obj-y		:= bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
-			blk-mq-tag.o blk-mq-dma.o blk-stat.o \
+			blk-mq-tag.o blk-mq-dma.o blk-stat.o blk-copy.o \
 			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
 			disk-events.o blk-ia-ranges.o early-lookup.o
diff --git a/block/blk-copy.c b/block/blk-copy.c
new file mode 100644
index 000000000000..8ac8879442f7
--- /dev/null
+++ b/block/blk-copy.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Offloaded and onloaded data copying support.
+ */
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/blk-copy.h>
+#include <linux/blk-mq.h>
+
+/* End all bios in the @ctx->bios list with status @ctx->status. */
+static void blkdev_end_bios(struct bio_copy_offload_ctx *ctx)
+{
+	struct bio *bio, *next;
+
+	bio = ctx->bios;
+	ctx->bios = NULL;
+	for (; bio; bio = next) {
+		next = bio->bi_next;
+		bio->bi_status = ctx->status;
+		bio_endio(bio);
+	}
+}
+
+/*
+ * Called after LBA translation finished for all bios associated with copy context
+ * @ctx.
+ */
+static void blkdev_translation_complete(struct bio_copy_offload_ctx *ctx)
+{
+	struct module *owner = NULL;
+	struct bio *bio;
+
+	WARN_ON_ONCE(ctx->phase != BLKDEV_TRANSLATE_LBAS);
+	ctx->phase = BLKDEV_COPY;
+
+	/* Check whether all bios are associated with the same block driver. */
+	for (bio = ctx->bios; bio; bio = bio->bi_next) {
+		if (!owner) {
+			owner = bio->bi_bdev->bd_disk->fops->owner;
+		} else if (owner != bio->bi_bdev->bd_disk->fops->owner) {
+			ctx->status = BLK_STS_INVAL;
+			break;
+		}
+	}
+
+	/* Remove the first bio from the bio list and submit it. */
+	bio = ctx->bios;
+	ctx->bios = bio->bi_next;
+	bio->bi_next = NULL;
+	if (ctx->biotail == bio)
+		ctx->biotail = NULL;
+	if (ctx->status == BLK_STS_OK)
+		submit_bio(bio);
+	else
+		bio_endio(bio);
+}
+
+/* REQ_OP_COPY_* completion handler. */
+static void blkdev_req_op_copy_done(struct bio *bio)
+{
+	struct bio_copy_offload_ctx *ctx = bio->bi_copy_ctx;
+	struct blk_copy_params *params = ctx->params;
+	blk_status_t status;
+
+	switch (ctx->phase) {
+	case BLKDEV_TRANSLATE_LBAS:
+		scoped_guard(spinlock_irqsave, &ctx->lock)
+			if (!ctx->status)
+				ctx->status = bio->bi_status;
+		break;
+	case BLKDEV_COPY:
+		status = ctx->status;
+		ctx->phase = BLKDEV_COPY_DONE;
+		blkdev_end_bios(ctx);
+		kfree(ctx);
+		scoped_guard(spinlock_irqsave, &params->lock) {
+			if (!params->status)
+				params->status = status;
+		}
+		if (atomic_dec_and_test(&params->copy_ctx_count))
+			params->end_io(params);
+		break;
+	case BLKDEV_COPY_DONE:
+		break;
+	}
+}
+
+/*
+ * Check that all LBA offsets are aligned with both the source and the destination
+ * logical block sizes. Compare input and output length. Store the number of bytes
+ * to be transferred in *@len.
+ */
+static int blkdev_copy_check_params(const struct blk_copy_params *params,
+				    loff_t *len)
+{
+	const unsigned int mask =
+		max(bdev_logical_block_size(params->in_bdev),
+		    bdev_logical_block_size(params->out_bdev)) - 1;
+	loff_t in_len = 0, out_len = 0;
+	unsigned int i;
+
+	for (i = 0; i < params->in_nseg; i++) {
+		if ((params->in_segs[i].pos | params->in_segs[i].len) & mask)
+			return -EINVAL;
+		in_len += params->in_segs[i].len;
+	}
+
+	for (i = 0; i < params->out_nseg; i++) {
+		if ((params->out_segs[i].pos | params->out_segs[i].len) & mask)
+			return -EINVAL;
+		out_len += params->out_segs[i].len;
+	}
+
+	if (in_len != out_len)
+		return -EINVAL;
+
+	*len = in_len;
+
+	return 0;
+}
+
+/*
+ * Calculate the number of bytes in the max_copy_src_segments input segments
+ * starting from input segment @in_idx.
+ */
+static loff_t blk_max_src_len(const struct blk_copy_params *params,
+			      unsigned int in_idx)
+{
+	uint16_t max_src_segments =
+		params->in_bdev->bd_queue->limits.max_copy_src_segments;
+	unsigned int max_i = min(params->in_nseg, in_idx + max_src_segments);
+	loff_t len = 0;
+
+	for (uint32_t i = in_idx; i < max_i; i++)
+		len += params->in_segs[i].len;
+
+	return len;
+}
+
+/*
+ * Calculate the number of bytes in the max_copy_dst_segments output segments
+ * starting from output segment @out_idx.
+ */
+static loff_t blk_max_dst_len(const struct blk_copy_params *params,
+			      unsigned int out_idx)
+{
+	uint16_t max_dst_segments =
+		params->out_bdev->bd_queue->limits.max_copy_dst_segments;
+	unsigned int max_i = min(params->out_nseg, out_idx + max_dst_segments);
+	loff_t len = 0;
+
+	for (uint32_t i = out_idx; i < max_i; i++)
+		len += params->out_segs[i].len;
+
+	return len;
+}
+
+struct blkdev_copy_sync_ctx {
+	struct completion compl;
+	blk_status_t status;
+};
+
+static void blkdev_end_copy_sync(const struct blk_copy_params *params)
+{
+	struct blkdev_copy_sync_ctx *ctx = params->private;
+
+	complete(&ctx->compl);
+}
+
+static int blkdev_copy_sync(struct blk_copy_params *params)
+{
+	struct blkdev_copy_sync_ctx ctx = {
+		.compl = COMPLETION_INITIALIZER_ONSTACK(ctx.compl),
+	};
+	int ret;
+
+	WARN_ON_ONCE(params->end_io || params->private);
+	params->end_io = blkdev_end_copy_sync;
+	params->private = &ctx;
+
+	ret = blkdev_copy_offload(params);
+	if (ret && ret != -EIOCBQUEUED)
+		return ret;
+
+	wait_for_completion(&ctx.compl);
+	return blk_status_to_errno(ctx.status);
+}
+
+/**
+ * blkdev_copy_chunk() - submit a single copy offload operation
+ * @params: Copy offload input parameters.
+ * @in_idx: Index of the input segment from where to start copying.
+ * @out_idx: Index of the output segment to where to start copying.
+ * @in_offset: Offset in bytes from the start of input segment @in_idx.
+ * @out_offset: Offset in bytes from the start of output segment @out_idx.
+ * @chunk: Maximum number of bytes to copy.
+ *
+ * Returns: the number of bytes covered by the submitted copy operation or a
+ *	negative error number.
+ */
+static loff_t blkdev_copy_chunk(struct blk_copy_params *params, u32 *in_idx,
+				u32 *out_idx, loff_t *in_offset,
+				loff_t *out_offset, loff_t chunk)
+{
+	struct bio_copy_offload_ctx *ctx;
+	u32 bio_count;
+
+	ctx = kzalloc_obj(*ctx);
+	if (!ctx)
+		return -ENOMEM;
+
+	spin_lock_init(&ctx->lock);
+	ctx->params = params;
+	ctx->phase = BLKDEV_TRANSLATE_LBAS;
+	ctx->translation_complete = blkdev_translation_complete;
+	/*
+	 * Initialized to one to prevent that ctx->translation_complete() is
+	 * called before bio submission has finished.
+	 */
+	ctx->bio_count = 1;
+
+	WARN_ON_ONCE(chunk <= 0);
+	chunk = min(chunk, blk_max_src_len(params, *in_idx) - *in_offset);
+	WARN_ON_ONCE(chunk <= 0);
+	chunk = min(chunk, blk_max_dst_len(params, *out_idx) - *out_offset);
+	WARN_ON_ONCE(chunk <= 0);
+	ctx->len = chunk;
+	for (loff_t bytes, remaining_in = chunk; remaining_in > 0;
+	     remaining_in -= bytes) {
+		struct bio *src_bio;
+
+		src_bio = bio_alloc(params->in_bdev, 0, REQ_OP_COPY_SRC,
+				    GFP_NOIO);
+		if (!src_bio) {
+			if (remaining_in == chunk)
+				goto free_ctx;
+			else
+				goto enomem;
+		}
+		atomic_inc(&params->copy_ctx_count);
+		scoped_guard(spinlock_irqsave, &ctx->lock)
+			ctx->bio_count++;
+		bytes = min(remaining_in, params->in_segs[*in_idx].len -
+			    *in_offset);
+		src_bio->bi_iter.bi_size = bytes;
+		src_bio->bi_iter.bi_sector = (params->in_segs[*in_idx].pos +
+					      *in_offset) >> SECTOR_SHIFT;
+		src_bio->bi_copy_ctx = ctx;
+		src_bio->bi_end_io = blkdev_req_op_copy_done;
+		*in_offset += bytes;
+		if (*in_offset >= params->in_segs[*in_idx].len) {
+			*in_offset -= params->in_segs[*in_idx].len;
+			(*in_idx)++;
+		}
+		submit_bio(src_bio);
+	}
+	for (loff_t bytes, remaining_out = chunk; remaining_out;
+	     remaining_out -= bytes) {
+		struct bio *dst_bio;
+
+		dst_bio = bio_alloc(params->out_bdev, 0, REQ_OP_COPY_DST,
+				    GFP_NOIO);
+		if (!dst_bio)
+			goto enomem;
+		scoped_guard(spinlock_irqsave, &ctx->lock)
+			ctx->bio_count++;
+		bytes = min(remaining_out, params->out_segs[*out_idx].len -
+			    *out_offset);
+		dst_bio->bi_iter.bi_size = bytes;
+		dst_bio->bi_iter.bi_sector = (params->out_segs[*out_idx].pos +
+					      *out_offset) >> SECTOR_SHIFT;
+		dst_bio->bi_copy_ctx = ctx;
+		dst_bio->bi_end_io = blkdev_req_op_copy_done;
+		*out_offset += bytes;
+		if (*out_offset >= params->out_segs[*out_idx].len) {
+			*out_offset -= params->out_segs[*out_idx].len;
+			(*out_idx)++;
+		}
+		submit_bio(dst_bio);
+	}
+
+dec_bio_count:
+	scoped_guard(spinlock_irqsave, &ctx->lock)
+		bio_count = --ctx->bio_count;
+	if (bio_count == 0)
+		ctx->translation_complete(ctx);
+	return chunk;
+
+enomem:
+	scoped_guard(spinlock_irqsave, &ctx->lock)
+		if (!ctx->status)
+			ctx->status = BLK_STS_RESOURCE;
+	chunk = -ENOMEM;
+	goto dec_bio_count;
+
+free_ctx:
+	kfree(ctx);
+	return -ENOMEM;
+}
+
+/**
+ * blkdev_copy_offload() - copy data and offload copying if possible.
+ * @params: Source and destination block device, data ranges and completion
+ *	callback.
+ *
+ * If @params->end_io != NULL, data is copied asynchronously. If @params->end_io
+ * == NULL, this function only returns after data copying finished.
+ *
+ * Return: 0 upon success; -EIOCBQUEUED if the completion callback function will
+ *	be called or has already been called; -EOPNOTSUPP if copy offloading is
+ *	not supported by the block device or if the source or destination
+ *	address ranges span more than one dm device.
+ */
+int blkdev_copy_offload(struct blk_copy_params *params)
+{
+	loff_t in_offset = 0, out_offset = 0;
+	u32 in_idx = 0, out_idx = 0;
+	loff_t len, chunk, max_chunk;
+	int ret;
+
+	might_sleep();
+
+	if (!params->end_io)
+		return blkdev_copy_sync(params);
+
+	spin_lock_init(&params->lock);
+
+	if (!bdev_max_copy_sectors(params->in_bdev) ||
+	    !bdev_max_copy_sectors(params->out_bdev))
+		return -EOPNOTSUPP;
+
+	ret = blkdev_copy_check_params(params, &len);
+	if (ret)
+		return ret;
+
+	params->len = len;
+
+	max_chunk = (u64)min(bdev_max_copy_sectors(params->in_bdev),
+			     bdev_max_copy_sectors(params->out_bdev))
+		    << SECTOR_SHIFT;
+
+	atomic_set(&params->copy_ctx_count, 1);
+
+	for (loff_t offset = 0; offset < len; offset += chunk) {
+		chunk = min(len - offset, max_chunk);
+		chunk = blkdev_copy_chunk(params, &in_idx, &out_idx, &in_offset,
+					  &out_offset, chunk);
+	}
+
+	if (atomic_dec_and_test(&params->copy_ctx_count))
+		params->end_io(params);
+
+	return -EIOCBQUEUED;
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_offload);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4e448e810b87..27a0f92fc2cb 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -535,4 +535,44 @@ struct blk_rq_stat {
 	u64 batch;
 };
 
+/* A single input or output segment descriptor. */
+struct blk_copy_seg {
+	loff_t pos;
+	loff_t len;
+};
+
+/**
+ * struct blk_copy_params - input parameters and internal parameters for copy
+ *	operations.
+ * @in_bdev: Input block device.
+ * @in_segs: Input LBA ranges.
+ * @in_nseg: Number of elements in @in_segs.
+ * @out_bdev: Output block device.
+ * @out_segs: Output LBA ranges.
+ * @out_nseg: Number of elements in @out_segs.
+ * @end_io: Called after copying data finished. If %NULL, copying data happens
+ *	synchronously instead of asynchronously.
+ * @private: May be used by @end_io. Not used directly.
+ * @len: Total number of bytes to copy. Set by blkdev_copy_offload() or
+ *	blkdev_copy_onload().
+ * @copy_ctxs: Number of in-flight copy contexts associated with copy offload
+ *	operations.
+ * @lock: Protects @status updates.
+ * @status: I/O completion status.
+ */
+struct blk_copy_params {
+	struct block_device *in_bdev;
+	struct blk_copy_seg *in_segs;
+	unsigned int in_nseg;
+	struct block_device *out_bdev;
+	struct blk_copy_seg *out_segs;
+	unsigned int out_nseg;
+	void (*end_io)(const struct blk_copy_params *params);
+	void *private;
+	loff_t len;
+	atomic_t copy_ctx_count;
+	spinlock_t lock;
+	blk_status_t status;
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8ae64cc0546f..fea296150cda 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1283,6 +1283,7 @@ void __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
 int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp);
+int blkdev_copy_offload(struct blk_copy_params *params);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */

  parent reply	other threads:[~2026-04-24 22:42 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
2026-04-24 22:41 ` [PATCH 01/12] block: Introduce queue limits for " Bart Van Assche
2026-04-24 22:41 ` [PATCH 02/12] block: Add the REQ_OP_COPY_{SRC,DST} operations Bart Van Assche
2026-04-24 22:41 ` Bart Van Assche [this message]
2026-04-24 22:41 ` [PATCH 04/12] block: Add an onloaded copy implementation Bart Van Assche
2026-04-24 22:41 ` [PATCH 05/12] block: Introduce accessor functions for copy offload bios Bart Van Assche
2026-04-24 22:41 ` [PATCH 06/12] fs/read_write: Generalize generic_copy_file_checks() Bart Van Assche
2026-04-24 22:41 ` [PATCH 07/12] fs, block: Add copy_file_range() support for block devices Bart Van Assche
2026-04-24 22:41 ` [PATCH 08/12] nvme: Add copy offloading support Bart Van Assche
2026-04-24 22:41 ` [PATCH 09/12] nvmet: Support the Copy command Bart Van Assche
2026-04-24 22:41 ` [PATCH 10/12] dm: Add support for copy offloading Bart Van Assche
2026-04-24 22:42 ` [PATCH 11/12] dm-linear: Enable " Bart Van Assche
2026-04-24 22:42 ` [PATCH 12/12] null_blk: Add support for REQ_OP_COPY_* Bart Van Assche
2026-05-22 12:00 ` [PATCH 00/12] Block storage copy offloading Shin'ichiro Kawasaki
2026-05-22 16:22   ` Bart Van Assche

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260424224201.1949243-4-bvanassche@acm.org \
    --to=bvanassche@acm.org \
    --cc=axboe@kernel.dk \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=nj.shetty@samsung.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.