From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, linux-scsi@vger.kernel.org,
linux-nvme@lists.infradead.org, Christoph Hellwig <hch@lst.de>,
Nitesh Shetty <nj.shetty@samsung.com>,
Bart Van Assche <bvanassche@acm.org>
Subject: [PATCH 03/12] block: Introduce blkdev_copy_offload()
Date: Fri, 24 Apr 2026 15:41:52 -0700 [thread overview]
Message-ID: <20260424224201.1949243-4-bvanassche@acm.org> (raw)
In-Reply-To: <20260424224201.1949243-1-bvanassche@acm.org>
Introduce blkdev_copy_offload() for performing copy offloading. This
function implements the algorithm explained the description of the
previous patch. If the input parameters exceed what can be supported
with a single copy offload operation, multiple copy offload operations
are submitted.
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/Makefile | 2 +-
block/blk-copy.c | 355 ++++++++++++++++++++++++++++++++++++++
include/linux/blk_types.h | 40 +++++
include/linux/blkdev.h | 1 +
4 files changed, 397 insertions(+), 1 deletion(-)
create mode 100644 block/blk-copy.c
diff --git a/block/Makefile b/block/Makefile
index 7dce2e44276c..d99e8d4fda7d 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
- blk-mq-tag.o blk-mq-dma.o blk-stat.o \
+ blk-mq-tag.o blk-mq-dma.o blk-stat.o blk-copy.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o blk-ia-ranges.o early-lookup.o
diff --git a/block/blk-copy.c b/block/blk-copy.c
new file mode 100644
index 000000000000..8ac8879442f7
--- /dev/null
+++ b/block/blk-copy.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Offloaded and onloaded data copying support.
+ */
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/blk-copy.h>
+#include <linux/blk-mq.h>
+
+/* End all bios in the @ctx->bios list with status @ctx->status. */
+static void blkdev_end_bios(struct bio_copy_offload_ctx *ctx)
+{
+ struct bio *bio, *next;
+
+ bio = ctx->bios;
+ ctx->bios = NULL;
+ for (; bio; bio = next) {
+ next = bio->bi_next;
+ bio->bi_status = ctx->status;
+ bio_endio(bio);
+ }
+}
+
+/*
+ * Called after LBA translation finished for all bios associated with copy context
+ * @ctx.
+ */
+static void blkdev_translation_complete(struct bio_copy_offload_ctx *ctx)
+{
+ struct module *owner = NULL;
+ struct bio *bio;
+
+ WARN_ON_ONCE(ctx->phase != BLKDEV_TRANSLATE_LBAS);
+ ctx->phase = BLKDEV_COPY;
+
+ /* Check whether all bios are associated with the same block driver. */
+ for (bio = ctx->bios; bio; bio = bio->bi_next) {
+ if (!owner) {
+ owner = bio->bi_bdev->bd_disk->fops->owner;
+ } else if (owner != bio->bi_bdev->bd_disk->fops->owner) {
+ ctx->status = BLK_STS_INVAL;
+ break;
+ }
+ }
+
+ /* Remove the first bio from the bio list and submit it. */
+ bio = ctx->bios;
+ ctx->bios = bio->bi_next;
+ bio->bi_next = NULL;
+ if (ctx->biotail == bio)
+ ctx->biotail = NULL;
+ if (ctx->status == BLK_STS_OK)
+ submit_bio(bio);
+ else
+ bio_endio(bio);
+}
+
+/* REQ_OP_COPY_* completion handler. */
+static void blkdev_req_op_copy_done(struct bio *bio)
+{
+ struct bio_copy_offload_ctx *ctx = bio->bi_copy_ctx;
+ struct blk_copy_params *params = ctx->params;
+ blk_status_t status;
+
+ switch (ctx->phase) {
+ case BLKDEV_TRANSLATE_LBAS:
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ if (!ctx->status)
+ ctx->status = bio->bi_status;
+ break;
+ case BLKDEV_COPY:
+ status = ctx->status;
+ ctx->phase = BLKDEV_COPY_DONE;
+ blkdev_end_bios(ctx);
+ kfree(ctx);
+ scoped_guard(spinlock_irqsave, ¶ms->lock) {
+ if (!params->status)
+ params->status = status;
+ }
+ if (atomic_dec_and_test(¶ms->copy_ctx_count))
+ params->end_io(params);
+ break;
+ case BLKDEV_COPY_DONE:
+ break;
+ }
+}
+
+/*
+ * Check that all LBA offsets are aligned with both the source and the destination
+ * logical block sizes. Compare input and output length. Store the number of bytes
+ * to be transferred in *@len.
+ */
+static int blkdev_copy_check_params(const struct blk_copy_params *params,
+ loff_t *len)
+{
+ const unsigned int mask =
+ max(bdev_logical_block_size(params->in_bdev),
+ bdev_logical_block_size(params->out_bdev)) - 1;
+ loff_t in_len = 0, out_len = 0;
+ unsigned int i;
+
+ for (i = 0; i < params->in_nseg; i++) {
+ if ((params->in_segs[i].pos | params->in_segs[i].len) & mask)
+ return -EINVAL;
+ in_len += params->in_segs[i].len;
+ }
+
+ for (i = 0; i < params->out_nseg; i++) {
+ if ((params->out_segs[i].pos | params->out_segs[i].len) & mask)
+ return -EINVAL;
+ out_len += params->out_segs[i].len;
+ }
+
+ if (in_len != out_len)
+ return -EINVAL;
+
+ *len = in_len;
+
+ return 0;
+}
+
+/*
+ * Calculate the number of bytes in the max_copy_src_segments input segments
+ * starting from input segment @in_idx.
+ */
+static loff_t blk_max_src_len(const struct blk_copy_params *params,
+ unsigned int in_idx)
+{
+ uint16_t max_src_segments =
+ params->in_bdev->bd_queue->limits.max_copy_src_segments;
+ unsigned int max_i = min(params->in_nseg, in_idx + max_src_segments);
+ loff_t len = 0;
+
+ for (uint32_t i = in_idx; i < max_i; i++)
+ len += params->in_segs[i].len;
+
+ return len;
+}
+
+/*
+ * Calculate the number of bytes in the max_copy_dst_segments output segments
+ * starting from output segment @out_idx.
+ */
+static loff_t blk_max_dst_len(const struct blk_copy_params *params,
+ unsigned int out_idx)
+{
+ uint16_t max_dst_segments =
+ params->out_bdev->bd_queue->limits.max_copy_dst_segments;
+ unsigned int max_i = min(params->out_nseg, out_idx + max_dst_segments);
+ loff_t len = 0;
+
+ for (uint32_t i = out_idx; i < max_i; i++)
+ len += params->out_segs[i].len;
+
+ return len;
+}
+
+struct blkdev_copy_sync_ctx {
+ struct completion compl;
+ blk_status_t status;
+};
+
+static void blkdev_end_copy_sync(const struct blk_copy_params *params)
+{
+ struct blkdev_copy_sync_ctx *ctx = params->private;
+
+ complete(&ctx->compl);
+}
+
+static int blkdev_copy_sync(struct blk_copy_params *params)
+{
+ struct blkdev_copy_sync_ctx ctx = {
+ .compl = COMPLETION_INITIALIZER_ONSTACK(ctx.compl),
+ };
+ int ret;
+
+ WARN_ON_ONCE(params->end_io || params->private);
+ params->end_io = blkdev_end_copy_sync;
+ params->private = &ctx;
+
+ ret = blkdev_copy_offload(params);
+ if (ret && ret != -EIOCBQUEUED)
+ return ret;
+
+ wait_for_completion(&ctx.compl);
+ return blk_status_to_errno(ctx.status);
+}
+
+/**
+ * blkdev_copy_chunk() - submit a single copy offload operation
+ * @params: Copy offload input parameters.
+ * @in_idx: Index of the input segment from where to start copying.
+ * @out_idx: Index of the output segment to where to start copying.
+ * @in_offset: Offset in bytes from the start of input segment @in_idx.
+ * @out_offset: Offset in bytes from the start of output segment @out_idx.
+ * @chunk: Maximum number of bytes to copy.
+ *
+ * Returns: the number of bytes covered by the submitted copy operation or a
+ * negative error number.
+ */
+static loff_t blkdev_copy_chunk(struct blk_copy_params *params, u32 *in_idx,
+ u32 *out_idx, loff_t *in_offset,
+ loff_t *out_offset, loff_t chunk)
+{
+ struct bio_copy_offload_ctx *ctx;
+ u32 bio_count;
+
+ ctx = kzalloc_obj(*ctx);
+ if (!ctx)
+ return -ENOMEM;
+
+ spin_lock_init(&ctx->lock);
+ ctx->params = params;
+ ctx->phase = BLKDEV_TRANSLATE_LBAS;
+ ctx->translation_complete = blkdev_translation_complete;
+ /*
+ * Initialized to one to prevent that ctx->translation_complete() is
+ * called before bio submission has finished.
+ */
+ ctx->bio_count = 1;
+
+ WARN_ON_ONCE(chunk <= 0);
+ chunk = min(chunk, blk_max_src_len(params, *in_idx) - *in_offset);
+ WARN_ON_ONCE(chunk <= 0);
+ chunk = min(chunk, blk_max_dst_len(params, *out_idx) - *out_offset);
+ WARN_ON_ONCE(chunk <= 0);
+ ctx->len = chunk;
+ for (loff_t bytes, remaining_in = chunk; remaining_in > 0;
+ remaining_in -= bytes) {
+ struct bio *src_bio;
+
+ src_bio = bio_alloc(params->in_bdev, 0, REQ_OP_COPY_SRC,
+ GFP_NOIO);
+ if (!src_bio) {
+ if (remaining_in == chunk)
+ goto free_ctx;
+ else
+ goto enomem;
+ }
+ atomic_inc(¶ms->copy_ctx_count);
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ ctx->bio_count++;
+ bytes = min(remaining_in, params->in_segs[*in_idx].len -
+ *in_offset);
+ src_bio->bi_iter.bi_size = bytes;
+ src_bio->bi_iter.bi_sector = (params->in_segs[*in_idx].pos +
+ *in_offset) >> SECTOR_SHIFT;
+ src_bio->bi_copy_ctx = ctx;
+ src_bio->bi_end_io = blkdev_req_op_copy_done;
+ *in_offset += bytes;
+ if (*in_offset >= params->in_segs[*in_idx].len) {
+ *in_offset -= params->in_segs[*in_idx].len;
+ (*in_idx)++;
+ }
+ submit_bio(src_bio);
+ }
+ for (loff_t bytes, remaining_out = chunk; remaining_out;
+ remaining_out -= bytes) {
+ struct bio *dst_bio;
+
+ dst_bio = bio_alloc(params->out_bdev, 0, REQ_OP_COPY_DST,
+ GFP_NOIO);
+ if (!dst_bio)
+ goto enomem;
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ ctx->bio_count++;
+ bytes = min(remaining_out, params->out_segs[*out_idx].len -
+ *out_offset);
+ dst_bio->bi_iter.bi_size = bytes;
+ dst_bio->bi_iter.bi_sector = (params->out_segs[*out_idx].pos +
+ *out_offset) >> SECTOR_SHIFT;
+ dst_bio->bi_copy_ctx = ctx;
+ dst_bio->bi_end_io = blkdev_req_op_copy_done;
+ *out_offset += bytes;
+ if (*out_offset >= params->out_segs[*out_idx].len) {
+ *out_offset -= params->out_segs[*out_idx].len;
+ (*out_idx)++;
+ }
+ submit_bio(dst_bio);
+ }
+
+dec_bio_count:
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ bio_count = --ctx->bio_count;
+ if (bio_count == 0)
+ ctx->translation_complete(ctx);
+ return chunk;
+
+enomem:
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ if (!ctx->status)
+ ctx->status = BLK_STS_RESOURCE;
+ chunk = -ENOMEM;
+ goto dec_bio_count;
+
+free_ctx:
+ kfree(ctx);
+ return -ENOMEM;
+}
+
+/**
+ * blkdev_copy_offload() - copy data and offload copying if possible.
+ * @params: Source and destination block device, data ranges and completion
+ * callback.
+ *
+ * If @params->end_io != NULL, data is copied asynchronously. If @params->end_io
+ * == NULL, this function only returns after data copying finished.
+ *
+ * Return: 0 upon success; -EIOCBQUEUED if the completion callback function will
+ * be called or has already been called; -EOPNOTSUPP if copy offloading is
+ * not supported by the block device or if the source or destination
+ * address ranges span more than one dm device.
+ */
+int blkdev_copy_offload(struct blk_copy_params *params)
+{
+ loff_t in_offset = 0, out_offset = 0;
+ u32 in_idx = 0, out_idx = 0;
+ loff_t len, chunk, max_chunk;
+ int ret;
+
+ might_sleep();
+
+ if (!params->end_io)
+ return blkdev_copy_sync(params);
+
+ spin_lock_init(¶ms->lock);
+
+ if (!bdev_max_copy_sectors(params->in_bdev) ||
+ !bdev_max_copy_sectors(params->out_bdev))
+ return -EOPNOTSUPP;
+
+ ret = blkdev_copy_check_params(params, &len);
+ if (ret)
+ return ret;
+
+ params->len = len;
+
+ max_chunk = (u64)min(bdev_max_copy_sectors(params->in_bdev),
+ bdev_max_copy_sectors(params->out_bdev))
+ << SECTOR_SHIFT;
+
+ atomic_set(¶ms->copy_ctx_count, 1);
+
+ for (loff_t offset = 0; offset < len; offset += chunk) {
+ chunk = min(len - offset, max_chunk);
+ chunk = blkdev_copy_chunk(params, &in_idx, &out_idx, &in_offset,
+ &out_offset, chunk);
+ }
+
+ if (atomic_dec_and_test(¶ms->copy_ctx_count))
+ params->end_io(params);
+
+ return -EIOCBQUEUED;
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_offload);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4e448e810b87..27a0f92fc2cb 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -535,4 +535,44 @@ struct blk_rq_stat {
u64 batch;
};
+/* A single input or output segment descriptor. */
+struct blk_copy_seg {
+ loff_t pos;
+ loff_t len;
+};
+
+/**
+ * struct blk_copy_params - input parameters and internal parameters for copy
+ * operations.
+ * @in_bdev: Input block device.
+ * @in_segs: Input LBA ranges.
+ * @in_nseg: Number of elements in @in_segs.
+ * @out_bdev: Output block device.
+ * @out_segs: Output LBA ranges.
+ * @out_nseg: Number of elements in @out_segs.
+ * @end_io: Called after copying data finished. If %NULL, copying data happens
+ * synchronously instead of asynchronously.
+ * @private: May be used by @end_io. Not used directly.
+ * @len: Total number of bytes to copy. Set by blkdev_copy_offload() or
+ * blkdev_copy_onload().
+ * @copy_ctxs: Number of in-flight copy contexts associated with copy offload
+ * operations.
+ * @lock: Protects @status updates.
+ * @status: I/O completion status.
+ */
+struct blk_copy_params {
+ struct block_device *in_bdev;
+ struct blk_copy_seg *in_segs;
+ unsigned int in_nseg;
+ struct block_device *out_bdev;
+ struct blk_copy_seg *out_segs;
+ unsigned int out_nseg;
+ void (*end_io)(const struct blk_copy_params *params);
+ void *private;
+ loff_t len;
+ atomic_t copy_ctx_count;
+ spinlock_t lock;
+ blk_status_t status;
+};
+
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8ae64cc0546f..fea296150cda 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1283,6 +1283,7 @@ void __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp);
+int blkdev_copy_offload(struct blk_copy_params *params);
#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
next prev parent reply other threads:[~2026-04-24 22:42 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
2026-04-24 22:41 ` [PATCH 01/12] block: Introduce queue limits for " Bart Van Assche
2026-04-24 22:41 ` [PATCH 02/12] block: Add the REQ_OP_COPY_{SRC,DST} operations Bart Van Assche
2026-04-24 22:41 ` Bart Van Assche [this message]
2026-04-24 22:41 ` [PATCH 04/12] block: Add an onloaded copy implementation Bart Van Assche
2026-04-24 22:41 ` [PATCH 05/12] block: Introduce accessor functions for copy offload bios Bart Van Assche
2026-04-24 22:41 ` [PATCH 06/12] fs/read_write: Generalize generic_copy_file_checks() Bart Van Assche
2026-04-24 22:41 ` [PATCH 07/12] fs, block: Add copy_file_range() support for block devices Bart Van Assche
2026-04-24 22:41 ` [PATCH 08/12] nvme: Add copy offloading support Bart Van Assche
2026-04-24 22:41 ` [PATCH 09/12] nvmet: Support the Copy command Bart Van Assche
2026-04-24 22:41 ` [PATCH 10/12] dm: Add support for copy offloading Bart Van Assche
2026-04-24 22:42 ` [PATCH 11/12] dm-linear: Enable " Bart Van Assche
2026-04-24 22:42 ` [PATCH 12/12] null_blk: Add support for REQ_OP_COPY_* Bart Van Assche
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260424224201.1949243-4-bvanassche@acm.org \
--to=bvanassche@acm.org \
--cc=axboe@kernel.dk \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=linux-scsi@vger.kernel.org \
--cc=nj.shetty@samsung.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox