* [PATCH 01/12] block: Introduce queue limits for copy offloading
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:41 ` [PATCH 02/12] block: Add the REQ_OP_COPY_{SRC,DST} operations Bart Van Assche
` (10 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche, Kanchan Joshi, Anuj Gupta
From: Nitesh Shetty <nj.shetty@samsung.com>
Add the following request queue limits:
- max_copy_hw_sectors: the maximum number of sectors supported by the
block driver for a single offloaded copy operation.
- max_copy_src_segments: the maximum number of source segments
supported by the block driver for a single offloaded copy operation.
- max_copy_dst_segments: the maximum number of destination segments
supported by the block driver for a single offloaded copy operation.
- max_user_copy_sectors: the maximum number of sectors configured by the
user for a single offloaded copy operation.
- max_copy_sectors: the maximum number of sectors for a single
offloaded copy operation. This is the minimum of the above two
parameters.
The default value for all these new limits is zero which means that copy
offloading is not supported unless if these limits are set by the block
driver.
ake the following two limits available in sysfs:
- copy_max_bytes (RW)
- copy_max_hw_bytes (RO)
These limits will be used by the function that implements copy
offloading to decide the bio size.
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
[ bvanassche: Added max_copy_{src,dst}_segments limits. Introduced
blk_validate_copy_limits(). Introduced BLK_FEAT_STACKING_COPY_OFFL.
Modified patch description. ]
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
Documentation/ABI/stable/sysfs-block | 24 +++++++++++++++++++
block/blk-settings.c | 36 ++++++++++++++++++++++++++++
block/blk-sysfs.c | 35 +++++++++++++++++++++++++++
include/linux/blkdev.h | 18 +++++++++++++-
4 files changed, 112 insertions(+), 1 deletion(-)
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 900b3fc4c72d..bec5e04085da 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -239,6 +239,30 @@ Description:
last zone of the device which may be smaller.
+What: /sys/block/<disk>/queue/copy_max_bytes
+Date: May 2026
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This is the maximum number of bytes that the block layer
+ will allow for a copy request. This is always smaller or
+ equal to the maximum size allowed by the block driver.
+ Any value higher than 'copy_max_hw_bytes' will be reduced to
+ 'copy_max_hw_bytes'. Writing '0' to this attribute will disable
+ copy offloading for this block device. If copy offloading is
+ disabled, copy requests will be translated into read and write
+ requests.
+
+
+What: /sys/block/<disk>/queue/copy_max_hw_bytes
+Date: May 2026
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This is the maximum number of bytes that is allowed for
+ a single data copy request. Set by the block driver. The value
+ zero indicates that the block device does not support copy
+ offloading.
+
+
What: /sys/block/<disk>/queue/crypto/
Date: February 2022
Contact: linux-block@vger.kernel.org
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 78c83817b9d3..cb846ff2926e 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -57,6 +57,11 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_hw_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
lim->atomic_write_hw_max = UINT_MAX;
+
+ lim->max_user_copy_sectors = UINT_MAX;
+ lim->max_copy_hw_sectors = UINT_MAX;
+ lim->max_copy_src_segments = U16_MAX;
+ lim->max_copy_dst_segments = U16_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -333,6 +338,21 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
lim->atomic_write_unit_max = 0;
}
+/*
+ * Check whether max_copy_hw_sectors and max_copy_{src,dst}_segments are
+ * either all nonzero or all zero.
+ */
+static int blk_validate_copy_limits(const struct queue_limits *lim)
+{
+ if (lim->max_copy_hw_sectors && lim->max_copy_src_segments &&
+ lim->max_copy_dst_segments)
+ return 0;
+ if (!lim->max_copy_hw_sectors && !lim->max_copy_src_segments &&
+ !lim->max_copy_dst_segments)
+ return 0;
+ return -EINVAL;
+}
+
/*
* Check that the limits in lim are valid, initialize defaults for unset
* values, and cap values based on others where needed.
@@ -510,6 +530,13 @@ int blk_validate_limits(struct queue_limits *lim)
err = blk_validate_integrity_limits(lim);
if (err)
return err;
+
+ err = blk_validate_copy_limits(lim);
+ if (err)
+ return err;
+ lim->max_copy_sectors =
+ min(lim->max_copy_hw_sectors, lim->max_user_copy_sectors);
+
return blk_validate_zoned_limits(lim);
}
EXPORT_SYMBOL_GPL(blk_validate_limits);
@@ -528,6 +555,7 @@ int blk_set_default_limits(struct queue_limits *lim)
*/
lim->max_user_discard_sectors = UINT_MAX;
lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
+ lim->max_user_copy_sectors = UINT_MAX;
return blk_validate_limits(lim);
}
@@ -829,6 +857,14 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_segment_size = min_not_zero(t->max_segment_size,
b->max_segment_size);
+ t->max_copy_hw_sectors =
+ min(t->max_copy_hw_sectors, b->max_copy_hw_sectors);
+ t->max_copy_src_segments =
+ min(t->max_copy_src_segments, b->max_copy_src_segments);
+ t->max_copy_dst_segments =
+ min(t->max_copy_dst_segments, b->max_copy_dst_segments);
+ t->max_copy_sectors = min(t->max_copy_sectors, b->max_copy_sectors);
+
alignment = queue_limit_alignment_offset(b, start);
/* Bottom device has different alignment. Check that it is
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f22c1f253eb3..8e1e14d1682d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -325,6 +325,36 @@ queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
return 0;
}
+static ssize_t queue_copy_hw_max_show(struct gendisk *disk, char *page)
+{
+ return queue_var_show(
+ disk->queue->limits.max_copy_hw_sectors << SECTOR_SHIFT, page);
+}
+
+static ssize_t queue_copy_max_show(struct gendisk *disk, char *page)
+{
+ return queue_var_show(
+ disk->queue->limits.max_copy_sectors << SECTOR_SHIFT, page);
+}
+
+static int queue_copy_max_store(struct gendisk *disk, const char *page,
+ size_t count, struct queue_limits *lim)
+{
+ unsigned long max_copy_bytes;
+ ssize_t ret;
+
+ ret = queue_var_store(&max_copy_bytes, page, count);
+ if (ret < 0)
+ return ret;
+
+ if ((max_copy_bytes >> SECTOR_SHIFT) > UINT_MAX)
+ return -EINVAL;
+
+ lim->max_user_copy_sectors = max_copy_bytes >> SECTOR_SHIFT;
+
+ return 0;
+}
+
static ssize_t queue_feature_store(struct gendisk *disk, const char *page,
size_t count, struct queue_limits *lim, blk_features_t feature)
{
@@ -652,6 +682,9 @@ QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones");
QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones");
+QUEUE_LIM_RO_ENTRY(queue_copy_hw_max, "copy_max_hw_bytes");
+QUEUE_LIM_RW_ENTRY(queue_copy_max, "copy_max_bytes");
+
QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
@@ -760,6 +793,8 @@ static const struct attribute *const queue_attrs[] = {
&queue_max_hw_wzeroes_unmap_sectors_entry.attr,
&queue_max_wzeroes_unmap_sectors_entry.attr,
&queue_max_zone_append_sectors_entry.attr,
+ &queue_copy_hw_max_entry.attr,
+ &queue_copy_max_entry.attr,
&queue_zone_write_granularity_entry.attr,
&queue_rotational_entry.attr,
&queue_zoned_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 890128cdea1c..8ae64cc0546f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -353,13 +353,17 @@ typedef unsigned int __bitwise blk_features_t;
#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
((__force blk_features_t)(1u << 15))
+/* block driver is a stacking block driver that supports copy offloading */
+#define BLK_FEAT_STACKING_COPY_OFFL ((__force blk_features_t)(1u << 16))
+
/*
* Flags automatically inherited when stacking limits.
*/
#define BLK_FEAT_INHERIT_MASK \
(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \
BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | \
- BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE)
+ BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE | \
+ BLK_FEAT_STACKING_COPY_OFFL)
/* internal flags in queue_limits.flags */
typedef unsigned int __bitwise blk_flags_t;
@@ -415,6 +419,13 @@ struct queue_limits {
unsigned int atomic_write_hw_unit_max;
unsigned int atomic_write_unit_max;
+ /* copy offloading limits */
+ unsigned int max_copy_hw_sectors; /* set by block driver*/
+ uint16_t max_copy_src_segments; /* set by block driver*/
+ uint16_t max_copy_dst_segments; /* set by block driver*/
+ unsigned int max_user_copy_sectors; /* set via sysfs */
+ unsigned int max_copy_sectors; /* min() of the above */
+
unsigned short max_segments;
unsigned short max_integrity_segments;
unsigned short max_discard_segments;
@@ -1454,6 +1465,11 @@ static inline unsigned int bdev_discard_granularity(struct block_device *bdev)
return bdev_limits(bdev)->discard_granularity;
}
+static inline unsigned int bdev_max_copy_sectors(struct block_device *bdev)
+{
+ return bdev_get_queue(bdev)->limits.max_copy_sectors;
+}
+
static inline unsigned int
bdev_max_secure_erase_sectors(struct block_device *bdev)
{
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 02/12] block: Add the REQ_OP_COPY_{SRC,DST} operations
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
2026-04-24 22:41 ` [PATCH 01/12] block: Introduce queue limits for " Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:41 ` [PATCH 03/12] block: Introduce blkdev_copy_offload() Bart Van Assche
` (9 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche, Anuj Gupta
From: Nitesh Shetty <nj.shetty@samsung.com>
Introduce the REQ_OP_COPY_SRC and REQ_OP_COPY_DST operations. The source
and destination LBA range information is in separate bios because any
other approach would require a rewrite of the device mapper. These bios
are associated with each other via the new bi_copy_ctx pointer. A new
pointer has been introduced because the copy offloading context
information must be preserved when cloning a bio and the bi_private bio
member must not be copied when cloning a bio.
This patch supports the following approach for copy offloading:
1. Allocate a struct bio_copy_offload_ctx instance and set phase to
BLKDEV_TRANSLATE_LBAS.
2. Allocate REQ_OP_COPY_SRC and REQ_OP_COPY_DST bios. Set the
bi_copy_ctx member of these bios.
3. Set the bio_count member of struct bio_copy_offload_ctx.
4. Submit all REQ_OP_COPY_* bios.
5. In submit_bio(), do the following for REQ_OP_COPY_* bios:
- If bio->bi_bdev is a stacking device, submit the bio. This will
send the bio to the device mapper. The device mapper will clone the
bio, translate the LBAs and will submit the cloned bio. That will
result in a recursive submit_bio() call.
- If bio->bi_bdev is not a stacking device, add the bio to the
copy_ctx->bios list and decrement copy_ctx->bio_count.
6. Once copy_ctx->bio_count == 0, call copy_ctx->translation_complete().
7. In the implementation of copy_ctx->translation_complete(), change
copy_ctx->phase from BLKDEV_TRANSLATE_LBAS into BLKDEV_COPY.
8. Submit the first REQ_OP_COPY_* bio of the copy_ctx->bios list.
9. Once this bio reaches the block driver associated with the bio,
retrieve the other bios involved in the copy operation from the copy
context data structure and convert all these bios into a copy offload
operation.
10. Once this bio completes, also complete all the other bios involved
in the copy offload operation.
This patch increases the size of struct bio from 104 to 112 bytes on 64-bit
systems.
To be discussed further: whether adding a new member in struct bio is
acceptable or whether the new pointer perhaps should be stored in front of
the bio. bioset_init() supports front padding.
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
[ bvanassche: changed the approach of this patch from combining the
COPY_SRC and COPY_DST operations immediately to translating the LBA
information first. ]
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/bio.c | 1 +
block/blk-core.c | 38 ++++++++++++++++++++++++++++++++
block/blk-merge.c | 13 +++++++++++
block/blk.h | 5 +++++
include/linux/blk-copy.h | 46 +++++++++++++++++++++++++++++++++++++++
include/linux/blk_types.h | 17 +++++++++++++++
6 files changed, 120 insertions(+)
create mode 100644 include/linux/blk-copy.h
diff --git a/block/bio.c b/block/bio.c
index b8972dba68a0..51480c9be27b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -852,6 +852,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter = bio_src->bi_iter;
+ bio->bi_copy_ctx = bio_src->bi_copy_ctx;
if (bio->bi_bdev) {
if (bio->bi_bdev == bio_src->bi_bdev &&
diff --git a/block/blk-core.c b/block/blk-core.c
index 17450058ea6d..37c01e717202 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-copy.h>
#include <linux/blk-pm.h>
#include <linux/blk-integrity.h>
#include <linux/highmem.h>
@@ -108,6 +109,8 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(ZONE_FINISH),
REQ_OP_NAME(ZONE_APPEND),
REQ_OP_NAME(WRITE_ZEROES),
+ REQ_OP_NAME(COPY_SRC),
+ REQ_OP_NAME(COPY_DST),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
};
@@ -782,6 +785,8 @@ void submit_bio_noacct(struct bio *bio)
struct block_device *bdev = bio->bi_bdev;
struct request_queue *q = bdev_get_queue(bdev);
blk_status_t status = BLK_STS_IOERR;
+ struct bio_copy_offload_ctx *copy_ctx;
+ u32 bio_count;
might_sleep();
@@ -875,6 +880,39 @@ void submit_bio_noacct(struct bio *bio)
* requests.
*/
fallthrough;
+ case REQ_OP_COPY_SRC:
+ case REQ_OP_COPY_DST:
+ copy_ctx = bio->bi_copy_ctx;
+ WARN_ON_ONCE(copy_ctx->phase == BLKDEV_COPY_DONE);
+ if (copy_ctx->phase == BLKDEV_COPY)
+ break;
+ /* If copy offloading is not supported, fail the bio. */
+ if (!q->limits.max_copy_sectors) {
+ scoped_guard(spinlock_irqsave, ©_ctx->lock)
+ copy_ctx->bio_count--;
+ goto not_supported;
+ }
+ /*
+ * If the block driver is a stacking driver that supports copy
+ * offloading, submit the bio.
+ */
+ if (q->limits.features & BLK_FEAT_STACKING_COPY_OFFL)
+ break;
+ /*
+ * Append the bio at the end of the bio->bi_copy_ctx->bios list.
+ */
+ scoped_guard(spinlock_irqsave, ©_ctx->lock) {
+ if (copy_ctx->biotail)
+ copy_ctx->biotail->bi_next = bio;
+ else
+ copy_ctx->bios = bio;
+ copy_ctx->biotail = bio;
+ bio_count = --copy_ctx->bio_count;
+ }
+ WARN_ON_ONCE(bio_count < 0);
+ if (bio_count == 0)
+ copy_ctx->translation_complete(copy_ctx);
+ return;
default:
goto not_supported;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fcf09325b22e..4678131650d2 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -207,6 +207,19 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
return __bio_split_discard(bio, lim, nsegs, max_sectors);
}
+struct bio *bio_split_copy(struct bio *bio, const struct queue_limits *lim,
+ unsigned int *nsegs)
+{
+ *nsegs = 1;
+ if (bio_sectors(bio) <= lim->max_copy_sectors)
+ return bio;
+
+ /* Splitting a REQ_OP_COPY_* bio is not supported. */
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ return NULL;
+}
+
static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim,
bool is_atomic)
{
diff --git a/block/blk.h b/block/blk.h
index b998a7761faf..274c226e87ee 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -379,6 +379,8 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *nr_segs);
struct bio *bio_split_zone_append(struct bio *bio,
const struct queue_limits *lim, unsigned *nr_segs);
+struct bio *bio_split_copy(struct bio *bio, const struct queue_limits *lim,
+ unsigned int *nsegs);
/*
* All drivers must accept single-segments bios that are smaller than PAGE_SIZE.
@@ -435,6 +437,9 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio,
return bio_split_discard(bio, lim, nr_segs);
case REQ_OP_WRITE_ZEROES:
return bio_split_write_zeroes(bio, lim, nr_segs);
+ case REQ_OP_COPY_SRC:
+ case REQ_OP_COPY_DST:
+ return bio_split_copy(bio, lim, nr_segs);
default:
/* other operations can't be split */
*nr_segs = 0;
diff --git a/include/linux/blk-copy.h b/include/linux/blk-copy.h
new file mode 100644
index 000000000000..5e38cfc14a71
--- /dev/null
+++ b/include/linux/blk-copy.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_BLK_COPY_H
+#define __LINUX_BLK_COPY_H
+
+#include <linux/blk_types.h>
+#include <linux/completion.h>
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+#include <linux/workqueue_types.h>
+
+struct blk_copy_params;
+struct request;
+
+enum blkdev_copy_phase {
+ BLKDEV_TRANSLATE_LBAS,
+ BLKDEV_COPY,
+ BLKDEV_COPY_DONE,
+};
+
+/*
+ * struct bio_copy_offload_ctx - context information for blkdev_copy_offload()
+ * @params: Input parameters passed to blkdev_copy_offload().
+ * @len: Number of bytes associated with this copy context.
+ * @phase: Copy offload phase: either translating LBAs or copying data.
+ * @lock: Protects @bios, @biotail and @bio_count.
+ * @bios: List with REQ_OP_COPY_* bios for which LBA translation completed.
+ * @biotail: Last element in the @bios list.
+ * @bio_count: Number bios for which LBA translation has not yet completed.
+ * @status: bio completion status.
+ * @translation_complete: Called after LBA translation has completed.
+ * LBA translation has completed once bio_count drops to zero.
+ */
+struct bio_copy_offload_ctx {
+ struct blk_copy_params *params;
+ loff_t len;
+ enum blkdev_copy_phase phase;
+ spinlock_t lock;
+ struct bio *bios __guarded_by(&lock);
+ struct bio *biotail __guarded_by(&lock);
+ u32 bio_count __guarded_by(&lock);
+ blk_status_t status __guarded_by(&lock);
+ void (*translation_complete)(struct bio_copy_offload_ctx *ctx);
+};
+
+#endif /* __LINUX_BLK_COPY_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8808ee76e73c..4e448e810b87 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -284,6 +284,8 @@ struct bio {
atomic_t __bi_cnt; /* pin count */
struct bio_set *bi_pool;
+
+ void *bi_copy_ctx;
};
#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
@@ -370,6 +372,10 @@ enum req_op {
/** @REQ_OP_ZONE_RESET_ALL: reset all the zone present on the device */
REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)19,
+ /* copy offload source and destination operations */
+ REQ_OP_COPY_SRC = (__force blk_opf_t)20,
+ REQ_OP_COPY_DST = (__force blk_opf_t)21,
+
/* Driver private requests */
/* private: */
REQ_OP_DRV_IN = (__force blk_opf_t)34,
@@ -461,6 +467,17 @@ static inline bool op_is_write(blk_opf_t op)
return !!(op & (__force blk_opf_t)1);
}
+static inline bool op_is_copy(blk_opf_t op)
+{
+ switch (op & REQ_OP_MASK) {
+ case REQ_OP_COPY_DST:
+ case REQ_OP_COPY_SRC:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* Check if the bio or request is one that needs special treatment in the
* flush state machine.
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 03/12] block: Introduce blkdev_copy_offload()
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
2026-04-24 22:41 ` [PATCH 01/12] block: Introduce queue limits for " Bart Van Assche
2026-04-24 22:41 ` [PATCH 02/12] block: Add the REQ_OP_COPY_{SRC,DST} operations Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:41 ` [PATCH 04/12] block: Add an onloaded copy implementation Bart Van Assche
` (8 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche
Introduce blkdev_copy_offload() for performing copy offloading. This
function implements the algorithm explained the description of the
previous patch. If the input parameters exceed what can be supported
with a single copy offload operation, multiple copy offload operations
are submitted.
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/Makefile | 2 +-
block/blk-copy.c | 355 ++++++++++++++++++++++++++++++++++++++
include/linux/blk_types.h | 40 +++++
include/linux/blkdev.h | 1 +
4 files changed, 397 insertions(+), 1 deletion(-)
create mode 100644 block/blk-copy.c
diff --git a/block/Makefile b/block/Makefile
index 7dce2e44276c..d99e8d4fda7d 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
- blk-mq-tag.o blk-mq-dma.o blk-stat.o \
+ blk-mq-tag.o blk-mq-dma.o blk-stat.o blk-copy.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o blk-ia-ranges.o early-lookup.o
diff --git a/block/blk-copy.c b/block/blk-copy.c
new file mode 100644
index 000000000000..8ac8879442f7
--- /dev/null
+++ b/block/blk-copy.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Offloaded and onloaded data copying support.
+ */
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/blk-copy.h>
+#include <linux/blk-mq.h>
+
+/* End all bios in the @ctx->bios list with status @ctx->status. */
+static void blkdev_end_bios(struct bio_copy_offload_ctx *ctx)
+{
+ struct bio *bio, *next;
+
+ bio = ctx->bios;
+ ctx->bios = NULL;
+ for (; bio; bio = next) {
+ next = bio->bi_next;
+ bio->bi_status = ctx->status;
+ bio_endio(bio);
+ }
+}
+
+/*
+ * Called after LBA translation finished for all bios associated with copy context
+ * @ctx.
+ */
+static void blkdev_translation_complete(struct bio_copy_offload_ctx *ctx)
+{
+ struct module *owner = NULL;
+ struct bio *bio;
+
+ WARN_ON_ONCE(ctx->phase != BLKDEV_TRANSLATE_LBAS);
+ ctx->phase = BLKDEV_COPY;
+
+ /* Check whether all bios are associated with the same block driver. */
+ for (bio = ctx->bios; bio; bio = bio->bi_next) {
+ if (!owner) {
+ owner = bio->bi_bdev->bd_disk->fops->owner;
+ } else if (owner != bio->bi_bdev->bd_disk->fops->owner) {
+ ctx->status = BLK_STS_INVAL;
+ break;
+ }
+ }
+
+ /* Remove the first bio from the bio list and submit it. */
+ bio = ctx->bios;
+ ctx->bios = bio->bi_next;
+ bio->bi_next = NULL;
+ if (ctx->biotail == bio)
+ ctx->biotail = NULL;
+ if (ctx->status == BLK_STS_OK)
+ submit_bio(bio);
+ else
+ bio_endio(bio);
+}
+
+/* REQ_OP_COPY_* completion handler. */
+static void blkdev_req_op_copy_done(struct bio *bio)
+{
+ struct bio_copy_offload_ctx *ctx = bio->bi_copy_ctx;
+ struct blk_copy_params *params = ctx->params;
+ blk_status_t status;
+
+ switch (ctx->phase) {
+ case BLKDEV_TRANSLATE_LBAS:
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ if (!ctx->status)
+ ctx->status = bio->bi_status;
+ break;
+ case BLKDEV_COPY:
+ status = ctx->status;
+ ctx->phase = BLKDEV_COPY_DONE;
+ blkdev_end_bios(ctx);
+ kfree(ctx);
+ scoped_guard(spinlock_irqsave, ¶ms->lock) {
+ if (!params->status)
+ params->status = status;
+ }
+ if (atomic_dec_and_test(¶ms->copy_ctx_count))
+ params->end_io(params);
+ break;
+ case BLKDEV_COPY_DONE:
+ break;
+ }
+}
+
+/*
+ * Check that all LBA offsets are aligned with both the source and the destination
+ * logical block sizes. Compare input and output length. Store the number of bytes
+ * to be transferred in *@len.
+ */
+static int blkdev_copy_check_params(const struct blk_copy_params *params,
+ loff_t *len)
+{
+ const unsigned int mask =
+ max(bdev_logical_block_size(params->in_bdev),
+ bdev_logical_block_size(params->out_bdev)) - 1;
+ loff_t in_len = 0, out_len = 0;
+ unsigned int i;
+
+ for (i = 0; i < params->in_nseg; i++) {
+ if ((params->in_segs[i].pos | params->in_segs[i].len) & mask)
+ return -EINVAL;
+ in_len += params->in_segs[i].len;
+ }
+
+ for (i = 0; i < params->out_nseg; i++) {
+ if ((params->out_segs[i].pos | params->out_segs[i].len) & mask)
+ return -EINVAL;
+ out_len += params->out_segs[i].len;
+ }
+
+ if (in_len != out_len)
+ return -EINVAL;
+
+ *len = in_len;
+
+ return 0;
+}
+
+/*
+ * Calculate the number of bytes in the max_copy_src_segments input segments
+ * starting from input segment @in_idx.
+ */
+static loff_t blk_max_src_len(const struct blk_copy_params *params,
+ unsigned int in_idx)
+{
+ uint16_t max_src_segments =
+ params->in_bdev->bd_queue->limits.max_copy_src_segments;
+ unsigned int max_i = min(params->in_nseg, in_idx + max_src_segments);
+ loff_t len = 0;
+
+ for (uint32_t i = in_idx; i < max_i; i++)
+ len += params->in_segs[i].len;
+
+ return len;
+}
+
+/*
+ * Calculate the number of bytes in the max_copy_dst_segments output segments
+ * starting from output segment @out_idx.
+ */
+static loff_t blk_max_dst_len(const struct blk_copy_params *params,
+ unsigned int out_idx)
+{
+ uint16_t max_dst_segments =
+ params->out_bdev->bd_queue->limits.max_copy_dst_segments;
+ unsigned int max_i = min(params->out_nseg, out_idx + max_dst_segments);
+ loff_t len = 0;
+
+ for (uint32_t i = out_idx; i < max_i; i++)
+ len += params->out_segs[i].len;
+
+ return len;
+}
+
+struct blkdev_copy_sync_ctx {
+ struct completion compl;
+ blk_status_t status;
+};
+
+static void blkdev_end_copy_sync(const struct blk_copy_params *params)
+{
+ struct blkdev_copy_sync_ctx *ctx = params->private;
+
+ complete(&ctx->compl);
+}
+
+static int blkdev_copy_sync(struct blk_copy_params *params)
+{
+ struct blkdev_copy_sync_ctx ctx = {
+ .compl = COMPLETION_INITIALIZER_ONSTACK(ctx.compl),
+ };
+ int ret;
+
+ WARN_ON_ONCE(params->end_io || params->private);
+ params->end_io = blkdev_end_copy_sync;
+ params->private = &ctx;
+
+ ret = blkdev_copy_offload(params);
+ if (ret && ret != -EIOCBQUEUED)
+ return ret;
+
+ wait_for_completion(&ctx.compl);
+ return blk_status_to_errno(ctx.status);
+}
+
+/**
+ * blkdev_copy_chunk() - submit a single copy offload operation
+ * @params: Copy offload input parameters.
+ * @in_idx: Index of the input segment from where to start copying.
+ * @out_idx: Index of the output segment to where to start copying.
+ * @in_offset: Offset in bytes from the start of input segment @in_idx.
+ * @out_offset: Offset in bytes from the start of output segment @out_idx.
+ * @chunk: Maximum number of bytes to copy.
+ *
+ * Returns: the number of bytes covered by the submitted copy operation or a
+ * negative error number.
+ */
+static loff_t blkdev_copy_chunk(struct blk_copy_params *params, u32 *in_idx,
+ u32 *out_idx, loff_t *in_offset,
+ loff_t *out_offset, loff_t chunk)
+{
+ struct bio_copy_offload_ctx *ctx;
+ u32 bio_count;
+
+ ctx = kzalloc_obj(*ctx);
+ if (!ctx)
+ return -ENOMEM;
+
+ spin_lock_init(&ctx->lock);
+ ctx->params = params;
+ ctx->phase = BLKDEV_TRANSLATE_LBAS;
+ ctx->translation_complete = blkdev_translation_complete;
+ /*
+ * Initialized to one to prevent that ctx->translation_complete() is
+ * called before bio submission has finished.
+ */
+ ctx->bio_count = 1;
+
+ WARN_ON_ONCE(chunk <= 0);
+ chunk = min(chunk, blk_max_src_len(params, *in_idx) - *in_offset);
+ WARN_ON_ONCE(chunk <= 0);
+ chunk = min(chunk, blk_max_dst_len(params, *out_idx) - *out_offset);
+ WARN_ON_ONCE(chunk <= 0);
+ ctx->len = chunk;
+ for (loff_t bytes, remaining_in = chunk; remaining_in > 0;
+ remaining_in -= bytes) {
+ struct bio *src_bio;
+
+ src_bio = bio_alloc(params->in_bdev, 0, REQ_OP_COPY_SRC,
+ GFP_NOIO);
+ if (!src_bio) {
+ if (remaining_in == chunk)
+ goto free_ctx;
+ else
+ goto enomem;
+ }
+ atomic_inc(¶ms->copy_ctx_count);
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ ctx->bio_count++;
+ bytes = min(remaining_in, params->in_segs[*in_idx].len -
+ *in_offset);
+ src_bio->bi_iter.bi_size = bytes;
+ src_bio->bi_iter.bi_sector = (params->in_segs[*in_idx].pos +
+ *in_offset) >> SECTOR_SHIFT;
+ src_bio->bi_copy_ctx = ctx;
+ src_bio->bi_end_io = blkdev_req_op_copy_done;
+ *in_offset += bytes;
+ if (*in_offset >= params->in_segs[*in_idx].len) {
+ *in_offset -= params->in_segs[*in_idx].len;
+ (*in_idx)++;
+ }
+ submit_bio(src_bio);
+ }
+ for (loff_t bytes, remaining_out = chunk; remaining_out;
+ remaining_out -= bytes) {
+ struct bio *dst_bio;
+
+ dst_bio = bio_alloc(params->out_bdev, 0, REQ_OP_COPY_DST,
+ GFP_NOIO);
+ if (!dst_bio)
+ goto enomem;
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ ctx->bio_count++;
+ bytes = min(remaining_out, params->out_segs[*out_idx].len -
+ *out_offset);
+ dst_bio->bi_iter.bi_size = bytes;
+ dst_bio->bi_iter.bi_sector = (params->out_segs[*out_idx].pos +
+ *out_offset) >> SECTOR_SHIFT;
+ dst_bio->bi_copy_ctx = ctx;
+ dst_bio->bi_end_io = blkdev_req_op_copy_done;
+ *out_offset += bytes;
+ if (*out_offset >= params->out_segs[*out_idx].len) {
+ *out_offset -= params->out_segs[*out_idx].len;
+ (*out_idx)++;
+ }
+ submit_bio(dst_bio);
+ }
+
+dec_bio_count:
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ bio_count = --ctx->bio_count;
+ if (bio_count == 0)
+ ctx->translation_complete(ctx);
+ return chunk;
+
+enomem:
+ scoped_guard(spinlock_irqsave, &ctx->lock)
+ if (!ctx->status)
+ ctx->status = BLK_STS_RESOURCE;
+ chunk = -ENOMEM;
+ goto dec_bio_count;
+
+free_ctx:
+ kfree(ctx);
+ return -ENOMEM;
+}
+
+/**
+ * blkdev_copy_offload() - copy data and offload copying if possible.
+ * @params: Source and destination block device, data ranges and completion
+ * callback.
+ *
+ * If @params->end_io != NULL, data is copied asynchronously. If @params->end_io
+ * == NULL, this function only returns after data copying finished.
+ *
+ * Return: 0 upon success; -EIOCBQUEUED if the completion callback function will
+ * be called or has already been called; -EOPNOTSUPP if copy offloading is
+ * not supported by the block device or if the source or destination
+ * address ranges span more than one dm device.
+ */
+int blkdev_copy_offload(struct blk_copy_params *params)
+{
+ loff_t in_offset = 0, out_offset = 0;
+ u32 in_idx = 0, out_idx = 0;
+ loff_t len, chunk, max_chunk;
+ int ret;
+
+ might_sleep();
+
+ if (!params->end_io)
+ return blkdev_copy_sync(params);
+
+ spin_lock_init(¶ms->lock);
+
+ if (!bdev_max_copy_sectors(params->in_bdev) ||
+ !bdev_max_copy_sectors(params->out_bdev))
+ return -EOPNOTSUPP;
+
+ ret = blkdev_copy_check_params(params, &len);
+ if (ret)
+ return ret;
+
+ params->len = len;
+
+ max_chunk = (u64)min(bdev_max_copy_sectors(params->in_bdev),
+ bdev_max_copy_sectors(params->out_bdev))
+ << SECTOR_SHIFT;
+
+ atomic_set(¶ms->copy_ctx_count, 1);
+
+ for (loff_t offset = 0; offset < len; offset += chunk) {
+ chunk = min(len - offset, max_chunk);
+ chunk = blkdev_copy_chunk(params, &in_idx, &out_idx, &in_offset,
+ &out_offset, chunk);
+ }
+
+ if (atomic_dec_and_test(¶ms->copy_ctx_count))
+ params->end_io(params);
+
+ return -EIOCBQUEUED;
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_offload);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4e448e810b87..27a0f92fc2cb 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -535,4 +535,44 @@ struct blk_rq_stat {
u64 batch;
};
+/* A single input or output segment descriptor. */
+struct blk_copy_seg {
+ loff_t pos;
+ loff_t len;
+};
+
+/**
+ * struct blk_copy_params - input parameters and internal parameters for copy
+ * operations.
+ * @in_bdev: Input block device.
+ * @in_segs: Input LBA ranges.
+ * @in_nseg: Number of elements in @in_segs.
+ * @out_bdev: Output block device.
+ * @out_segs: Output LBA ranges.
+ * @out_nseg: Number of elements in @out_segs.
+ * @end_io: Called after copying data finished. If %NULL, copying data happens
+ * synchronously instead of asynchronously.
+ * @private: May be used by @end_io. Not used directly.
+ * @len: Total number of bytes to copy. Set by blkdev_copy_offload() or
+ * blkdev_copy_onload().
+ * @copy_ctxs: Number of in-flight copy contexts associated with copy offload
+ * operations.
+ * @lock: Protects @status updates.
+ * @status: I/O completion status.
+ */
+struct blk_copy_params {
+ struct block_device *in_bdev;
+ struct blk_copy_seg *in_segs;
+ unsigned int in_nseg;
+ struct block_device *out_bdev;
+ struct blk_copy_seg *out_segs;
+ unsigned int out_nseg;
+ void (*end_io)(const struct blk_copy_params *params);
+ void *private;
+ loff_t len;
+ atomic_t copy_ctx_count;
+ spinlock_t lock;
+ blk_status_t status;
+};
+
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8ae64cc0546f..fea296150cda 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1283,6 +1283,7 @@ void __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp);
+int blkdev_copy_offload(struct blk_copy_params *params);
#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 04/12] block: Add an onloaded copy implementation
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
` (2 preceding siblings ...)
2026-04-24 22:41 ` [PATCH 03/12] block: Introduce blkdev_copy_offload() Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:41 ` [PATCH 05/12] block: Introduce accessor functions for copy offload bios Bart Van Assche
` (7 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche, Vincent Fu, Anuj Gupta
From: Nitesh Shetty <nj.shetty@samsung.com>
For the devices which do not support copy offloading, add a function that
copies data by submitting READ and WRITE operations.
Onloaded copying is implemented by reading from the source block device
into memory and by writing this data to the destination block device.
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Vincent Fu <vincent.fu@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-copy.c | 229 +++++++++++++++++++++++++++++++++++++++++
include/linux/blkdev.h | 1 +
2 files changed, 230 insertions(+)
diff --git a/block/blk-copy.c b/block/blk-copy.c
index 8ac8879442f7..459ed8581efc 100644
--- a/block/blk-copy.c
+++ b/block/blk-copy.c
@@ -7,6 +7,26 @@
#include <linux/blk-copy.h>
#include <linux/blk-mq.h>
+/**
+ * Tracks the state of a single onloaded copy operation.
+ * @params: Data copy parameters.
+ * @read_work: For scheduling read work.
+ * @write_work: For scheduling write work.
+ * @buf: Data buffer.
+ * @buf_len: Length in bytes of @buf.
+ * @offset: Current copying offset. Range: [0, @len[.
+ * @chunk: Size in bytes of the chunk of data that is being copied.
+ */
+struct blkdev_copy_onload_ctx {
+ struct blk_copy_params *params;
+ struct work_struct read_work;
+ struct work_struct write_work;
+ void *buf;
+ ssize_t buf_len;
+ loff_t offset;
+ loff_t chunk;
+};
+
/* End all bios in the @ctx->bios list with status @ctx->status. */
static void blkdev_end_bios(struct bio_copy_offload_ctx *ctx)
{
@@ -353,3 +373,212 @@ int blkdev_copy_offload(struct blk_copy_params *params)
return -EIOCBQUEUED;
}
EXPORT_SYMBOL_GPL(blkdev_copy_offload);
+
+static void *blkdev_copy_alloc_buf(size_t req_size, size_t *alloc_size)
+{
+ unsigned int min_size = PAGE_SIZE;
+ char *buf;
+
+ while (req_size >= min_size) {
+ buf = kmalloc(req_size, GFP_NOIO | __GFP_NOWARN);
+ if (buf) {
+ *alloc_size = req_size;
+ return buf;
+ }
+ req_size >>= 1;
+ }
+
+ return NULL;
+}
+
+static struct bio *bio_map_buf(void *buf, unsigned int len)
+{
+ struct page *page;
+ struct bio *bio;
+ static const uint16_t nr_vecs = 1;
+
+ bio = bio_kmalloc(nr_vecs, GFP_NOIO);
+ if (!bio)
+ return NULL;
+ bio_init_inline(bio, /*bdev=*/NULL, /*max_vecs=*/nr_vecs, /*opf=*/0);
+
+ page = virt_to_page(buf);
+ if (bio_add_page(bio, page, len, offset_in_page(buf)) < len) {
+ /* we don't support partial mappings */
+ bio_uninit(bio);
+ kfree(bio);
+ WARN_ON_ONCE(true);
+ return NULL;
+ }
+
+ return bio;
+}
+
+static void blkdev_write_done(struct bio *bio)
+{
+ struct blkdev_copy_onload_ctx *ctx = bio->bi_copy_ctx;
+ struct blk_copy_params *params = ctx->params;
+ blk_status_t sts = bio->bi_status;
+
+ kfree(bio);
+
+ if (sts) {
+ params->status = sts;
+ params->end_io(params);
+ return;
+ }
+
+ ctx->offset += ctx->chunk;
+
+ schedule_work(&ctx->read_work);
+}
+
+static sector_t blkdev_offset_to_out_pos(const struct blk_copy_params *params,
+ loff_t offset)
+{
+ for (int i = 0; i < params->out_nseg; i++) {
+ loff_t rem = params->out_segs[i].len - offset;
+
+ if (rem > 0)
+ return params->out_segs[i].pos + offset;
+ offset -= params->out_segs[i].len;
+ }
+ return 0;
+}
+
+static void blkdev_write_work(struct work_struct *work)
+{
+ struct blkdev_copy_onload_ctx *ctx =
+ container_of(work, typeof(*ctx), read_work);
+ struct blk_copy_params *params = ctx->params;
+ struct bio *bio;
+ loff_t out_pos;
+
+ out_pos = blkdev_offset_to_out_pos(params, ctx->offset);
+
+ bio = bio_map_buf(ctx->buf, ctx->buf_len);
+ if (!bio) {
+ params->status = BLK_STS_AGAIN;
+ params->end_io(params);
+ return;
+ }
+ bio->bi_opf = REQ_OP_WRITE;
+ bio_set_dev(bio, params->out_bdev);
+ bio->bi_iter.bi_sector = out_pos >> SECTOR_SHIFT;
+ bio->bi_iter.bi_size = ctx->chunk;
+ bio->bi_end_io = blkdev_write_done;
+ bio->bi_copy_ctx = ctx;
+ submit_bio(bio);
+}
+
+static void blkdev_read_done(struct bio *bio)
+{
+ struct blkdev_copy_onload_ctx *ctx = bio->bi_copy_ctx;
+ struct blk_copy_params *params = ctx->params;
+ blk_status_t sts = bio->bi_status;
+
+ kfree(bio);
+
+ if (sts) {
+ params->status = sts;
+ params->end_io(params);
+ return;
+ }
+
+ schedule_work(&ctx->write_work);
+}
+
+static sector_t blkdev_offset_to_in_pos(const struct blk_copy_params *params,
+ loff_t offset, loff_t *chunk)
+{
+ for (int i = 0; i < params->in_nseg; i++) {
+ loff_t rem = params->in_segs[i].len - offset;
+
+ if (rem > 0) {
+ if (*chunk > rem)
+ *chunk = rem;
+ return params->in_segs[i].pos + offset;
+ }
+ offset -= params->in_segs[i].len;
+ }
+ *chunk = 0;
+ return 0;
+}
+
+static void blkdev_read_work(struct work_struct *work)
+{
+ struct blkdev_copy_onload_ctx *ctx =
+ container_of(work, typeof(*ctx), read_work);
+ struct blk_copy_params *params = ctx->params;
+ loff_t offset = ctx->offset;
+ sector_t in_pos;
+ struct bio *bio;
+
+ ctx->chunk = min(ctx->buf_len, params->len - offset);
+ if (ctx->chunk)
+ in_pos = blkdev_offset_to_in_pos(params, offset, &ctx->chunk);
+ if (ctx->chunk == 0) {
+ params->end_io(params);
+ return;
+ }
+
+ bio = bio_map_buf(ctx->buf, ctx->buf_len);
+ if (!bio) {
+ params->status = BLK_STS_AGAIN;
+ params->end_io(params);
+ return;
+ }
+ bio->bi_opf = REQ_OP_READ;
+ bio_set_dev(bio, params->in_bdev);
+ bio->bi_iter.bi_sector = in_pos >> SECTOR_SHIFT;
+ bio->bi_iter.bi_size = ctx->chunk;
+ bio->bi_end_io = blkdev_read_done;
+ bio->bi_copy_ctx = ctx;
+ submit_bio(bio);
+}
+
+/**
+ * blkdev_copy_onload - asynchronously copy data between two block devices using
+ * read and write operations.
+ * @params: Input and output block devices, input and output ranges and
+ * completion callback pointer.
+ * Return: 0 upon success; -EIOCBQUEUED if the completion callback function will
+ * be called or has already been called.
+ */
+int blkdev_copy_onload(struct blk_copy_params *params)
+{
+ loff_t max_hw_bytes =
+ min(queue_max_hw_sectors(params->in_bdev->bd_queue),
+ queue_max_hw_sectors(params->out_bdev->bd_queue)) <<
+ SECTOR_SHIFT;
+ struct blkdev_copy_onload_ctx *ctx;
+ loff_t len;
+ int ret;
+
+ ret = blkdev_copy_check_params(params, &len);
+ if (ret)
+ return ret;
+
+ params->len = len;
+
+ ctx = kzalloc_obj(*ctx);
+ if (!ctx)
+ return -ENOMEM;
+
+ INIT_WORK(&ctx->read_work, blkdev_read_work);
+ INIT_WORK(&ctx->write_work, blkdev_write_work);
+ ctx->params = params;
+
+ ctx->buf = blkdev_copy_alloc_buf(min(max_hw_bytes, len), &ctx->buf_len);
+ if (!ctx->buf)
+ goto err;
+
+ blkdev_read_work(&ctx->read_work);
+
+ return -EIOCBQUEUED;
+
+err:
+ kfree(ctx);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_onload);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fea296150cda..817eeba2f207 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1284,6 +1284,7 @@ void __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp);
int blkdev_copy_offload(struct blk_copy_params *params);
+int blkdev_copy_onload(struct blk_copy_params *params);
#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 05/12] block: Introduce accessor functions for copy offload bios
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
` (3 preceding siblings ...)
2026-04-24 22:41 ` [PATCH 04/12] block: Add an onloaded copy implementation Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:41 ` [PATCH 06/12] fs/read_write: Generalize generic_copy_file_checks() Bart Van Assche
` (6 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche
Make it easy for block drivers to iterate over the copy offload bios by
providing accessor functions for the copy offloading bios.
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-copy.c | 47 ++++++++++++++++++++++++++++++++++++++++
include/linux/blk-copy.h | 4 ++++
2 files changed, 51 insertions(+)
diff --git a/block/blk-copy.c b/block/blk-copy.c
index 459ed8581efc..f49a5f835b4a 100644
--- a/block/blk-copy.c
+++ b/block/blk-copy.c
@@ -7,6 +7,53 @@
#include <linux/blk-copy.h>
#include <linux/blk-mq.h>
+static struct bio *__blk_next_copy_bio(struct request *rq, struct bio *prev_bio,
+ enum req_op op)
+{
+ struct bio *bio;
+
+ if (prev_bio) {
+ bio = prev_bio->bi_next;
+ } else {
+ struct bio_copy_offload_ctx *copy_ctx = rq->bio->bi_copy_ctx;
+
+ bio = copy_ctx->bios;
+ }
+
+ for (; bio && bio_op(bio) != op; bio = bio->bi_next)
+ ;
+ return bio;
+}
+
+struct bio *blk_first_copy_bio(struct request *rq, enum req_op op)
+{
+ struct bio *bio = rq->bio;
+
+ if (bio_op(bio) == op)
+ return bio;
+
+ return __blk_next_copy_bio(rq, NULL, op);
+}
+EXPORT_SYMBOL_GPL(blk_first_copy_bio);
+
+struct bio *blk_next_copy_bio(struct bio *bio)
+{
+ return __blk_next_copy_bio(NULL, bio, bio_op(bio));
+}
+EXPORT_SYMBOL_GPL(blk_next_copy_bio);
+
+unsigned int blk_copy_bio_count(struct request *rq, enum req_op op)
+{
+ unsigned int count = 0;
+
+ for (struct bio *bio = blk_first_copy_bio(rq, op); bio;
+ bio = blk_next_copy_bio(bio))
+ count++;
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(blk_copy_bio_count);
+
/**
* Tracks the state of a single onloaded copy operation.
* @params: Data copy parameters.
diff --git a/include/linux/blk-copy.h b/include/linux/blk-copy.h
index 5e38cfc14a71..4c8435312752 100644
--- a/include/linux/blk-copy.h
+++ b/include/linux/blk-copy.h
@@ -43,4 +43,8 @@ struct bio_copy_offload_ctx {
void (*translation_complete)(struct bio_copy_offload_ctx *ctx);
};
+struct bio *blk_first_copy_bio(struct request *rq, enum req_op op);
+struct bio *blk_next_copy_bio(struct bio *bio);
+unsigned int blk_copy_bio_count(struct request *rq, enum req_op op);
+
#endif /* __LINUX_BLK_COPY_H */
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 06/12] fs/read_write: Generalize generic_copy_file_checks()
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
` (4 preceding siblings ...)
2026-04-24 22:41 ` [PATCH 05/12] block: Introduce accessor functions for copy offload bios Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:41 ` [PATCH 07/12] fs, block: Add copy_file_range() support for block devices Bart Van Assche
` (5 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche, Anuj Gupta, Hannes Reinecke
From: Anuj Gupta <anuj20.g@samsung.com>
Prepare for adding copy_file_range() support for block devices by making
the following changes:
- Change file_inode(file) into file->f_mapping->host. Although only one
inode is associated with regular files, two inodes are associated
with block devices. file->f_mapping->host is the primary block device
inode.
- Change S_ISREG() into S_ISREG() || S_ISBLK().
- Add an inode->i_mode & S_IFMT check that verifies that source and
destination have the same type (block device or regular file).
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
[ bvanassche: rewrote patch description ]
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
fs/read_write.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 50bff7edc91f..d6fba5afff94 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1484,8 +1484,8 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t *req_count, unsigned int flags)
{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
+ struct inode *inode_in = file_in->f_mapping->host;
+ struct inode *inode_out = file_out->f_mapping->host;
uint64_t count = *req_count;
loff_t size_in;
int ret;
@@ -1791,7 +1791,9 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
/* Don't copy dirs, pipes, sockets... */
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ if (!S_ISREG(inode_in->i_mode) && !S_ISBLK(inode_in->i_mode))
+ return -EINVAL;
+ if ((inode_in->i_mode & S_IFMT) != (inode_out->i_mode & S_IFMT))
return -EINVAL;
if (!(file_in->f_mode & FMODE_READ) ||
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 07/12] fs, block: Add copy_file_range() support for block devices
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
` (5 preceding siblings ...)
2026-04-24 22:41 ` [PATCH 06/12] fs/read_write: Generalize generic_copy_file_checks() Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:41 ` [PATCH 08/12] nvme: Add copy offloading support Bart Van Assche
` (4 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche, Hannes Reinecke, Anuj Gupta
From: Nitesh Shetty <nj.shetty@samsung.com>
Add copy_file_range() support for block devices. If input and output block
devices have been opened with O_DIRECT and if copy offloading is supported
use blkdev_copy_offload(). Otherwise use splice_copy_file_range().
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/fops.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
diff --git a/block/fops.c b/block/fops.c
index bb6642b45937..f438503f1b77 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -19,6 +19,7 @@
#include <linux/iomap.h>
#include <linux/module.h>
#include <linux/io_uring/cmd.h>
+#include <linux/splice.h>
#include "blk.h"
static inline struct inode *bdev_file_inode(struct file *file)
@@ -861,6 +862,58 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
+static ssize_t blkdev_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
+{
+ struct block_device *in_bdev = I_BDEV(bdev_file_inode(file_in));
+ struct block_device *out_bdev = I_BDEV(bdev_file_inode(file_out));
+ loff_t in_end, out_end;
+ int err;
+
+ if (check_add_overflow(pos_in, len, &in_end) ||
+ PAGE_ALIGN(in_end) < in_end ||
+ check_add_overflow(pos_out, len, &out_end) ||
+ PAGE_ALIGN(out_end) < out_end)
+ return -EINVAL;
+
+ /*
+ * filemap_write_and_wait_range() and filemap_invalidate_inode() expect
+ * that the 'end' argument is rounded up to the next multiple of
+ * PAGE_SIZE.
+ */
+ in_end = PAGE_ALIGN(in_end);
+ out_end = PAGE_ALIGN(out_end);
+
+ if (bdev_max_copy_sectors(in_bdev) && bdev_max_copy_sectors(out_bdev) &&
+ file_in->f_iocb_flags & file_out->f_iocb_flags & IOCB_DIRECT) {
+ struct blk_copy_seg in_seg = { .pos = pos_in, .len = len };
+ struct blk_copy_seg out_seg = { .pos = pos_out, .len = len };
+ struct blk_copy_params params = {
+ .in_bdev = in_bdev,
+ .out_bdev = out_bdev,
+ .in_nseg = 1,
+ .in_segs = &in_seg,
+ .out_nseg = 1,
+ .out_segs = &out_seg,
+ };
+ err = filemap_write_and_wait_range(file_in->f_mapping, pos_in,
+ in_end);
+ if (err)
+ return err;
+ err = filemap_invalidate_inode(bdev_file_inode(file_out),
+ /*flush=*/false,
+ pos_out, out_end);
+ if (err)
+ return err;
+ if (blkdev_copy_offload(¶ms) == 0)
+ return len;
+ /* If copy offloading fails, fall back to onloading. */
+ }
+
+ return splice_copy_file_range(file_in, pos_in, file_out, pos_out, len);
+}
+
#define BLKDEV_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES)
@@ -967,6 +1020,7 @@ const struct file_operations def_blk_fops = {
.fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC,
+ .copy_file_range = blkdev_copy_file_range,
};
static __init int blkdev_init(void)
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 08/12] nvme: Add copy offloading support
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
` (6 preceding siblings ...)
2026-04-24 22:41 ` [PATCH 07/12] fs, block: Add copy_file_range() support for block devices Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:41 ` [PATCH 09/12] nvmet: Support the Copy command Bart Van Assche
` (3 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche, Kanchan Joshi,
Javier González, Anuj Gupta
From: Nitesh Shetty <nj.shetty@samsung.com>
Add support for the NVMe Copy command. This command supports a single
destination range and up to 256 source ranges.
Add trace event support for nvme_copy_cmd.
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Javier González <javier.gonz@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
[ bvanassche: generalized Copy support from one to 256 source ranges; fixed
an endianness issue in nvme_config_copy(); renamed rsvd91 into rsvd81 and
verified the offset with pahole ]
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/nvme/host/constants.c | 1 +
drivers/nvme/host/core.c | 106 ++++++++++++++++++++++++++++++++++
drivers/nvme/host/trace.c | 19 ++++++
include/linux/nvme.h | 46 ++++++++++++++-
4 files changed, 169 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index dc90df9e13a2..b80c7c7fb629 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -19,6 +19,7 @@ static const char * const nvme_ops[] = {
[nvme_cmd_resv_report] = "Reservation Report",
[nvme_cmd_resv_acquire] = "Reservation Acquire",
[nvme_cmd_resv_release] = "Reservation Release",
+ [nvme_cmd_copy] = "Copy Offload",
[nvme_cmd_zone_mgmt_send] = "Zone Management Send",
[nvme_cmd_zone_mgmt_recv] = "Zone Management Receive",
[nvme_cmd_zone_append] = "Zone Append",
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1e33af94c24b..6f3c1fde112f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -6,6 +6,7 @@
#include <linux/async.h>
#include <linux/blkdev.h>
+#include <linux/blk-copy.h>
#include <linux/blk-mq.h>
#include <linux/blk-integrity.h>
#include <linux/compat.h>
@@ -821,6 +822,87 @@ static inline void nvme_setup_flush(struct nvme_ns *ns,
cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
}
+/*
+ * Translate REQ_OP_COPY_SRC and REQ_OP_COPY_DST bios into an NVMe Copy command.
+ * The NVMe copy command supports multiple source LBA ranges, a single
+ * destination LBA range, and also supports copying across NVMe namespaces. This
+ * implementation supports all these features except copying across NVMe
+ * namespaces.
+ */
+static inline blk_status_t nvme_setup_copy_offload(struct nvme_ns *ns,
+ struct request *req,
+ struct nvme_command *cmnd)
+{
+ const u32 nr_range = blk_copy_bio_count(req, REQ_OP_COPY_SRC);
+ struct nvme_ns *src_ns, *dst_ns;
+ struct bio *src_bio = NULL, *dst_bio;
+ struct nvme_copy_range *range;
+ u16 control = 0;
+ u64 dlba;
+
+ dst_bio = blk_first_copy_bio(req, REQ_OP_COPY_DST);
+
+ if (WARN_ON_ONCE(!dst_bio))
+ return BLK_STS_IOERR;
+
+ /* TO DO: derive dst_ns from dst_bio. */
+ dst_ns = ns;
+ dlba = nvme_sect_to_lba(dst_ns->head, dst_bio->bi_iter.bi_sector);
+
+ if (req->cmd_flags & REQ_FUA)
+ control |= NVME_RW_FUA;
+
+ if (req->cmd_flags & REQ_FAILFAST_DEV)
+ control |= NVME_RW_LR;
+
+ *cmnd = (typeof(*cmnd)){
+ .copy = {
+ .opcode = nvme_cmd_copy,
+ .nsid = cpu_to_le32(dst_ns->head->ns_id),
+ .control = cpu_to_le16(control),
+ .sdlba = cpu_to_le64(dlba),
+ .desfmt_prinfor = 2, /* DESFMT=2 */
+ .nr_range = nr_range - 1, /* 0's based */
+ }
+ };
+
+ range = kmalloc_array(nr_range, sizeof(*range),
+ GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN);
+ if (!range)
+ return BLK_STS_RESOURCE;
+
+ for (unsigned int i = 0; i < nr_range; i++) {
+ u64 slba;
+ u32 nslb;
+
+ if (!src_bio)
+ src_bio = blk_first_copy_bio(req, REQ_OP_COPY_SRC);
+ else
+ src_bio = blk_next_copy_bio(src_bio);
+ if (WARN_ON_ONCE(!src_bio))
+ goto free_range;
+ /* TO DO: derive src_ns from src_bio. */
+ src_ns = ns;
+ slba = nvme_sect_to_lba(src_ns->head,
+ src_bio->bi_iter.bi_sector);
+ nslb = src_bio->bi_iter.bi_size >> src_ns->head->lba_shift;
+ range[i].nsid = cpu_to_le32(src_ns->head->ns_id); /* requires DESFMT=2 */
+ range[i].slba = cpu_to_le64(slba);
+ range[i].nlb = cpu_to_le16(nslb - 1);
+ }
+
+ req->special_vec.bv_page = virt_to_page(range);
+ req->special_vec.bv_offset = offset_in_page(range);
+ req->special_vec.bv_len = sizeof(*range) * nr_range;
+ req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+ return BLK_STS_OK;
+
+free_range:
+ kfree(range);
+ return BLK_STS_IOERR;
+}
+
static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd)
{
@@ -1122,6 +1204,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
case REQ_OP_ZONE_APPEND:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
break;
+ case REQ_OP_COPY_DST:
+ case REQ_OP_COPY_SRC:
+ ret = nvme_setup_copy_offload(ns, req, cmd);
+ break;
default:
WARN_ON_ONCE(1);
return BLK_STS_IOERR;
@@ -1884,6 +1970,21 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
return true;
}
+static void nvme_config_copy(struct nvme_ns *ns, struct nvme_id_ns *id,
+ struct queue_limits *lim)
+{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+
+ if (!(ctrl->oncs & NVME_CTRL_ONCS_COPY)) {
+ lim->max_copy_hw_sectors = 0;
+ return;
+ }
+ lim->max_copy_hw_sectors = nvme_lba_to_sect(ns->head,
+ le16_to_cpu(id->mssrl));
+ lim->max_copy_src_segments = 256;
+ lim->max_copy_dst_segments = 1;
+}
+
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
{
return uuid_equal(&a->uuid, &b->uuid) &&
@@ -2416,6 +2517,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (!nvme_update_disk_info(ns, id, nvm, &lim))
capacity = 0;
+ nvme_config_copy(ns, id, &lim);
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
ns->head->ids.csi == NVME_CSI_ZNS)
nvme_update_zone_info(ns, &lim, &zi);
@@ -2542,6 +2644,9 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
lim.physical_block_size = ns_lim->physical_block_size;
lim.io_min = ns_lim->io_min;
lim.io_opt = ns_lim->io_opt;
+ lim.max_copy_hw_sectors = UINT_MAX;
+ lim.max_copy_src_segments = U16_MAX;
+ lim.max_copy_dst_segments = U16_MAX;
queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
ns->head->disk->disk_name);
if (unsupported)
@@ -5368,6 +5473,7 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_copy_command) != 64);
BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index ad25ad1e4041..7096ade7740c 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -153,6 +153,23 @@ static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
return ret;
}
+static const char *nvme_trace_copy(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u64 sdlba = get_unaligned_le64(cdw10);
+ u8 nr_range = get_unaligned_le16(cdw10 + 8);
+ u16 control = get_unaligned_le16(cdw10 + 10);
+ u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
+ u32 reftag = get_unaligned_le32(cdw10 + 16);
+
+ trace_seq_printf(p,
+ "sdlba=%llu, nr_range=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u",
+ sdlba, nr_range, control, dsmgmt, reftag);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -386,6 +403,8 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
return nvme_trace_resv_rel(p, cdw10);
case nvme_cmd_resv_report:
return nvme_trace_resv_report(p, cdw10);
+ case nvme_cmd_copy:
+ return nvme_trace_copy(p, cdw10);
default:
return nvme_trace_common(p, cdw10);
}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 041f30931a90..ead8e5128e3b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -376,7 +376,7 @@ struct nvme_id_ctrl {
__u8 nvscc;
__u8 nwpc;
__le16 acwu;
- __u8 rsvd534[2];
+ __le16 ocfs;
__le32 sgls;
__le32 mnan;
__u8 rsvd544[224];
@@ -404,6 +404,7 @@ enum {
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_ONCS_RESERVATIONS = 1 << 5,
NVME_CTRL_ONCS_TIMESTAMP = 1 << 6,
+ NVME_CTRL_ONCS_COPY = 1 << 8,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
NVME_CTRL_OACS_NS_MNGT_SUPP = 1 << 3,
@@ -458,7 +459,10 @@ struct nvme_id_ns {
__le16 npdg;
__le16 npda;
__le16 nows;
- __u8 rsvd74[18];
+ __le16 mssrl;
+ __le32 mcl;
+ __u8 msrc;
+ __u8 rsvd81[11];
__le32 anagrpid;
__u8 rsvd96[3];
__u8 nsattr;
@@ -967,6 +971,7 @@ enum nvme_opcode {
nvme_cmd_resv_acquire = 0x11,
nvme_cmd_io_mgmt_recv = 0x12,
nvme_cmd_resv_release = 0x15,
+ nvme_cmd_copy = 0x19,
nvme_cmd_zone_mgmt_send = 0x79,
nvme_cmd_zone_mgmt_recv = 0x7a,
nvme_cmd_zone_append = 0x7d,
@@ -991,7 +996,8 @@ enum nvme_opcode {
nvme_opcode_name(nvme_cmd_resv_release), \
nvme_opcode_name(nvme_cmd_zone_mgmt_send), \
nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \
- nvme_opcode_name(nvme_cmd_zone_append))
+ nvme_opcode_name(nvme_cmd_zone_append), \
+ nvme_opcode_name(nvme_cmd_copy))
@@ -1169,6 +1175,39 @@ struct nvme_dsm_range {
__le64 slba;
};
+struct nvme_copy_command {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __u64 rsvd2;
+ __le64 metadata;
+ union nvme_data_ptr dptr;
+ __le64 sdlba;
+ __u8 nr_range;
+ __u8 desfmt_prinfor;
+ __le16 control;
+ __le16 rsvd13;
+ __le16 dspec;
+ __le32 ilbrt;
+ __le16 lbat;
+ __le16 lbatm;
+};
+
+struct nvme_copy_range {
+ __le32 nsid; /* DESFMT=2 only */
+ __le32 rsvd1;
+ __le64 slba;
+ __le16 nlb;
+ __le16 rsvd18;
+ __le32 rsvd20;
+ __le32 eilbrt;
+ __le16 elbat;
+ __le16 elbatm;
+};
+
+static_assert(sizeof(struct nvme_copy_range) == 32);
+
struct nvme_write_zeroes_cmd {
__u8 opcode;
__u8 flags;
@@ -2001,6 +2040,7 @@ struct nvme_command {
struct nvme_download_firmware dlfw;
struct nvme_format_cmd format;
struct nvme_dsm_cmd dsm;
+ struct nvme_copy_command copy;
struct nvme_write_zeroes_cmd write_zeroes;
struct nvme_zone_mgmt_send_cmd zms;
struct nvme_zone_mgmt_recv_cmd zmr;
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 09/12] nvmet: Support the Copy command
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
` (7 preceding siblings ...)
2026-04-24 22:41 ` [PATCH 08/12] nvme: Add copy offloading support Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:41 ` [PATCH 10/12] dm: Add support for copy offloading Bart Van Assche
` (2 subsequent siblings)
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche, Anuj Gupta
From: Nitesh Shetty <nj.shetty@samsung.com>
Support the Copy command for namespaces backed by a block device or by a
file. For namespaces backed by a block device, we call
blkdev_copy_offload() and fall back to blkdev_copy_onload() if necessary.
For namespaces backed by a file we call vfs_copy_file_range().
nvmet always reports that the Copy command is supported.
Tracing support is added for the Copy command.
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
[ bvanassche: Increased namespace limits. ]
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/nvme/host/trace.c | 2 +-
drivers/nvme/target/admin-cmd.c | 26 +++++++++-
drivers/nvme/target/io-cmd-bdev.c | 80 +++++++++++++++++++++++++++++++
drivers/nvme/target/io-cmd-file.c | 59 +++++++++++++++++++++--
drivers/nvme/target/trace.c | 19 ++++++++
include/linux/nvme.h | 1 +
6 files changed, 179 insertions(+), 8 deletions(-)
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 7096ade7740c..fd49363f8516 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -143,7 +143,7 @@ static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
u16 length = get_unaligned_le16(cdw10 + 8);
u16 control = get_unaligned_le16(cdw10 + 10);
u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
- u32 reftag = get_unaligned_le32(cdw10 + 16);
+ u32 reftag = get_unaligned_le32(cdw10 + 16);
trace_seq_printf(p,
"slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u",
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index e4fd1caadfb0..1e404df6ad84 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -733,8 +733,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);
id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
NVME_CTRL_ONCS_WRITE_ZEROES |
- NVME_CTRL_ONCS_RESERVATIONS);
-
+ NVME_CTRL_ONCS_RESERVATIONS | NVME_CTRL_ONCS_COPY);
/* XXX: don't report vwc if the underlying device is write through */
id->vwc = NVME_CTRL_VWC_PRESENT;
@@ -797,6 +796,27 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
+static void nvmet_set_copy_limits(struct nvme_id_ns *id)
+{
+ /*
+ * MSRC = Maximum Source Range Count - the maximum number of
+ * source ranges that may be used to specify source data in a
+ * Copy command. 0's based.
+ */
+ id->msrc = 256 - 1;
+ /*
+ * MSSRL = Maximum Single Source Range Length - the maximum number
+ * of logical blocks that may be specified in the Number of Logical
+ * Blocks field in each valid Source Range Entries Descriptor.
+ */
+ id->mssrl = cpu_to_le16(U16_MAX);
+ /*
+ * MCL = Maximum Copy Length - the maximum number of logical
+ * blocks that may be specified in a Copy command.
+ */
+ id->mcl = cpu_to_le32(U32_MAX);
+}
+
static void nvmet_execute_identify_ns(struct nvmet_req *req)
{
struct nvme_id_ns *id;
@@ -845,6 +865,8 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
if (req->ns->bdev)
nvmet_bdev_set_limits(req->ns->bdev, id);
+ nvmet_set_copy_limits(id);
+
/*
* We just provide a single LBA format that matches what the
* underlying device reports.
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index f2d9e8901df4..4196f10b02ab 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -451,6 +451,83 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
}
}
+static void nvmet_bdev_copy_endio(const struct blk_copy_params *params)
+{
+ struct nvmet_req *rq = params->private;
+ blk_status_t status = params->status;
+
+ /*
+ * From the NVM Command Set Specification section about the Copy
+ * Command: "If the command completes with failure (i.e., completes with
+ * a status code other than Successful Completion), then: [ ... ] Dword
+ * 0 of the completion queue entry contains the number of the lowest
+ * numbered Source Range entry that was not successfully copied". Since
+ * that information is not available, clear Dword 0.
+ */
+ rq->cqe->result.u32 = cpu_to_le32(0);
+
+ nvmet_req_complete(rq, blk_to_nvme_status(rq, status));
+}
+
+static void nvmet_bdev_execute_copy(struct nvmet_req *rq)
+{
+ u32 i, nr_range = (u32)rq->cmd->copy.nr_range + 1;
+ struct blk_copy_seg *in_segs __free(kfree) = NULL;
+ struct nvme_command *cmd = rq->cmd;
+ struct nvme_copy_range range;
+ u64 src_len, copy_len = 0;
+ loff_t dst_pos, src_pos;
+ u16 status;
+ int ret;
+
+ status = NVME_SC_INTERNAL;
+ in_segs = kmalloc_array(nr_range, sizeof(*in_segs), GFP_KERNEL);
+ if (!in_segs)
+ goto err_rq_complete;
+
+ for (i = 0; i < nr_range; i++) {
+ status = nvmet_copy_from_sgl(rq, i * sizeof(range), &range,
+ sizeof(range));
+ if (WARN_ON_ONCE(status))
+ goto err_rq_complete;
+ /*
+ * TO DO: implement support for different source and destination namespace
+ * IDs.
+ */
+ status = errno_to_nvme_status(rq, -EIO);
+ if (le32_to_cpu(range.nsid) != rq->ns->nsid)
+ goto err_rq_complete;
+ src_pos = le64_to_cpu(range.slba) << rq->ns->blksize_shift;
+ src_len = (le16_to_cpu(range.nlb) + 1) << rq->ns->blksize_shift;
+ in_segs[i] =
+ (struct blk_copy_seg){ .pos = src_pos, .len = src_len };
+ copy_len += src_len;
+ }
+
+ dst_pos = le64_to_cpu(cmd->copy.sdlba) << rq->ns->blksize_shift;
+ struct blk_copy_seg out_seg = { .pos = dst_pos, .len = copy_len };
+ struct blk_copy_params params = {
+ .in_bdev = rq->ns->bdev,
+ .in_segs = in_segs,
+ .in_nseg = nr_range,
+ .out_bdev = rq->ns->bdev,
+ .out_segs = &out_seg,
+ .out_nseg = 1,
+ .end_io = nvmet_bdev_copy_endio,
+ .private = rq,
+ };
+ ret = blkdev_copy_offload(¶ms);
+ if (ret == -EIOCBQUEUED)
+ return;
+ if (ret)
+ ret = blkdev_copy_onload(¶ms);
+
+ rq->cqe->result.u32 = cpu_to_le32(ret == 0);
+ status = errno_to_nvme_status(rq, ret);
+err_rq_complete:
+ nvmet_req_complete(rq, status);
+}
+
u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
{
switch (req->cmd->common.opcode) {
@@ -469,6 +546,9 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
case nvme_cmd_write_zeroes:
req->execute = nvmet_bdev_execute_write_zeroes;
return 0;
+ case nvme_cmd_copy:
+ req->execute = nvmet_bdev_execute_copy;
+ return 0;
default:
return nvmet_report_invalid_opcode(req);
}
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 0b22d183f927..5e8738b45d52 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -131,11 +131,7 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC)
is_sync = true;
- pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
- if (unlikely(pos + req->transfer_len > req->ns->size)) {
- nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC));
- return true;
- }
+ pos = le64_to_cpu(req->cmd->copy.sdlba) << req->ns->blksize_shift;
memset(&req->f.iocb, 0, sizeof(struct kiocb));
for_each_sg(req->sg, sg, req->sg_cnt, i) {
@@ -321,6 +317,50 @@ static void nvmet_file_dsm_work(struct work_struct *w)
}
}
+static void nvmet_file_copy_work(struct work_struct *w)
+{
+ struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
+ u32 id, nr_range = req->cmd->copy.nr_range + 1;
+ loff_t dst_pos;
+ ssize_t ret;
+ u16 status;
+
+ status = errno_to_nvme_status(req, -ENOSPC);
+ dst_pos = le64_to_cpu(req->cmd->copy.sdlba) << req->ns->blksize_shift;
+
+ for (id = 0; id < nr_range; id++) {
+ struct nvme_copy_range range;
+ loff_t src_pos, src_len;
+
+ status = nvmet_copy_from_sgl(req, id * sizeof(range), &range,
+ sizeof(range));
+ if (status)
+ goto out;
+ /*
+ * TO DO: implement support for different source and destination namespace
+ * IDs.
+ */
+ status = errno_to_nvme_status(req, -EIO);
+ if (le32_to_cpu(range.nsid) != req->ns->nsid)
+ goto out;
+ src_pos = le64_to_cpu(range.slba) << (req->ns->blksize_shift);
+ src_len = (le16_to_cpu(range.nlb) + 1) << req->ns->blksize_shift;
+ ret = vfs_copy_file_range(req->ns->file, src_pos, req->ns->file,
+ dst_pos, src_len, COPY_FILE_SPLICE);
+ if (ret != src_len) {
+ req->cqe->result.u32 = cpu_to_le32(id);
+ status = errno_to_nvme_status(req, ret < 0 ? ret : -EIO);
+ goto out;
+ }
+ dst_pos += ret;
+ }
+
+ status = 0;
+
+out:
+ nvmet_req_complete(req, status);
+}
+
static void nvmet_file_execute_dsm(struct nvmet_req *req)
{
if (!nvmet_check_data_len_lte(req, nvmet_dsm_len(req)))
@@ -329,6 +369,12 @@ static void nvmet_file_execute_dsm(struct nvmet_req *req)
queue_work(nvmet_wq, &req->f.work);
}
+static void nvmet_file_execute_copy(struct nvmet_req *req)
+{
+ INIT_WORK(&req->f.work, nvmet_file_copy_work);
+ queue_work(nvmet_wq, &req->f.work);
+}
+
static void nvmet_file_write_zeroes_work(struct work_struct *w)
{
struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
@@ -375,6 +421,9 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
case nvme_cmd_write_zeroes:
req->execute = nvmet_file_execute_write_zeroes;
return 0;
+ case nvme_cmd_copy:
+ req->execute = nvmet_file_execute_copy;
+ return 0;
default:
return nvmet_report_invalid_opcode(req);
}
diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c
index 6dbc7036f2e4..2baef7294491 100644
--- a/drivers/nvme/target/trace.c
+++ b/drivers/nvme/target/trace.c
@@ -92,6 +92,23 @@ static const char *nvmet_trace_dsm(struct trace_seq *p, u8 *cdw10)
return ret;
}
+static const char *nvmet_trace_copy(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u64 sdlba = get_unaligned_le64(cdw10);
+ u8 nr_range = get_unaligned_le16(cdw10 + 8);
+ u16 control = get_unaligned_le16(cdw10 + 10);
+ u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
+ u32 reftag = get_unaligned_le32(cdw10 + 16);
+
+ trace_seq_printf(p,
+ "sdlba=%llu, nr_range=%u, ctrl=1x%x, dsmgmt=%u, reftag=%u",
+ sdlba, nr_range, control, dsmgmt, reftag);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
static const char *nvmet_trace_common(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -303,6 +320,8 @@ const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p,
return nvmet_trace_resv_rel(p, cdw10);
case nvme_cmd_resv_report:
return nvmet_trace_resv_report(p, cdw10);
+ case nvme_cmd_copy:
+ return nvmet_trace_copy(p, cdw10);
default:
return nvmet_trace_common(p, cdw10);
}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index ead8e5128e3b..c6325aeb13a0 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -2220,6 +2220,7 @@ enum {
NVME_SC_PMR_SAN_PROHIBITED = 0x123,
NVME_SC_ANA_GROUP_ID_INVALID = 0x124,
NVME_SC_ANA_ATTACH_FAILED = 0x125,
+ NVME_SC_COMMAND_SIZE_LIMIT_EXC = 0x183,
/*
* I/O Command Set Specific - NVM commands:
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 10/12] dm: Add support for copy offloading
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
` (8 preceding siblings ...)
2026-04-24 22:41 ` [PATCH 09/12] nvmet: Support the Copy command Bart Van Assche
@ 2026-04-24 22:41 ` Bart Van Assche
2026-04-24 22:42 ` [PATCH 11/12] dm-linear: Enable " Bart Van Assche
2026-04-24 22:42 ` [PATCH 12/12] null_blk: Add support for REQ_OP_COPY_* Bart Van Assche
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:41 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche
In dm_calculate_queue_limits(), clear the copy offload limits if the
device mapper driver does not support copy offloading. This is necessary
since blk_set_stacking_limits() sets the copy offload limits to their
maximum.
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/md/dm-table.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index dc2eff6b739d..888c5bdca5f1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1816,6 +1816,14 @@ int dm_calculate_queue_limits(struct dm_table *t,
return -EINVAL;
combine_limits:
+ if (!(ti_limits.features & BLK_FEAT_STACKING_COPY_OFFL)) {
+ ti_limits.max_copy_hw_sectors = 0;
+ ti_limits.max_copy_src_segments = 0;
+ ti_limits.max_copy_dst_segments = 0;
+ ti_limits.max_user_copy_sectors = 0;
+ ti_limits.max_copy_sectors = 0;
+ }
+
/*
* Merge this target's queue limits into the overall limits
* for the table.
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 11/12] dm-linear: Enable copy offloading
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
` (9 preceding siblings ...)
2026-04-24 22:41 ` [PATCH 10/12] dm: Add support for copy offloading Bart Van Assche
@ 2026-04-24 22:42 ` Bart Van Assche
2026-04-24 22:42 ` [PATCH 12/12] null_blk: Add support for REQ_OP_COPY_* Bart Van Assche
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:42 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche
Set BLK_FEAT_STACKING_COPY_OFFL and max_copy_hw_sectors to enable copy
offloading.
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
drivers/md/dm-linear.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 38c17846deb0..3de8bf5f11fb 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -119,6 +119,11 @@ static void linear_status(struct dm_target *ti, status_type_t type,
}
}
+static void linear_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ limits->features |= BLK_FEAT_STACKING_COPY_OFFL;
+}
+
static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
unsigned int cmd, unsigned long arg,
bool *forward)
@@ -211,6 +216,7 @@ static struct target_type linear_target = {
.dtr = linear_dtr,
.map = linear_map,
.status = linear_status,
+ .io_hints = linear_io_hints,
.prepare_ioctl = linear_prepare_ioctl,
.iterate_devices = linear_iterate_devices,
.direct_access = linear_dax_direct_access,
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 12/12] null_blk: Add support for REQ_OP_COPY_*
2026-04-24 22:41 [PATCH 00/12] Block storage copy offloading Bart Van Assche
` (10 preceding siblings ...)
2026-04-24 22:42 ` [PATCH 11/12] dm-linear: Enable " Bart Van Assche
@ 2026-04-24 22:42 ` Bart Van Assche
11 siblings, 0 replies; 13+ messages in thread
From: Bart Van Assche @ 2026-04-24 22:42 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, linux-scsi, linux-nvme, Christoph Hellwig,
Nitesh Shetty, Bart Van Assche, Damien Le Moal, Anuj Gupta,
Vincent Fu
From: Nitesh Shetty <nj.shetty@samsung.com>
Implementation is based on existing read and write infrastructure.
copy_max_bytes: A new configfs and module parameter is introduced, which
can be used to set hardware/driver supported maximum copy limit.
Only request based queue mode will support for copy offload.
Added tracefs support to copy IO tracing.
Suggested-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Vincent Fu <vincent.fu@samsung.com>
[ bvanassche: Split nullb_do_copy() into two functions. Added a
cond_resched() call inside nullb_do_copy(). ]
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
Documentation/block/null_blk.rst | 4 ++
drivers/block/null_blk/main.c | 113 ++++++++++++++++++++++++++++++
drivers/block/null_blk/null_blk.h | 1 +
3 files changed, 118 insertions(+)
diff --git a/Documentation/block/null_blk.rst b/Documentation/block/null_blk.rst
index 4dd78f24d10a..ea0616dbf7f3 100644
--- a/Documentation/block/null_blk.rst
+++ b/Documentation/block/null_blk.rst
@@ -149,3 +149,7 @@ zone_size=[MB]: Default: 256
zone_nr_conv=[nr_conv]: Default: 0
The number of conventional zones to create when block device is zoned. If
zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1.
+
+max_copy_bytes=[size in bytes]: Default: UINT_MAX
+ A module and configfs parameter which can be used to set hardware/driver
+ supported maximum copy offload limit.
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index f8c0fd57e041..87a2f3536b50 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/init.h>
+#include <linux/blk-copy.h>
#include "null_blk.h"
#undef pr_fmt
@@ -169,6 +170,10 @@ static int g_max_sectors;
module_param_named(max_sectors, g_max_sectors, int, 0444);
MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)");
+static unsigned long g_max_copy_bytes = UINT_MAX;
+module_param_named(max_copy_bytes, g_max_copy_bytes, ulong, 0444);
+MODULE_PARM_DESC(max_copy_bytes, "Maximum size of a copy command (in bytes)");
+
static unsigned int nr_devices = 1;
module_param(nr_devices, uint, 0444);
MODULE_PARM_DESC(nr_devices, "Number of devices to register");
@@ -450,6 +455,7 @@ NULLB_DEVICE_ATTR(home_node, uint, NULL);
NULLB_DEVICE_ATTR(queue_mode, uint, NULL);
NULLB_DEVICE_ATTR(blocksize, uint, NULL);
NULLB_DEVICE_ATTR(max_sectors, uint, NULL);
+NULLB_DEVICE_ATTR(max_copy_bytes, uint, NULL);
NULLB_DEVICE_ATTR(irqmode, uint, NULL);
NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL);
NULLB_DEVICE_ATTR(index, uint, NULL);
@@ -601,6 +607,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_blocksize,
&nullb_device_attr_cache_size,
&nullb_device_attr_completion_nsec,
+ &nullb_device_attr_max_copy_bytes,
&nullb_device_attr_discard,
&nullb_device_attr_fua,
&nullb_device_attr_home_node,
@@ -805,6 +812,7 @@ static struct nullb_device *null_alloc_dev(void)
dev->queue_mode = g_queue_mode;
dev->blocksize = g_bs;
dev->max_sectors = g_max_sectors;
+ dev->max_copy_bytes = g_max_copy_bytes;
dev->irqmode = g_irqmode;
dev->hw_queue_depth = g_hw_queue_depth;
dev->blocking = g_blocking;
@@ -1275,6 +1283,96 @@ static blk_status_t null_transfer(struct nullb *nullb, struct page *page,
return err;
}
+static ssize_t nullb_copy_sector(struct nullb *nullb, sector_t sector_in,
+ sector_t sector_out, ssize_t rem, bool is_fua)
+{
+ struct nullb_page *t_page_in, *t_page_out;
+ loff_t offset_in, offset_out;
+ void *in, *out;
+ ssize_t chunk;
+
+ chunk = min_t(size_t, nullb->dev->blocksize, rem);
+ offset_in = (sector_in & SECTOR_MASK) << SECTOR_SHIFT;
+ offset_out = (sector_out & SECTOR_MASK) << SECTOR_SHIFT;
+
+ guard(spinlock_irq)(&nullb->lock);
+
+ if (null_cache_active(nullb) && !is_fua)
+ null_make_cache_space(nullb, PAGE_SIZE);
+
+ t_page_in = null_insert_page(nullb, sector_in,
+ !null_cache_active(nullb));
+ if (!t_page_in)
+ return -1;
+ t_page_out = null_insert_page(nullb, sector_out,
+ !null_cache_active(nullb) || is_fua);
+ if (!t_page_out)
+ return -1;
+
+ in = kmap_local_page(t_page_in->page);
+ out = kmap_local_page(t_page_out->page);
+ memcpy(out + offset_out, in + offset_in, chunk);
+ kunmap_local(out);
+ kunmap_local(in);
+
+ __set_bit(sector_out & SECTOR_MASK, t_page_out->bitmap);
+
+ if (is_fua)
+ null_free_sector(nullb, sector_out, true);
+
+ return chunk;
+}
+
+static blk_status_t nullb_do_copy(struct nullb *nullb, struct request *rq)
+{
+ sector_t sector_in, sector_in_end, sector_out, sector_out_end;
+ struct bio_copy_offload_ctx *copy_ctx = rq->bio->bi_copy_ctx;
+ ssize_t chunk, rem = copy_ctx->len;
+ struct bio *src_bio, *dst_bio;
+
+ src_bio = blk_first_copy_bio(rq, REQ_OP_COPY_SRC);
+ dst_bio = blk_first_copy_bio(rq, REQ_OP_COPY_DST);
+
+ if (WARN_ON_ONCE(!src_bio || !dst_bio))
+ return BLK_STS_IOERR;
+
+ sector_in = src_bio->bi_iter.bi_sector;
+ sector_in_end = sector_in + (src_bio->bi_iter.bi_size >> SECTOR_SHIFT);
+ sector_out = dst_bio->bi_iter.bi_sector;
+ sector_out_end = sector_out + (dst_bio->bi_iter.bi_size >> SECTOR_SHIFT);
+
+ while (rem > 0) {
+ chunk = nullb_copy_sector(nullb, sector_in, sector_out, rem,
+ rq->cmd_flags & REQ_FUA);
+ if (chunk < 0)
+ return BLK_STS_IOERR;
+ rem -= chunk;
+ if (!rem)
+ break;
+ sector_in += chunk >> SECTOR_SHIFT;
+ if (sector_in >= sector_in_end) {
+ src_bio = blk_next_copy_bio(src_bio);
+ if (WARN_ON_ONCE(!src_bio))
+ return BLK_STS_IOERR;
+ sector_in = src_bio->bi_iter.bi_sector;
+ sector_in_end = sector_in +
+ (src_bio->bi_iter.bi_size >> SECTOR_SHIFT);
+ }
+ sector_out += chunk >> SECTOR_SHIFT;
+ if (sector_out >= sector_out_end) {
+ dst_bio = blk_next_copy_bio(dst_bio);
+ if (WARN_ON_ONCE(!dst_bio))
+ return BLK_STS_IOERR;
+ sector_out = dst_bio->bi_iter.bi_sector;
+ sector_out_end = sector_out +
+ (dst_bio->bi_iter.bi_size >> SECTOR_SHIFT);
+ }
+ cond_resched();
+ }
+
+ return BLK_STS_OK;
+}
+
/*
* Transfer data for the given request. The transfer size is capped with the
* nr_sectors argument.
@@ -1292,6 +1390,9 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
struct req_iterator iter;
struct bio_vec bvec;
+ if (op_is_copy(req_op(rq)))
+ return nullb_do_copy(nullb, rq);
+
spin_lock_irq(&nullb->lock);
rq_for_each_segment(bvec, rq, iter) {
len = bvec.bv_len;
@@ -1806,6 +1907,13 @@ static void null_config_discard(struct nullb *nullb, struct queue_limits *lim)
lim->max_hw_discard_sectors = UINT_MAX >> 9;
}
+static void null_config_copy(struct nullb *nullb, struct queue_limits *lim)
+{
+ lim->max_copy_hw_sectors = nullb->dev->max_copy_bytes >> SECTOR_SHIFT;
+ lim->max_copy_src_segments = nullb->dev->max_copy_bytes ? U16_MAX : 0;
+ lim->max_copy_dst_segments = lim->max_copy_src_segments;
+}
+
static const struct block_device_operations null_ops = {
.owner = THIS_MODULE,
.report_zones = null_report_zones,
@@ -1922,6 +2030,9 @@ static int null_validate_conf(struct nullb_device *dev)
return -EINVAL;
}
+ if (dev->queue_mode == NULL_Q_BIO)
+ dev->max_copy_bytes = 0;
+
return 0;
}
@@ -1989,6 +2100,8 @@ static int null_add_dev(struct nullb_device *dev)
if (dev->virt_boundary)
lim.virt_boundary_mask = PAGE_SIZE - 1;
null_config_discard(nullb, &lim);
+ null_config_copy(nullb, &lim);
+
if (dev->zoned) {
rv = null_init_zoned_dev(dev, &lim);
if (rv)
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 6c4c4bbe7dad..c15c319ed91b 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -93,6 +93,7 @@ struct nullb_device {
unsigned int queue_mode; /* block interface */
unsigned int blocksize; /* block size */
unsigned int max_sectors; /* Max sectors per command */
+ unsigned long max_copy_bytes; /* Max copy offload length in bytes */
unsigned int irqmode; /* IRQ completion handler */
unsigned int hw_queue_depth; /* queue depth */
unsigned int index; /* index of the disk, only valid with a disk */
^ permalink raw reply related [flat|nested] 13+ messages in thread