* [RFC PATCH 1/2] blk-integrity: add BLK_EXPECTED_REF_TAG_CAPABLE
2026-06-27 6:19 [RFC PATCH 0/2] Avoid software ref tag remapping for NVMe devices Caleb Sander Mateos
@ 2026-06-27 6:19 ` Caleb Sander Mateos
2026-06-27 6:19 ` [RFC PATCH 2/2] nvme/core: advertise BLK_EXPECTED_REF_TAG_CAPABLE Caleb Sander Mateos
1 sibling, 0 replies; 3+ messages in thread
From: Caleb Sander Mateos @ 2026-06-27 6:19 UTC (permalink / raw)
To: Jens Axboe, Keith Busch, Christoph Hellwig, Sagi Grimberg,
Martin K. Petersen
Cc: linux-block, linux-nvme, linux-kernel, Caleb Sander Mateos
Add BLK_EXPECTED_REF_TAG_CAPABLE to enum blk_integrity_flags to allow a
device to report support for specifying an expected initial ref tag in
I/O. Make blk_integrity_remap() a no-op if the flag is set, as the ref
tag seed used to generate/verify ref tags in the protection information
can be passed as the expected initial ref tag.
Ref tag remapping is necessary to merge bios with non-contiguous ref tag
seeds, as it converts both bios' ref tags to/from absolute integrity
interval numbers, which are contiguous. So don't merge bios to a
BLK_EXPECTED_REF_TAG_CAPABLE device if the next bio's ref tag seed
doesn't match the ref tag that would follow the end of the first bio.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
block/blk-integrity.c | 18 ++++++++++++++++++
block/t10-pi.c | 3 ++-
include/linux/blk-integrity.h | 2 ++
3 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 964eebbee14d..85ebe13f0912 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -139,10 +139,12 @@ EXPORT_SYMBOL_GPL(blk_rq_integrity_map_user);
bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
struct request *next)
{
struct bio_integrity_payload *bip, *bip_next;
+ struct blk_integrity *bi;
+ u64 intervals;
if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0)
return true;
if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0)
@@ -155,10 +157,17 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
if (bip->bip_flags & BIP_CHECK_APPTAG &&
bip->app_tag != bip_next->app_tag)
return false;
+ bi = blk_get_integrity(req->bio->bi_bdev->bd_disk);
+ intervals = blk_rq_bytes(req) >> bi->interval_exp;
+ if (bip->bip_flags & BIP_CHECK_REFTAG &&
+ bi->flags & BLK_EXPECTED_REF_TAG_CAPABLE &&
+ bip->bip_iter.bi_sector + intervals != bip_next->bip_iter.bi_sector)
+ return false;
+
if (req->nr_integrity_segments + next->nr_integrity_segments >
q->limits.max_integrity_segments)
return false;
if (integrity_req_gap_back_merge(req, next->bio))
@@ -169,11 +178,13 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
struct bio *bio)
{
struct bio_integrity_payload *bip, *bip_bio = bio_integrity(bio);
+ struct blk_integrity *bi;
int nr_integrity_segs;
+ u64 intervals;
if (blk_integrity_rq(req) == 0 && bip_bio == NULL)
return true;
if (blk_integrity_rq(req) == 0 || bip_bio == NULL)
@@ -185,10 +196,17 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
if (bip->bip_flags & BIP_CHECK_APPTAG &&
bip->app_tag != bip_bio->app_tag)
return false;
+ bi = blk_get_integrity(req->bio->bi_bdev->bd_disk);
+ intervals = blk_rq_bytes(req) >> bi->interval_exp;
+ if (bip->bip_flags & BIP_CHECK_REFTAG &&
+ bi->flags & BLK_EXPECTED_REF_TAG_CAPABLE &&
+ bip->bip_iter.bi_sector + intervals != bip_bio->bip_iter.bi_sector)
+ return false;
+
nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
if (req->nr_integrity_segments + nr_integrity_segs >
q->limits.max_integrity_segments)
return false;
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 71367fd082bd..becf3e316b06 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -545,11 +545,12 @@ static void blk_integrity_remap(struct request *rq, unsigned int nr_bytes,
struct blk_integrity *bi = &rq->q->limits.integrity;
u64 ref = bio_integrity_intervals(bi, blk_rq_pos(rq));
unsigned intervals = nr_bytes >> bi->interval_exp;
struct bio *bio;
- if (!(bi->flags & BLK_INTEGRITY_REF_TAG))
+ if (!(bi->flags & BLK_INTEGRITY_REF_TAG) ||
+ bi->flags & BLK_EXPECTED_REF_TAG_CAPABLE)
return;
__rq_for_each_bio(bio, rq) {
__blk_reftag_remap(bio, bi, &intervals, &ref, prep);
if (!intervals)
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index c82b2f6fe194..e314d22d9922 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -13,10 +13,12 @@ enum blk_integrity_flags {
BLK_INTEGRITY_NOGENERATE = 1 << 1,
BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2,
BLK_INTEGRITY_REF_TAG = 1 << 3,
BLK_INTEGRITY_STACKED = 1 << 4,
BLK_SPLIT_INTERVAL_CAPABLE = 1 << 5,
+ /* Device I/O specifies expected initial ref tag independent of LBA */
+ BLK_EXPECTED_REF_TAG_CAPABLE = 1 << 6,
};
const char *blk_integrity_profile_name(struct blk_integrity *bi);
bool queue_limits_stack_integrity(struct queue_limits *t,
struct queue_limits *b);
--
2.54.0
^ permalink raw reply related [flat|nested] 3+ messages in thread* [RFC PATCH 2/2] nvme/core: advertise BLK_EXPECTED_REF_TAG_CAPABLE
2026-06-27 6:19 [RFC PATCH 0/2] Avoid software ref tag remapping for NVMe devices Caleb Sander Mateos
2026-06-27 6:19 ` [RFC PATCH 1/2] blk-integrity: add BLK_EXPECTED_REF_TAG_CAPABLE Caleb Sander Mateos
@ 2026-06-27 6:19 ` Caleb Sander Mateos
1 sibling, 0 replies; 3+ messages in thread
From: Caleb Sander Mateos @ 2026-06-27 6:19 UTC (permalink / raw)
To: Jens Axboe, Keith Busch, Christoph Hellwig, Sagi Grimberg,
Martin K. Petersen
Cc: linux-block, linux-nvme, linux-kernel, Caleb Sander Mateos
NVMe Read, Write, and Write Zeroes commands include an (E)ILBRT field to
specify the expected initial reference tag for the controller to check
against the ref tags in the protection information buffer. However, the
NVMe driver currently always sets (E)ILBRT to the lower bits of the LBA.
The block integrity layer generates/verifies the PI ref tags according
to the bio's ref tag seed, so it must "remap" the ref tags, adjusting
for the difference between the ref tag seed and the absolute integrity
interval number (= LBA).
If a request has an integrity payload, set (E)ILBRT to its ref tag seed
so no ref tag remapping is required. Set BLK_EXPECTED_REF_TAG_CAPABLE in
NVMe devices' enum blk_integrity_flags to skip the block integrity layer
ref tag remapping.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
drivers/nvme/host/core.c | 20 ++++++++++----------
include/linux/t10-pi.h | 5 -----
2 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 453c1f0b2dd0..8202ca706c97 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -914,34 +914,34 @@ static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd)
}
static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
struct request *req)
{
- u32 upper, lower;
- u64 ref48;
+ u64 ref_tag;
/* only type1 and type 2 PI formats have a reftag */
switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
break;
default:
return;
}
+ ref_tag = full_pi_ref_tag(req);
+ if (blk_integrity_rq(req))
+ ref_tag = bio_integrity(req->bio)->bip_iter.bi_sector;
+
/* both rw and write zeroes share the same reftag format */
switch (ns->head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
- cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
+ cmnd->rw.reftag = cpu_to_le32(lower_32_bits(ref_tag));
break;
case NVME_NVM_NS_64B_GUARD:
- ref48 = ext_pi_ref_tag(req);
- lower = lower_32_bits(ref48);
- upper = upper_32_bits(ref48);
-
- cmnd->rw.reftag = cpu_to_le32(lower);
- cmnd->rw.cdw3 = cpu_to_le32(upper);
+ ref_tag = lower_48_bits(ref_tag);
+ cmnd->rw.reftag = cpu_to_le32(lower_32_bits(ref_tag));
+ cmnd->rw.cdw3 = cpu_to_le32(upper_32_bits(ref_tag));
break;
default:
break;
}
}
@@ -1889,11 +1889,11 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
break;
default:
break;
}
- bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE;
+ bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE | BLK_EXPECTED_REF_TAG_CAPABLE;
bi->metadata_size = head->ms;
if (bi->csum_type) {
bi->pi_tuple_size = head->pi_size;
bi->pi_offset = info->pi_offset;
}
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index b6c2496866ea..5cf4859877f5 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -66,11 +66,6 @@ struct crc64_pi_tuple {
static inline u64 lower_48_bits(u64 n)
{
return n & ((1ull << 48) - 1);
}
-static inline u64 ext_pi_ref_tag(struct request *rq)
-{
- return lower_48_bits(full_pi_ref_tag(rq));
-}
-
#endif
--
2.54.0
^ permalink raw reply related [flat|nested] 3+ messages in thread