Linux block layer
 help / color / mirror / Atom feed
* [RFC PATCH 0/2] Avoid software ref tag remapping for NVMe devices
@ 2026-06-27  6:19 Caleb Sander Mateos
  2026-06-27  6:19 ` [RFC PATCH 1/2] blk-integrity: add BLK_EXPECTED_REF_TAG_CAPABLE Caleb Sander Mateos
  2026-06-27  6:19 ` [RFC PATCH 2/2] nvme/core: advertise BLK_EXPECTED_REF_TAG_CAPABLE Caleb Sander Mateos
  0 siblings, 2 replies; 3+ messages in thread
From: Caleb Sander Mateos @ 2026-06-27  6:19 UTC (permalink / raw)
  To: Jens Axboe, Keith Busch, Christoph Hellwig, Sagi Grimberg,
	Martin K. Petersen
  Cc: linux-block, linux-nvme, linux-kernel, Caleb Sander Mateos

Currently, each bio has a reference tag seed which is used to generate
the sequential ref tags in its protection information. In principle, the
ref tag seed can be any value as long as the same value is used when
blocks are written as when they are read back. However, some devices
(e.g. T10 DIF) require the ref tags to match the low bits of the
absolute integrity interval numbers. So the block integrity layer always
"remaps" ref tags to absolute integrity intervals using
blk_integrity_prepare() on writes and blk_integrity_complete() on reads.

On devices which do support an explicit "expected initial reference tag"
field in addition to the logical block address on each I/O, the software
ref tag remapping could be skipped by just passing the ref tag seed as
the expected initial ref tag.

Introduce a BLK_EXPECTED_REF_TAG_CAPABLE flag for devices to advertise
support for an expected initial ref tag. On devices that set this flag,
skip the block integrity layer ref tag remapping. Also take care not to
merge bios with non-contiguous ref tags, as the merged bio's ref tags
would no longer come from a single ref tag seed.

Set BLK_EXPECTED_REF_TAG_CAPABLE for NVMe devices and plumb the ref tag
seed (if provided) to the NVMe Read/Write (E)ILBRT field.

One potential concern would be NVMe devices which already have ref tags
written by an old kernel, which did perform the remapping (persisting
ref tags set to the low bits of the LBAs). When a new kernel that skips
the remapping reads back the ref tags, it would expect them to match the
ref tag seed, which would fail the ref tag verification.

Caleb Sander Mateos (2):
  blk-integrity: add BLK_EXPECTED_REF_TAG_CAPABLE
  nvme/core: advertise BLK_EXPECTED_REF_TAG_CAPABLE

 block/blk-integrity.c         | 18 ++++++++++++++++++
 block/t10-pi.c                |  3 ++-
 drivers/nvme/host/core.c      | 20 ++++++++++----------
 include/linux/blk-integrity.h |  2 ++
 include/linux/t10-pi.h        |  5 -----
 5 files changed, 32 insertions(+), 16 deletions(-)

-- 
2.54.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [RFC PATCH 1/2] blk-integrity: add BLK_EXPECTED_REF_TAG_CAPABLE
  2026-06-27  6:19 [RFC PATCH 0/2] Avoid software ref tag remapping for NVMe devices Caleb Sander Mateos
@ 2026-06-27  6:19 ` Caleb Sander Mateos
  2026-06-27  6:19 ` [RFC PATCH 2/2] nvme/core: advertise BLK_EXPECTED_REF_TAG_CAPABLE Caleb Sander Mateos
  1 sibling, 0 replies; 3+ messages in thread
From: Caleb Sander Mateos @ 2026-06-27  6:19 UTC (permalink / raw)
  To: Jens Axboe, Keith Busch, Christoph Hellwig, Sagi Grimberg,
	Martin K. Petersen
  Cc: linux-block, linux-nvme, linux-kernel, Caleb Sander Mateos

Add BLK_EXPECTED_REF_TAG_CAPABLE to enum blk_integrity_flags to allow a
device to report support for specifying an expected initial ref tag in
I/O. Make blk_integrity_remap() a no-op if the flag is set, as the ref
tag seed used to generate/verify ref tags in the protection information
can be passed as the expected initial ref tag.

Ref tag remapping is necessary to merge bios with non-contiguous ref tag
seeds, as it converts both bios' ref tags to/from absolute integrity
interval numbers, which are contiguous. So don't merge bios to a
BLK_EXPECTED_REF_TAG_CAPABLE device if the next bio's ref tag seed
doesn't match the ref tag that would follow the end of the first bio.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
 block/blk-integrity.c         | 18 ++++++++++++++++++
 block/t10-pi.c                |  3 ++-
 include/linux/blk-integrity.h |  2 ++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 964eebbee14d..85ebe13f0912 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -139,10 +139,12 @@ EXPORT_SYMBOL_GPL(blk_rq_integrity_map_user);
 
 bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
 			    struct request *next)
 {
 	struct bio_integrity_payload *bip, *bip_next;
+	struct blk_integrity *bi;
+	u64 intervals;
 
 	if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0)
 		return true;
 
 	if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0)
@@ -155,10 +157,17 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
 
 	if (bip->bip_flags & BIP_CHECK_APPTAG &&
 	    bip->app_tag != bip_next->app_tag)
 		return false;
 
+	bi = blk_get_integrity(req->bio->bi_bdev->bd_disk);
+	intervals = blk_rq_bytes(req) >> bi->interval_exp;
+	if (bip->bip_flags & BIP_CHECK_REFTAG &&
+	    bi->flags & BLK_EXPECTED_REF_TAG_CAPABLE &&
+	    bip->bip_iter.bi_sector + intervals != bip_next->bip_iter.bi_sector)
+		return false;
+
 	if (req->nr_integrity_segments + next->nr_integrity_segments >
 	    q->limits.max_integrity_segments)
 		return false;
 
 	if (integrity_req_gap_back_merge(req, next->bio))
@@ -169,11 +178,13 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
 
 bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
 			     struct bio *bio)
 {
 	struct bio_integrity_payload *bip, *bip_bio = bio_integrity(bio);
+	struct blk_integrity *bi;
 	int nr_integrity_segs;
+	u64 intervals;
 
 	if (blk_integrity_rq(req) == 0 && bip_bio == NULL)
 		return true;
 
 	if (blk_integrity_rq(req) == 0 || bip_bio == NULL)
@@ -185,10 +196,17 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
 
 	if (bip->bip_flags & BIP_CHECK_APPTAG &&
 	    bip->app_tag != bip_bio->app_tag)
 		return false;
 
+	bi = blk_get_integrity(req->bio->bi_bdev->bd_disk);
+	intervals = blk_rq_bytes(req) >> bi->interval_exp;
+	if (bip->bip_flags & BIP_CHECK_REFTAG &&
+	    bi->flags & BLK_EXPECTED_REF_TAG_CAPABLE &&
+	    bip->bip_iter.bi_sector + intervals != bip_bio->bip_iter.bi_sector)
+		return false;
+
 	nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
 	if (req->nr_integrity_segments + nr_integrity_segs >
 	    q->limits.max_integrity_segments)
 		return false;
 
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 71367fd082bd..becf3e316b06 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -545,11 +545,12 @@ static void blk_integrity_remap(struct request *rq, unsigned int nr_bytes,
 	struct blk_integrity *bi = &rq->q->limits.integrity;
 	u64 ref = bio_integrity_intervals(bi, blk_rq_pos(rq));
 	unsigned intervals = nr_bytes >> bi->interval_exp;
 	struct bio *bio;
 
-	if (!(bi->flags & BLK_INTEGRITY_REF_TAG))
+	if (!(bi->flags & BLK_INTEGRITY_REF_TAG) ||
+	    bi->flags & BLK_EXPECTED_REF_TAG_CAPABLE)
 		return;
 
 	__rq_for_each_bio(bio, rq) {
 		__blk_reftag_remap(bio, bi, &intervals, &ref, prep);
 		if (!intervals)
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index c82b2f6fe194..e314d22d9922 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -13,10 +13,12 @@ enum blk_integrity_flags {
 	BLK_INTEGRITY_NOGENERATE	= 1 << 1,
 	BLK_INTEGRITY_DEVICE_CAPABLE	= 1 << 2,
 	BLK_INTEGRITY_REF_TAG		= 1 << 3,
 	BLK_INTEGRITY_STACKED		= 1 << 4,
 	BLK_SPLIT_INTERVAL_CAPABLE	= 1 << 5,
+	/* Device I/O specifies expected initial ref tag independent of LBA */
+	BLK_EXPECTED_REF_TAG_CAPABLE	= 1 << 6,
 };
 
 const char *blk_integrity_profile_name(struct blk_integrity *bi);
 bool queue_limits_stack_integrity(struct queue_limits *t,
 		struct queue_limits *b);
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [RFC PATCH 2/2] nvme/core: advertise BLK_EXPECTED_REF_TAG_CAPABLE
  2026-06-27  6:19 [RFC PATCH 0/2] Avoid software ref tag remapping for NVMe devices Caleb Sander Mateos
  2026-06-27  6:19 ` [RFC PATCH 1/2] blk-integrity: add BLK_EXPECTED_REF_TAG_CAPABLE Caleb Sander Mateos
@ 2026-06-27  6:19 ` Caleb Sander Mateos
  1 sibling, 0 replies; 3+ messages in thread
From: Caleb Sander Mateos @ 2026-06-27  6:19 UTC (permalink / raw)
  To: Jens Axboe, Keith Busch, Christoph Hellwig, Sagi Grimberg,
	Martin K. Petersen
  Cc: linux-block, linux-nvme, linux-kernel, Caleb Sander Mateos

NVMe Read, Write, and Write Zeroes commands include an (E)ILBRT field to
specify the expected initial reference tag for the controller to check
against the ref tags in the protection information buffer. However, the
NVMe driver currently always sets (E)ILBRT to the lower bits of the LBA.
The block integrity layer generates/verifies the PI ref tags according
to the bio's ref tag seed, so it must "remap" the ref tags, adjusting
for the difference between the ref tag seed and the absolute integrity
interval number (= LBA).

If a request has an integrity payload, set (E)ILBRT to its ref tag seed
so no ref tag remapping is required. Set BLK_EXPECTED_REF_TAG_CAPABLE in
NVMe devices' enum blk_integrity_flags to skip the block integrity layer
ref tag remapping.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
 drivers/nvme/host/core.c | 20 ++++++++++----------
 include/linux/t10-pi.h   |  5 -----
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 453c1f0b2dd0..8202ca706c97 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -914,34 +914,34 @@ static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd)
 }
 
 static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
 			      struct request *req)
 {
-	u32 upper, lower;
-	u64 ref48;
+	u64 ref_tag;
 
 	/* only type1 and type 2 PI formats have a reftag */
 	switch (ns->head->pi_type) {
 	case NVME_NS_DPS_PI_TYPE1:
 	case NVME_NS_DPS_PI_TYPE2:
 		break;
 	default:
 		return;
 	}
 
+	ref_tag = full_pi_ref_tag(req);
+	if (blk_integrity_rq(req))
+		ref_tag = bio_integrity(req->bio)->bip_iter.bi_sector;
+
 	/* both rw and write zeroes share the same reftag format */
 	switch (ns->head->guard_type) {
 	case NVME_NVM_NS_16B_GUARD:
-		cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
+		cmnd->rw.reftag = cpu_to_le32(lower_32_bits(ref_tag));
 		break;
 	case NVME_NVM_NS_64B_GUARD:
-		ref48 = ext_pi_ref_tag(req);
-		lower = lower_32_bits(ref48);
-		upper = upper_32_bits(ref48);
-
-		cmnd->rw.reftag = cpu_to_le32(lower);
-		cmnd->rw.cdw3 = cpu_to_le32(upper);
+		ref_tag = lower_48_bits(ref_tag);
+		cmnd->rw.reftag = cpu_to_le32(lower_32_bits(ref_tag));
+		cmnd->rw.cdw3 = cpu_to_le32(upper_32_bits(ref_tag));
 		break;
 	default:
 		break;
 	}
 }
@@ -1889,11 +1889,11 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
 		break;
 	default:
 		break;
 	}
 
-	bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE;
+	bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE | BLK_EXPECTED_REF_TAG_CAPABLE;
 	bi->metadata_size = head->ms;
 	if (bi->csum_type) {
 		bi->pi_tuple_size = head->pi_size;
 		bi->pi_offset = info->pi_offset;
 	}
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index b6c2496866ea..5cf4859877f5 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -66,11 +66,6 @@ struct crc64_pi_tuple {
 static inline u64 lower_48_bits(u64 n)
 {
 	return n & ((1ull << 48) - 1);
 }
 
-static inline u64 ext_pi_ref_tag(struct request *rq)
-{
-	return lower_48_bits(full_pi_ref_tag(rq));
-}
-
 #endif
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-06-27  6:19 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-27  6:19 [RFC PATCH 0/2] Avoid software ref tag remapping for NVMe devices Caleb Sander Mateos
2026-06-27  6:19 ` [RFC PATCH 1/2] blk-integrity: add BLK_EXPECTED_REF_TAG_CAPABLE Caleb Sander Mateos
2026-06-27  6:19 ` [RFC PATCH 2/2] nvme/core: advertise BLK_EXPECTED_REF_TAG_CAPABLE Caleb Sander Mateos

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox