* [PATCH] nvme: add DIX support for nvme-rdma @ 2022-08-29 8:12 Chao Leng 2022-08-29 9:02 ` Sagi Grimberg 2022-08-29 10:43 ` Max Gurtovoy 0 siblings, 2 replies; 19+ messages in thread From: Chao Leng @ 2022-08-29 8:12 UTC (permalink / raw) To: linux-nvme; +Cc: hch, sagi, kbusch, mgurtovoy Now some rdma HBA alread support DIX-DIF: host buffer to host HBA use DIX, host HBA to target use DIF. This patch add DIX support for nvme-rdma. Test results: The performance of different I/O sizes all be optimized. The CPU usage of 256k size I/Os can reduce more than 20%. Signed-off-by: Chao Leng <lengchao@huawei.com> --- drivers/nvme/host/core.c | 19 ++++++++++++++----- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/rdma.c | 24 ++++++++++++++++-------- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2429b11eb9a8..7055d3b7b582 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1623,7 +1623,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) #ifdef CONFIG_BLK_DEV_INTEGRITY static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, - u32 max_integrity_segments) + u32 max_integrity_segments, bool dix_support) { struct blk_integrity integrity = { }; @@ -1631,7 +1631,11 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE3: switch (ns->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.profile = &t10_pi_type3_crc; + if (dix_support) + integrity.profile = &t10_pi_type3_ip; + else + integrity.profile = &t10_pi_type3_crc; + integrity.tag_size = sizeof(u16) + sizeof(u32); integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; @@ -1649,7 +1653,11 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE2: switch (ns->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.profile = &t10_pi_type1_crc; + if (dix_support) + integrity.profile = &t10_pi_type1_ip; + else + integrity.profile = &t10_pi_type1_crc; + integrity.tag_size = sizeof(u16); integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; @@ -1674,7 +1682,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, } #else static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, - u32 max_integrity_segments) + u32 max_integrity_segments, bool dix_support) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ @@ -1900,7 +1908,8 @@ static void nvme_update_disk_info(struct gendisk *disk, if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && (ns->features & NVME_NS_METADATA_SUPPORTED)) nvme_init_integrity(disk, ns, - ns->ctrl->max_integrity_segments); + ns->ctrl->max_integrity_segments, + ns->ctrl->dix_support); else if (!nvme_ns_has_pi(ns)) capacity = 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bdc0ff7ed9ab..92cf93cf120b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -276,6 +276,7 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; + bool dix_support; u32 max_discard_sectors; u32 max_discard_segments; u32 max_zeroes_sectors; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3100643be299..0f63b626b3d4 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -872,6 +872,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, IBK_INTEGRITY_HANDOVER) pi_capable = true; + if (ctrl->device->dev->attrs.sig_guard_cap & IB_GUARD_T10DIF_CSUM) + ctrl->ctrl.dix_support = true; + ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev, pi_capable); @@ -1423,10 +1426,10 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, struct nvme_command *cmd, struct ib_sig_domain *domain, - u16 control, u8 pi_type) + u16 control, u8 pi_type, enum ib_t10_dif_bg_type dif_type) { domain->sig_type = IB_SIG_TYPE_T10_DIF; - domain->sig.dif.bg_type = IB_T10DIF_CRC; + domain->sig.dif.bg_type = dif_type; domain->sig.dif.pi_interval = 1 << bi->interval_exp; domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); if (control & NVME_RW_PRINFO_PRCHK_REF) @@ -1441,7 +1444,7 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs, - u8 pi_type) + u8 pi_type, bool dix_support) { u16 control = le16_to_cpu(cmd->rw.control); @@ -1450,16 +1453,20 @@ static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, /* for WRITE_INSERT/READ_STRIP no memory domain */ sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, - pi_type); + pi_type, IB_T10DIF_CRC); /* Clear the PRACT bit since HCA will generate/verify the PI */ control &= ~NVME_RW_PRINFO_PRACT; cmd->rw.control = cpu_to_le16(control); } else { /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, - pi_type); - nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, - pi_type); + pi_type, IB_T10DIF_CRC); + if (dix_support) + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, + control, pi_type, IB_T10DIF_CSUM); + else + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, + control, pi_type, IB_T10DIF_CRC); } } @@ -1501,7 +1508,8 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, goto mr_put; nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, - req->mr->sig_attrs, ns->pi_type); + req->mr->sig_attrs, ns->pi_type, + ns->ctrl->dix_support); nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); -- 2.16.4 ^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-29 8:12 [PATCH] nvme: add DIX support for nvme-rdma Chao Leng @ 2022-08-29 9:02 ` Sagi Grimberg 2022-08-29 10:43 ` Max Gurtovoy 1 sibling, 0 replies; 19+ messages in thread From: Sagi Grimberg @ 2022-08-29 9:02 UTC (permalink / raw) To: Chao Leng, linux-nvme; +Cc: hch, kbusch, mgurtovoy Reviewed-by: Sagi Grimberg <sagi@grimberg.me> ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-29 8:12 [PATCH] nvme: add DIX support for nvme-rdma Chao Leng 2022-08-29 9:02 ` Sagi Grimberg @ 2022-08-29 10:43 ` Max Gurtovoy 2022-08-29 13:16 ` Chao Leng 1 sibling, 1 reply; 19+ messages in thread From: Max Gurtovoy @ 2022-08-29 10:43 UTC (permalink / raw) To: Chao Leng, linux-nvme; +Cc: hch, sagi, kbusch hi, On 8/29/2022 11:12 AM, Chao Leng wrote: > Now some rdma HBA alread support DIX-DIF: host buffer to host HBA use > DIX, host HBA to target use DIF. > This patch add DIX support for nvme-rdma. > Test results: > The performance of different I/O sizes all be optimized. > The CPU usage of 256k size I/Os can reduce more than 20%. You're adding a new PI guard type for 16bit mode only, aren't you ? I don't think the subject and commit message is correct. Can you attach to commit message please the table of performance numbers before and after the patch ? You can mention that also in iser, if supported, the default is to use IP_CHECKSUM for DIX and not CRC. > > Signed-off-by: Chao Leng <lengchao@huawei.com> > --- > drivers/nvme/host/core.c | 19 ++++++++++++++----- > drivers/nvme/host/nvme.h | 1 + > drivers/nvme/host/rdma.c | 24 ++++++++++++++++-------- > 3 files changed, 31 insertions(+), 13 deletions(-) > > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > index 2429b11eb9a8..7055d3b7b582 100644 > --- a/drivers/nvme/host/core.c > +++ b/drivers/nvme/host/core.c > @@ -1623,7 +1623,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) > > #ifdef CONFIG_BLK_DEV_INTEGRITY > static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, > - u32 max_integrity_segments) > + u32 max_integrity_segments, bool dix_support) > { > struct blk_integrity integrity = { }; > > @@ -1631,7 +1631,11 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, > case NVME_NS_DPS_PI_TYPE3: > switch (ns->guard_type) { > case NVME_NVM_NS_16B_GUARD: > - integrity.profile = &t10_pi_type3_crc; > + if (dix_support) > + integrity.profile = &t10_pi_type3_ip; > + else > + integrity.profile = &t10_pi_type3_crc; > + > integrity.tag_size = sizeof(u16) + sizeof(u32); > integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; > break; > @@ -1649,7 +1653,11 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, > case NVME_NS_DPS_PI_TYPE2: > switch (ns->guard_type) { > case NVME_NVM_NS_16B_GUARD: > - integrity.profile = &t10_pi_type1_crc; > + if (dix_support) > + integrity.profile = &t10_pi_type1_ip; > + else > + integrity.profile = &t10_pi_type1_crc; > + > integrity.tag_size = sizeof(u16); > integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; > break; > @@ -1674,7 +1682,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, > } > #else > static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, > - u32 max_integrity_segments) > + u32 max_integrity_segments, bool dix_support) > { > } > #endif /* CONFIG_BLK_DEV_INTEGRITY */ > @@ -1900,7 +1908,8 @@ static void nvme_update_disk_info(struct gendisk *disk, > if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && > (ns->features & NVME_NS_METADATA_SUPPORTED)) > nvme_init_integrity(disk, ns, > - ns->ctrl->max_integrity_segments); > + ns->ctrl->max_integrity_segments, > + ns->ctrl->dix_support); > else if (!nvme_ns_has_pi(ns)) > capacity = 0; > } > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h > index bdc0ff7ed9ab..92cf93cf120b 100644 > --- a/drivers/nvme/host/nvme.h > +++ b/drivers/nvme/host/nvme.h > @@ -276,6 +276,7 @@ struct nvme_ctrl { > u32 max_hw_sectors; > u32 max_segments; > u32 max_integrity_segments; > + bool dix_support; > u32 max_discard_sectors; > u32 max_discard_segments; > u32 max_zeroes_sectors; > diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c > index 3100643be299..0f63b626b3d4 100644 > --- a/drivers/nvme/host/rdma.c > +++ b/drivers/nvme/host/rdma.c > @@ -872,6 +872,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, > IBK_INTEGRITY_HANDOVER) > pi_capable = true; > > + if (ctrl->device->dev->attrs.sig_guard_cap & IB_GUARD_T10DIF_CSUM) > + ctrl->ctrl.dix_support = true; > + > ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev, > pi_capable); > > @@ -1423,10 +1426,10 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, > > static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, > struct nvme_command *cmd, struct ib_sig_domain *domain, > - u16 control, u8 pi_type) > + u16 control, u8 pi_type, enum ib_t10_dif_bg_type dif_type) > { > domain->sig_type = IB_SIG_TYPE_T10_DIF; > - domain->sig.dif.bg_type = IB_T10DIF_CRC; > + domain->sig.dif.bg_type = dif_type; > domain->sig.dif.pi_interval = 1 << bi->interval_exp; > domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); > if (control & NVME_RW_PRINFO_PRCHK_REF) > @@ -1441,7 +1444,7 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, > > static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, > struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs, > - u8 pi_type) > + u8 pi_type, bool dix_support) > { > u16 control = le16_to_cpu(cmd->rw.control); > > @@ -1450,16 +1453,20 @@ static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, > /* for WRITE_INSERT/READ_STRIP no memory domain */ > sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; > nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, > - pi_type); > + pi_type, IB_T10DIF_CRC); > /* Clear the PRACT bit since HCA will generate/verify the PI */ > control &= ~NVME_RW_PRINFO_PRACT; > cmd->rw.control = cpu_to_le16(control); > } else { > /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ > nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, > - pi_type); > - nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, > - pi_type); > + pi_type, IB_T10DIF_CRC); > + if (dix_support) > + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, > + control, pi_type, IB_T10DIF_CSUM); > + else > + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, > + control, pi_type, IB_T10DIF_CRC); > } > } > > @@ -1501,7 +1508,8 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, > goto mr_put; > > nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, > - req->mr->sig_attrs, ns->pi_type); > + req->mr->sig_attrs, ns->pi_type, > + ns->ctrl->dix_support); > nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); > > ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-29 10:43 ` Max Gurtovoy @ 2022-08-29 13:16 ` Chao Leng 2022-08-29 14:56 ` Max Gurtovoy 0 siblings, 1 reply; 19+ messages in thread From: Chao Leng @ 2022-08-29 13:16 UTC (permalink / raw) To: Max Gurtovoy, linux-nvme; +Cc: hch, sagi, kbusch On 2022/8/29 18:43, Max Gurtovoy wrote: > hi, > > On 8/29/2022 11:12 AM, Chao Leng wrote: >> Now some rdma HBA alread support DIX-DIF: host buffer to host HBA use >> DIX, host HBA to target use DIF. >> This patch add DIX support for nvme-rdma. >> Test results: >> The performance of different I/O sizes all be optimized. >> The CPU usage of 256k size I/Os can reduce more than 20%. > > You're adding a new PI guard type for 16bit mode only, aren't you ? No, Now DIX just be defined for 16bit mode. > > I don't think the subject and commit message is correct. > > Can you attach to commit message please the table of performance numbers before and after the patch ?ok, I will give you the detailed performance numbers later. > > You can mention that also in iser, if supported, the default is to use IP_CHECKSUM for DIX and not CRC. According to DIX define:DIX = IP_CHECKSUM. To reduce CPU utilization, the end-to-end DIF for SCSI protocols is DIX-DIF when supported by hardware. > > >> >> Signed-off-by: Chao Leng <lengchao@huawei.com> >> --- >> drivers/nvme/host/core.c | 19 ++++++++++++++----- >> drivers/nvme/host/nvme.h | 1 + >> drivers/nvme/host/rdma.c | 24 ++++++++++++++++-------- >> 3 files changed, 31 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c >> index 2429b11eb9a8..7055d3b7b582 100644 >> --- a/drivers/nvme/host/core.c >> +++ b/drivers/nvme/host/core.c >> @@ -1623,7 +1623,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) >> #ifdef CONFIG_BLK_DEV_INTEGRITY >> static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >> - u32 max_integrity_segments) >> + u32 max_integrity_segments, bool dix_support) >> { >> struct blk_integrity integrity = { }; >> @@ -1631,7 +1631,11 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >> case NVME_NS_DPS_PI_TYPE3: >> switch (ns->guard_type) { >> case NVME_NVM_NS_16B_GUARD: >> - integrity.profile = &t10_pi_type3_crc; >> + if (dix_support) >> + integrity.profile = &t10_pi_type3_ip; >> + else >> + integrity.profile = &t10_pi_type3_crc; >> + >> integrity.tag_size = sizeof(u16) + sizeof(u32); >> integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; >> break; >> @@ -1649,7 +1653,11 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >> case NVME_NS_DPS_PI_TYPE2: >> switch (ns->guard_type) { >> case NVME_NVM_NS_16B_GUARD: >> - integrity.profile = &t10_pi_type1_crc; >> + if (dix_support) >> + integrity.profile = &t10_pi_type1_ip; >> + else >> + integrity.profile = &t10_pi_type1_crc; >> + >> integrity.tag_size = sizeof(u16); >> integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; >> break; >> @@ -1674,7 +1682,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >> } >> #else >> static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >> - u32 max_integrity_segments) >> + u32 max_integrity_segments, bool dix_support) >> { >> } >> #endif /* CONFIG_BLK_DEV_INTEGRITY */ >> @@ -1900,7 +1908,8 @@ static void nvme_update_disk_info(struct gendisk *disk, >> if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && >> (ns->features & NVME_NS_METADATA_SUPPORTED)) >> nvme_init_integrity(disk, ns, >> - ns->ctrl->max_integrity_segments); >> + ns->ctrl->max_integrity_segments, >> + ns->ctrl->dix_support); >> else if (!nvme_ns_has_pi(ns)) >> capacity = 0; >> } >> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h >> index bdc0ff7ed9ab..92cf93cf120b 100644 >> --- a/drivers/nvme/host/nvme.h >> +++ b/drivers/nvme/host/nvme.h >> @@ -276,6 +276,7 @@ struct nvme_ctrl { >> u32 max_hw_sectors; >> u32 max_segments; >> u32 max_integrity_segments; >> + bool dix_support; >> u32 max_discard_sectors; >> u32 max_discard_segments; >> u32 max_zeroes_sectors; >> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c >> index 3100643be299..0f63b626b3d4 100644 >> --- a/drivers/nvme/host/rdma.c >> +++ b/drivers/nvme/host/rdma.c >> @@ -872,6 +872,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, >> IBK_INTEGRITY_HANDOVER) >> pi_capable = true; >> + if (ctrl->device->dev->attrs.sig_guard_cap & IB_GUARD_T10DIF_CSUM) >> + ctrl->ctrl.dix_support = true; >> + >> ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev, >> pi_capable); >> @@ -1423,10 +1426,10 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, >> static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, >> struct nvme_command *cmd, struct ib_sig_domain *domain, >> - u16 control, u8 pi_type) >> + u16 control, u8 pi_type, enum ib_t10_dif_bg_type dif_type) >> { >> domain->sig_type = IB_SIG_TYPE_T10_DIF; >> - domain->sig.dif.bg_type = IB_T10DIF_CRC; >> + domain->sig.dif.bg_type = dif_type; >> domain->sig.dif.pi_interval = 1 << bi->interval_exp; >> domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); >> if (control & NVME_RW_PRINFO_PRCHK_REF) >> @@ -1441,7 +1444,7 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, >> static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, >> struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs, >> - u8 pi_type) >> + u8 pi_type, bool dix_support) >> { >> u16 control = le16_to_cpu(cmd->rw.control); >> @@ -1450,16 +1453,20 @@ static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, >> /* for WRITE_INSERT/READ_STRIP no memory domain */ >> sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; >> nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, >> - pi_type); >> + pi_type, IB_T10DIF_CRC); >> /* Clear the PRACT bit since HCA will generate/verify the PI */ >> control &= ~NVME_RW_PRINFO_PRACT; >> cmd->rw.control = cpu_to_le16(control); >> } else { >> /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ >> nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, >> - pi_type); >> - nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, >> - pi_type); >> + pi_type, IB_T10DIF_CRC); >> + if (dix_support) >> + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, >> + control, pi_type, IB_T10DIF_CSUM); >> + else >> + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, >> + control, pi_type, IB_T10DIF_CRC); >> } >> } >> @@ -1501,7 +1508,8 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, >> goto mr_put; >> nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, >> - req->mr->sig_attrs, ns->pi_type); >> + req->mr->sig_attrs, ns->pi_type, >> + ns->ctrl->dix_support); >> nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); >> ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); > . ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-29 13:16 ` Chao Leng @ 2022-08-29 14:56 ` Max Gurtovoy 2022-08-29 15:10 ` Keith Busch ` (2 more replies) 0 siblings, 3 replies; 19+ messages in thread From: Max Gurtovoy @ 2022-08-29 14:56 UTC (permalink / raw) To: Chao Leng, linux-nvme; +Cc: hch, sagi, kbusch On 8/29/2022 4:16 PM, Chao Leng wrote: > > > On 2022/8/29 18:43, Max Gurtovoy wrote: >> hi, >> >> On 8/29/2022 11:12 AM, Chao Leng wrote: >>> Now some rdma HBA alread support DIX-DIF: host buffer to host HBA use >>> DIX, host HBA to target use DIF. >>> This patch add DIX support for nvme-rdma. >>> Test results: >>> The performance of different I/O sizes all be optimized. >>> The CPU usage of 256k size I/Os can reduce more than 20%. >> >> You're adding a new PI guard type for 16bit mode only, aren't you ? > No, Now DIX just be defined for 16bit mode. >> >> I don't think the subject and commit message is correct. >> >> Can you attach to commit message please the table of performance >> numbers before and after the patch ?ok, I will give you the detailed >> performance numbers later. >> >> You can mention that also in iser, if supported, the default is to >> use IP_CHECKSUM for DIX and not CRC. > According to DIX define:DIX = IP_CHECKSUM. > To reduce CPU utilization, the end-to-end DIF for SCSI protocols is > DIX-DIF when supported by hardware. From what I re-call DIX was protection between host_buff -> host_device and DIF was protection between host_device -> target_device. If now its defined as DIX == IP_CHECKSUM and DIF == CRC please mention it somehow in the commit message. Something like: " Some RDMA HBA already support both DIX (IP_CHECKSUM) and DIF (CRC) offloads. Therefore, for host buffer to host HBA it's preferred using DIX to get better CPU utilization. Add DIX for host buffer to host HBA integrity, if supported by underlying hardware. Wire domain protocol for data integrity was not changed (will be using DIF). Test results (add a table please): " >> >> >>> >>> Signed-off-by: Chao Leng <lengchao@huawei.com> >>> --- >>> drivers/nvme/host/core.c | 19 ++++++++++++++----- >>> drivers/nvme/host/nvme.h | 1 + >>> drivers/nvme/host/rdma.c | 24 ++++++++++++++++-------- >>> 3 files changed, 31 insertions(+), 13 deletions(-) >>> >>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c >>> index 2429b11eb9a8..7055d3b7b582 100644 >>> --- a/drivers/nvme/host/core.c >>> +++ b/drivers/nvme/host/core.c >>> @@ -1623,7 +1623,7 @@ int nvme_getgeo(struct block_device *bdev, >>> struct hd_geometry *geo) >>> #ifdef CONFIG_BLK_DEV_INTEGRITY >>> static void nvme_init_integrity(struct gendisk *disk, struct >>> nvme_ns *ns, >>> - u32 max_integrity_segments) >>> + u32 max_integrity_segments, bool dix_support) >>> { >>> struct blk_integrity integrity = { }; >>> @@ -1631,7 +1631,11 @@ static void nvme_init_integrity(struct >>> gendisk *disk, struct nvme_ns *ns, >>> case NVME_NS_DPS_PI_TYPE3: >>> switch (ns->guard_type) { >>> case NVME_NVM_NS_16B_GUARD: >>> - integrity.profile = &t10_pi_type3_crc; >>> + if (dix_support) >>> + integrity.profile = &t10_pi_type3_ip; >>> + else >>> + integrity.profile = &t10_pi_type3_crc; >>> + >>> integrity.tag_size = sizeof(u16) + sizeof(u32); >>> integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; >>> break; >>> @@ -1649,7 +1653,11 @@ static void nvme_init_integrity(struct >>> gendisk *disk, struct nvme_ns *ns, >>> case NVME_NS_DPS_PI_TYPE2: >>> switch (ns->guard_type) { >>> case NVME_NVM_NS_16B_GUARD: >>> - integrity.profile = &t10_pi_type1_crc; >>> + if (dix_support) >>> + integrity.profile = &t10_pi_type1_ip; >>> + else >>> + integrity.profile = &t10_pi_type1_crc; >>> + >>> integrity.tag_size = sizeof(u16); >>> integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; >>> break; >>> @@ -1674,7 +1682,7 @@ static void nvme_init_integrity(struct gendisk >>> *disk, struct nvme_ns *ns, >>> } >>> #else >>> static void nvme_init_integrity(struct gendisk *disk, struct >>> nvme_ns *ns, >>> - u32 max_integrity_segments) >>> + u32 max_integrity_segments, bool dix_support) >>> { >>> } >>> #endif /* CONFIG_BLK_DEV_INTEGRITY */ >>> @@ -1900,7 +1908,8 @@ static void nvme_update_disk_info(struct >>> gendisk *disk, >>> if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && >>> (ns->features & NVME_NS_METADATA_SUPPORTED)) >>> nvme_init_integrity(disk, ns, >>> - ns->ctrl->max_integrity_segments); >>> + ns->ctrl->max_integrity_segments, >>> + ns->ctrl->dix_support); >>> else if (!nvme_ns_has_pi(ns)) >>> capacity = 0; >>> } >>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h >>> index bdc0ff7ed9ab..92cf93cf120b 100644 >>> --- a/drivers/nvme/host/nvme.h >>> +++ b/drivers/nvme/host/nvme.h >>> @@ -276,6 +276,7 @@ struct nvme_ctrl { >>> u32 max_hw_sectors; >>> u32 max_segments; >>> u32 max_integrity_segments; >>> + bool dix_support; >>> u32 max_discard_sectors; >>> u32 max_discard_segments; >>> u32 max_zeroes_sectors; >>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c >>> index 3100643be299..0f63b626b3d4 100644 >>> --- a/drivers/nvme/host/rdma.c >>> +++ b/drivers/nvme/host/rdma.c >>> @@ -872,6 +872,9 @@ static int >>> nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, >>> IBK_INTEGRITY_HANDOVER) >>> pi_capable = true; >>> + if (ctrl->device->dev->attrs.sig_guard_cap & IB_GUARD_T10DIF_CSUM) >>> + ctrl->ctrl.dix_support = true; >>> + >>> ctrl->max_fr_pages = >>> nvme_rdma_get_max_fr_pages(ctrl->device->dev, >>> pi_capable); >>> @@ -1423,10 +1426,10 @@ static int nvme_rdma_map_sg_fr(struct >>> nvme_rdma_queue *queue, >>> static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, >>> struct nvme_command *cmd, struct ib_sig_domain *domain, >>> - u16 control, u8 pi_type) >>> + u16 control, u8 pi_type, enum ib_t10_dif_bg_type dif_type) >>> { >>> domain->sig_type = IB_SIG_TYPE_T10_DIF; >>> - domain->sig.dif.bg_type = IB_T10DIF_CRC; >>> + domain->sig.dif.bg_type = dif_type; >>> domain->sig.dif.pi_interval = 1 << bi->interval_exp; >>> domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); >>> if (control & NVME_RW_PRINFO_PRCHK_REF) >>> @@ -1441,7 +1444,7 @@ static void nvme_rdma_set_sig_domain(struct >>> blk_integrity *bi, >>> static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, >>> struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs, >>> - u8 pi_type) >>> + u8 pi_type, bool dix_support) >>> { >>> u16 control = le16_to_cpu(cmd->rw.control); >>> @@ -1450,16 +1453,20 @@ static void nvme_rdma_set_sig_attrs(struct >>> blk_integrity *bi, >>> /* for WRITE_INSERT/READ_STRIP no memory domain */ >>> sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; >>> nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, >>> - pi_type); >>> + pi_type, IB_T10DIF_CRC); >>> /* Clear the PRACT bit since HCA will generate/verify the >>> PI */ >>> control &= ~NVME_RW_PRINFO_PRACT; >>> cmd->rw.control = cpu_to_le16(control); >>> } else { >>> /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ >>> nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, >>> - pi_type); >>> - nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, >>> - pi_type); >>> + pi_type, IB_T10DIF_CRC); >>> + if (dix_support) >>> + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, >>> + control, pi_type, IB_T10DIF_CSUM); >>> + else >>> + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, >>> + control, pi_type, IB_T10DIF_CRC); >>> } >>> } >>> @@ -1501,7 +1508,8 @@ static int nvme_rdma_map_sg_pi(struct >>> nvme_rdma_queue *queue, >>> goto mr_put; >>> nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, >>> - req->mr->sig_attrs, ns->pi_type); >>> + req->mr->sig_attrs, ns->pi_type, >>> + ns->ctrl->dix_support); >>> nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); >>> ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); >> . ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-29 14:56 ` Max Gurtovoy @ 2022-08-29 15:10 ` Keith Busch 2022-08-29 23:47 ` Max Gurtovoy 2022-08-30 12:18 ` Chao Leng 2022-08-30 2:38 ` Martin K. Petersen 2022-08-30 12:15 ` Chao Leng 2 siblings, 2 replies; 19+ messages in thread From: Keith Busch @ 2022-08-29 15:10 UTC (permalink / raw) To: Max Gurtovoy; +Cc: Chao Leng, linux-nvme, hch, sagi On Mon, Aug 29, 2022 at 05:56:39PM +0300, Max Gurtovoy wrote: > On 8/29/2022 4:16 PM, Chao Leng wrote: > > On 2022/8/29 18:43, Max Gurtovoy wrote: > > > On 8/29/2022 11:12 AM, Chao Leng wrote: > > > > > > You can mention that also in iser, if supported, the default is to > > > use IP_CHECKSUM for DIX and not CRC. > > According to DIX define:DIX = IP_CHECKSUM. > > To reduce CPU utilization, the end-to-end DIF for SCSI protocols is > > DIX-DIF when supported by hardware. > > From what I re-call DIX was protection between host_buff -> host_device and > DIF was protection between host_device -> target_device. > > If now its defined as DIX == IP_CHECKSUM and DIF == CRC please mention it > somehow in the commit message. Where is this coming from? The NVMe command set spec says this is the difference between DIF and DIX: The primary difference between these two mechanisms is the location of the protection information. In DIF, the protection information is contiguous with the logical block data and creates an extended logical block, while in DIX, the protection information is stored in a separate buffer. Regarding CRC vs IP Checksum, the spec also says this: In addition to a CRC-16, DIX also specifies an optional IP checksum that is not supported by the NVM Express interface. So DIX support doesn't imply IP checksum. Even if the host device can support it, the target device can not report it uses that guard type. ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-29 15:10 ` Keith Busch @ 2022-08-29 23:47 ` Max Gurtovoy 2022-08-30 12:18 ` Chao Leng 1 sibling, 0 replies; 19+ messages in thread From: Max Gurtovoy @ 2022-08-29 23:47 UTC (permalink / raw) To: Keith Busch; +Cc: Chao Leng, linux-nvme, hch, sagi On 8/29/2022 6:10 PM, Keith Busch wrote: > On Mon, Aug 29, 2022 at 05:56:39PM +0300, Max Gurtovoy wrote: >> On 8/29/2022 4:16 PM, Chao Leng wrote: >>> On 2022/8/29 18:43, Max Gurtovoy wrote: >>>> On 8/29/2022 11:12 AM, Chao Leng wrote: >>>> >>>> You can mention that also in iser, if supported, the default is to >>>> use IP_CHECKSUM for DIX and not CRC. >>> According to DIX define:DIX = IP_CHECKSUM. >>> To reduce CPU utilization, the end-to-end DIF for SCSI protocols is >>> DIX-DIF when supported by hardware. >> From what I re-call DIX was protection between host_buff -> host_device and >> DIF was protection between host_device -> target_device. >> >> If now its defined as DIX == IP_CHECKSUM and DIF == CRC please mention it >> somehow in the commit message. > Where is this coming from? The NVMe command set spec says this is the > difference between DIF and DIX: > > The primary difference between these two mechanisms is the location of the > protection information. In DIF, the protection information is contiguous with > the logical block data and creates an extended logical block, while in DIX, > the protection information is stored in a separate buffer. > > Regarding CRC vs IP Checksum, the spec also says this: > > In addition to a CRC-16, DIX also specifies an optional IP checksum that is > not supported by the NVM Express interface. Not so clear what does that mean. > > So DIX support doesn't imply IP checksum. Even if the host device can support > it, the target device can not report it uses that guard type. Right. But we don't send the IP checksum guard on the wire. The implementation is only used for integrity between host buffer <--> local HBA. And the fabrics support only DIF (extended logical block) so IP checksum guard is not allowed. So, I suggest re-write the commit message according to the NVMe spec (that defined DIX and DIF differently than SCSI) + add perf numbers for 4k, 8k, 16k, 32k, 64k, 128k, 258k IO read + IO write. Or maybe mention only the IP checksum that become default, if supported, for offloaded integrity between host buffer <--> local HBA (As we do in iSER). ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-29 15:10 ` Keith Busch 2022-08-29 23:47 ` Max Gurtovoy @ 2022-08-30 12:18 ` Chao Leng 2022-08-30 14:49 ` Keith Busch 1 sibling, 1 reply; 19+ messages in thread From: Chao Leng @ 2022-08-30 12:18 UTC (permalink / raw) To: Keith Busch, Max Gurtovoy; +Cc: linux-nvme, hch, sagi On 2022/8/29 23:10, Keith Busch wrote: > On Mon, Aug 29, 2022 at 05:56:39PM +0300, Max Gurtovoy wrote: >> On 8/29/2022 4:16 PM, Chao Leng wrote: >>> On 2022/8/29 18:43, Max Gurtovoy wrote: >>>> On 8/29/2022 11:12 AM, Chao Leng wrote: >>>> >>>> You can mention that also in iser, if supported, the default is to >>>> use IP_CHECKSUM for DIX and not CRC. >>> According to DIX define:DIX = IP_CHECKSUM. >>> To reduce CPU utilization, the end-to-end DIF for SCSI protocols is >>> DIX-DIF when supported by hardware. >> >> From what I re-call DIX was protection between host_buff -> host_device and >> DIF was protection between host_device -> target_device. >> >> If now its defined as DIX == IP_CHECKSUM and DIF == CRC please mention it >> somehow in the commit message. > > Where is this coming from? The NVMe command set spec says this is the > difference between DIF and DIX: > > The primary difference between these two mechanisms is the location of the > protection information. In DIF, the protection information is contiguous with > the logical block data and creates an extended logical block, while in DIX, > the protection information is stored in a separate buffer.The patch do not conflict with nvme spec. DIX just be used between host_buff -> host HBA. host HBA->target still use DIF. > > Regarding CRC vs IP Checksum, the spec also says this: > > In addition to a CRC-16, DIX also specifies an optional IP checksum that is > not supported by the NVM Express interface. NVMe do not support DIX, this should refer specifically to PCI, but NVMe over fabrics can support DIX. The NVMe base spec says: Additionally, support has been added for many Enterprise capabilities like end-to-end data protection (compatible with SCSI Protection Information, commonly known as T10 DIF, and SNIA DIX standards), enhanced error reporting, and virtualization. > > So DIX support doesn't imply IP checksum. Even if the host device can support > it, the target device can not report it uses that guard type. DIX just be used between host_buff -> host HBA. The target do not care. > . > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-30 12:18 ` Chao Leng @ 2022-08-30 14:49 ` Keith Busch 0 siblings, 0 replies; 19+ messages in thread From: Keith Busch @ 2022-08-30 14:49 UTC (permalink / raw) To: Chao Leng; +Cc: Max Gurtovoy, linux-nvme, hch, sagi On Tue, Aug 30, 2022 at 08:18:19PM +0800, Chao Leng wrote: > On 2022/8/29 23:10, Keith Busch wrote: > > > > So DIX support doesn't imply IP checksum. Even if the host device can support > > it, the target device can not report it uses that guard type. > DIX just be used between host_buff -> host HBA. The target do not care. Oops, serves me right for jumping in the middle of a thread without knowing the details. Thanks for the clarification. ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-29 14:56 ` Max Gurtovoy 2022-08-29 15:10 ` Keith Busch @ 2022-08-30 2:38 ` Martin K. Petersen 2022-08-30 12:21 ` Chao Leng 2022-08-30 12:15 ` Chao Leng 2 siblings, 1 reply; 19+ messages in thread From: Martin K. Petersen @ 2022-08-30 2:38 UTC (permalink / raw) To: Max Gurtovoy; +Cc: Chao Leng, linux-nvme, hch, sagi, kbusch Max, >> According to DIX define:DIX = IP_CHECKSUM. >> To reduce CPU utilization, the end-to-end DIF for SCSI protocols is >> DIX-DIF when supported by hardware. > > From what I re-call DIX was protection between host_buff -> > host_device and DIF was protection between host_device -> > target_device. DIX is a specification for a SCSI host adapter interface which describes how to put the protection information in a different buffer from the data buffer. The optional IP checksum guard tag was an artifact of the DIX efforts predating CPUs having suitable CRC calculation offload. We simply couldn't calculate the T10 DIF CRC fast enough on a general purpose CPU in 2006. Now that most modern processors (x86_64, ARM) support pclmulqdq or similar, IP checksum support is pretty much obsolete. That said, I don't have a problem with permitting IP checksum use for NVMe RDMA adapters if the hardware is capable. But it would be good to get some supporting benchmarks. Plus of course a description of the performance vs. data integrity trade-off wrt. using the weaker IP checksum. -- Martin K. Petersen Oracle Linux Engineering ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-30 2:38 ` Martin K. Petersen @ 2022-08-30 12:21 ` Chao Leng 2022-09-05 6:37 ` Christoph Hellwig 0 siblings, 1 reply; 19+ messages in thread From: Chao Leng @ 2022-08-30 12:21 UTC (permalink / raw) To: Martin K. Petersen, Max Gurtovoy; +Cc: linux-nvme, hch, sagi, kbusch On 2022/8/30 10:38, Martin K. Petersen wrote: > > Max, > >>> According to DIX define:DIX = IP_CHECKSUM. >>> To reduce CPU utilization, the end-to-end DIF for SCSI protocols is >>> DIX-DIF when supported by hardware. >> >> From what I re-call DIX was protection between host_buff -> >> host_device and DIF was protection between host_device -> >> target_device. > > DIX is a specification for a SCSI host adapter interface which describes > how to put the protection information in a different buffer from the > data buffer. > > The optional IP checksum guard tag was an artifact of the DIX efforts > predating CPUs having suitable CRC calculation offload. We simply > couldn't calculate the T10 DIF CRC fast enough on a general purpose CPU > in 2006. > > Now that most modern processors (x86_64, ARM) support pclmulqdq or > similar, IP checksum support is pretty much obsolete. From the test result, Checksum still significantly reduces CPU utilization compared with CRC, though the modern processors can work well with CRC. The host CPU:Intel(R) Xeon(R) Gold 6126 CPU @ 2.60GHz 48 processors. The test result without patch: test item IOPS sys(CPU usage)% CPU total usage% Single concurrency 8k read 30694 0.4 0.7 Single concurrency 8k write 20088 0.3 0.6 Single concurrency 256k read 4945 0.9 1.0 Single concurrency 256k write 4672 0.7 0.9 32 concurrency 8k read 421108 6.9 11.3 32 concurrency 8k write 288861 4.3 8.5 32 concurrency 256k read 20215 2.9 3.2 32 concurrency 256k write 19627 3.1 4.6 The test result after the patch is applied: test item IOPS sys(CPU usage)% CPU total usage% Single concurrency 8k read 30950 0.4 0.7 Single concurrency 8k write 24325 0.3 0.6 Single concurrency 256k read 6919 0.5 0.6 Single concurrency 256k write 5477 0.4 0.7 32 concurrency 8k read 442294 6.3 11.4 32 concurrency 8k write 297841 3.5 8.2 32 concurrency 256k read 20915 1.9 2.5 32 concurrency 256k write 19814 1.8 3.3 > > That said, I don't have a problem with permitting IP checksum use for > NVMe RDMA adapters if the hardware is capable. But it would be good to > get some supporting benchmarks. Plus of course a description of the > performance vs. data integrity trade-off wrt. using the weaker IP > checksum. Checksum just be used between host_buff -> host HBA, and the time is very short. If hardware support this, it is useful for reducing CPU utilization and data security can be acceptable like SCSI. > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-30 12:21 ` Chao Leng @ 2022-09-05 6:37 ` Christoph Hellwig 2022-09-06 2:13 ` Chao Leng 0 siblings, 1 reply; 19+ messages in thread From: Christoph Hellwig @ 2022-09-05 6:37 UTC (permalink / raw) To: Chao Leng; +Cc: Martin K. Petersen, Max Gurtovoy, linux-nvme, hch, sagi, kbusch On Tue, Aug 30, 2022 at 08:21:29PM +0800, Chao Leng wrote: > Checksum just be used between host_buff -> host HBA, and the time > is very short. If hardware support this, it is useful for reducing > CPU utilization and data security can be acceptable like SCSI. No, that is not how _end to and_ data protection works. And as said before, the IP checksums are not actually specified in NVMe to start with. ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-09-05 6:37 ` Christoph Hellwig @ 2022-09-06 2:13 ` Chao Leng 2022-09-06 6:59 ` Christoph Hellwig 0 siblings, 1 reply; 19+ messages in thread From: Chao Leng @ 2022-09-06 2:13 UTC (permalink / raw) To: Christoph Hellwig Cc: Martin K. Petersen, Max Gurtovoy, linux-nvme, sagi, kbusch On 2022/9/5 14:37, Christoph Hellwig wrote: > On Tue, Aug 30, 2022 at 08:21:29PM +0800, Chao Leng wrote: >> Checksum just be used between host_buff -> host HBA, and the time >> is very short. If hardware support this, it is useful for reducing >> CPU utilization and data security can be acceptable like SCSI. > > No, that is not how _end to and_ data protection works. Yes, end to end dif define: host_buf-----crc------>host HBA-----crc----->target. but to reduce CPU utilization, DIX define: host_buf--checksum---->host HBA-----crc----->target. SCSI related code is implemented according to the above definition. > > And as said before, the IP checksums are not actually specified in NVMe > to start with. Although the NVMe protocol does not define in detail how DIX should be supported. But The NVMe base spec says: Additionally, support has been added for many Enterprise capabilities like end-to-end data protection (compatible with SCSI Protection Information, commonly known as T10 DIF, and SNIA DIX standards), enhanced error reporting, and virtualization. > > . > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-09-06 2:13 ` Chao Leng @ 2022-09-06 6:59 ` Christoph Hellwig 2022-09-06 9:34 ` Max Gurtovoy 2022-09-06 10:13 ` Chao Leng 0 siblings, 2 replies; 19+ messages in thread From: Christoph Hellwig @ 2022-09-06 6:59 UTC (permalink / raw) To: Chao Leng Cc: Christoph Hellwig, Martin K. Petersen, Max Gurtovoy, linux-nvme, sagi, kbusch On Tue, Sep 06, 2022 at 10:13:42AM +0800, Chao Leng wrote: > Although the NVMe protocol does not define in detail how DIX should > be supported. NVMe describes how protection information works very well. It does not use "DIX" at all to reference those features. And remember that even for SCSI DIX is not a normative on the wire protocol, but just an interface for HBAs on how to allow the host to interact with the actual wire protocol. > But The NVMe base spec says: > Additionally, support has been added for many Enterprise capabilities like > end-to-end data protection (compatible with SCSI Protection Information, > commonly known as T10 DIF, and SNIA DIX standards), enhanced error reporting, > and virtualization. None of which is normative language. ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-09-06 6:59 ` Christoph Hellwig @ 2022-09-06 9:34 ` Max Gurtovoy 2022-09-06 9:35 ` Christoph Hellwig 2022-09-06 10:13 ` Chao Leng 1 sibling, 1 reply; 19+ messages in thread From: Max Gurtovoy @ 2022-09-06 9:34 UTC (permalink / raw) To: Christoph Hellwig, Chao Leng; +Cc: Martin K. Petersen, linux-nvme, sagi, kbusch On 9/6/2022 9:59 AM, Christoph Hellwig wrote: > On Tue, Sep 06, 2022 at 10:13:42AM +0800, Chao Leng wrote: >> Although the NVMe protocol does not define in detail how DIX should >> be supported. > NVMe describes how protection information works very well. It does not > use "DIX" at all to reference those features. And remember that even > for SCSI DIX is not a normative on the wire protocol, but just an > interface for HBAs on how to allow the host to interact with the actual > wire protocol. Maybe we can extend the spec and say that for Fabrics only we can use IP_CSUM as protection for "host_buf-------->host HBA". On the wire we'll use extended LBA with CRC as defined today. > >> But The NVMe base spec says: >> Additionally, support has been added for many Enterprise capabilities like >> end-to-end data protection (compatible with SCSI Protection Information, >> commonly known as T10 DIF, and SNIA DIX standards), enhanced error reporting, >> and virtualization. > None of which is normative language. ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-09-06 9:34 ` Max Gurtovoy @ 2022-09-06 9:35 ` Christoph Hellwig 2022-09-06 10:05 ` Max Gurtovoy 0 siblings, 1 reply; 19+ messages in thread From: Christoph Hellwig @ 2022-09-06 9:35 UTC (permalink / raw) To: Max Gurtovoy Cc: Christoph Hellwig, Chao Leng, Martin K. Petersen, linux-nvme, sagi, kbusch On Tue, Sep 06, 2022 at 12:34:13PM +0300, Max Gurtovoy wrote: > Maybe we can extend the spec and say that for Fabrics only we can use > IP_CSUM as protection for "host_buf-------->host HBA". > > On the wire we'll use extended LBA with CRC as defined today. Please read up on the concept of end to end data protection again. ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-09-06 9:35 ` Christoph Hellwig @ 2022-09-06 10:05 ` Max Gurtovoy 0 siblings, 0 replies; 19+ messages in thread From: Max Gurtovoy @ 2022-09-06 10:05 UTC (permalink / raw) To: Christoph Hellwig; +Cc: Chao Leng, Martin K. Petersen, linux-nvme, sagi, kbusch On 9/6/2022 12:35 PM, Christoph Hellwig wrote: > On Tue, Sep 06, 2022 at 12:34:13PM +0300, Max Gurtovoy wrote: >> Maybe we can extend the spec and say that for Fabrics only we can use >> IP_CSUM as protection for "host_buf-------->host HBA". >> >> On the wire we'll use extended LBA with CRC as defined today. > Please read up on the concept of end to end data protection again. The current spec doesn't allow the IP_CSUM . This is clear in: "In addition to a CRC-16, DIX also specifies an optional IP checksum that is not supported by the NVM Express interface." I'm saying that we can extend it to support it. For now, it only shows the following path HOST ---> [CTLR ---> NVM] but it can be extended to something like HOST ---> HOST_HBA ---> TARGET_HBA ---> NVM ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-09-06 6:59 ` Christoph Hellwig 2022-09-06 9:34 ` Max Gurtovoy @ 2022-09-06 10:13 ` Chao Leng 1 sibling, 0 replies; 19+ messages in thread From: Chao Leng @ 2022-09-06 10:13 UTC (permalink / raw) To: Christoph Hellwig Cc: Martin K. Petersen, Max Gurtovoy, linux-nvme, sagi, kbusch On 2022/9/6 14:59, Christoph Hellwig wrote: > On Tue, Sep 06, 2022 at 10:13:42AM +0800, Chao Leng wrote: >> Although the NVMe protocol does not define in detail how DIX should >> be supported. > > NVMe describes how protection information works very well. It does not > use "DIX" at all to reference those features. And remember that even > for SCSI DIX is not a normative on the wire protocol, but just an > interface for HBAs on how to allow the host to interact with the actual > wire protocol. Why can't NVMe implement similar mechanisms like SCSI? From the actual test, it is very useful to reduce the CPU utilization. > >> But The NVMe base spec says: >> Additionally, support has been added for many Enterprise capabilities like >> end-to-end data protection (compatible with SCSI Protection Information, >> commonly known as T10 DIF, and SNIA DIX standards), enhanced error reporting, >> and virtualization. > > None of which is normative language. > > . > ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH] nvme: add DIX support for nvme-rdma 2022-08-29 14:56 ` Max Gurtovoy 2022-08-29 15:10 ` Keith Busch 2022-08-30 2:38 ` Martin K. Petersen @ 2022-08-30 12:15 ` Chao Leng 2 siblings, 0 replies; 19+ messages in thread From: Chao Leng @ 2022-08-30 12:15 UTC (permalink / raw) To: Max Gurtovoy, linux-nvme; +Cc: hch, sagi, kbusch On 2022/8/29 22:56, Max Gurtovoy wrote: > > On 8/29/2022 4:16 PM, Chao Leng wrote: >> >> >> On 2022/8/29 18:43, Max Gurtovoy wrote: >>> hi, >>> >>> On 8/29/2022 11:12 AM, Chao Leng wrote: >>>> Now some rdma HBA alread support DIX-DIF: host buffer to host HBA use >>>> DIX, host HBA to target use DIF. >>>> This patch add DIX support for nvme-rdma. >>>> Test results: >>>> The performance of different I/O sizes all be optimized. >>>> The CPU usage of 256k size I/Os can reduce more than 20%. >>> >>> You're adding a new PI guard type for 16bit mode only, aren't you ? >> No, Now DIX just be defined for 16bit mode. >>> >>> I don't think the subject and commit message is correct. >>> >>> Can you attach to commit message please the table of performance numbers before and after the patch ?ok, I will give you the detailed performance numbers later. >>> >>> You can mention that also in iser, if supported, the default is to use IP_CHECKSUM for DIX and not CRC. >> According to DIX define:DIX = IP_CHECKSUM. >> To reduce CPU utilization, the end-to-end DIF for SCSI protocols is DIX-DIF when supported by hardware. > > From what I re-call DIX was protection between host_buff -> host_device and DIF was protection between host_device -> target_device. Yes, it is correct. > > If now its defined as DIX == IP_CHECKSUM and DIF == CRC please mention it somehow in the commit message. The commit message is not suitable for detailed description. > > Something like: > > " > > Some RDMA HBA already support both DIX (IP_CHECKSUM) and DIF (CRC) offloads. > Therefore, for host buffer to host HBA it's preferred using DIX to get better CPU utilization. > Add DIX for host buffer to host HBA integrity, if supported by underlying hardware. > Wire domain protocol for data integrity was not changed (will be using DIF). > Test results (add a table please): > > " >>> >>> >>>> >>>> Signed-off-by: Chao Leng <lengchao@huawei.com> >>>> --- >>>> drivers/nvme/host/core.c | 19 ++++++++++++++----- >>>> drivers/nvme/host/nvme.h | 1 + >>>> drivers/nvme/host/rdma.c | 24 ++++++++++++++++-------- >>>> 3 files changed, 31 insertions(+), 13 deletions(-) >>>> >>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c >>>> index 2429b11eb9a8..7055d3b7b582 100644 >>>> --- a/drivers/nvme/host/core.c >>>> +++ b/drivers/nvme/host/core.c >>>> @@ -1623,7 +1623,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) >>>> #ifdef CONFIG_BLK_DEV_INTEGRITY >>>> static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >>>> - u32 max_integrity_segments) >>>> + u32 max_integrity_segments, bool dix_support) >>>> { >>>> struct blk_integrity integrity = { }; >>>> @@ -1631,7 +1631,11 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >>>> case NVME_NS_DPS_PI_TYPE3: >>>> switch (ns->guard_type) { >>>> case NVME_NVM_NS_16B_GUARD: >>>> - integrity.profile = &t10_pi_type3_crc; >>>> + if (dix_support) >>>> + integrity.profile = &t10_pi_type3_ip; >>>> + else >>>> + integrity.profile = &t10_pi_type3_crc; >>>> + >>>> integrity.tag_size = sizeof(u16) + sizeof(u32); >>>> integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; >>>> break; >>>> @@ -1649,7 +1653,11 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >>>> case NVME_NS_DPS_PI_TYPE2: >>>> switch (ns->guard_type) { >>>> case NVME_NVM_NS_16B_GUARD: >>>> - integrity.profile = &t10_pi_type1_crc; >>>> + if (dix_support) >>>> + integrity.profile = &t10_pi_type1_ip; >>>> + else >>>> + integrity.profile = &t10_pi_type1_crc; >>>> + >>>> integrity.tag_size = sizeof(u16); >>>> integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; >>>> break; >>>> @@ -1674,7 +1682,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >>>> } >>>> #else >>>> static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, >>>> - u32 max_integrity_segments) >>>> + u32 max_integrity_segments, bool dix_support) >>>> { >>>> } >>>> #endif /* CONFIG_BLK_DEV_INTEGRITY */ >>>> @@ -1900,7 +1908,8 @@ static void nvme_update_disk_info(struct gendisk *disk, >>>> if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && >>>> (ns->features & NVME_NS_METADATA_SUPPORTED)) >>>> nvme_init_integrity(disk, ns, >>>> - ns->ctrl->max_integrity_segments); >>>> + ns->ctrl->max_integrity_segments, >>>> + ns->ctrl->dix_support); >>>> else if (!nvme_ns_has_pi(ns)) >>>> capacity = 0; >>>> } >>>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h >>>> index bdc0ff7ed9ab..92cf93cf120b 100644 >>>> --- a/drivers/nvme/host/nvme.h >>>> +++ b/drivers/nvme/host/nvme.h >>>> @@ -276,6 +276,7 @@ struct nvme_ctrl { >>>> u32 max_hw_sectors; >>>> u32 max_segments; >>>> u32 max_integrity_segments; >>>> + bool dix_support; >>>> u32 max_discard_sectors; >>>> u32 max_discard_segments; >>>> u32 max_zeroes_sectors; >>>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c >>>> index 3100643be299..0f63b626b3d4 100644 >>>> --- a/drivers/nvme/host/rdma.c >>>> +++ b/drivers/nvme/host/rdma.c >>>> @@ -872,6 +872,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, >>>> IBK_INTEGRITY_HANDOVER) >>>> pi_capable = true; >>>> + if (ctrl->device->dev->attrs.sig_guard_cap & IB_GUARD_T10DIF_CSUM) >>>> + ctrl->ctrl.dix_support = true; >>>> + >>>> ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev, >>>> pi_capable); >>>> @@ -1423,10 +1426,10 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, >>>> static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, >>>> struct nvme_command *cmd, struct ib_sig_domain *domain, >>>> - u16 control, u8 pi_type) >>>> + u16 control, u8 pi_type, enum ib_t10_dif_bg_type dif_type) >>>> { >>>> domain->sig_type = IB_SIG_TYPE_T10_DIF; >>>> - domain->sig.dif.bg_type = IB_T10DIF_CRC; >>>> + domain->sig.dif.bg_type = dif_type; >>>> domain->sig.dif.pi_interval = 1 << bi->interval_exp; >>>> domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); >>>> if (control & NVME_RW_PRINFO_PRCHK_REF) >>>> @@ -1441,7 +1444,7 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, >>>> static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, >>>> struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs, >>>> - u8 pi_type) >>>> + u8 pi_type, bool dix_support) >>>> { >>>> u16 control = le16_to_cpu(cmd->rw.control); >>>> @@ -1450,16 +1453,20 @@ static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, >>>> /* for WRITE_INSERT/READ_STRIP no memory domain */ >>>> sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; >>>> nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, >>>> - pi_type); >>>> + pi_type, IB_T10DIF_CRC); >>>> /* Clear the PRACT bit since HCA will generate/verify the PI */ >>>> control &= ~NVME_RW_PRINFO_PRACT; >>>> cmd->rw.control = cpu_to_le16(control); >>>> } else { >>>> /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ >>>> nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, >>>> - pi_type); >>>> - nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, >>>> - pi_type); >>>> + pi_type, IB_T10DIF_CRC); >>>> + if (dix_support) >>>> + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, >>>> + control, pi_type, IB_T10DIF_CSUM); >>>> + else >>>> + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, >>>> + control, pi_type, IB_T10DIF_CRC); >>>> } >>>> } >>>> @@ -1501,7 +1508,8 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, >>>> goto mr_put; >>>> nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, >>>> - req->mr->sig_attrs, ns->pi_type); >>>> + req->mr->sig_attrs, ns->pi_type, >>>> + ns->ctrl->dix_support); >>>> nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); >>>> ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); >>> . > . ^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2022-09-06 11:15 UTC | newest] Thread overview: 19+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2022-08-29 8:12 [PATCH] nvme: add DIX support for nvme-rdma Chao Leng 2022-08-29 9:02 ` Sagi Grimberg 2022-08-29 10:43 ` Max Gurtovoy 2022-08-29 13:16 ` Chao Leng 2022-08-29 14:56 ` Max Gurtovoy 2022-08-29 15:10 ` Keith Busch 2022-08-29 23:47 ` Max Gurtovoy 2022-08-30 12:18 ` Chao Leng 2022-08-30 14:49 ` Keith Busch 2022-08-30 2:38 ` Martin K. Petersen 2022-08-30 12:21 ` Chao Leng 2022-09-05 6:37 ` Christoph Hellwig 2022-09-06 2:13 ` Chao Leng 2022-09-06 6:59 ` Christoph Hellwig 2022-09-06 9:34 ` Max Gurtovoy 2022-09-06 9:35 ` Christoph Hellwig 2022-09-06 10:05 ` Max Gurtovoy 2022-09-06 10:13 ` Chao Leng 2022-08-30 12:15 ` Chao Leng
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox