* [PATCH 1/4] nvme: update the explanation for not updating the limits in nvme_config_discard
2023-12-26 8:58 nvme discard limits fix and tidy ups Christoph Hellwig
@ 2023-12-26 8:58 ` Christoph Hellwig
2023-12-26 8:58 ` [PATCH 2/4] nvme: also skip discard granularity updates " Christoph Hellwig
` (3 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2023-12-26 8:58 UTC (permalink / raw)
To: Keith Busch, Sagi Grimberg; +Cc: linux-nvme
Expeand the comment a bit to explain what is going on.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
drivers/nvme/host/core.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d144d1acb09a05..56107cfc97b7bc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1743,7 +1743,13 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
queue->limits.discard_granularity = size;
- /* If discard is already enabled, don't reset queue limits */
+ /*
+ * If discard is already enabled, don't reset queue limits.
+ *
+ * This works around the fact that the block layer can't cope well with
+ * updating the hardware limits when overridden through sysfs. This is
+ * harmless because discard limits in NVMe are purely advisory.
+ */
if (queue->limits.max_discard_sectors)
return;
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 2/4] nvme: also skip discard granularity updates in nvme_config_discard
2023-12-26 8:58 nvme discard limits fix and tidy ups Christoph Hellwig
2023-12-26 8:58 ` [PATCH 1/4] nvme: update the explanation for not updating the limits in nvme_config_discard Christoph Hellwig
@ 2023-12-26 8:58 ` Christoph Hellwig
2023-12-26 23:18 ` Max Gurtovoy
2023-12-26 8:58 ` [PATCH 3/4] nvme: fix max_discard_sectors calculation Christoph Hellwig
` (2 subsequent siblings)
4 siblings, 1 reply; 9+ messages in thread
From: Christoph Hellwig @ 2023-12-26 8:58 UTC (permalink / raw)
To: Keith Busch, Sagi Grimberg; +Cc: linux-nvme
Don't just skip the discard sectors and segments but also the granularity
if a value was already set before.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
drivers/nvme/host/core.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 56107cfc97b7bc..6c52b0ab382c85 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1727,7 +1727,6 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
struct nvme_ns_head *head)
{
struct request_queue *queue = disk->queue;
- u32 size = queue_logical_block_size(queue);
if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX))
ctrl->max_discard_sectors =
@@ -1741,8 +1740,6 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- queue->limits.discard_granularity = size;
-
/*
* If discard is already enabled, don't reset queue limits.
*
@@ -1755,6 +1752,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
+ queue->limits.discard_granularity = queue_logical_block_size(queue);
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH 2/4] nvme: also skip discard granularity updates in nvme_config_discard
2023-12-26 8:58 ` [PATCH 2/4] nvme: also skip discard granularity updates " Christoph Hellwig
@ 2023-12-26 23:18 ` Max Gurtovoy
2023-12-27 8:07 ` Christoph Hellwig
0 siblings, 1 reply; 9+ messages in thread
From: Max Gurtovoy @ 2023-12-26 23:18 UTC (permalink / raw)
To: Christoph Hellwig, Keith Busch, Sagi Grimberg; +Cc: linux-nvme
On 26/12/2023 10:58, Christoph Hellwig wrote:
> Don't just skip the discard sectors and segments but also the granularity
> if a value was already set before.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> drivers/nvme/host/core.c | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 56107cfc97b7bc..6c52b0ab382c85 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1727,7 +1727,6 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
> struct nvme_ns_head *head)
> {
> struct request_queue *queue = disk->queue;
> - u32 size = queue_logical_block_size(queue);
>
> if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX))
> ctrl->max_discard_sectors =
> @@ -1741,8 +1740,6 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
> BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
> NVME_DSM_MAX_RANGES);
>
> - queue->limits.discard_granularity = size;
> -
> /*
> * If discard is already enabled, don't reset queue limits.
> *
> @@ -1755,6 +1752,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
>
> blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
> blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
> + queue->limits.discard_granularity = queue_logical_block_size(queue);
>
> if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
> blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
maybe worth to add a small helper like we have for other discard limits:
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0046b447268f..b7eee29ca8ec 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -369,6 +369,17 @@ void blk_queue_zone_write_granularity(struct
request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
+/**
+ * blk_queue_discard_granularity - set discard granularity for the queue
+ * @q: the request queue for the device
+ * @size: the discard granularity size, in bytes
+ */
+void blk_queue_discard_granularity(struct request_queue *q, unsigned
int size)
+{
+ q->limits.discard_granularity = size;
+}
+EXPORT_SYMBOL_GPL(blk_queue_discard_granularity);
+
/**
* blk_queue_alignment_offset - set physical block alignment offset
* @q: the request queue for the device
otherwise looks good,
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH 2/4] nvme: also skip discard granularity updates in nvme_config_discard
2023-12-26 23:18 ` Max Gurtovoy
@ 2023-12-27 8:07 ` Christoph Hellwig
0 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2023-12-27 8:07 UTC (permalink / raw)
To: Max Gurtovoy; +Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, linux-nvme
On Wed, Dec 27, 2023 at 01:18:13AM +0200, Max Gurtovoy wrote:
> maybe worth to add a small helper like we have for other discard limits:
I'd rather not add more wrappers like this. They dont really
scale, so I'm looking into atomic bulk updates of all settings right
now, and the last I need is random churn due to a new helper..
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 3/4] nvme: fix max_discard_sectors calculation
2023-12-26 8:58 nvme discard limits fix and tidy ups Christoph Hellwig
2023-12-26 8:58 ` [PATCH 1/4] nvme: update the explanation for not updating the limits in nvme_config_discard Christoph Hellwig
2023-12-26 8:58 ` [PATCH 2/4] nvme: also skip discard granularity updates " Christoph Hellwig
@ 2023-12-26 8:58 ` Christoph Hellwig
2024-01-02 17:28 ` Keith Busch
2023-12-26 8:58 ` [PATCH 4/4] nvme: simplify the max_discard_segments calculation Christoph Hellwig
2024-01-02 21:36 ` nvme discard limits fix and tidy ups Keith Busch
4 siblings, 1 reply; 9+ messages in thread
From: Christoph Hellwig @ 2023-12-26 8:58 UTC (permalink / raw)
To: Keith Busch, Sagi Grimberg; +Cc: linux-nvme
ctrl->max_discard_sectors stores a value that is potentially based of
the DMRSL field in Identify Controller, which is in units of LBAs and
thus dependent on the Format of a namespace.
Fix this by moving the calculation of max_discard_sectors entirely
into nvme_config_discard and replacing the ctrl->max_discard_sectors
value with a local variable so that the calculation is always
namespace-specific.
Fixes: 1a86924e4f46 ("nvme: fix interpretation of DMRSL")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
drivers/nvme/host/core.c | 20 +++++++++-----------
drivers/nvme/host/nvme.h | 1 -
2 files changed, 9 insertions(+), 12 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6c52b0ab382c85..86b9a1c4876f5f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1727,12 +1727,13 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
struct nvme_ns_head *head)
{
struct request_queue *queue = disk->queue;
+ u32 max_discard_sectors;
- if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX))
- ctrl->max_discard_sectors =
- nvme_lba_to_sect(head, ctrl->dmrsl);
-
- if (ctrl->max_discard_sectors == 0) {
+ if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
+ max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
+ } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
+ max_discard_sectors = UINT_MAX;
+ } else {
blk_queue_max_discard_sectors(queue, 0);
return;
}
@@ -1750,7 +1751,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
if (queue->limits.max_discard_sectors)
return;
- blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
+ blk_queue_max_discard_sectors(queue, max_discard_sectors);
blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
queue->limits.discard_granularity = queue_logical_block_size(queue);
@@ -2911,13 +2912,10 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
struct nvme_id_ctrl_nvm *id;
int ret;
- if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
- ctrl->max_discard_sectors = UINT_MAX;
+ if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
- } else {
- ctrl->max_discard_sectors = 0;
+ else
ctrl->max_discard_segments = 0;
- }
/*
* Even though NVMe spec explicitly states that MDTS is not applicable
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 3dbd187896d8d8..9a698c49ea0364 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -297,7 +297,6 @@ struct nvme_ctrl {
u32 max_hw_sectors;
u32 max_segments;
u32 max_integrity_segments;
- u32 max_discard_sectors;
u32 max_discard_segments;
u32 max_zeroes_sectors;
#ifdef CONFIG_BLK_DEV_ZONED
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH 3/4] nvme: fix max_discard_sectors calculation
2023-12-26 8:58 ` [PATCH 3/4] nvme: fix max_discard_sectors calculation Christoph Hellwig
@ 2024-01-02 17:28 ` Keith Busch
0 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2024-01-02 17:28 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Sagi Grimberg, linux-nvme
On Tue, Dec 26, 2023 at 08:58:43AM +0000, Christoph Hellwig wrote:
> ctrl->max_discard_sectors stores a value that is potentially based of
> the DMRSL field in Identify Controller, which is in units of LBAs and
> thus dependent on the Format of a namespace.
Is it just me, or is this one of the more annoying conventions in NVMe
standard: controller scoped identification fields are in units of
namespace specific formats?! Why not define these in namespace
identifications, or provide a fixed unit size? As it is now, these types
of fields would be in units of the largest LBA size of any attached
namespace, and that's kind of awkward in a multi-namespace environment.
> @@ -1750,7 +1751,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
> if (queue->limits.max_discard_sectors)
> return;
Since DMRSL can change when you format a namespace to a new LBA size,
might the old limit be using the wrong max_discard_size if we skip
updating it here? Probably not a big deal.
And let's say the user explicitly disabled max_discard_sectors through
sysfs. The next driver namespace rescan will turn it back on, probably
against the user's expectations. Should this limit check use
'max_hw_discard_sectors' instead?
Anyway, the driver was like that already, and the series looks good to
me. I just noticed this weirdness while reviewing.
> - blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
> + blk_queue_max_discard_sectors(queue, max_discard_sectors);
> blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
> queue->limits.discard_granularity = queue_logical_block_size(queue);
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 4/4] nvme: simplify the max_discard_segments calculation
2023-12-26 8:58 nvme discard limits fix and tidy ups Christoph Hellwig
` (2 preceding siblings ...)
2023-12-26 8:58 ` [PATCH 3/4] nvme: fix max_discard_sectors calculation Christoph Hellwig
@ 2023-12-26 8:58 ` Christoph Hellwig
2024-01-02 21:36 ` nvme discard limits fix and tidy ups Keith Busch
4 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2023-12-26 8:58 UTC (permalink / raw)
To: Keith Busch, Sagi Grimberg; +Cc: linux-nvme
Just stash away the DMRL value in the nvme_ctrl struture, and leave
all interpretation to nvme_config_discard, where we know DSM is
supported by the time we're configuring the number of segments.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
drivers/nvme/host/core.c | 13 +++++--------
drivers/nvme/host/nvme.h | 2 +-
2 files changed, 6 insertions(+), 9 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 86b9a1c4876f5f..50818dbcfa1ae1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1752,7 +1752,10 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
return;
blk_queue_max_discard_sectors(queue, max_discard_sectors);
- blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
+ if (ctrl->dmrl)
+ blk_queue_max_discard_segments(queue, ctrl->dmrl);
+ else
+ blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
queue->limits.discard_granularity = queue_logical_block_size(queue);
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
@@ -2912,11 +2915,6 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
struct nvme_id_ctrl_nvm *id;
int ret;
- if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
- ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
- else
- ctrl->max_discard_segments = 0;
-
/*
* Even though NVMe spec explicitly states that MDTS is not applicable
* to the write-zeroes, we are cautious and limit the size to the
@@ -2946,8 +2944,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
if (ret)
goto free_data;
- if (id->dmrl)
- ctrl->max_discard_segments = id->dmrl;
+ ctrl->dmrl = id->dmrl;
ctrl->dmrsl = le32_to_cpu(id->dmrsl);
if (id->wzsl)
ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9a698c49ea0364..297b80430f1b69 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -297,13 +297,13 @@ struct nvme_ctrl {
u32 max_hw_sectors;
u32 max_segments;
u32 max_integrity_segments;
- u32 max_discard_segments;
u32 max_zeroes_sectors;
#ifdef CONFIG_BLK_DEV_ZONED
u32 max_zone_append;
#endif
u16 crdt[3];
u16 oncs;
+ u8 dmrl;
u32 dmrsl;
u16 oacs;
u16 sqsize;
--
2.39.2
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: nvme discard limits fix and tidy ups
2023-12-26 8:58 nvme discard limits fix and tidy ups Christoph Hellwig
` (3 preceding siblings ...)
2023-12-26 8:58 ` [PATCH 4/4] nvme: simplify the max_discard_segments calculation Christoph Hellwig
@ 2024-01-02 21:36 ` Keith Busch
4 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2024-01-02 21:36 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Sagi Grimberg, linux-nvme
On Tue, Dec 26, 2023 at 08:58:40AM +0000, Christoph Hellwig wrote:
> Hi all,
>
> this series contains a fix that makes sure that the discard limits
> are properly calculated based on the LBA size for each namespaces,
> and also tidies up some code around that.
Applied to nvme-6.8.
^ permalink raw reply [flat|nested] 9+ messages in thread