* [PATCH 1/2] nvme/host: move nvme_ns_info into nvme header file
2025-12-21 21:26 [PATCH 0/2] nvme/host: perform delayed retries upon non-fatal error received during nvme namespace validation Alex Tran
@ 2025-12-21 21:26 ` Alex Tran
2025-12-21 21:26 ` [PATCH 2/2] nvme/host: add delayed retries upon non-fatal error during ns validation Alex Tran
1 sibling, 0 replies; 5+ messages in thread
From: Alex Tran @ 2025-12-21 21:26 UTC (permalink / raw)
To: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg
Cc: linux-nvme, linux-kernel, Alex Tran
The nvme_ns_info struct is moved into the nvme header file.
This is done because it is used as a field in the nvme_ns struct
to store the pending info being used in the following patch.
Signed-off-by: Alex Tran <alex.t.tran@gmail.com>
---
drivers/nvme/host/core.c | 15 ---------------
drivers/nvme/host/nvme.h | 15 +++++++++++++++
2 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7bf228df6001f1f4d0b3c570de285a5eb17bb08e..fab321e79b7cdbb89d96d950c1cc8c1128906770 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -33,21 +33,6 @@
#define NVME_MINORS (1U << MINORBITS)
-struct nvme_ns_info {
- struct nvme_ns_ids ids;
- u32 nsid;
- __le32 anagrpid;
- u8 pi_offset;
- u16 endgid;
- u64 runs;
- bool is_shared;
- bool is_readonly;
- bool is_ready;
- bool is_removed;
- bool is_rotational;
- bool no_vwc;
-};
-
unsigned int admin_timeout = 60;
module_param(admin_timeout, uint, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9a5f28c5103c5c42777bd9309a983ef0196c1b95..ff4e7213131298a1a019eaa3822ca26f857b2443 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -525,6 +525,21 @@ enum nvme_ns_features {
NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeroes supported */
};
+struct nvme_ns_info {
+ struct nvme_ns_ids ids;
+ u32 nsid;
+ __le32 anagrpid;
+ u8 pi_offset;
+ u16 endgid;
+ u64 runs;
+ bool is_shared;
+ bool is_readonly;
+ bool is_ready;
+ bool is_removed;
+ bool is_rotational;
+ bool no_vwc;
+};
+
struct nvme_ns {
struct list_head list;
--
2.51.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH 2/2] nvme/host: add delayed retries upon non-fatal error during ns validation
2025-12-21 21:26 [PATCH 0/2] nvme/host: perform delayed retries upon non-fatal error received during nvme namespace validation Alex Tran
2025-12-21 21:26 ` [PATCH 1/2] nvme/host: move nvme_ns_info into nvme header file Alex Tran
@ 2025-12-21 21:26 ` Alex Tran
2025-12-25 13:00 ` Sagi Grimberg
1 sibling, 1 reply; 5+ messages in thread
From: Alex Tran @ 2025-12-21 21:26 UTC (permalink / raw)
To: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg
Cc: linux-nvme, linux-kernel, Alex Tran
If a non-fatal error is received during nvme namespace validation, it
should not be ignored and the namespace should not be removed immediately.
Rather, delayed retires should be performed on the namespace validation
process.
This handles non-fatal issues more robustly, by retrying a few times before
giving up and removing the namespace. The number of retries is set
to 3 and the interval between retries is set to 3 seconds.
Signed-off-by: Alex Tran <alex.t.tran@gmail.com>
---
drivers/nvme/host/core.c | 43 +++++++++++++++++++++++++++++++++++++++----
drivers/nvme/host/nvme.h | 9 +++++++++
2 files changed, 48 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fab321e79b7cdbb89d96d950c1cc8c1128906770..2e208d894b27f85f7f6358eb697be262ce45aed6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -139,6 +139,7 @@ static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
struct nvme_command *cmd);
static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
+static void nvme_validate_ns_work(struct work_struct *work);
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
@@ -4118,6 +4119,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
ns->ctrl = ctrl;
kref_init(&ns->kref);
+ INIT_DELAYED_WORK(&ns->validate_work, nvme_validate_ns_work);
+
if (nvme_init_ns_head(ns, info))
goto out_cleanup_disk;
@@ -4215,6 +4218,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
{
bool last_path = false;
+ cancel_delayed_work_sync(&ns->validate_work);
+
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
@@ -4285,12 +4290,42 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
out:
/*
* Only remove the namespace if we got a fatal error back from the
- * device, otherwise ignore the error and just move on.
- *
- * TODO: we should probably schedule a delayed retry here.
+ * device, otherwise delayed retries are performed.
*/
- if (ret > 0 && (ret & NVME_STATUS_DNR))
+ if (ret > 0 && (ret & NVME_STATUS_DNR)) {
nvme_ns_remove(ns);
+ } else if (ret > 0) {
+ if (ns->validate_retries < NVME_NS_VALIDATION_MAX_RETRIES) {
+ ns->validate_retries++;
+
+ if (!nvme_get_ns(ns))
+ return;
+
+ dev_warn(
+ ns->ctrl->device,
+ "validation failed for nsid %d, retry %d/%d in %ds\n",
+ ns->head->ns_id, ns->validate_retries,
+ NVME_NS_VALIDATION_MAX_RETRIES,
+ NVME_NS_VALIDATION_RETRY_INTERVAL);
+ memcpy(&ns->pending_info, info, sizeof(*info));
+ schedule_delayed_work(
+ &ns->validate_work,
+ NVME_NS_VALIDATION_RETRY_INTERVAL * HZ);
+ } else {
+ dev_err(ns->ctrl->device,
+ "validation failed for nsid %d after %d retries\n",
+ ns->head->ns_id,
+ NVME_NS_VALIDATION_MAX_RETRIES);
+ }
+ }
+}
+
+static void nvme_validate_ns_work(struct work_struct *work)
+{
+ struct nvme_ns *ns = container_of(to_delayed_work(work), struct nvme_ns,
+ validate_work);
+ nvme_validate_ns(ns, &ns->pending_info);
+ nvme_put_ns(ns);
}
static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ff4e7213131298a1a019eaa3822ca26f857b2443..17a4123e5e4da9828ef5662acca54e6aa9fd3cb9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -46,6 +46,12 @@ extern unsigned int admin_timeout;
#define NVME_CTRL_PAGE_SHIFT 12
#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT)
+/*
+ * Default to 3 retries in intervals of 3 seconds for namespace validation
+ */
+#define NVME_NS_VALIDATION_MAX_RETRIES 3
+#define NVME_NS_VALIDATION_RETRY_INTERVAL 3
+
extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
@@ -565,6 +571,9 @@ struct nvme_ns {
struct device cdev_device;
struct nvme_fault_inject fault_inject;
+ struct delayed_work validate_work;
+ struct nvme_ns_info pending_info;
+ unsigned int validate_retries;
};
/* NVMe ns supports metadata actions by the controller (generate/strip) */
--
2.51.0
^ permalink raw reply related [flat|nested] 5+ messages in thread