* [PATCH] nvme: introduce panic_on_double_cqe param
@ 2025-10-22 13:54 Guixin Liu
2025-10-23 5:14 ` Chaitanya Kulkarni
0 siblings, 1 reply; 5+ messages in thread
From: Guixin Liu @ 2025-10-22 13:54 UTC (permalink / raw)
To: Keith Busch, Jens Axboe, Sagi Grimberg; +Cc: linux-nvme
Add a new debug switch to control whether to trigger a kernel crash
when duplicate CQEs are detected, in order to preserve the kernel
context, such as sq, cq, and so on, for subsequent debugging and
analysis.
Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
---
drivers/nvme/host/core.c | 5 +++++
drivers/nvme/host/nvme.h | 3 +++
2 files changed, 8 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fa4181d7de73..7a3f9129a39c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -95,6 +95,11 @@ module_param(apst_secondary_latency_tol_us, ulong, 0644);
MODULE_PARM_DESC(apst_secondary_latency_tol_us,
"secondary APST latency tolerance in us");
+bool panic_on_double_cqe;
+EXPORT_SYMBOL_GPL(panic_on_double_cqe);
+module_param(panic_on_double_cqe, bool, 0644);
+MODULE_PARM_DESC(panic_on_double_cqe, "crash the kernel to save the scene");
+
/*
* Older kernels didn't enable protection information if it was at an offset.
* Newer kernels do, so it breaks reads on the upgrade if such formats were
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 102fae6a231c..24010d5d15ce 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -595,6 +595,8 @@ static inline u16 nvme_cid(struct request *rq)
return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag;
}
+extern bool panic_on_double_cqe;
+
static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
u16 command_id)
{
@@ -612,6 +614,7 @@ static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
dev_err(nvme_req(rq)->ctrl->device,
"request %#x genctr mismatch (got %#x expected %#x)\n",
tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr));
+ BUG_ON(panic_on_double_cqe);
return NULL;
}
return rq;
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH] nvme: introduce panic_on_double_cqe param
2025-10-22 13:54 [PATCH] nvme: introduce panic_on_double_cqe param Guixin Liu
@ 2025-10-23 5:14 ` Chaitanya Kulkarni
2025-10-29 1:42 ` Guixin Liu
0 siblings, 1 reply; 5+ messages in thread
From: Chaitanya Kulkarni @ 2025-10-23 5:14 UTC (permalink / raw)
To: Guixin Liu, Keith Busch, Jens Axboe, Sagi Grimberg
Cc: linux-nvme@lists.infradead.org
On 10/22/25 6:54 AM, Guixin Liu wrote:
> Add a new debug switch to control whether to trigger a kernel crash
> when duplicate CQEs are detected, in order to preserve the kernel
> context, such as sq, cq, and so on, for subsequent debugging and
> analysis.
>
> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
> ---
> drivers/nvme/host/core.c | 5 +++++
> drivers/nvme/host/nvme.h | 3 +++
> 2 files changed, 8 insertions(+)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fa4181d7de73..7a3f9129a39c 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -95,6 +95,11 @@ module_param(apst_secondary_latency_tol_us, ulong, 0644);
> MODULE_PARM_DESC(apst_secondary_latency_tol_us,
> "secondary APST latency tolerance in us");
>
> +bool panic_on_double_cqe;
> +EXPORT_SYMBOL_GPL(panic_on_double_cqe);
> +module_param(panic_on_double_cqe, bool, 0644);
> +MODULE_PARM_DESC(panic_on_double_cqe, "crash the kernel to save the scene");
> +
> /*
> * Older kernels didn't enable protection information if it was at an offset.
> * Newer kernels do, so it breaks reads on the upgrade if such formats were
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 102fae6a231c..24010d5d15ce 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -595,6 +595,8 @@ static inline u16 nvme_cid(struct request *rq)
> return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag;
> }
>
> +extern bool panic_on_double_cqe;
> +
> static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
> u16 command_id)
> {
> @@ -612,6 +614,7 @@ static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
> dev_err(nvme_req(rq)->ctrl->device,
> "request %#x genctr mismatch (got %#x expected %#x)\n",
> tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr));
> + BUG_ON(panic_on_double_cqe);
> return NULL;
> }
> return rq;
I'm really not sure this is a good idea, I'll leave to others.
-ck
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] nvme: introduce panic_on_double_cqe param
2025-10-23 5:14 ` Chaitanya Kulkarni
@ 2025-10-29 1:42 ` Guixin Liu
2025-10-29 4:17 ` Chaitanya Kulkarni
0 siblings, 1 reply; 5+ messages in thread
From: Guixin Liu @ 2025-10-29 1:42 UTC (permalink / raw)
To: Chaitanya Kulkarni, Keith Busch, Jens Axboe, Sagi Grimberg
Cc: linux-nvme@lists.infradead.org
在 2025/10/23 13:14, Chaitanya Kulkarni 写道:
> On 10/22/25 6:54 AM, Guixin Liu wrote:
>> Add a new debug switch to control whether to trigger a kernel crash
>> when duplicate CQEs are detected, in order to preserve the kernel
>> context, such as sq, cq, and so on, for subsequent debugging and
>> analysis.
>>
>> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
>> ---
>> drivers/nvme/host/core.c | 5 +++++
>> drivers/nvme/host/nvme.h | 3 +++
>> 2 files changed, 8 insertions(+)
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index fa4181d7de73..7a3f9129a39c 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -95,6 +95,11 @@ module_param(apst_secondary_latency_tol_us, ulong, 0644);
>> MODULE_PARM_DESC(apst_secondary_latency_tol_us,
>> "secondary APST latency tolerance in us");
>>
>> +bool panic_on_double_cqe;
>> +EXPORT_SYMBOL_GPL(panic_on_double_cqe);
>> +module_param(panic_on_double_cqe, bool, 0644);
>> +MODULE_PARM_DESC(panic_on_double_cqe, "crash the kernel to save the scene");
>> +
>> /*
>> * Older kernels didn't enable protection information if it was at an offset.
>> * Newer kernels do, so it breaks reads on the upgrade if such formats were
>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>> index 102fae6a231c..24010d5d15ce 100644
>> --- a/drivers/nvme/host/nvme.h
>> +++ b/drivers/nvme/host/nvme.h
>> @@ -595,6 +595,8 @@ static inline u16 nvme_cid(struct request *rq)
>> return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag;
>> }
>>
>> +extern bool panic_on_double_cqe;
>> +
>> static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
>> u16 command_id)
>> {
>> @@ -612,6 +614,7 @@ static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
>> dev_err(nvme_req(rq)->ctrl->device,
>> "request %#x genctr mismatch (got %#x expected %#x)\n",
>> tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr));
>> + BUG_ON(panic_on_double_cqe);
>> return NULL;
>> }
>> return rq;
>
> I'm really not sure this is a good idea, I'll leave to others.
>
>
> -ck
Yeah, I think so too, and I'd also like to find a more elegant solution.
Best Regards,
Guixin Liu
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] nvme: introduce panic_on_double_cqe param
2025-10-29 1:42 ` Guixin Liu
@ 2025-10-29 4:17 ` Chaitanya Kulkarni
2025-10-29 5:03 ` Chaitanya Kulkarni
0 siblings, 1 reply; 5+ messages in thread
From: Chaitanya Kulkarni @ 2025-10-29 4:17 UTC (permalink / raw)
To: Guixin Liu
Cc: linux-nvme@lists.infradead.org, Jens Axboe, Sagi Grimberg,
Keith Busch
On 10/28/25 18:42, Guixin Liu wrote:
>
>
> 在 2025/10/23 13:14, Chaitanya Kulkarni 写道:
>> On 10/22/25 6:54 AM, Guixin Liu wrote:
>>> Add a new debug switch to control whether to trigger a kernel crash
>>> when duplicate CQEs are detected, in order to preserve the kernel
>>> context, such as sq, cq, and so on, for subsequent debugging and
>>> analysis.
>>>
>>> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
>>> ---
>>> drivers/nvme/host/core.c | 5 +++++
>>> drivers/nvme/host/nvme.h | 3 +++
>>> 2 files changed, 8 insertions(+)
>>>
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index fa4181d7de73..7a3f9129a39c 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -95,6 +95,11 @@ module_param(apst_secondary_latency_tol_us,
>>> ulong, 0644);
>>> MODULE_PARM_DESC(apst_secondary_latency_tol_us,
>>> "secondary APST latency tolerance in us");
>>> +bool panic_on_double_cqe;
>>> +EXPORT_SYMBOL_GPL(panic_on_double_cqe);
>>> +module_param(panic_on_double_cqe, bool, 0644);
>>> +MODULE_PARM_DESC(panic_on_double_cqe, "crash the kernel to save the
>>> scene");
>>> +
>>> /*
>>> * Older kernels didn't enable protection information if it was
>>> at an offset.
>>> * Newer kernels do, so it breaks reads on the upgrade if such
>>> formats were
>>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>> index 102fae6a231c..24010d5d15ce 100644
>>> --- a/drivers/nvme/host/nvme.h
>>> +++ b/drivers/nvme/host/nvme.h
>>> @@ -595,6 +595,8 @@ static inline u16 nvme_cid(struct request *rq)
>>> return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag;
>>> }
>>> +extern bool panic_on_double_cqe;
>>> +
>>> static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
>>> u16 command_id)
>>> {
>>> @@ -612,6 +614,7 @@ static inline struct request
>>> *nvme_find_rq(struct blk_mq_tags *tags,
>>> dev_err(nvme_req(rq)->ctrl->device,
>>> "request %#x genctr mismatch (got %#x expected %#x)\n",
>>> tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr));
>>> + BUG_ON(panic_on_double_cqe);
>>> return NULL;
>>> }
>>> return rq;
>>
>> I'm really not sure this is a good idea, I'll leave to others.
>>
>>
>> -ck
> Yeah, I think so too, and I'd also like to find a more elegant solution.
>
> Best Regards,
> Guixin Liu
>
What about logging the necessary information and still continuing the
setup ?
When you are debugging you can always setup a breakpoint there and get
the system state in the breakpoint, something like following totally
untested :-
Usage:
# Enable at runtime
echo 'module nvme +p' > /sys/kernel/debug/dynamic_debug/control
# Or at module load
modprobe nvme dyndbg='+p'
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index
c916176bd9f0..a9f9c61d3fc9 100644 --- a/drivers/nvme/host/pci.c +++
b/drivers/nvme/host/pci.c @@ -1294,6 +1294,50 @@ static inline struct
blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) return
nvmeq->dev->tagset.tags[nvmeq->qid - 1]; } +/** + *
nvme_handle_duplicate_cqe - Handle duplicate CQE detection + * @nvmeq:
The queue where duplicate was detected + * @cqe: The completion queue
entry + * @command_id: The command ID from the CQE + * + * Logs detailed
information about the duplicate CQE including CQE details + * and full
queue state to aid in debugging hardware or firmware issues. + */
+static void nvme_handle_duplicate_cqe(struct nvme_queue *nvmeq, +
struct nvme_completion *cqe, + u16 command_id) +{ + struct nvme_dev *dev
= nvmeq->dev; + + dev_dbg(dev->ctrl.device, + "Duplicate/Invalid CQE
detected:\n" + " Queue: %d (SQ ID: %u)\n" + " Command ID: %u\n" + "
Status: %#x\n" + " Result: %#llx\n" + "Queue state:\n" + " Queue depth:
%u\n" + " SQ tail: %u, last_sq_tail: %u\n" + " CQ head: %u, phase: %u\n"
+ " CQ vector: %u\n" + " SQ DMA addr: %pad\n" + " CQ DMA addr: %pad\n" +
" Flags: %#lx (enabled=%d, sq_cmb=%d, polled=%d)\n", + nvmeq->qid,
le16_to_cpu(cqe->sq_id), + command_id, le16_to_cpu(cqe->status), +
le64_to_cpu(cqe->result.u64), + nvmeq->q_depth, + nvmeq->sq_tail,
nvmeq->last_sq_tail, + nvmeq->cq_head, nvmeq->cq_phase, +
nvmeq->cq_vector, + &nvmeq->sq_dma_addr, + &nvmeq->cq_dma_addr, +
nvmeq->flags, + test_bit(NVMEQ_ENABLED, &nvmeq->flags), +
test_bit(NVMEQ_SQ_CMB, &nvmeq->flags), + test_bit(NVMEQ_POLLED,
&nvmeq->flags)); +} + static inline void nvme_handle_cqe(struct
nvme_queue *nvmeq, struct io_comp_batch *iob, u16 idx) { @@ -1315,9
+1359,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); if
(unlikely(!req)) { - dev_warn(nvmeq->dev->ctrl.device, - "invalid id %d
completed on queue %d\n", - command_id, le16_to_cpu(cqe->sq_id)); +
nvme_handle_duplicate_cqe(nvmeq, cqe, command_id); return; } -ck
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] nvme: introduce panic_on_double_cqe param
2025-10-29 4:17 ` Chaitanya Kulkarni
@ 2025-10-29 5:03 ` Chaitanya Kulkarni
0 siblings, 0 replies; 5+ messages in thread
From: Chaitanya Kulkarni @ 2025-10-29 5:03 UTC (permalink / raw)
To: Guixin Liu
Cc: linux-nvme@lists.infradead.org, Jens Axboe, Sagi Grimberg,
Keith Busch
On 10/28/25 21:17, Chaitanya Kulkarni wrote:
> On 10/28/25 18:42, Guixin Liu wrote:
>>
>>
>> 在 2025/10/23 13:14, Chaitanya Kulkarni 写道:
>>> On 10/22/25 6:54 AM, Guixin Liu wrote:
>>>> Add a new debug switch to control whether to trigger a kernel crash
>>>> when duplicate CQEs are detected, in order to preserve the kernel
>>>> context, such as sq, cq, and so on, for subsequent debugging and
>>>> analysis.
>>>>
>>>> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
>>>> ---
>>>> drivers/nvme/host/core.c | 5 +++++
>>>> drivers/nvme/host/nvme.h | 3 +++
>>>> 2 files changed, 8 insertions(+)
>>>>
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>> index fa4181d7de73..7a3f9129a39c 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>> @@ -95,6 +95,11 @@ module_param(apst_secondary_latency_tol_us,
>>>> ulong, 0644);
>>>> MODULE_PARM_DESC(apst_secondary_latency_tol_us,
>>>> "secondary APST latency tolerance in us");
>>>> +bool panic_on_double_cqe;
>>>> +EXPORT_SYMBOL_GPL(panic_on_double_cqe);
>>>> +module_param(panic_on_double_cqe, bool, 0644);
>>>> +MODULE_PARM_DESC(panic_on_double_cqe, "crash the kernel to save
>>>> the scene");
>>>> +
>>>> /*
>>>> * Older kernels didn't enable protection information if it was
>>>> at an offset.
>>>> * Newer kernels do, so it breaks reads on the upgrade if such
>>>> formats were
>>>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>>> index 102fae6a231c..24010d5d15ce 100644
>>>> --- a/drivers/nvme/host/nvme.h
>>>> +++ b/drivers/nvme/host/nvme.h
>>>> @@ -595,6 +595,8 @@ static inline u16 nvme_cid(struct request *rq)
>>>> return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag;
>>>> }
>>>> +extern bool panic_on_double_cqe;
>>>> +
>>>> static inline struct request *nvme_find_rq(struct blk_mq_tags
>>>> *tags,
>>>> u16 command_id)
>>>> {
>>>> @@ -612,6 +614,7 @@ static inline struct request
>>>> *nvme_find_rq(struct blk_mq_tags *tags,
>>>> dev_err(nvme_req(rq)->ctrl->device,
>>>> "request %#x genctr mismatch (got %#x expected %#x)\n",
>>>> tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr));
>>>> + BUG_ON(panic_on_double_cqe);
>>>> return NULL;
>>>> }
>>>> return rq;
>>>
>>> I'm really not sure this is a good idea, I'll leave to others.
>>>
>>>
>>> -ck
>> Yeah, I think so too, and I'd also like to find a more elegant solution.
>>
>> Best Regards,
>> Guixin Liu
>>
>
> What about logging the necessary information and still continuing the
> setup ?
> When you are debugging you can always setup a breakpoint there and get
> the system state in the breakpoint, something like following totally
> untested :-
>
> Usage:
> # Enable at runtime
> echo 'module nvme +p' > /sys/kernel/debug/dynamic_debug/control
>
> # Or at module load
> modprobe nvme dyndbg='+p'
>
something is wrong with my mail trying it again :-
drivers/nvme/host/pci.c | 48 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 45 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c916176bd9f0..a9f9c61d3fc9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1294,6 +1294,50 @@ static inline struct blk_mq_tags
*nvme_queue_tagset(struct nvme_queue *nvmeq)
return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
}
+/**
+ * nvme_handle_duplicate_cqe - Handle duplicate CQE detection
+ * @nvmeq: The queue where duplicate was detected
+ * @cqe: The completion queue entry
+ * @command_id: The command ID from the CQE
+ *
+ * Logs detailed information about the duplicate CQE including CQE details
+ * and full queue state to aid in debugging hardware or firmware issues.
+ */
+static void nvme_handle_duplicate_cqe(struct nvme_queue *nvmeq,
+ struct nvme_completion *cqe,
+ u16 command_id)
+{
+ struct nvme_dev *dev = nvmeq->dev;
+
+ dev_dbg(dev->ctrl.device,
+ "Duplicate/Invalid CQE detected:\n"
+ " Queue: %d (SQ ID: %u)\n"
+ " Command ID: %u\n"
+ " Status: %#x\n"
+ " Result: %#llx\n"
+ "Queue state:\n"
+ " Queue depth: %u\n"
+ " SQ tail: %u, last_sq_tail: %u\n"
+ " CQ head: %u, phase: %u\n"
+ " CQ vector: %u\n"
+ " SQ DMA addr: %pad\n"
+ " CQ DMA addr: %pad\n"
+ " Flags: %#lx (enabled=%d, sq_cmb=%d, polled=%d)\n",
+ nvmeq->qid, le16_to_cpu(cqe->sq_id),
+ command_id, le16_to_cpu(cqe->status),
+ le64_to_cpu(cqe->result.u64),
+ nvmeq->q_depth,
+ nvmeq->sq_tail, nvmeq->last_sq_tail,
+ nvmeq->cq_head, nvmeq->cq_phase,
+ nvmeq->cq_vector,
+ &nvmeq->sq_dma_addr,
+ &nvmeq->cq_dma_addr,
+ nvmeq->flags,
+ test_bit(NVMEQ_ENABLED, &nvmeq->flags),
+ test_bit(NVMEQ_SQ_CMB, &nvmeq->flags),
+ test_bit(NVMEQ_POLLED, &nvmeq->flags));
+}
+
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
struct io_comp_batch *iob, u16 idx)
{
@@ -1315,9 +1359,7 @@ static inline void nvme_handle_cqe(struct
nvme_queue *nvmeq,
req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id);
if (unlikely(!req)) {
- dev_warn(nvmeq->dev->ctrl.device,
- "invalid id %d completed on queue %d\n",
- command_id, le16_to_cpu(cqe->sq_id));
+ nvme_handle_duplicate_cqe(nvmeq, cqe, command_id);
return;
}
-ck
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-10-29 5:03 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-22 13:54 [PATCH] nvme: introduce panic_on_double_cqe param Guixin Liu
2025-10-23 5:14 ` Chaitanya Kulkarni
2025-10-29 1:42 ` Guixin Liu
2025-10-29 4:17 ` Chaitanya Kulkarni
2025-10-29 5:03 ` Chaitanya Kulkarni
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox