* [PATCH] nvme-pci: add NVMe controller statistics
@ 2025-05-28 17:02 Tokunori Ikegami
2025-05-28 20:35 ` Keith Busch
0 siblings, 1 reply; 3+ messages in thread
From: Tokunori Ikegami @ 2025-05-28 17:02 UTC (permalink / raw)
To: linux-nvme; +Cc: Tokunori Ikegami
This is to count the controller warning events.
Signed-off-by: Tokunori Ikegami <ikegami.t@gmail.com>
---
drivers/nvme/host/nvme.h | 9 +++++++++
drivers/nvme/host/pci.c | 20 ++++++++++++++++++++
2 files changed, 29 insertions(+)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1de1b843afa5..aa28bea48783 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -277,6 +277,13 @@ enum nvme_ctrl_flags {
NVME_CTRL_FROZEN = 6,
};
+struct nvme_stats {
+ unsigned long timeouts;
+ unsigned long aborts;
+ unsigned long resets;
+ unsigned long disables;
+};
+
struct nvme_ctrl {
bool comp_seen;
bool identified;
@@ -410,6 +417,8 @@ struct nvme_ctrl {
enum nvme_ctrl_type cntrltype;
enum nvme_dctype dctype;
+
+ struct nvme_stats stats;
};
static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 94ed13903b1b..5ce26408e426 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1465,6 +1465,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
dev_warn(dev->ctrl.device,
"controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
csts, result);
+ dev->ctrl.stats.resets++;
if (csts != ~0)
return;
@@ -1526,6 +1527,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
dev_warn(dev->ctrl.device,
"I/O tag %d (%04x) QID %d timeout, completion polled\n",
req->tag, nvme_cid(req), nvmeq->qid);
+ dev->ctrl.stats.timeouts++;
return BLK_EH_DONE;
}
@@ -1563,6 +1565,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
req->tag, nvme_cid(req), opcode,
nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
+ dev->ctrl.stats.resets++;
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
goto disable;
}
@@ -1582,6 +1585,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode),
nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
blk_rq_bytes(req));
+ dev->ctrl.stats.aborts++;
abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
BLK_MQ_REQ_NOWAIT);
@@ -2390,6 +2394,19 @@ static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(hmb);
+static ssize_t stats_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvme_stats *stats = &ctrl->stats;
+
+ return sysfs_emit(buf,
+ "timeouts: %lu, aborts: %lu, resets: %lu, disables: %lu\n",
+ stats->timeouts, stats->aborts, stats->resets,
+ stats->disables);
+}
+static DEVICE_ATTR_RO(stats);
+
static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
@@ -2414,6 +2431,7 @@ static struct attribute *nvme_pci_attrs[] = {
&dev_attr_cmbloc.attr,
&dev_attr_cmbsz.attr,
&dev_attr_hmb.attr,
+ &dev_attr_stats.attr,
NULL,
};
@@ -3055,6 +3073,7 @@ static void nvme_reset_work(struct work_struct *work)
*/
dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
result);
+ dev->ctrl.stats.disables++;
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_dev_disable(dev, true);
nvme_sync_queues(&dev->ctrl);
@@ -3591,6 +3610,7 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
case pci_channel_io_frozen:
dev_warn(dev->ctrl.device,
"frozen state error detected, reset controller\n");
+ dev->ctrl.stats.resets++;
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
nvme_dev_disable(dev, true);
return PCI_ERS_RESULT_DISCONNECT;
--
2.48.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] nvme-pci: add NVMe controller statistics
2025-05-28 17:02 [PATCH] nvme-pci: add NVMe controller statistics Tokunori Ikegami
@ 2025-05-28 20:35 ` Keith Busch
2025-05-29 17:38 ` Tokunori Ikegami
0 siblings, 1 reply; 3+ messages in thread
From: Keith Busch @ 2025-05-28 20:35 UTC (permalink / raw)
To: Tokunori Ikegami; +Cc: linux-nvme
On Thu, May 29, 2025 at 02:02:56AM +0900, Tokunori Ikegami wrote:
> +static ssize_t stats_show(struct device *dev, struct device_attribute *attr,
> + char *buf)
> +{
> + struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
> + struct nvme_stats *stats = &ctrl->stats;
> +
> + return sysfs_emit(buf,
> + "timeouts: %lu, aborts: %lu, resets: %lu, disables: %lu\n",
> + stats->timeouts, stats->aborts, stats->resets,
> + stats->disables);
> +}
Specifically on sysfs attributes, the intended use was one file, one
attribute, one value. That makes programatically parsing these much
easier. So what you'd do instead is create 4 new files called
"timeouts", "aborts", "resets", and "disables", and each just print out
a sinlge number instead of this more complex output. If you're concerned
with polluting the nvme device's sysfs directory, you can also make a
sub-directory called "stats" to collect such things.
That principle hasn't always been strictly followed, but let's not
introduce new deviants.
As to what you're tracking and reporting here, I'm not convinced it's
useful. It's just a snapshot of what's happened for the lifetime of that
instance, which may not be very long.
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] nvme-pci: add NVMe controller statistics
2025-05-28 20:35 ` Keith Busch
@ 2025-05-29 17:38 ` Tokunori Ikegami
0 siblings, 0 replies; 3+ messages in thread
From: Tokunori Ikegami @ 2025-05-29 17:38 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-nvme
On 2025/05/29 5:35, Keith Busch wrote:
> Specifically on sysfs attributes, the intended use was one file, one
> attribute, one value. That makes programatically parsing these much
> easier. So what you'd do instead is create 4 new files called
> "timeouts", "aborts", "resets", and "disables", and each just print out
> a sinlge number instead of this more complex output. If you're concerned
> with polluting the nvme device's sysfs directory, you can also make a
> sub-directory called "stats" to collect such things.
Just done as mentioned by the version patch.
> That principle hasn't always been strictly followed, but let's not
> introduce new deviants.
I see.
> As to what you're tracking and reporting here, I'm not convinced it's
> useful. It's just a snapshot of what's happened for the lifetime of that
> instance, which may not be very long.
Just changed the attributes as read-write device attribute then user can
reset the counter value if needed and accumulate the counter value on
user application.
By the way I think any unstable issue behavior caused on a drive the
statistics counters can be used to check the warning behavior cause but
it is okay as not be very long for the lifetime.
Thank you.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-05-29 17:40 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-28 17:02 [PATCH] nvme-pci: add NVMe controller statistics Tokunori Ikegami
2025-05-28 20:35 ` Keith Busch
2025-05-29 17:38 ` Tokunori Ikegami
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox