All of lore.kernel.org
 help / color / mirror / Atom feed
From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-nvme@lists.infradead.org
Cc: dwagner@suse.de, hare@suse.com, kbusch@kernel.org, hch@lst.de,
	sagi@grimberg.me, axboe@kernel.dk, chaitanyak@nvidia.com,
	venkat88@linux.ibm.com, gjoyce@linux.ibm.com,
	wenxiong@linux.ibm.com, Nilay Shroff <nilay@linux.ibm.com>
Subject: [PATCHv4 4/8] nvme: export command error counters via sysfs
Date: Sun, 17 May 2026 00:06:51 +0530	[thread overview]
Message-ID: <20260516183709.269937-5-nilay@linux.ibm.com> (raw)
In-Reply-To: <20260516183709.269937-1-nilay@linux.ibm.com>

When an NVMe command completes with an error status, the driver
logs the error to the kernel log. However, these messages may be
lost or overwritten over time since dmesg is a circular buffer.

Expose per-path and ctrl sysfs attribute command_error_count, under
diag attribute group to provide persistent visibility into error
occurrences. This allows users to observe the total number of commands
that have failed on a given path over time, which can be useful for
diagnosing path health and stability.

This attribute is both readable and writable thus allowing user to reset
these counters. These counters can also be consumed by observability
tools such as nvme-top to provide additional insight into NVMe error
behavior.

Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
 drivers/nvme/host/core.c  | 10 +++++-
 drivers/nvme/host/nvme.h  |  2 ++
 drivers/nvme/host/sysfs.c | 66 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bacd5e45c322..3b2f7a972941 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -438,11 +438,19 @@ static inline void nvme_end_req_zoned(struct request *req)
 
 static inline void __nvme_end_req(struct request *req)
 {
-	if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
+	struct nvme_ns *ns = req->q->queuedata;
+	struct nvme_request *nr = nvme_req(req);
+
+	if (unlikely(nr->status && !(req->rq_flags & RQF_QUIET))) {
 		if (blk_rq_is_passthrough(req))
 			nvme_log_err_passthru(req);
 		else
 			nvme_log_error(req);
+
+		if (ns)
+			atomic_long_inc(&ns->errors);
+		else
+			atomic_long_inc(&nr->ctrl->errors);
 	}
 	nvme_end_req_zoned(req);
 	nvme_trace_bio_complete(req);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 68c9df4f457a..b83d702dbb92 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -413,6 +413,7 @@ struct nvme_ctrl {
 	unsigned long ka_last_check_time;
 	struct work_struct fw_act_work;
 	unsigned long events;
+	atomic_long_t errors;
 
 #ifdef CONFIG_NVME_MULTIPATH
 	/* asymmetric namespace access: */
@@ -592,6 +593,7 @@ struct nvme_ns {
 	atomic_long_t failover;
 #endif
 	atomic_long_t retries;
+	atomic_long_t errors;
 	struct list_head siblings;
 	struct kref kref;
 	struct nvme_ns_head *head;
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index 35a42fd4aec4..789518f21f40 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/nvme-auth.h>
+#include <linux/blkdev.h>
 
 #include "nvme.h"
 #include "fabrics.h"
@@ -376,8 +377,37 @@ static ssize_t command_retries_count_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(command_retries_count);
 
+static ssize_t nvme_io_errors_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+	return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->errors));
+}
+
+static ssize_t nvme_io_errors_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	unsigned long errors;
+	int err;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+	err = kstrtoul(buf, 0, &errors);
+	if (err)
+		return -EINVAL;
+
+	atomic_long_set(&ns->errors, errors);
+
+	return count;
+}
+
+struct device_attribute dev_attr_io_errors =
+	__ATTR(command_error_count, 0644,
+		nvme_io_errors_show, nvme_io_errors_store);
+
 static struct attribute *nvme_ns_diag_attrs[] = {
 	&dev_attr_command_retries_count.attr,
+	&dev_attr_io_errors.attr,
 #ifdef CONFIG_NVME_MULTIPATH
 	&dev_attr_multipath_failover_count.attr,
 #endif
@@ -393,6 +423,12 @@ static umode_t nvme_ns_diag_attrs_are_visible(struct kobject *kobj,
 		if (nvme_disk_is_ns_head(dev_to_disk(dev)))
 			return 0;
 	}
+	if (a == &dev_attr_io_errors.attr) {
+		struct gendisk *disk = dev_to_disk(dev);
+
+		if (nvme_disk_is_ns_head(disk))
+			return 0;
+	}
 #ifdef CONFIG_NVME_MULTIPATH
 	if (a == &dev_attr_multipath_failover_count.attr) {
 		if (nvme_disk_is_ns_head(dev_to_disk(dev)))
@@ -995,7 +1031,37 @@ static const struct attribute_group nvme_tls_attrs_group = {
 };
 #endif
 
+static ssize_t nvme_adm_errors_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n",
+			(unsigned long)atomic_long_read(&ctrl->errors));
+}
+
+static ssize_t nvme_adm_errors_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	unsigned long errors;
+	int err;
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	err = kstrtoul(buf, 0, &errors);
+	if (err)
+		return -EINVAL;
+
+	atomic_long_set(&ctrl->errors, errors);
+
+	return count;
+}
+
+struct device_attribute dev_attr_adm_errors =
+	__ATTR(command_error_count, 0644,
+		nvme_adm_errors_show, nvme_adm_errors_store);
+
 static struct attribute *nvme_dev_diag_attrs[] = {
+	&dev_attr_adm_errors.attr,
 	NULL,
 };
 
-- 
2.53.0



  parent reply	other threads:[~2026-05-16 18:37 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-16 18:36 [PATCHv4 0/8] nvme: export additional diagnostic counters via sysfs Nilay Shroff
2026-05-16 18:36 ` [PATCHv4 1/8] nvme: add diag attribute group under sysfs Nilay Shroff
2026-05-16 18:36 ` [PATCHv4 2/8] nvme: export command retry count via sysfs Nilay Shroff
2026-05-16 18:36 ` [PATCHv4 3/8] nvme: export multipath failover " Nilay Shroff
2026-05-16 18:36 ` Nilay Shroff [this message]
2026-05-16 18:36 ` [PATCHv4 5/8] nvme: export I/O requeue count when no path is usable " Nilay Shroff
2026-05-16 18:36 ` [PATCHv4 6/8] nvme: export I/O failure count when no path is available " Nilay Shroff
2026-05-16 18:36 ` [PATCHv4 7/8] nvme: export controller reset event count " Nilay Shroff
2026-05-16 18:36 ` [PATCHv4 8/8] nvme: export controller reconnect " Nilay Shroff
2026-05-16 18:47 ` [PATCHv4 0/8] nvme: export additional diagnostic counters " Nilay Shroff
2026-05-25  9:12 ` Venkat Rao Bagalkote
2026-05-27 19:54 ` Keith Busch
2026-06-04  8:58 ` Keith Busch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260516183709.269937-5-nilay@linux.ibm.com \
    --to=nilay@linux.ibm.com \
    --cc=axboe@kernel.dk \
    --cc=chaitanyak@nvidia.com \
    --cc=dwagner@suse.de \
    --cc=gjoyce@linux.ibm.com \
    --cc=hare@suse.com \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=sagi@grimberg.me \
    --cc=venkat88@linux.ibm.com \
    --cc=wenxiong@linux.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.