From: Ian Rogers <irogers@google.com>
To: Peter Zijlstra <peterz@infradead.org>,
Ingo Molnar <mingo@redhat.com>,
Arnaldo Carvalho de Melo <acme@kernel.org>,
Namhyung Kim <namhyung@kernel.org>, Jiri Olsa <jolsa@kernel.org>,
Ian Rogers <irogers@google.com>,
Adrian Hunter <adrian.hunter@intel.com>,
James Clark <james.clark@linaro.org>,
Tomas Glozar <tglozar@redhat.com>,
Michael Jeanson <mjeanson@efficios.com>,
Dmitrii Dolgov <9erthalion6@gmail.com>,
Alexandre Chartre <alexandre.chartre@oracle.com>,
Yuzhuo Jing <yuzhuo@google.com>, Leo Yan <leo.yan@arm.com>,
German Gomez <german.gomez@arm.com>,
Anubhav Shelat <ashelat@redhat.com>,
linux-kernel@vger.kernel.org, linux-perf-users@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Subject: [PATCH v1 2/3] perf pmu: Implement tool-provided NVMe PMU
Date: Tue, 9 Jun 2026 00:03:47 -0700 [thread overview]
Message-ID: <20260609070348.541964-3-irogers@google.com> (raw)
In-Reply-To: <20260609070348.541964-1-irogers@google.com>
Add a tool-provided PMU for NVMe devices using libnvme. This PMU
exposes metrics from various NVMe logs (SMART, Endurance Group,
FDP, Error Information, and Zoned Namespaces) as perf events under
the 'nvme_nvmeX' PMUs.
Features:
- Generic configuration encoding (log type, size, and offset mapped
inside the event config).
- Dynamic probing of supported log pages on /dev/nvmeX. Unsupported
events are marked deprecated and hidden from 'perf list' by
default.
- Correct interval delta tracking using baseline offset snapshots on
evsel->priv.
- Proper conversion and scaling of properties like temperature (K
to C).
Signed-off-by: Ian Rogers <irogers@google.com>
CONV=ca4c5d09-4ef8-405a-80bb-aa988020b436
TAG=agy
---
tools/perf/util/Build | 1 +
tools/perf/util/evsel.c | 11 +-
tools/perf/util/nvme_pmu.c | 562 +++++++++++++++++++++++++++++++++++++
tools/perf/util/nvme_pmu.h | 112 ++++++++
tools/perf/util/pmu.c | 15 +
tools/perf/util/pmu.h | 8 +-
tools/perf/util/pmus.c | 26 +-
7 files changed, 728 insertions(+), 7 deletions(-)
create mode 100644 tools/perf/util/nvme_pmu.c
create mode 100644 tools/perf/util/nvme_pmu.h
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index b22cdc24082a..e28e99634178 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -20,6 +20,7 @@ perf-util-y += disasm.o
perf-util-y += env.o
perf-util-y += event.o
perf-util-y += evlist.o
+perf-util-y += nvme_pmu.o
perf-util-y += sideband_evlist.o
perf-util-y += evsel.o
perf-util-y += evsel_fprintf.o
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 34c03f47a913..8caa626af57a 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -57,6 +57,7 @@
#include "hashmap.h"
#include "hist.h"
#include "hwmon_pmu.h"
+#include "nvme_pmu.h"
#include "intel-tpebs.h"
#include "memswap.h"
#include "off_cpu.h"
@@ -2207,6 +2208,8 @@ int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
if (evsel__is_hwmon(evsel))
return evsel__hwmon_pmu_read(evsel, cpu_map_idx, thread);
+ if (evsel__is_nvme(evsel))
+ return evsel__nvme_pmu_read(evsel, cpu_map_idx, thread);
if (evsel__is_drm(evsel))
return evsel__drm_pmu_read(evsel, cpu_map_idx, thread);
@@ -2947,8 +2950,12 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
}
if (evsel__is_hwmon(evsel)) {
err = evsel__hwmon_pmu_open(evsel, threads,
- start_cpu_map_idx,
- end_cpu_map_idx);
+ start_cpu_map_idx, end_cpu_map_idx);
+ goto out;
+ }
+ if (evsel__is_nvme(evsel)) {
+ err = evsel__nvme_pmu_open(evsel, threads,
+ start_cpu_map_idx, end_cpu_map_idx);
goto out;
}
if (evsel__is_drm(evsel)) {
diff --git a/tools/perf/util/nvme_pmu.c b/tools/perf/util/nvme_pmu.c
new file mode 100644
index 000000000000..17ba758aec59
--- /dev/null
+++ b/tools/perf/util/nvme_pmu.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "counts.h"
+#include "debug.h"
+#include "evsel.h"
+#include "hashmap.h"
+#include "nvme_pmu.h"
+#include "pmu.h"
+#include <internal/xyarray.h>
+#include <internal/threadmap.h>
+#include <perf/threadmap.h>
+#include <sys/types.h>
+#include <assert.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <api/fs/fs.h>
+#include <api/io.h>
+#include <api/io_dir.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+
+#ifdef HAVE_LIBNVME_SUPPORT
+#include <libnvme.h>
+
+
+#define NVME_CONFIG(log, size, offset) \
+ (((uint64_t)(log) << 24) | ((uint64_t)(size) << 16) | (offset))
+
+enum nvme_log_type {
+ NVME_LOG_SMART = 0,
+ NVME_LOG_ENDURANCE = 1,
+ NVME_LOG_FDP = 2,
+ NVME_LOG_ERROR = 3,
+ NVME_LOG_ZNS = 4,
+};
+
+#define NVME_SMART(size, field) \
+ NVME_CONFIG(NVME_LOG_SMART, size, offsetof(struct nvme_smart_log, field))
+
+#define NVME_ENDURANCE(size, field) \
+ NVME_CONFIG(NVME_LOG_ENDURANCE, size, offsetof(struct nvme_endurance_group_log, field))
+
+#define NVME_FDP(size, field) \
+ NVME_CONFIG(NVME_LOG_FDP, size, offsetof(struct nvme_fdp_stats_log, field))
+
+#define NVME_ERROR(size, field) \
+ NVME_CONFIG(NVME_LOG_ERROR, size, offsetof(struct nvme_error_log_page, field))
+
+#define NVME_ZNS(size, field) \
+ NVME_CONFIG(NVME_LOG_ZNS, size, offsetof(struct nvme_zns_changed_zone_log, field))
+
+struct nvme_event {
+ const char *name;
+ const char *desc;
+ const char *scale_unit;
+ uint64_t config;
+};
+
+static const struct nvme_event nvme_events[] = {
+ { "smart_data_units_read",
+ "Data units read (in 1000s of 512-byte units)",
+ "512000B", NVME_SMART(16, data_units_read) },
+ { "smart_data_units_written",
+ "Data units written (in 1000s of 512-byte units)",
+ "512000B", NVME_SMART(16, data_units_written) },
+ { "smart_host_read_commands", "Host read commands", NULL, NVME_SMART(16, host_reads) },
+ { "smart_host_write_commands", "Host write commands", NULL, NVME_SMART(16, host_writes) },
+ { "smart_ctrl_busy_time", "Controller busy time", "60s", NVME_SMART(16, ctrl_busy_time) },
+ { "smart_power_cycles", "Power cycles", NULL, NVME_SMART(16, power_cycles) },
+ { "smart_power_on_hours", "Power on hours", "1h", NVME_SMART(16, power_on_hours) },
+ { "smart_unsafe_shutdowns", "Unsafe shutdowns", NULL, NVME_SMART(16, unsafe_shutdowns) },
+ { "smart_media_errors", "Media errors", NULL, NVME_SMART(16, media_errors) },
+ { "smart_num_err_log_entries",
+ "Number of error log entries",
+ NULL, NVME_SMART(16, num_err_log_entries) },
+ { "smart_warning_temp_time",
+ "Warning temperature time",
+ "60s", NVME_SMART(4, warning_temp_time) },
+ { "smart_crit_comp_time",
+ "Critical composite temperature time",
+ "60s", NVME_SMART(4, critical_comp_time) },
+ { "smart_temperature", "Temperature", "0.001'C", NVME_SMART(2, temperature) },
+
+ { "endurance_percent_used",
+ "Endurance group percentage used",
+ NULL, NVME_ENDURANCE(1, percent_used) },
+ { "endurance_data_units_read",
+ "Endurance group data units read",
+ "512000B", NVME_ENDURANCE(16, data_units_read) },
+ { "endurance_data_units_written",
+ "Endurance group data units written",
+ "512000B", NVME_ENDURANCE(16, data_units_written) },
+ { "endurance_media_units_written",
+ "Endurance group media units written",
+ "512000B", NVME_ENDURANCE(16, media_units_written) },
+ { "endurance_host_read_cmds",
+ "Endurance group host read commands",
+ NULL, NVME_ENDURANCE(16, host_read_cmds) },
+ { "endurance_host_write_cmds",
+ "Endurance group host write commands",
+ NULL, NVME_ENDURANCE(16, host_write_cmds) },
+ { "endurance_num_err_info_log_entries",
+ "Endurance group number of error information log entries",
+ NULL, NVME_ENDURANCE(16, num_err_info_log_entries) },
+
+ { "fdp_hbmw", "FDP host bytes with metadata written", "1B", NVME_FDP(16, hbmw) },
+ { "fdp_mbmw", "FDP media bytes with metadata written", "1B", NVME_FDP(16, mbmw) },
+ { "fdp_mbe", "FDP media bytes erased", "1B", NVME_FDP(16, mbe) },
+
+ { "error_count", "Error info log error count", NULL, NVME_ERROR(8, error_count) },
+
+ { "zns_nrzid", "ZNS changed zone nrzid", NULL, NVME_ZNS(2, nrzid) },
+};
+
+
+struct nvme_pmu {
+ struct perf_pmu pmu;
+ char *dev_name;
+ bool support_checked;
+ bool log_supported[256];
+};
+
+
+bool perf_pmu__is_nvme(const struct perf_pmu *pmu)
+{
+ return pmu && pmu->type >= PERF_PMU_TYPE_NVME_START &&
+ pmu->type <= PERF_PMU_TYPE_NVME_END;
+}
+
+bool evsel__is_nvme(const struct evsel *evsel)
+{
+ return perf_pmu__is_nvme(evsel->pmu);
+}
+
+struct perf_pmu *nvme_pmu__new(struct list_head *pmus, const char *sysfs_name, const char *name)
+{
+ struct nvme_pmu *nvm;
+ char buf[64];
+ __u32 type;
+
+ /*
+ * Usually sysfs_name is something like "nvme0".
+ * We try to extract the number. If parsing fails, we use 0.
+ */
+ type = PERF_PMU_TYPE_NVME_START + strtoul(sysfs_name + 4, NULL, 10);
+
+ if (type > PERF_PMU_TYPE_NVME_END) {
+ pr_err("Unable to encode NVMe type from %s in valid PMU type\n", sysfs_name);
+ return NULL;
+ }
+
+ snprintf(buf, sizeof(buf), "nvme_%s", name);
+
+ nvm = zalloc(sizeof(*nvm));
+ if (!nvm)
+ return NULL;
+
+ if (perf_pmu__init(&nvm->pmu, type, buf) != 0) {
+ free(nvm);
+ return NULL;
+ }
+
+ nvm->dev_name = strdup(sysfs_name);
+ if (!nvm->dev_name) {
+ perf_pmu__delete(&nvm->pmu);
+ return NULL;
+ }
+ nvm->pmu.alias_name = strdup(sysfs_name);
+ if (!nvm->pmu.alias_name) {
+ perf_pmu__delete(&nvm->pmu);
+ return NULL;
+ }
+ nvm->pmu.cpus = perf_cpu_map__new_int(0);
+ if (!nvm->pmu.cpus) {
+ perf_pmu__delete(&nvm->pmu);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&nvm->pmu.format);
+ INIT_LIST_HEAD(&nvm->pmu.caps);
+
+ list_add_tail(&nvm->pmu.list, pmus);
+ return &nvm->pmu;
+}
+
+void nvme_pmu__exit(struct perf_pmu *pmu)
+{
+ struct nvme_pmu *nvm = container_of(pmu, struct nvme_pmu, pmu);
+
+ zfree(&nvm->dev_name);
+}
+
+
+
+static void nvme_pmu__check_support(struct nvme_pmu *nvm)
+{
+ int fd;
+ char path[PATH_MAX];
+ struct nvme_smart_log smart_log;
+ struct nvme_endurance_group_log endurance_log;
+ struct nvme_fdp_stats_log fdp_log;
+ struct nvme_error_log_page error_log;
+ struct nvme_zns_changed_zone_log zns_log;
+
+ if (nvm->support_checked)
+ return;
+
+ nvm->support_checked = true;
+
+ /* Assume all supported if we can't test. */
+ memset(nvm->log_supported, 1, sizeof(nvm->log_supported));
+
+ snprintf(path, sizeof(path), "/dev/%s", nvm->dev_name);
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return;
+
+ if (nvme_get_log_smart(fd, NVME_NSID_ALL, true, &smart_log) != 0)
+ nvm->log_supported[NVME_LOG_SMART] = false;
+
+ if (nvme_get_log_endurance_group(fd, 0, &endurance_log) != 0)
+ nvm->log_supported[NVME_LOG_ENDURANCE] = false;
+
+ if (nvme_get_log_fdp_stats(fd, 0, 0, sizeof(fdp_log), &fdp_log) != 0)
+ nvm->log_supported[NVME_LOG_FDP] = false;
+
+ if (nvme_get_log_error(fd, 1, true, &error_log) != 0)
+ nvm->log_supported[NVME_LOG_ERROR] = false;
+
+ if (nvme_get_log_zns_changed_zones(fd, NVME_NSID_ALL, true, &zns_log) != 0)
+ nvm->log_supported[NVME_LOG_ZNS] = false;
+
+ close(fd);
+}
+
+int nvme_pmu__for_each_event(struct perf_pmu *pmu, void *state, pmu_event_callback cb)
+{
+ struct nvme_pmu *nvm = container_of(pmu, struct nvme_pmu, pmu);
+ size_t i;
+
+ nvme_pmu__check_support(nvm);
+ for (i = 0; i < ARRAY_SIZE(nvme_events); i++) {
+ const struct nvme_event *e = &nvme_events[i];
+ char alias_buf[64];
+ char desc_buf[256];
+ char encoding_buf[128];
+ struct pmu_event_info info = {
+ .pmu = pmu,
+ .name = e->name,
+ .alias = alias_buf,
+ .scale_unit = e->scale_unit,
+ .desc = desc_buf,
+ .long_desc = NULL,
+ .encoding_desc = encoding_buf,
+
+ .topic = "nvme",
+ .pmu_name = pmu->name,
+ .event_type_desc = "NVMe event",
+ .deprecated = !nvm->log_supported[(e->config >> 24) & 0xFF],
+ };
+
+ int ret;
+
+ snprintf(alias_buf, sizeof(alias_buf), "%s", e->name);
+ snprintf(desc_buf, sizeof(desc_buf), "%s", e->desc);
+ snprintf(encoding_buf, sizeof(encoding_buf),
+ "%s/config=0x%lx/", pmu->name, e->config);
+
+ ret = cb(state, &info);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+size_t nvme_pmu__num_events(struct perf_pmu *pmu __maybe_unused)
+{
+ return ARRAY_SIZE(nvme_events);
+}
+
+bool nvme_pmu__have_event(struct perf_pmu *pmu __maybe_unused, const char *name)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(nvme_events); i++) {
+ if (!strcasecmp(name, nvme_events[i].name))
+ return true;
+ }
+ return false;
+}
+
+static int nvme_pmu__config_term(const struct nvme_pmu *nvm __maybe_unused,
+ struct perf_event_attr *attr,
+ struct parse_events_term *term,
+ struct parse_events_error *err)
+{
+ if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER) {
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(nvme_events); i++) {
+ if (!strcasecmp(term->config, nvme_events[i].name)) {
+ attr->config = nvme_events[i].config;
+ return 0;
+ }
+ }
+ }
+ if (err) {
+ char *err_str;
+
+ parse_events_error__handle(err, term->err_val,
+ asprintf(&err_str,
+ "unexpected nvme event term (%s) %s",
+ parse_events__term_type_str(term->type_term),
+ term->config) < 0
+ ? strdup("unexpected nvme event term")
+ : err_str,
+ NULL);
+ }
+ return -EINVAL;
+}
+
+int nvme_pmu__config_terms(const struct perf_pmu *pmu,
+ struct perf_event_attr *attr,
+ struct parse_events_terms *terms,
+ struct parse_events_error *err)
+{
+ struct nvme_pmu *nvm = container_of(pmu, struct nvme_pmu, pmu);
+ struct parse_events_term *term;
+
+ list_for_each_entry(term, &terms->terms, list) {
+ if (nvme_pmu__config_term(nvm, attr, term, err))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int nvme_pmu__check_alias(struct parse_events_terms *terms, struct perf_pmu_info *info,
+ struct parse_events_error *err)
+{
+ struct parse_events_term *term =
+ list_first_entry(&terms->terms, struct parse_events_term, list);
+
+ if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER) {
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(nvme_events); i++) {
+ if (!strcasecmp(term->config, nvme_events[i].name)) {
+ if (nvme_events[i].scale_unit) {
+ char *unit;
+
+ perf_pmu__convert_scale(nvme_events[i].scale_unit,
+ &unit, &info->scale);
+ info->unit = unit;
+ }
+ return 0;
+ }
+ }
+ }
+ if (err) {
+ char *err_str;
+
+ parse_events_error__handle(err, term->err_val,
+ asprintf(&err_str,
+ "unexpected nvme event term (%s) %s",
+ parse_events__term_type_str(term->type_term),
+ term->config) < 0
+ ? strdup("unexpected nvme event term")
+ : err_str,
+ NULL);
+ }
+ return -EINVAL;
+}
+
+int perf_pmus__read_nvme_pmus(struct list_head *pmus)
+{
+ nvme_root_t r = nvme_scan(NULL);
+ nvme_host_t h;
+ nvme_subsystem_t s;
+ nvme_ctrl_t c;
+
+ if (!r)
+ return 0;
+
+ nvme_for_each_host(r, h) {
+ nvme_for_each_subsystem(h, s) {
+ nvme_subsystem_for_each_ctrl(s, c) {
+ nvme_pmu__new(pmus, nvme_ctrl_get_name(c), nvme_ctrl_get_name(c));
+ }
+ }
+ }
+ nvme_free_tree(r);
+ return 0;
+}
+
+
+static int nvme_pmu__read_val(int fd, uint64_t config, uint64_t *val)
+{
+ int log_type = (config >> 24) & 0xFF;
+ unsigned int size = (config >> 16) & 0xFF;
+ unsigned int offset = config & 0xFFFF;
+ uint8_t buf[4096];
+ uint8_t *p;
+
+ if (log_type == NVME_LOG_SMART) {
+ if (offset + size > sizeof(struct nvme_smart_log))
+ return -EINVAL;
+ if (nvme_get_log_smart(fd, NVME_NSID_ALL, true, (struct nvme_smart_log *)buf) != 0)
+ return -EINVAL;
+
+ if (offset == offsetof(struct nvme_smart_log, temperature)) {
+ uint64_t kelvin = ((struct nvme_smart_log *)buf)->temperature[0] |
+ (((struct nvme_smart_log *)buf)->temperature[1] << 8);
+ *val = (kelvin * 1000) - 273150;
+ return 0;
+ }
+ } else if (log_type == NVME_LOG_ENDURANCE) {
+ if (offset + size > sizeof(struct nvme_endurance_group_log))
+ return -EINVAL;
+ if (nvme_get_log_endurance_group(fd, 0,
+ (struct nvme_endurance_group_log *)buf) != 0)
+ return -EINVAL;
+ } else if (log_type == NVME_LOG_FDP) {
+ if (offset + size > sizeof(struct nvme_fdp_stats_log))
+ return -EINVAL;
+ if (nvme_get_log_fdp_stats(fd, 0, 0, sizeof(struct nvme_fdp_stats_log), buf) != 0)
+ return -EINVAL;
+ } else if (log_type == NVME_LOG_ERROR) {
+ if (offset + size > sizeof(struct nvme_error_log_page))
+ return -EINVAL;
+ if (nvme_get_log_error(fd, 1, true, (struct nvme_error_log_page *)buf) != 0)
+ return -EINVAL;
+ } else if (log_type == NVME_LOG_ZNS) {
+ if (offset + size > sizeof(struct nvme_zns_changed_zone_log))
+ return -EINVAL;
+ if (nvme_get_log_zns_changed_zones(fd, NVME_NSID_ALL, true,
+ (struct nvme_zns_changed_zone_log *)buf) != 0)
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ p = buf + offset;
+ if (size == 16 || size == 8)
+ *val = le64_to_cpu(*(uint64_t *)p);
+ else if (size == 4)
+ *val = le32_to_cpu(*(uint32_t *)p);
+ else if (size == 2)
+ *val = le16_to_cpu(*(uint16_t *)p);
+ else if (size == 1)
+ *val = *(uint8_t *)p;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool nvme_pmu__is_gauge(uint64_t config)
+{
+ if (config == NVME_SMART(2, temperature) ||
+ config == NVME_ENDURANCE(1, percent_used) ||
+ config == NVME_ZNS(2, nrzid))
+ return true;
+ return false;
+}
+
+#define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
+
+int evsel__nvme_pmu_open(struct evsel *evsel,
+ struct perf_thread_map *threads,
+ int start_cpu_map_idx, int end_cpu_map_idx)
+{
+ struct nvme_pmu *nvm = container_of(evsel->pmu, struct nvme_pmu, pmu);
+ int idx = 0, thread = 0, nthreads, err = 0;
+ char path[PATH_MAX];
+
+ snprintf(path, sizeof(path), "/dev/%s", nvm->dev_name);
+
+ nthreads = perf_thread_map__nr(threads);
+
+ if (!evsel->priv) {
+ int max_cpus = evsel->core.cpus ? perf_cpu_map__nr(evsel->core.cpus) : 1;
+
+ evsel->priv = xyarray__new(max_cpus, nthreads, sizeof(uint64_t));
+ }
+
+ for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) {
+ for (thread = 0; thread < nthreads; thread++) {
+ int fd = open(path, O_RDONLY);
+
+ FD(evsel, idx, thread) = fd;
+ if (fd < 0) {
+ err = -errno;
+ goto out_close;
+ }
+ if (evsel->priv) {
+ uint64_t *initial_val = xyarray__entry(evsel->priv, idx, thread);
+
+ if (nvme_pmu__read_val(fd, evsel->core.attr.config, initial_val))
+ *initial_val = 0;
+ }
+ }
+ }
+ return 0;
+out_close:
+ if (err)
+ threads->err_thread = thread;
+
+ do {
+ while (--thread >= 0) {
+ if (FD(evsel, idx, thread) >= 0)
+ close(FD(evsel, idx, thread));
+ FD(evsel, idx, thread) = -1;
+ }
+ thread = nthreads;
+ } while (--idx >= 0);
+ return err;
+}
+
+int evsel__nvme_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
+{
+ int fd;
+ struct perf_counts_values *count, *old_count = NULL;
+ uint64_t val = 0;
+ uint64_t *initial_val = NULL;
+
+ if (evsel->prev_raw_counts)
+ old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
+
+ count = perf_counts(evsel->counts, cpu_map_idx, thread);
+ fd = FD(evsel, cpu_map_idx, thread);
+
+ if (fd < 0 || nvme_pmu__read_val(fd, evsel->core.attr.config, &val)) {
+ count->lost++;
+ return -EINVAL;
+ }
+
+ if (evsel->priv)
+ initial_val = xyarray__entry(evsel->priv, cpu_map_idx, thread);
+
+ if (old_count) {
+ if (nvme_pmu__is_gauge(evsel->core.attr.config))
+ count->val = old_count->val + val;
+ else
+ count->val = val - (initial_val ? *initial_val : 0);
+ count->run = old_count->run + 1;
+ count->ena = old_count->ena + 1;
+ } else {
+ if (nvme_pmu__is_gauge(evsel->core.attr.config))
+ count->val = val;
+ else
+ count->val = val - (initial_val ? *initial_val : 0);
+ count->run++;
+ count->ena++;
+ }
+ return 0;
+}
+
+
+#endif
diff --git a/tools/perf/util/nvme_pmu.h b/tools/perf/util/nvme_pmu.h
new file mode 100644
index 000000000000..6d5d2bbe4167
--- /dev/null
+++ b/tools/perf/util/nvme_pmu.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __NVME_PMU_H
+#define __NVME_PMU_H
+
+#include "pmu.h"
+#include <stdbool.h>
+#include <errno.h>
+
+struct list_head;
+struct perf_thread_map;
+struct evsel;
+
+#ifdef HAVE_LIBNVME_SUPPORT
+struct perf_pmu *nvme_pmu__new(struct list_head *pmus, const char *sysfs_name, const char *name);
+void nvme_pmu__exit(struct perf_pmu *pmu);
+
+int nvme_pmu__for_each_event(struct perf_pmu *pmu, void *state, pmu_event_callback cb);
+size_t nvme_pmu__num_events(struct perf_pmu *pmu);
+bool nvme_pmu__have_event(struct perf_pmu *pmu, const char *name);
+int nvme_pmu__config_terms(const struct perf_pmu *pmu,
+ struct perf_event_attr *attr,
+ struct parse_events_terms *terms,
+ struct parse_events_error *err);
+int nvme_pmu__check_alias(struct parse_events_terms *terms, struct perf_pmu_info *info,
+ struct parse_events_error *err);
+
+bool perf_pmu__is_nvme(const struct perf_pmu *pmu);
+bool evsel__is_nvme(const struct evsel *evsel);
+
+int perf_pmus__read_nvme_pmus(struct list_head *pmus);
+
+int evsel__nvme_pmu_open(struct evsel *evsel,
+ struct perf_thread_map *threads,
+ int start_cpu_map_idx, int end_cpu_map_idx);
+int evsel__nvme_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread);
+#else
+static inline struct perf_pmu *nvme_pmu__new(struct list_head *pmus __maybe_unused,
+ const char *sysfs_name __maybe_unused,
+ const char *name __maybe_unused)
+{
+ return NULL;
+}
+
+static inline void nvme_pmu__exit(struct perf_pmu *pmu __maybe_unused)
+{
+}
+
+static inline int nvme_pmu__for_each_event(struct perf_pmu *pmu __maybe_unused,
+ void *state __maybe_unused,
+ pmu_event_callback cb __maybe_unused)
+{
+ return 0;
+}
+
+static inline size_t nvme_pmu__num_events(struct perf_pmu *pmu __maybe_unused)
+{
+ return 0;
+}
+
+static inline bool nvme_pmu__have_event(struct perf_pmu *pmu __maybe_unused,
+ const char *name __maybe_unused)
+{
+ return false;
+}
+
+static inline int nvme_pmu__config_terms(const struct perf_pmu *pmu __maybe_unused,
+ struct perf_event_attr *attr __maybe_unused,
+ struct parse_events_terms *terms __maybe_unused,
+ struct parse_events_error *err __maybe_unused)
+{
+ return -EINVAL;
+}
+
+static inline int nvme_pmu__check_alias(struct parse_events_terms *terms __maybe_unused,
+ struct perf_pmu_info *info __maybe_unused,
+ struct parse_events_error *err __maybe_unused)
+{
+ return -EINVAL;
+}
+
+static inline bool perf_pmu__is_nvme(const struct perf_pmu *pmu __maybe_unused)
+{
+ return false;
+}
+
+static inline bool evsel__is_nvme(const struct evsel *evsel __maybe_unused)
+{
+ return false;
+}
+
+static inline int perf_pmus__read_nvme_pmus(struct list_head *pmus __maybe_unused)
+{
+ return 0;
+}
+
+static inline int evsel__nvme_pmu_open(struct evsel *evsel __maybe_unused,
+ struct perf_thread_map *threads __maybe_unused,
+ int start_cpu_map_idx __maybe_unused,
+ int end_cpu_map_idx __maybe_unused)
+{
+ return 0;
+}
+
+static inline int evsel__nvme_pmu_read(struct evsel *evsel __maybe_unused,
+ int cpu_map_idx __maybe_unused,
+ int thread __maybe_unused)
+{
+ return 0;
+}
+#endif
+
+#endif /* __NVME_PMU_H */
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 9994709ef12b..26ec19753644 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -22,6 +22,7 @@
#include "pmu.h"
#include "drm_pmu.h"
#include "hwmon_pmu.h"
+#include "nvme_pmu.h"
#include "pmus.h"
#include "tool_pmu.h"
#include "tp_pmu.h"
@@ -1687,6 +1688,8 @@ int perf_pmu__config_terms(const struct perf_pmu *pmu,
if (perf_pmu__is_hwmon(pmu))
return hwmon_pmu__config_terms(pmu, attr, terms, err);
+ if (perf_pmu__is_nvme(pmu))
+ return nvme_pmu__config_terms(pmu, attr, terms, err);
if (perf_pmu__is_drm(pmu))
return drm_pmu__config_terms(pmu, attr, terms, err);
@@ -1851,6 +1854,10 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
ret = hwmon_pmu__check_alias(head_terms, info, err);
goto out;
}
+ if (perf_pmu__is_nvme(pmu)) {
+ ret = nvme_pmu__check_alias(head_terms, info, err);
+ goto out;
+ }
if (perf_pmu__is_drm(pmu)) {
ret = drm_pmu__check_alias(pmu, head_terms, info, err);
goto out;
@@ -2071,6 +2078,8 @@ bool perf_pmu__have_event(struct perf_pmu *pmu, const char *name)
return tp_pmu__have_event(pmu, name);
if (perf_pmu__is_hwmon(pmu))
return hwmon_pmu__have_event(pmu, name);
+ if (perf_pmu__is_nvme(pmu))
+ return nvme_pmu__have_event(pmu, name);
if (perf_pmu__is_drm(pmu))
return drm_pmu__have_event(pmu, name);
if (perf_pmu__find_alias(pmu, name, /*load=*/ true) != NULL)
@@ -2092,6 +2101,8 @@ size_t perf_pmu__num_events(struct perf_pmu *pmu)
return tp_pmu__num_events(pmu);
if (perf_pmu__is_hwmon(pmu))
return hwmon_pmu__num_events(pmu);
+ if (perf_pmu__is_nvme(pmu))
+ return nvme_pmu__num_events(pmu);
if (perf_pmu__is_drm(pmu))
return drm_pmu__num_events(pmu);
@@ -2223,6 +2234,8 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus,
return tp_pmu__for_each_event(pmu, state, cb);
if (perf_pmu__is_hwmon(pmu))
return hwmon_pmu__for_each_event(pmu, state, cb);
+ if (perf_pmu__is_nvme(pmu))
+ return nvme_pmu__for_each_event(pmu, state, cb);
if (perf_pmu__is_drm(pmu))
return drm_pmu__for_each_event(pmu, state, cb);
@@ -2714,6 +2727,8 @@ void perf_pmu__delete(struct perf_pmu *pmu)
if (perf_pmu__is_hwmon(pmu))
hwmon_pmu__exit(pmu);
+ if (perf_pmu__is_nvme(pmu))
+ nvme_pmu__exit(pmu);
else if (perf_pmu__is_drm(pmu))
drm_pmu__exit(pmu);
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 0d9f3c57e8e8..0fe47dd429e8 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -45,6 +45,8 @@ enum pmu_kind {
PERF_PMU_KIND_DRM,
/* A perf tool provided HWMON PMU. */
PERF_PMU_KIND_HWMON,
+ /* A perf tool provided NVME PMU. */
+ PERF_PMU_KIND_NVME,
/* Perf tool provided PMU for tool events like time. */
PERF_PMU_KIND_TOOL,
/* A testing PMU kind. */
@@ -53,7 +55,9 @@ enum pmu_kind {
enum {
PERF_PMU_TYPE_PE_START = 0,
- PERF_PMU_TYPE_PE_END = 0xFFFDFFFF,
+ PERF_PMU_TYPE_PE_END = 0xFFFCFFFF,
+ PERF_PMU_TYPE_NVME_START = 0xFFFD0000,
+ PERF_PMU_TYPE_NVME_END = 0xFFFDFFFF,
PERF_PMU_TYPE_DRM_START = 0xFFFE0000,
PERF_PMU_TYPE_DRM_END = 0xFFFEFFFF,
PERF_PMU_TYPE_HWMON_START = 0xFFFF0000,
@@ -363,6 +367,8 @@ static inline enum pmu_kind perf_pmu__kind(const struct perf_pmu *pmu)
type = pmu->type;
if (type <= PERF_PMU_TYPE_PE_END)
return PERF_PMU_KIND_PE;
+ if (type <= PERF_PMU_TYPE_NVME_END)
+ return PERF_PMU_KIND_NVME;
if (type <= PERF_PMU_TYPE_DRM_END)
return PERF_PMU_KIND_DRM;
if (type <= PERF_PMU_TYPE_HWMON_END)
diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c
index 5e3f571450fe..83777f941e9a 100644
--- a/tools/perf/util/pmus.c
+++ b/tools/perf/util/pmus.c
@@ -17,6 +17,7 @@
#include "pmus.h"
#include "pmu.h"
#include "hwmon_pmu.h"
+#include "nvme_pmu.h"
#include "tool_pmu.h"
#include "print-events.h"
#include "strbuf.h"
@@ -44,18 +45,21 @@ enum perf_tool_pmu_type {
PERF_TOOL_PMU_TYPE_PE_OTHER,
PERF_TOOL_PMU_TYPE_TOOL,
PERF_TOOL_PMU_TYPE_HWMON,
+ PERF_TOOL_PMU_TYPE_NVME,
PERF_TOOL_PMU_TYPE_DRM,
#define PERF_TOOL_PMU_TYPE_PE_CORE_MASK (1 << PERF_TOOL_PMU_TYPE_PE_CORE)
#define PERF_TOOL_PMU_TYPE_PE_OTHER_MASK (1 << PERF_TOOL_PMU_TYPE_PE_OTHER)
#define PERF_TOOL_PMU_TYPE_TOOL_MASK (1 << PERF_TOOL_PMU_TYPE_TOOL)
#define PERF_TOOL_PMU_TYPE_HWMON_MASK (1 << PERF_TOOL_PMU_TYPE_HWMON)
+#define PERF_TOOL_PMU_TYPE_NVME_MASK (1 << PERF_TOOL_PMU_TYPE_NVME)
#define PERF_TOOL_PMU_TYPE_DRM_MASK (1 << PERF_TOOL_PMU_TYPE_DRM)
#define PERF_TOOL_PMU_TYPE_ALL_MASK (PERF_TOOL_PMU_TYPE_PE_CORE_MASK | \
PERF_TOOL_PMU_TYPE_PE_OTHER_MASK | \
PERF_TOOL_PMU_TYPE_TOOL_MASK | \
PERF_TOOL_PMU_TYPE_HWMON_MASK | \
+ PERF_TOOL_PMU_TYPE_NVME_MASK | \
PERF_TOOL_PMU_TYPE_DRM_MASK)
};
static unsigned int read_pmu_types;
@@ -175,12 +179,15 @@ struct perf_pmu *perf_pmus__find(const char *name)
return pmu;
/* Looking up an individual perf event PMU failed, check if a tool PMU should be read. */
- if (!strncmp(name, "hwmon_", 6))
- to_read_pmus |= PERF_TOOL_PMU_TYPE_HWMON_MASK;
- else if (!strncmp(name, "drm_", 4))
+ if (!strncmp(name, "hwmon_", 6)) {
+ to_read_pmus = PERF_TOOL_PMU_TYPE_HWMON_MASK;
+ } else if (!strncmp(name, "nvme_", 5)) {
+ to_read_pmus = PERF_TOOL_PMU_TYPE_NVME_MASK;
+ } else if (!strncmp(name, "drm_", 4)) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_DRM_MASK;
- else if (!strcmp(name, "tool"))
+ } else if (!strcmp(name, "tool")) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_TOOL_MASK;
+ }
if (to_read_pmus) {
pmu_read_sysfs(to_read_pmus);
@@ -279,6 +286,10 @@ static void pmu_read_sysfs(unsigned int to_read_types)
(read_pmu_types & PERF_TOOL_PMU_TYPE_HWMON_MASK) == 0)
perf_pmus__read_hwmon_pmus(&other_pmus);
+ if ((to_read_types & PERF_TOOL_PMU_TYPE_NVME_MASK) != 0 &&
+ (read_pmu_types & PERF_TOOL_PMU_TYPE_NVME_MASK) == 0)
+ perf_pmus__read_nvme_pmus(&other_pmus);
+
if ((to_read_types & PERF_TOOL_PMU_TYPE_DRM_MASK) != 0 &&
(read_pmu_types & PERF_TOOL_PMU_TYPE_DRM_MASK) == 0)
perf_pmus__read_drm_pmus(&other_pmus);
@@ -387,6 +398,10 @@ struct perf_pmu *perf_pmus__scan_for_event(struct perf_pmu *pmu, const char *eve
if (strlen(event) > 4 && strncmp("drm-", event, 4) == 0)
to_read_pmus |= PERF_TOOL_PMU_TYPE_DRM_MASK;
+ /* Could the event be an nvme event? */
+ if (nvme_pmu__have_event(NULL, event))
+ to_read_pmus |= PERF_TOOL_PMU_TYPE_NVME_MASK;
+
pmu_read_sysfs(to_read_pmus);
pmu = list_prepare_entry(pmu, &core_pmus, list);
}
@@ -424,11 +439,14 @@ struct perf_pmu *perf_pmus__scan_matching_wildcard(struct perf_pmu *pmu, const c
*/
if (strisglob(wildcard)) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_HWMON_MASK |
+ PERF_TOOL_PMU_TYPE_NVME_MASK |
PERF_TOOL_PMU_TYPE_DRM_MASK;
} else if (strlen(wildcard) >= 4 && strncmp("drm_", wildcard, 4) == 0) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_DRM_MASK;
} else if (strlen(wildcard) >= 5 && strncmp("hwmon", wildcard, 5) == 0) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_HWMON_MASK;
+ } else if (strlen(wildcard) >= 4 && strncmp("nvme", wildcard, 4) == 0) {
+ to_read_pmus |= PERF_TOOL_PMU_TYPE_NVME_MASK;
}
pmu_read_sysfs(to_read_pmus);
--
2.54.0.1064.gd145956f57-goog
next prev parent reply other threads:[~2026-06-09 7:04 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-09 7:03 [PATCH v1 0/3] perf pmu: Add tool-provided NVMe PMU Ian Rogers
2026-06-09 7:03 ` [PATCH v1 1/3] perf build: Add libnvme feature detection Ian Rogers
2026-06-09 7:12 ` sashiko-bot
2026-06-09 7:03 ` Ian Rogers [this message]
2026-06-09 7:21 ` [PATCH v1 2/3] perf pmu: Implement tool-provided NVMe PMU sashiko-bot
2026-06-09 7:03 ` [PATCH v1 3/3] perf tests: Add NVMe PMU event parsing test Ian Rogers
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260609070348.541964-3-irogers@google.com \
--to=irogers@google.com \
--cc=9erthalion6@gmail.com \
--cc=acme@kernel.org \
--cc=adrian.hunter@intel.com \
--cc=alexandre.chartre@oracle.com \
--cc=ashelat@redhat.com \
--cc=german.gomez@arm.com \
--cc=james.clark@linaro.org \
--cc=jolsa@kernel.org \
--cc=leo.yan@arm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=linux-perf-users@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=mjeanson@efficios.com \
--cc=namhyung@kernel.org \
--cc=peterz@infradead.org \
--cc=tglozar@redhat.com \
--cc=yuzhuo@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox