From: Ian Rogers <irogers@google.com>
To: Peter Zijlstra <peterz@infradead.org>,
Ingo Molnar <mingo@redhat.com>,
Arnaldo Carvalho de Melo <acme@kernel.org>,
Namhyung Kim <namhyung@kernel.org>, Jiri Olsa <jolsa@kernel.org>,
Ian Rogers <irogers@google.com>,
Adrian Hunter <adrian.hunter@intel.com>,
James Clark <james.clark@linaro.org>,
Tomas Glozar <tglozar@redhat.com>,
Michael Jeanson <mjeanson@efficios.com>,
Dmitrii Dolgov <9erthalion6@gmail.com>,
Alexandre Chartre <alexandre.chartre@oracle.com>,
Yuzhuo Jing <yuzhuo@google.com>, Leo Yan <leo.yan@arm.com>,
German Gomez <german.gomez@arm.com>,
Anubhav Shelat <ashelat@redhat.com>,
linux-kernel@vger.kernel.org, linux-perf-users@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Subject: [PATCH v1 2/3] perf pmu: Implement tool-provided NVMe PMU
Date: Tue, 9 Jun 2026 00:03:47 -0700 [thread overview]
Message-ID: <20260609070348.541964-3-irogers@google.com> (raw)
In-Reply-To: <20260609070348.541964-1-irogers@google.com>
Add a tool-provided PMU for NVMe devices using libnvme. This PMU
exposes metrics from various NVMe logs (SMART, Endurance Group,
FDP, Error Information, and Zoned Namespaces) as perf events under
the 'nvme_nvmeX' PMUs.
Features:
- Generic configuration encoding (log type, size, and offset mapped
inside the event config).
- Dynamic probing of supported log pages on /dev/nvmeX. Unsupported
events are marked deprecated and hidden from 'perf list' by
default.
- Correct interval delta tracking using baseline offset snapshots on
evsel->priv.
- Proper conversion and scaling of properties like temperature (K
to C).
Signed-off-by: Ian Rogers <irogers@google.com>
CONV=ca4c5d09-4ef8-405a-80bb-aa988020b436
TAG=agy
---
tools/perf/util/Build | 1 +
tools/perf/util/evsel.c | 11 +-
tools/perf/util/nvme_pmu.c | 562 +++++++++++++++++++++++++++++++++++++
tools/perf/util/nvme_pmu.h | 112 ++++++++
tools/perf/util/pmu.c | 15 +
tools/perf/util/pmu.h | 8 +-
tools/perf/util/pmus.c | 26 +-
7 files changed, 728 insertions(+), 7 deletions(-)
create mode 100644 tools/perf/util/nvme_pmu.c
create mode 100644 tools/perf/util/nvme_pmu.h
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index b22cdc24082a..e28e99634178 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -20,6 +20,7 @@ perf-util-y += disasm.o
perf-util-y += env.o
perf-util-y += event.o
perf-util-y += evlist.o
+perf-util-y += nvme_pmu.o
perf-util-y += sideband_evlist.o
perf-util-y += evsel.o
perf-util-y += evsel_fprintf.o
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 34c03f47a913..8caa626af57a 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -57,6 +57,7 @@
#include "hashmap.h"
#include "hist.h"
#include "hwmon_pmu.h"
+#include "nvme_pmu.h"
#include "intel-tpebs.h"
#include "memswap.h"
#include "off_cpu.h"
@@ -2207,6 +2208,8 @@ int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
if (evsel__is_hwmon(evsel))
return evsel__hwmon_pmu_read(evsel, cpu_map_idx, thread);
+ if (evsel__is_nvme(evsel))
+ return evsel__nvme_pmu_read(evsel, cpu_map_idx, thread);
if (evsel__is_drm(evsel))
return evsel__drm_pmu_read(evsel, cpu_map_idx, thread);
@@ -2947,8 +2950,12 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
}
if (evsel__is_hwmon(evsel)) {
err = evsel__hwmon_pmu_open(evsel, threads,
- start_cpu_map_idx,
- end_cpu_map_idx);
+ start_cpu_map_idx, end_cpu_map_idx);
+ goto out;
+ }
+ if (evsel__is_nvme(evsel)) {
+ err = evsel__nvme_pmu_open(evsel, threads,
+ start_cpu_map_idx, end_cpu_map_idx);
goto out;
}
if (evsel__is_drm(evsel)) {
diff --git a/tools/perf/util/nvme_pmu.c b/tools/perf/util/nvme_pmu.c
new file mode 100644
index 000000000000..17ba758aec59
--- /dev/null
+++ b/tools/perf/util/nvme_pmu.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "counts.h"
+#include "debug.h"
+#include "evsel.h"
+#include "hashmap.h"
+#include "nvme_pmu.h"
+#include "pmu.h"
+#include <internal/xyarray.h>
+#include <internal/threadmap.h>
+#include <perf/threadmap.h>
+#include <sys/types.h>
+#include <assert.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <api/fs/fs.h>
+#include <api/io.h>
+#include <api/io_dir.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+
+#ifdef HAVE_LIBNVME_SUPPORT
+#include <libnvme.h>
+
+
+#define NVME_CONFIG(log, size, offset) \
+ (((uint64_t)(log) << 24) | ((uint64_t)(size) << 16) | (offset))
+
+enum nvme_log_type {
+ NVME_LOG_SMART = 0,
+ NVME_LOG_ENDURANCE = 1,
+ NVME_LOG_FDP = 2,
+ NVME_LOG_ERROR = 3,
+ NVME_LOG_ZNS = 4,
+};
+
+#define NVME_SMART(size, field) \
+ NVME_CONFIG(NVME_LOG_SMART, size, offsetof(struct nvme_smart_log, field))
+
+#define NVME_ENDURANCE(size, field) \
+ NVME_CONFIG(NVME_LOG_ENDURANCE, size, offsetof(struct nvme_endurance_group_log, field))
+
+#define NVME_FDP(size, field) \
+ NVME_CONFIG(NVME_LOG_FDP, size, offsetof(struct nvme_fdp_stats_log, field))
+
+#define NVME_ERROR(size, field) \
+ NVME_CONFIG(NVME_LOG_ERROR, size, offsetof(struct nvme_error_log_page, field))
+
+#define NVME_ZNS(size, field) \
+ NVME_CONFIG(NVME_LOG_ZNS, size, offsetof(struct nvme_zns_changed_zone_log, field))
+
+struct nvme_event {
+ const char *name;
+ const char *desc;
+ const char *scale_unit;
+ uint64_t config;
+};
+
+static const struct nvme_event nvme_events[] = {
+ { "smart_data_units_read",
+ "Data units read (in 1000s of 512-byte units)",
+ "512000B", NVME_SMART(16, data_units_read) },
+ { "smart_data_units_written",
+ "Data units written (in 1000s of 512-byte units)",
+ "512000B", NVME_SMART(16, data_units_written) },
+ { "smart_host_read_commands", "Host read commands", NULL, NVME_SMART(16, host_reads) },
+ { "smart_host_write_commands", "Host write commands", NULL, NVME_SMART(16, host_writes) },
+ { "smart_ctrl_busy_time", "Controller busy time", "60s", NVME_SMART(16, ctrl_busy_time) },
+ { "smart_power_cycles", "Power cycles", NULL, NVME_SMART(16, power_cycles) },
+ { "smart_power_on_hours", "Power on hours", "1h", NVME_SMART(16, power_on_hours) },
+ { "smart_unsafe_shutdowns", "Unsafe shutdowns", NULL, NVME_SMART(16, unsafe_shutdowns) },
+ { "smart_media_errors", "Media errors", NULL, NVME_SMART(16, media_errors) },
+ { "smart_num_err_log_entries",
+ "Number of error log entries",
+ NULL, NVME_SMART(16, num_err_log_entries) },
+ { "smart_warning_temp_time",
+ "Warning temperature time",
+ "60s", NVME_SMART(4, warning_temp_time) },
+ { "smart_crit_comp_time",
+ "Critical composite temperature time",
+ "60s", NVME_SMART(4, critical_comp_time) },
+ { "smart_temperature", "Temperature", "0.001'C", NVME_SMART(2, temperature) },
+
+ { "endurance_percent_used",
+ "Endurance group percentage used",
+ NULL, NVME_ENDURANCE(1, percent_used) },
+ { "endurance_data_units_read",
+ "Endurance group data units read",
+ "512000B", NVME_ENDURANCE(16, data_units_read) },
+ { "endurance_data_units_written",
+ "Endurance group data units written",
+ "512000B", NVME_ENDURANCE(16, data_units_written) },
+ { "endurance_media_units_written",
+ "Endurance group media units written",
+ "512000B", NVME_ENDURANCE(16, media_units_written) },
+ { "endurance_host_read_cmds",
+ "Endurance group host read commands",
+ NULL, NVME_ENDURANCE(16, host_read_cmds) },
+ { "endurance_host_write_cmds",
+ "Endurance group host write commands",
+ NULL, NVME_ENDURANCE(16, host_write_cmds) },
+ { "endurance_num_err_info_log_entries",
+ "Endurance group number of error information log entries",
+ NULL, NVME_ENDURANCE(16, num_err_info_log_entries) },
+
+ { "fdp_hbmw", "FDP host bytes with metadata written", "1B", NVME_FDP(16, hbmw) },
+ { "fdp_mbmw", "FDP media bytes with metadata written", "1B", NVME_FDP(16, mbmw) },
+ { "fdp_mbe", "FDP media bytes erased", "1B", NVME_FDP(16, mbe) },
+
+ { "error_count", "Error info log error count", NULL, NVME_ERROR(8, error_count) },
+
+ { "zns_nrzid", "ZNS changed zone nrzid", NULL, NVME_ZNS(2, nrzid) },
+};
+
+
+struct nvme_pmu {
+ struct perf_pmu pmu;
+ char *dev_name;
+ bool support_checked;
+ bool log_supported[256];
+};
+
+
+bool perf_pmu__is_nvme(const struct perf_pmu *pmu)
+{
+ return pmu && pmu->type >= PERF_PMU_TYPE_NVME_START &&
+ pmu->type <= PERF_PMU_TYPE_NVME_END;
+}
+
+bool evsel__is_nvme(const struct evsel *evsel)
+{
+ return perf_pmu__is_nvme(evsel->pmu);
+}
+
+struct perf_pmu *nvme_pmu__new(struct list_head *pmus, const char *sysfs_name, const char *name)
+{
+ struct nvme_pmu *nvm;
+ char buf[64];
+ __u32 type;
+
+ /*
+ * Usually sysfs_name is something like "nvme0".
+ * We try to extract the number. If parsing fails, we use 0.
+ */
+ type = PERF_PMU_TYPE_NVME_START + strtoul(sysfs_name + 4, NULL, 10);
+
+ if (type > PERF_PMU_TYPE_NVME_END) {
+ pr_err("Unable to encode NVMe type from %s in valid PMU type\n", sysfs_name);
+ return NULL;
+ }
+
+ snprintf(buf, sizeof(buf), "nvme_%s", name);
+
+ nvm = zalloc(sizeof(*nvm));
+ if (!nvm)
+ return NULL;
+
+ if (perf_pmu__init(&nvm->pmu, type, buf) != 0) {
+ free(nvm);
+ return NULL;
+ }
+
+ nvm->dev_name = strdup(sysfs_name);
+ if (!nvm->dev_name) {
+ perf_pmu__delete(&nvm->pmu);
+ return NULL;
+ }
+ nvm->pmu.alias_name = strdup(sysfs_name);
+ if (!nvm->pmu.alias_name) {
+ perf_pmu__delete(&nvm->pmu);
+ return NULL;
+ }
+ nvm->pmu.cpus = perf_cpu_map__new_int(0);
+ if (!nvm->pmu.cpus) {
+ perf_pmu__delete(&nvm->pmu);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&nvm->pmu.format);
+ INIT_LIST_HEAD(&nvm->pmu.caps);
+
+ list_add_tail(&nvm->pmu.list, pmus);
+ return &nvm->pmu;
+}
+
+void nvme_pmu__exit(struct perf_pmu *pmu)
+{
+ struct nvme_pmu *nvm = container_of(pmu, struct nvme_pmu, pmu);
+
+ zfree(&nvm->dev_name);
+}
+
+
+
+static void nvme_pmu__check_support(struct nvme_pmu *nvm)
+{
+ int fd;
+ char path[PATH_MAX];
+ struct nvme_smart_log smart_log;
+ struct nvme_endurance_group_log endurance_log;
+ struct nvme_fdp_stats_log fdp_log;
+ struct nvme_error_log_page error_log;
+ struct nvme_zns_changed_zone_log zns_log;
+
+ if (nvm->support_checked)
+ return;
+
+ nvm->support_checked = true;
+
+ /* Assume all supported if we can't test. */
+ memset(nvm->log_supported, 1, sizeof(nvm->log_supported));
+
+ snprintf(path, sizeof(path), "/dev/%s", nvm->dev_name);
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return;
+
+ if (nvme_get_log_smart(fd, NVME_NSID_ALL, true, &smart_log) != 0)
+ nvm->log_supported[NVME_LOG_SMART] = false;
+
+ if (nvme_get_log_endurance_group(fd, 0, &endurance_log) != 0)
+ nvm->log_supported[NVME_LOG_ENDURANCE] = false;
+
+ if (nvme_get_log_fdp_stats(fd, 0, 0, sizeof(fdp_log), &fdp_log) != 0)
+ nvm->log_supported[NVME_LOG_FDP] = false;
+
+ if (nvme_get_log_error(fd, 1, true, &error_log) != 0)
+ nvm->log_supported[NVME_LOG_ERROR] = false;
+
+ if (nvme_get_log_zns_changed_zones(fd, NVME_NSID_ALL, true, &zns_log) != 0)
+ nvm->log_supported[NVME_LOG_ZNS] = false;
+
+ close(fd);
+}
+
+int nvme_pmu__for_each_event(struct perf_pmu *pmu, void *state, pmu_event_callback cb)
+{
+ struct nvme_pmu *nvm = container_of(pmu, struct nvme_pmu, pmu);
+ size_t i;
+
+ nvme_pmu__check_support(nvm);
+ for (i = 0; i < ARRAY_SIZE(nvme_events); i++) {
+ const struct nvme_event *e = &nvme_events[i];
+ char alias_buf[64];
+ char desc_buf[256];
+ char encoding_buf[128];
+ struct pmu_event_info info = {
+ .pmu = pmu,
+ .name = e->name,
+ .alias = alias_buf,
+ .scale_unit = e->scale_unit,
+ .desc = desc_buf,
+ .long_desc = NULL,
+ .encoding_desc = encoding_buf,
+
+ .topic = "nvme",
+ .pmu_name = pmu->name,
+ .event_type_desc = "NVMe event",
+ .deprecated = !nvm->log_supported[(e->config >> 24) & 0xFF],
+ };
+
+ int ret;
+
+ snprintf(alias_buf, sizeof(alias_buf), "%s", e->name);
+ snprintf(desc_buf, sizeof(desc_buf), "%s", e->desc);
+ snprintf(encoding_buf, sizeof(encoding_buf),
+ "%s/config=0x%lx/", pmu->name, e->config);
+
+ ret = cb(state, &info);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+size_t nvme_pmu__num_events(struct perf_pmu *pmu __maybe_unused)
+{
+ return ARRAY_SIZE(nvme_events);
+}
+
+bool nvme_pmu__have_event(struct perf_pmu *pmu __maybe_unused, const char *name)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(nvme_events); i++) {
+ if (!strcasecmp(name, nvme_events[i].name))
+ return true;
+ }
+ return false;
+}
+
+static int nvme_pmu__config_term(const struct nvme_pmu *nvm __maybe_unused,
+ struct perf_event_attr *attr,
+ struct parse_events_term *term,
+ struct parse_events_error *err)
+{
+ if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER) {
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(nvme_events); i++) {
+ if (!strcasecmp(term->config, nvme_events[i].name)) {
+ attr->config = nvme_events[i].config;
+ return 0;
+ }
+ }
+ }
+ if (err) {
+ char *err_str;
+
+ parse_events_error__handle(err, term->err_val,
+ asprintf(&err_str,
+ "unexpected nvme event term (%s) %s",
+ parse_events__term_type_str(term->type_term),
+ term->config) < 0
+ ? strdup("unexpected nvme event term")
+ : err_str,
+ NULL);
+ }
+ return -EINVAL;
+}
+
+int nvme_pmu__config_terms(const struct perf_pmu *pmu,
+ struct perf_event_attr *attr,
+ struct parse_events_terms *terms,
+ struct parse_events_error *err)
+{
+ struct nvme_pmu *nvm = container_of(pmu, struct nvme_pmu, pmu);
+ struct parse_events_term *term;
+
+ list_for_each_entry(term, &terms->terms, list) {
+ if (nvme_pmu__config_term(nvm, attr, term, err))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int nvme_pmu__check_alias(struct parse_events_terms *terms, struct perf_pmu_info *info,
+ struct parse_events_error *err)
+{
+ struct parse_events_term *term =
+ list_first_entry(&terms->terms, struct parse_events_term, list);
+
+ if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER) {
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(nvme_events); i++) {
+ if (!strcasecmp(term->config, nvme_events[i].name)) {
+ if (nvme_events[i].scale_unit) {
+ char *unit;
+
+ perf_pmu__convert_scale(nvme_events[i].scale_unit,
+ &unit, &info->scale);
+ info->unit = unit;
+ }
+ return 0;
+ }
+ }
+ }
+ if (err) {
+ char *err_str;
+
+ parse_events_error__handle(err, term->err_val,
+ asprintf(&err_str,
+ "unexpected nvme event term (%s) %s",
+ parse_events__term_type_str(term->type_term),
+ term->config) < 0
+ ? strdup("unexpected nvme event term")
+ : err_str,
+ NULL);
+ }
+ return -EINVAL;
+}
+
+int perf_pmus__read_nvme_pmus(struct list_head *pmus)
+{
+ nvme_root_t r = nvme_scan(NULL);
+ nvme_host_t h;
+ nvme_subsystem_t s;
+ nvme_ctrl_t c;
+
+ if (!r)
+ return 0;
+
+ nvme_for_each_host(r, h) {
+ nvme_for_each_subsystem(h, s) {
+ nvme_subsystem_for_each_ctrl(s, c) {
+ nvme_pmu__new(pmus, nvme_ctrl_get_name(c), nvme_ctrl_get_name(c));
+ }
+ }
+ }
+ nvme_free_tree(r);
+ return 0;
+}
+
+
+static int nvme_pmu__read_val(int fd, uint64_t config, uint64_t *val)
+{
+ int log_type = (config >> 24) & 0xFF;
+ unsigned int size = (config >> 16) & 0xFF;
+ unsigned int offset = config & 0xFFFF;
+ uint8_t buf[4096];
+ uint8_t *p;
+
+ if (log_type == NVME_LOG_SMART) {
+ if (offset + size > sizeof(struct nvme_smart_log))
+ return -EINVAL;
+ if (nvme_get_log_smart(fd, NVME_NSID_ALL, true, (struct nvme_smart_log *)buf) != 0)
+ return -EINVAL;
+
+ if (offset == offsetof(struct nvme_smart_log, temperature)) {
+ uint64_t kelvin = ((struct nvme_smart_log *)buf)->temperature[0] |
+ (((struct nvme_smart_log *)buf)->temperature[1] << 8);
+ *val = (kelvin * 1000) - 273150;
+ return 0;
+ }
+ } else if (log_type == NVME_LOG_ENDURANCE) {
+ if (offset + size > sizeof(struct nvme_endurance_group_log))
+ return -EINVAL;
+ if (nvme_get_log_endurance_group(fd, 0,
+ (struct nvme_endurance_group_log *)buf) != 0)
+ return -EINVAL;
+ } else if (log_type == NVME_LOG_FDP) {
+ if (offset + size > sizeof(struct nvme_fdp_stats_log))
+ return -EINVAL;
+ if (nvme_get_log_fdp_stats(fd, 0, 0, sizeof(struct nvme_fdp_stats_log), buf) != 0)
+ return -EINVAL;
+ } else if (log_type == NVME_LOG_ERROR) {
+ if (offset + size > sizeof(struct nvme_error_log_page))
+ return -EINVAL;
+ if (nvme_get_log_error(fd, 1, true, (struct nvme_error_log_page *)buf) != 0)
+ return -EINVAL;
+ } else if (log_type == NVME_LOG_ZNS) {
+ if (offset + size > sizeof(struct nvme_zns_changed_zone_log))
+ return -EINVAL;
+ if (nvme_get_log_zns_changed_zones(fd, NVME_NSID_ALL, true,
+ (struct nvme_zns_changed_zone_log *)buf) != 0)
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ p = buf + offset;
+ if (size == 16 || size == 8)
+ *val = le64_to_cpu(*(uint64_t *)p);
+ else if (size == 4)
+ *val = le32_to_cpu(*(uint32_t *)p);
+ else if (size == 2)
+ *val = le16_to_cpu(*(uint16_t *)p);
+ else if (size == 1)
+ *val = *(uint8_t *)p;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool nvme_pmu__is_gauge(uint64_t config)
+{
+ if (config == NVME_SMART(2, temperature) ||
+ config == NVME_ENDURANCE(1, percent_used) ||
+ config == NVME_ZNS(2, nrzid))
+ return true;
+ return false;
+}
+
+#define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
+
+int evsel__nvme_pmu_open(struct evsel *evsel,
+ struct perf_thread_map *threads,
+ int start_cpu_map_idx, int end_cpu_map_idx)
+{
+ struct nvme_pmu *nvm = container_of(evsel->pmu, struct nvme_pmu, pmu);
+ int idx = 0, thread = 0, nthreads, err = 0;
+ char path[PATH_MAX];
+
+ snprintf(path, sizeof(path), "/dev/%s", nvm->dev_name);
+
+ nthreads = perf_thread_map__nr(threads);
+
+ if (!evsel->priv) {
+ int max_cpus = evsel->core.cpus ? perf_cpu_map__nr(evsel->core.cpus) : 1;
+
+ evsel->priv = xyarray__new(max_cpus, nthreads, sizeof(uint64_t));
+ }
+
+ for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) {
+ for (thread = 0; thread < nthreads; thread++) {
+ int fd = open(path, O_RDONLY);
+
+ FD(evsel, idx, thread) = fd;
+ if (fd < 0) {
+ err = -errno;
+ goto out_close;
+ }
+ if (evsel->priv) {
+ uint64_t *initial_val = xyarray__entry(evsel->priv, idx, thread);
+
+ if (nvme_pmu__read_val(fd, evsel->core.attr.config, initial_val))
+ *initial_val = 0;
+ }
+ }
+ }
+ return 0;
+out_close:
+ if (err)
+ threads->err_thread = thread;
+
+ do {
+ while (--thread >= 0) {
+ if (FD(evsel, idx, thread) >= 0)
+ close(FD(evsel, idx, thread));
+ FD(evsel, idx, thread) = -1;
+ }
+ thread = nthreads;
+ } while (--idx >= 0);
+ return err;
+}
+
+int evsel__nvme_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread)
+{
+ int fd;
+ struct perf_counts_values *count, *old_count = NULL;
+ uint64_t val = 0;
+ uint64_t *initial_val = NULL;
+
+ if (evsel->prev_raw_counts)
+ old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
+
+ count = perf_counts(evsel->counts, cpu_map_idx, thread);
+ fd = FD(evsel, cpu_map_idx, thread);
+
+ if (fd < 0 || nvme_pmu__read_val(fd, evsel->core.attr.config, &val)) {
+ count->lost++;
+ return -EINVAL;
+ }
+
+ if (evsel->priv)
+ initial_val = xyarray__entry(evsel->priv, cpu_map_idx, thread);
+
+ if (old_count) {
+ if (nvme_pmu__is_gauge(evsel->core.attr.config))
+ count->val = old_count->val + val;
+ else
+ count->val = val - (initial_val ? *initial_val : 0);
+ count->run = old_count->run + 1;
+ count->ena = old_count->ena + 1;
+ } else {
+ if (nvme_pmu__is_gauge(evsel->core.attr.config))
+ count->val = val;
+ else
+ count->val = val - (initial_val ? *initial_val : 0);
+ count->run++;
+ count->ena++;
+ }
+ return 0;
+}
+
+
+#endif
diff --git a/tools/perf/util/nvme_pmu.h b/tools/perf/util/nvme_pmu.h
new file mode 100644
index 000000000000..6d5d2bbe4167
--- /dev/null
+++ b/tools/perf/util/nvme_pmu.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __NVME_PMU_H
+#define __NVME_PMU_H
+
+#include "pmu.h"
+#include <stdbool.h>
+#include <errno.h>
+
+struct list_head;
+struct perf_thread_map;
+struct evsel;
+
+#ifdef HAVE_LIBNVME_SUPPORT
+struct perf_pmu *nvme_pmu__new(struct list_head *pmus, const char *sysfs_name, const char *name);
+void nvme_pmu__exit(struct perf_pmu *pmu);
+
+int nvme_pmu__for_each_event(struct perf_pmu *pmu, void *state, pmu_event_callback cb);
+size_t nvme_pmu__num_events(struct perf_pmu *pmu);
+bool nvme_pmu__have_event(struct perf_pmu *pmu, const char *name);
+int nvme_pmu__config_terms(const struct perf_pmu *pmu,
+ struct perf_event_attr *attr,
+ struct parse_events_terms *terms,
+ struct parse_events_error *err);
+int nvme_pmu__check_alias(struct parse_events_terms *terms, struct perf_pmu_info *info,
+ struct parse_events_error *err);
+
+bool perf_pmu__is_nvme(const struct perf_pmu *pmu);
+bool evsel__is_nvme(const struct evsel *evsel);
+
+int perf_pmus__read_nvme_pmus(struct list_head *pmus);
+
+int evsel__nvme_pmu_open(struct evsel *evsel,
+ struct perf_thread_map *threads,
+ int start_cpu_map_idx, int end_cpu_map_idx);
+int evsel__nvme_pmu_read(struct evsel *evsel, int cpu_map_idx, int thread);
+#else
+static inline struct perf_pmu *nvme_pmu__new(struct list_head *pmus __maybe_unused,
+ const char *sysfs_name __maybe_unused,
+ const char *name __maybe_unused)
+{
+ return NULL;
+}
+
+static inline void nvme_pmu__exit(struct perf_pmu *pmu __maybe_unused)
+{
+}
+
+static inline int nvme_pmu__for_each_event(struct perf_pmu *pmu __maybe_unused,
+ void *state __maybe_unused,
+ pmu_event_callback cb __maybe_unused)
+{
+ return 0;
+}
+
+static inline size_t nvme_pmu__num_events(struct perf_pmu *pmu __maybe_unused)
+{
+ return 0;
+}
+
+static inline bool nvme_pmu__have_event(struct perf_pmu *pmu __maybe_unused,
+ const char *name __maybe_unused)
+{
+ return false;
+}
+
+static inline int nvme_pmu__config_terms(const struct perf_pmu *pmu __maybe_unused,
+ struct perf_event_attr *attr __maybe_unused,
+ struct parse_events_terms *terms __maybe_unused,
+ struct parse_events_error *err __maybe_unused)
+{
+ return -EINVAL;
+}
+
+static inline int nvme_pmu__check_alias(struct parse_events_terms *terms __maybe_unused,
+ struct perf_pmu_info *info __maybe_unused,
+ struct parse_events_error *err __maybe_unused)
+{
+ return -EINVAL;
+}
+
+static inline bool perf_pmu__is_nvme(const struct perf_pmu *pmu __maybe_unused)
+{
+ return false;
+}
+
+static inline bool evsel__is_nvme(const struct evsel *evsel __maybe_unused)
+{
+ return false;
+}
+
+static inline int perf_pmus__read_nvme_pmus(struct list_head *pmus __maybe_unused)
+{
+ return 0;
+}
+
+static inline int evsel__nvme_pmu_open(struct evsel *evsel __maybe_unused,
+ struct perf_thread_map *threads __maybe_unused,
+ int start_cpu_map_idx __maybe_unused,
+ int end_cpu_map_idx __maybe_unused)
+{
+ return 0;
+}
+
+static inline int evsel__nvme_pmu_read(struct evsel *evsel __maybe_unused,
+ int cpu_map_idx __maybe_unused,
+ int thread __maybe_unused)
+{
+ return 0;
+}
+#endif
+
+#endif /* __NVME_PMU_H */
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 9994709ef12b..26ec19753644 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -22,6 +22,7 @@
#include "pmu.h"
#include "drm_pmu.h"
#include "hwmon_pmu.h"
+#include "nvme_pmu.h"
#include "pmus.h"
#include "tool_pmu.h"
#include "tp_pmu.h"
@@ -1687,6 +1688,8 @@ int perf_pmu__config_terms(const struct perf_pmu *pmu,
if (perf_pmu__is_hwmon(pmu))
return hwmon_pmu__config_terms(pmu, attr, terms, err);
+ if (perf_pmu__is_nvme(pmu))
+ return nvme_pmu__config_terms(pmu, attr, terms, err);
if (perf_pmu__is_drm(pmu))
return drm_pmu__config_terms(pmu, attr, terms, err);
@@ -1851,6 +1854,10 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
ret = hwmon_pmu__check_alias(head_terms, info, err);
goto out;
}
+ if (perf_pmu__is_nvme(pmu)) {
+ ret = nvme_pmu__check_alias(head_terms, info, err);
+ goto out;
+ }
if (perf_pmu__is_drm(pmu)) {
ret = drm_pmu__check_alias(pmu, head_terms, info, err);
goto out;
@@ -2071,6 +2078,8 @@ bool perf_pmu__have_event(struct perf_pmu *pmu, const char *name)
return tp_pmu__have_event(pmu, name);
if (perf_pmu__is_hwmon(pmu))
return hwmon_pmu__have_event(pmu, name);
+ if (perf_pmu__is_nvme(pmu))
+ return nvme_pmu__have_event(pmu, name);
if (perf_pmu__is_drm(pmu))
return drm_pmu__have_event(pmu, name);
if (perf_pmu__find_alias(pmu, name, /*load=*/ true) != NULL)
@@ -2092,6 +2101,8 @@ size_t perf_pmu__num_events(struct perf_pmu *pmu)
return tp_pmu__num_events(pmu);
if (perf_pmu__is_hwmon(pmu))
return hwmon_pmu__num_events(pmu);
+ if (perf_pmu__is_nvme(pmu))
+ return nvme_pmu__num_events(pmu);
if (perf_pmu__is_drm(pmu))
return drm_pmu__num_events(pmu);
@@ -2223,6 +2234,8 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus,
return tp_pmu__for_each_event(pmu, state, cb);
if (perf_pmu__is_hwmon(pmu))
return hwmon_pmu__for_each_event(pmu, state, cb);
+ if (perf_pmu__is_nvme(pmu))
+ return nvme_pmu__for_each_event(pmu, state, cb);
if (perf_pmu__is_drm(pmu))
return drm_pmu__for_each_event(pmu, state, cb);
@@ -2714,6 +2727,8 @@ void perf_pmu__delete(struct perf_pmu *pmu)
if (perf_pmu__is_hwmon(pmu))
hwmon_pmu__exit(pmu);
+ if (perf_pmu__is_nvme(pmu))
+ nvme_pmu__exit(pmu);
else if (perf_pmu__is_drm(pmu))
drm_pmu__exit(pmu);
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 0d9f3c57e8e8..0fe47dd429e8 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -45,6 +45,8 @@ enum pmu_kind {
PERF_PMU_KIND_DRM,
/* A perf tool provided HWMON PMU. */
PERF_PMU_KIND_HWMON,
+ /* A perf tool provided NVME PMU. */
+ PERF_PMU_KIND_NVME,
/* Perf tool provided PMU for tool events like time. */
PERF_PMU_KIND_TOOL,
/* A testing PMU kind. */
@@ -53,7 +55,9 @@ enum pmu_kind {
enum {
PERF_PMU_TYPE_PE_START = 0,
- PERF_PMU_TYPE_PE_END = 0xFFFDFFFF,
+ PERF_PMU_TYPE_PE_END = 0xFFFCFFFF,
+ PERF_PMU_TYPE_NVME_START = 0xFFFD0000,
+ PERF_PMU_TYPE_NVME_END = 0xFFFDFFFF,
PERF_PMU_TYPE_DRM_START = 0xFFFE0000,
PERF_PMU_TYPE_DRM_END = 0xFFFEFFFF,
PERF_PMU_TYPE_HWMON_START = 0xFFFF0000,
@@ -363,6 +367,8 @@ static inline enum pmu_kind perf_pmu__kind(const struct perf_pmu *pmu)
type = pmu->type;
if (type <= PERF_PMU_TYPE_PE_END)
return PERF_PMU_KIND_PE;
+ if (type <= PERF_PMU_TYPE_NVME_END)
+ return PERF_PMU_KIND_NVME;
if (type <= PERF_PMU_TYPE_DRM_END)
return PERF_PMU_KIND_DRM;
if (type <= PERF_PMU_TYPE_HWMON_END)
diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c
index 5e3f571450fe..83777f941e9a 100644
--- a/tools/perf/util/pmus.c
+++ b/tools/perf/util/pmus.c
@@ -17,6 +17,7 @@
#include "pmus.h"
#include "pmu.h"
#include "hwmon_pmu.h"
+#include "nvme_pmu.h"
#include "tool_pmu.h"
#include "print-events.h"
#include "strbuf.h"
@@ -44,18 +45,21 @@ enum perf_tool_pmu_type {
PERF_TOOL_PMU_TYPE_PE_OTHER,
PERF_TOOL_PMU_TYPE_TOOL,
PERF_TOOL_PMU_TYPE_HWMON,
+ PERF_TOOL_PMU_TYPE_NVME,
PERF_TOOL_PMU_TYPE_DRM,
#define PERF_TOOL_PMU_TYPE_PE_CORE_MASK (1 << PERF_TOOL_PMU_TYPE_PE_CORE)
#define PERF_TOOL_PMU_TYPE_PE_OTHER_MASK (1 << PERF_TOOL_PMU_TYPE_PE_OTHER)
#define PERF_TOOL_PMU_TYPE_TOOL_MASK (1 << PERF_TOOL_PMU_TYPE_TOOL)
#define PERF_TOOL_PMU_TYPE_HWMON_MASK (1 << PERF_TOOL_PMU_TYPE_HWMON)
+#define PERF_TOOL_PMU_TYPE_NVME_MASK (1 << PERF_TOOL_PMU_TYPE_NVME)
#define PERF_TOOL_PMU_TYPE_DRM_MASK (1 << PERF_TOOL_PMU_TYPE_DRM)
#define PERF_TOOL_PMU_TYPE_ALL_MASK (PERF_TOOL_PMU_TYPE_PE_CORE_MASK | \
PERF_TOOL_PMU_TYPE_PE_OTHER_MASK | \
PERF_TOOL_PMU_TYPE_TOOL_MASK | \
PERF_TOOL_PMU_TYPE_HWMON_MASK | \
+ PERF_TOOL_PMU_TYPE_NVME_MASK | \
PERF_TOOL_PMU_TYPE_DRM_MASK)
};
static unsigned int read_pmu_types;
@@ -175,12 +179,15 @@ struct perf_pmu *perf_pmus__find(const char *name)
return pmu;
/* Looking up an individual perf event PMU failed, check if a tool PMU should be read. */
- if (!strncmp(name, "hwmon_", 6))
- to_read_pmus |= PERF_TOOL_PMU_TYPE_HWMON_MASK;
- else if (!strncmp(name, "drm_", 4))
+ if (!strncmp(name, "hwmon_", 6)) {
+ to_read_pmus = PERF_TOOL_PMU_TYPE_HWMON_MASK;
+ } else if (!strncmp(name, "nvme_", 5)) {
+ to_read_pmus = PERF_TOOL_PMU_TYPE_NVME_MASK;
+ } else if (!strncmp(name, "drm_", 4)) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_DRM_MASK;
- else if (!strcmp(name, "tool"))
+ } else if (!strcmp(name, "tool")) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_TOOL_MASK;
+ }
if (to_read_pmus) {
pmu_read_sysfs(to_read_pmus);
@@ -279,6 +286,10 @@ static void pmu_read_sysfs(unsigned int to_read_types)
(read_pmu_types & PERF_TOOL_PMU_TYPE_HWMON_MASK) == 0)
perf_pmus__read_hwmon_pmus(&other_pmus);
+ if ((to_read_types & PERF_TOOL_PMU_TYPE_NVME_MASK) != 0 &&
+ (read_pmu_types & PERF_TOOL_PMU_TYPE_NVME_MASK) == 0)
+ perf_pmus__read_nvme_pmus(&other_pmus);
+
if ((to_read_types & PERF_TOOL_PMU_TYPE_DRM_MASK) != 0 &&
(read_pmu_types & PERF_TOOL_PMU_TYPE_DRM_MASK) == 0)
perf_pmus__read_drm_pmus(&other_pmus);
@@ -387,6 +398,10 @@ struct perf_pmu *perf_pmus__scan_for_event(struct perf_pmu *pmu, const char *eve
if (strlen(event) > 4 && strncmp("drm-", event, 4) == 0)
to_read_pmus |= PERF_TOOL_PMU_TYPE_DRM_MASK;
+ /* Could the event be an nvme event? */
+ if (nvme_pmu__have_event(NULL, event))
+ to_read_pmus |= PERF_TOOL_PMU_TYPE_NVME_MASK;
+
pmu_read_sysfs(to_read_pmus);
pmu = list_prepare_entry(pmu, &core_pmus, list);
}
@@ -424,11 +439,14 @@ struct perf_pmu *perf_pmus__scan_matching_wildcard(struct perf_pmu *pmu, const c
*/
if (strisglob(wildcard)) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_HWMON_MASK |
+ PERF_TOOL_PMU_TYPE_NVME_MASK |
PERF_TOOL_PMU_TYPE_DRM_MASK;
} else if (strlen(wildcard) >= 4 && strncmp("drm_", wildcard, 4) == 0) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_DRM_MASK;
} else if (strlen(wildcard) >= 5 && strncmp("hwmon", wildcard, 5) == 0) {
to_read_pmus |= PERF_TOOL_PMU_TYPE_HWMON_MASK;
+ } else if (strlen(wildcard) >= 4 && strncmp("nvme", wildcard, 4) == 0) {
+ to_read_pmus |= PERF_TOOL_PMU_TYPE_NVME_MASK;
}
pmu_read_sysfs(to_read_pmus);
--
2.54.0.1064.gd145956f57-goog
next prev parent reply other threads:[~2026-06-09 7:04 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-09 7:03 [PATCH v1 0/3] perf pmu: Add tool-provided NVMe PMU Ian Rogers
2026-06-09 7:03 ` [PATCH v1 1/3] perf build: Add libnvme feature detection Ian Rogers
2026-06-09 7:12 ` sashiko-bot
2026-06-09 7:03 ` Ian Rogers [this message]
2026-06-09 7:21 ` [PATCH v1 2/3] perf pmu: Implement tool-provided NVMe PMU sashiko-bot
2026-06-09 7:03 ` [PATCH v1 3/3] perf tests: Add NVMe PMU event parsing test Ian Rogers
2026-06-09 16:57 ` [PATCH v2 0/3] perf pmu: Add tool-provided NVMe PMU Ian Rogers
2026-06-09 16:57 ` [PATCH v2 1/3] perf build: Add libnvme feature detection Ian Rogers
2026-06-09 17:08 ` sashiko-bot
2026-06-09 16:57 ` [PATCH v2 2/3] perf pmu: Implement tool-provided NVMe PMU Ian Rogers
2026-06-09 17:19 ` sashiko-bot
2026-06-09 16:57 ` [PATCH v2 3/3] perf tests: Add NVMe PMU event parsing test Ian Rogers
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260609070348.541964-3-irogers@google.com \
--to=irogers@google.com \
--cc=9erthalion6@gmail.com \
--cc=acme@kernel.org \
--cc=adrian.hunter@intel.com \
--cc=alexandre.chartre@oracle.com \
--cc=ashelat@redhat.com \
--cc=german.gomez@arm.com \
--cc=james.clark@linaro.org \
--cc=jolsa@kernel.org \
--cc=leo.yan@arm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=linux-perf-users@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=mjeanson@efficios.com \
--cc=namhyung@kernel.org \
--cc=peterz@infradead.org \
--cc=tglozar@redhat.com \
--cc=yuzhuo@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.