* [PATCH v4 09/31] firmware: arm_scmi: Add support to parse SHMTIs areas
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>
Add logic to scan the SHMTI areas, parsing the TDCF descriptors while
collecting DataEvent, BlockTimestamp and UUID lines.
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
v3 --> v4
- use kzalloc_obj
- track SHMTI-discovered Telemetry DE states
- refactor TDE cache with dedicated helpers
- force tstamp to zero when timestamp is NOT supported or disabled
v2 --> v3
- split from monolithic Telemetry patch
- avoid devres allocation for resources that are added to the xa_lines XArray
- simplify prototype of line parsing helpers to drop unneeded dev
- flip tstmap logic in scmi_telemetry_line_data_parse() to properly emit
a TLM ftrace event
- use ternary ops to simplify quite a few expressions
---
drivers/firmware/arm_scmi/telemetry.c | 629 ++++++++++++++++++++++++++
1 file changed, 629 insertions(+)
diff --git a/drivers/firmware/arm_scmi/telemetry.c b/drivers/firmware/arm_scmi/telemetry.c
index a5c61ec37065..087a3b6d18e4 100644
--- a/drivers/firmware/arm_scmi/telemetry.c
+++ b/drivers/firmware/arm_scmi/telemetry.c
@@ -254,6 +254,23 @@ struct uuid_line {
u32 dwords[SCMI_TLM_DE_IMPL_MAX_DWORDS];
};
+#define LINE_DATA_GET(f) \
+({ \
+ typeof(f) _f = (f); \
+ \
+ (TO_CPU_64(_I(&_f->data_high), _I(&_f->data_low))); \
+})
+
+#define LINE_TSTAMP_GET(f) \
+({ \
+ typeof(f) _f = (f); \
+ \
+ (TO_CPU_64(_I(&_f->ts_high), _I(&_f->ts_low))); \
+})
+
+#define BLK_TS_STAMP(f) LINE_TSTAMP_GET(f)
+#define BLK_TS_RATE(p) PAYLD_ID(p)
+
enum tdcf_line_types {
TDCF_DATA_LINE,
TDCF_BLK_TS_LINE,
@@ -365,6 +382,7 @@ struct telemetry_line {
refcount_t users;
u32 last_magic;
struct payload __iomem *payld;
+ struct xarray *xa_lines;
/* Protect line accesses */
struct mutex mtx;
};
@@ -413,18 +431,34 @@ struct telemetry_de {
#define to_tde(d) container_of(d, struct telemetry_de, de)
+#define TDE_HAS_TSTAMP(_t) \
+ ({ \
+ bool ts; \
+ struct telemetry_de *t = _t; \
+ \
+ ts = t->de.tstamp_support && t->de.tstamp_enabled; \
+ })
+
#define DE_ENABLED_WITH_TSTAMP 2
+enum de_state {
+ ENA_STATE,
+ ENA_TSTAMP,
+ ENA_MAX
+};
+
struct telemetry_info {
bool streaming_mode;
unsigned int num_shmti;
unsigned int num_des_tstamp;
+ atomic_t des_enabled[ENA_MAX];
unsigned int default_blk_ts_rate;
const struct scmi_protocol_handle *ph;
struct telemetry_shmti *shmti;
struct telemetry_de *tdes;
struct scmi_telemetry_group *grps;
struct xarray xa_des;
+ struct xarray xa_lines;
/* Mutex to protect access to @free_des */
struct mutex free_mtx;
struct list_head free_des;
@@ -439,6 +473,21 @@ struct telemetry_info {
static struct scmi_telemetry_res_info *
__scmi_telemetry_resources_get(struct telemetry_info *ti);
+static inline void
+scmi_telemetry_de_state_update(struct telemetry_info *ti, enum de_state state,
+ bool *current_state, bool next_state)
+{
+ if (!current_state || *current_state != next_state)
+ atomic_add(next_state ? 1 : -1, &ti->des_enabled[state]);
+
+ if (current_state)
+ *current_state = next_state;
+
+ dev_dbg(ti->ph->dev, "Telemetry des_enabled[%s]:%u\n",
+ state == ENA_STATE ? "STATE" : "TSTAMP",
+ atomic_read(&ti->des_enabled[state]));
+}
+
static struct telemetry_de *
scmi_telemetry_free_tde_get(struct telemetry_info *ti)
{
@@ -518,6 +567,27 @@ static int scmi_telemetry_tde_register(struct telemetry_info *ti,
return ret;
}
+static bool
+scmi_telemetry_tde_cache_unchanged(struct telemetry_de *tde, u32 magic)
+{
+ guard(mutex)(&tde->mtx);
+
+ return tde->last_magic == magic;
+}
+
+static void
+scmi_telemetry_tde_cache_update(struct telemetry_de *tde, u64 val,
+ u64 *tstamp, u32 *magic)
+{
+ guard(mutex)(&tde->mtx);
+
+ tde->last_magic = magic ? *magic : TDCF_BAD_END_SEQ;
+ tde->last_val = val;
+ tde->last_ts = tstamp && TDE_HAS_TSTAMP(tde) ? *tstamp : 0;
+ if (tstamp)
+ *tstamp = tde->last_ts;
+}
+
struct scmi_tlm_de_priv {
struct telemetry_info *ti;
void *next;
@@ -1122,6 +1192,555 @@ scmi_telemetry_resources_get(const struct scmi_protocol_handle *ph)
return ti->res_get(ti);
}
+static u64
+scmi_telemetry_blkts_read(u32 magic, struct telemetry_block_ts *bts)
+{
+ if (WARN_ON(!bts || !refcount_read(&bts->line.users)))
+ return 0;
+
+ guard(mutex)(&bts->line.mtx);
+
+ if (bts->line.last_magic == magic)
+ return bts->last_ts;
+
+ /* Note that the bts->last_rate can change ONLY on creation */
+ bts->last_ts = BLK_TS_STAMP(&bts->line.payld->blk_tsl);
+ bts->line.last_magic = magic;
+
+ return bts->last_ts;
+}
+
+static void scmi_telemetry_blkts_update(struct telemetry_info *ti, u32 magic,
+ struct telemetry_block_ts *bts)
+{
+ guard(mutex)(&bts->line.mtx);
+
+ if (bts->line.last_magic != magic) {
+ bts->last_ts = BLK_TS_STAMP(&bts->line.payld->blk_tsl);
+ bts->last_rate = BLK_TS_RATE(bts->line.payld);
+ /* BLK_TS clock rate value can change ONLY here on creation */
+ if (!bts->last_rate)
+ bts->last_rate = ti->default_blk_ts_rate;
+ bts->line.last_magic = magic;
+ }
+}
+
+static void scmi_telemetry_line_put(struct telemetry_line *line, void *blob)
+{
+ if (refcount_dec_and_test(&line->users)) {
+ scoped_guard(mutex, &line->mtx)
+ xa_erase(line->xa_lines, (unsigned long)line->payld);
+ kfree(blob);
+ }
+}
+
+static void scmi_telemetry_blkts_unlink(struct telemetry_de *tde)
+{
+ scmi_telemetry_line_put(&tde->bts->line, tde->bts);
+ tde->bts = NULL;
+}
+
+static void scmi_telemetry_uuid_unlink(struct telemetry_de *tde)
+{
+ scmi_telemetry_line_put(&tde->uuid->line, tde->uuid);
+ tde->uuid = NULL;
+}
+
+static void scmi_telemetry_de_unlink(struct scmi_telemetry_de *de)
+{
+ struct telemetry_de *tde = to_tde(de);
+
+ /* Unlink all related lines triggering their deallocation */
+ if (tde->bts)
+ scmi_telemetry_blkts_unlink(tde);
+ if (tde->uuid)
+ scmi_telemetry_uuid_unlink(tde);
+}
+
+static struct telemetry_line *
+scmi_telemetry_line_get(struct xarray *xa_lines, struct payload *payld)
+{
+ struct telemetry_line *line;
+
+ line = xa_load(xa_lines, (unsigned long)payld);
+ if (!line)
+ return NULL;
+
+ refcount_inc(&line->users);
+
+ return line;
+}
+
+static int
+scmi_telemetry_line_init(struct telemetry_line *line, struct xarray *xa_lines,
+ struct payload __iomem *payld)
+{
+ refcount_set(&line->users, 1);
+ line->payld = payld;
+ line->xa_lines = xa_lines;
+ mutex_init(&line->mtx);
+
+ return xa_insert(xa_lines, (unsigned long)payld, line, GFP_KERNEL);
+}
+
+static struct telemetry_block_ts *
+scmi_telemetry_blkts_create(struct device *dev, struct xarray *xa_lines,
+ struct payload *payld)
+{
+ struct telemetry_block_ts *bts;
+ int ret;
+
+ bts = kzalloc_obj(*bts);
+ if (!bts)
+ return NULL;
+
+ ret = scmi_telemetry_line_init(&bts->line, xa_lines, payld);
+ if (ret) {
+ kfree(bts);
+ return NULL;
+ }
+
+ trace_scmi_tlm_collect(0, (u64)payld, 0, "SHMTI_NEW_BLKTS");
+
+ return bts;
+}
+
+static struct telemetry_block_ts *
+scmi_telemetry_blkts_get_or_create(struct device *dev, struct xarray *xa_lines,
+ struct payload *payld)
+{
+ struct telemetry_line *line;
+
+ line = scmi_telemetry_line_get(xa_lines, payld);
+ if (line)
+ return to_blkts(line);
+
+ return scmi_telemetry_blkts_create(dev, xa_lines, payld);
+}
+
+static struct telemetry_uuid *
+scmi_telemetry_uuid_create(struct device *dev, struct xarray *xa_lines,
+ struct payload *payld)
+{
+ struct telemetry_uuid *uuid;
+ int ret;
+
+ uuid = kzalloc_obj(*uuid);
+ if (!uuid)
+ return NULL;
+
+ for (int i = 0; i < SCMI_TLM_DE_IMPL_MAX_DWORDS; i++)
+ uuid->de_impl_version[i] = le32_to_cpu(payld->uuid_l.dwords[i]);
+
+ ret = scmi_telemetry_line_init(&uuid->line, xa_lines, payld);
+ if (ret) {
+ kfree(uuid);
+ return NULL;
+ }
+
+ trace_scmi_tlm_collect(0, (u64)payld, 0, "SHMTI_NEW_UUID");
+
+ return uuid;
+}
+
+static struct telemetry_uuid *
+scmi_telemetry_uuid_get_or_create(struct device *dev, struct xarray *xa_lines,
+ struct payload *payld)
+{
+ struct telemetry_line *line;
+
+ line = scmi_telemetry_line_get(xa_lines, payld);
+ if (line)
+ return to_uuid(line);
+
+ return scmi_telemetry_uuid_create(dev, xa_lines, payld);
+}
+
+static void scmi_telemetry_tdcf_uuid_parse(struct telemetry_info *ti,
+ struct payload __iomem *payld,
+ struct telemetry_shmti *shmti,
+ void **active_uuid)
+{
+ struct telemetry_uuid *uuid;
+
+ if (UUID_INVALID(payld)) {
+ trace_scmi_tlm_access(0, "UUID_INVALID", 0, 0);
+ return;
+ }
+
+ /* A UUID descriptor MUST be returned: it is found or it is created */
+ uuid = scmi_telemetry_uuid_get_or_create(ti->ph->dev, &ti->xa_lines,
+ payld);
+ if (WARN_ON(!uuid))
+ return;
+
+ *active_uuid = uuid;
+}
+
+static struct payload *
+scmi_telemetry_nearest_line_by_type(struct telemetry_shmti *shmti,
+ void *last, enum tdcf_line_types ltype)
+{
+ struct tdcf __iomem *tdcf = shmti->base;
+ void *next, *found = NULL;
+
+ /* Scan from start of TDCF payloads up to last_payld */
+ next = tdcf->payld;
+ while (next < last) {
+ if (LINE_TYPE((struct payload *)next) == ltype)
+ found = next;
+
+ next += LINE_LENGTH_WORDS((struct payload *)next);
+ }
+
+ return found;
+}
+
+static struct telemetry_block_ts *
+scmi_telemetry_blkts_bind(struct device *dev, struct telemetry_shmti *shmti,
+ struct payload *payld, struct xarray *xa_lines,
+ struct payload *bts_payld)
+{
+ /* Trigger a manual search when no BLK_TS payload offset was provided */
+ if (!bts_payld) {
+ /* Find the BLK_TS immediately preceding this DE payld */
+ bts_payld = scmi_telemetry_nearest_line_by_type(shmti, payld,
+ TDCF_BLK_TS_LINE);
+ if (!bts_payld)
+ return NULL;
+ }
+
+ return scmi_telemetry_blkts_get_or_create(dev, xa_lines, bts_payld);
+}
+
+/**
+ * scmi_telemetry_tdcf_blkts_parse - A BLK_TS line parser
+ *
+ * @ti: A reference to the telemetry_info descriptor
+ * @payld: TDCF payld line to process
+ * @shmti: SHMTI descriptor inside which the scan is happening
+ * @active_bts: Input/output reference to keep track of the last blk_ts found
+ *
+ * Process a valid TDCF BLK_TS line and, after having looked up or created a
+ * blk_ts descriptor, update the related data and return it as the currently
+ * active blk_ts, given that it is effectively the last found during this
+ * scan.
+ */
+static void scmi_telemetry_tdcf_blkts_parse(struct telemetry_info *ti,
+ struct payload __iomem *payld,
+ struct telemetry_shmti *shmti,
+ void **active_bts)
+{
+ struct telemetry_block_ts *bts;
+
+ /* Check for spec compliance */
+ if (BLK_TS_INVALID(payld)) {
+ trace_scmi_tlm_access(0, "BLK_TS_INVALID", 0, 0);
+ return;
+ }
+
+ /* A BLK_TS descriptor MUST be returned: it is found or it is created */
+ bts = scmi_telemetry_blkts_get_or_create(ti->ph->dev,
+ &ti->xa_lines, payld);
+ if (WARN_ON(!bts))
+ return;
+
+ /* Update the descriptor with the lastest TS */
+ scmi_telemetry_blkts_update(ti, shmti->last_magic, bts);
+ *active_bts = bts;
+}
+
+static inline struct telemetry_de *
+scmi_telemetry_tde_allocate(struct telemetry_info *ti, u32 de_id,
+ struct payload __iomem *payld)
+{
+ struct telemetry_de *tde;
+
+ tde = scmi_telemetry_tde_get(ti, de_id);
+ if (IS_ERR(tde))
+ return NULL;
+
+ tde->de.info->id = de_id;
+ tde->de.enabled = true;
+ tde->de.tstamp_enabled = LINE_TS_VALID(payld) || USE_BLK_TS(payld);
+
+ if (scmi_telemetry_tde_register(ti, tde)) {
+ scmi_telemetry_free_tde_put(ti, tde);
+ return NULL;
+ }
+
+ scmi_telemetry_de_state_update(ti, ENA_STATE, NULL, true);
+ if (tde->de.tstamp_enabled)
+ scmi_telemetry_de_state_update(ti, ENA_TSTAMP, NULL, true);
+
+ return tde;
+}
+
+static inline void
+scmi_telemetry_line_data_parse(struct telemetry_de *tde, u64 *val, u64 *tstamp,
+ struct payload __iomem *payld, u32 magic)
+{
+ /* Data is always valid since we are NOT handling BLK TS lines here */
+ *val = LINE_DATA_GET(&payld->l);
+ if (tstamp) {
+ if (USE_BLK_TS(payld)) {
+ /* Read out the actual BLK_TS */
+ *tstamp = scmi_telemetry_blkts_read(magic, tde->bts);
+ } else if (LINE_TS_VALID(payld)) {
+ /*
+ * Note that LINE_TS_VALID implies HAS_LINE_EXT and that
+ * the per DE line_ts_rate is advertised in the DE
+ * descriptor.
+ */
+ *tstamp = LINE_TSTAMP_GET(&payld->tsl);
+ } else {
+ *tstamp = 0;
+ }
+ }
+
+ trace_scmi_tlm_collect(tstamp ? *tstamp : 0, tde->de.info->id,
+ *val, "SHMTI_DE_READ");
+
+ scmi_telemetry_tde_cache_update(tde, *val, tstamp, &magic);
+}
+
+static inline void scmi_telemetry_bts_link(struct telemetry_de *tde,
+ struct telemetry_block_ts *bts)
+{
+ refcount_inc(&bts->line.users);
+ tde->bts = bts;
+ /* Update TS clock rate if provided by the BLK_TS */
+ if (tde->bts->last_rate)
+ tde->de.info->ts_rate = tde->bts->last_rate;
+}
+
+static inline void scmi_telemetry_uuid_link(struct telemetry_de *tde,
+ struct telemetry_uuid *uuid)
+{
+ refcount_inc(&uuid->line.users);
+ tde->uuid = uuid;
+}
+
+/**
+ * scmi_telemetry_tdcf_data_parse - TDCF DataLine parsing
+ * @ti: A reference to the telemetry info descriptor
+ * @payld: Line payload to parse
+ * @shmti: A reference to the containing SHMTI area
+ * @mode: A flag to determine the behaviour of the scan
+ * @active_bts: A pointer to keep track and report any found BLK timestamp line
+ * @active_uuid: A pointer to keep track and report any found UUID line
+ *
+ * This routine takes care to:
+ * - verify line consistency in relation to the used flags and the current
+ * context: e.g. is there an active preceding BLK_TS line if the DataLine
+ * sports a USE_BLKTS flag ?
+ * - verify the related Data Event ID exists OR create a brand new DE
+ * (depending on the @mode of operation)
+ * - links any active BLK_TS or UUID line to the current DE
+ * - read and save value/tstamp for the DE ONLY if anything has changed (by
+ * tracking the last TDCF magic) and update related magic: this allows to
+ * minimize future needs of single-DE reads
+ *
+ * Modes of operation.
+ *
+ * The scan behaviour depends on the chosen @mode:
+ * - SCAN_LOOKUP: the basic scan which aims to update value associated to
+ * existing DEs. Any discovered DataLine that could NOT be
+ * matched to an existing, previously discovered, DE is
+ * discarded. This is the normal scan behaviour.
+ * - SCAN_UPDATE: a more advanced scan which provides all the SCAN_LOOKUP
+ * features plus takes care to update the DEs location
+ * coordinates inside the SHMTI: note that the related DEs are
+ * still supposed to have been previously discovered when
+ * this scan runs. This is used to update location
+ * coordinates for DEs contained in a Group when such group
+ * is enabled.
+ * - SCAN_DISCOVERY: the most advanced scan available which provides all
+ * the SCAN_LOOKUP features plus discovery capabilities:
+ * any DataLine referring to a previously unknown DE leads
+ * to the allocation of a new DE descriptor.
+ * This mode is used on the first scan at init time, ONLY
+ * if Telemetry was found to be already enabled at boot on
+ * the platform side: this helps to maximize gathered
+ * information when dealing with out of spec firmwares.
+ * Any usage of this discovery mode other than in a boot-on
+ * enabled scenario is discouraged since it can easily
+ * lead to spurious DE discoveries.
+ */
+static void scmi_telemetry_tdcf_data_parse(struct telemetry_info *ti,
+ struct payload __iomem *payld,
+ struct telemetry_shmti *shmti,
+ enum scan_mode mode,
+ void *active_bts, void *active_uuid)
+{
+ bool use_blk_ts = USE_BLK_TS(payld);
+ struct telemetry_de *tde;
+ u64 val, tstamp = 0;
+ u32 de_id;
+
+ de_id = PAYLD_ID(payld);
+ /* Discard malformed lines...a preceding BLK_TS must exist */
+ if (use_blk_ts && !active_bts) {
+ trace_scmi_tlm_access(de_id, "BAD_USE_BLK_TS", 0, 0);
+ return;
+ }
+
+ /* Is this DE ID known ? */
+ tde = scmi_telemetry_tde_lookup(ti, de_id);
+ if (!tde) {
+ if (mode != SCAN_DISCOVERY) {
+ trace_scmi_tlm_access(de_id, "DE_INVALID", 0, 0);
+ return;
+ }
+
+ /* In SCAN_DISCOVERY mode we allocate new DEs for unknown IDs */
+ tde = scmi_telemetry_tde_allocate(ti, de_id, payld);
+ if (!tde)
+ return;
+ }
+
+ /* Update DE location refs if requested: normally done only on enable */
+ if (mode >= SCAN_UPDATE) {
+ tde->base = shmti->base;
+ tde->eplg = SHMTI_EPLG(shmti);
+ tde->offset = (void *)payld - (void *)shmti->base;
+
+ dev_dbg(ti->ph->dev,
+ "TDCF-updated DE_ID:0x%08X - shmti:%pK offset:%u\n",
+ tde->de.info->id, tde->base, tde->offset);
+ }
+
+ /* Has any value/tstamp really changed ?*/
+ if (scmi_telemetry_tde_cache_unchanged(tde, shmti->last_magic))
+ return;
+
+ /* Link the related BTS when needed, it's unlinked on disable */
+ if (use_blk_ts && !tde->bts)
+ scmi_telemetry_bts_link(tde, active_bts);
+
+ /* Link the active UUID when existent, it's unlinked on disable */
+ if (active_uuid)
+ scmi_telemetry_uuid_link(tde, active_uuid);
+
+ /* Parse data words */
+ scmi_telemetry_line_data_parse(tde, &val, &tstamp, payld,
+ shmti->last_magic);
+
+}
+
+static int scmi_telemetry_tdcf_line_parse(struct telemetry_info *ti,
+ struct payload __iomem *payld,
+ struct telemetry_shmti *shmti,
+ enum scan_mode mode,
+ void **active_bts, void **active_uuid)
+{
+ int used_qwords;
+
+ used_qwords = LINE_LENGTH_QWORDS(payld);
+ /* Invalid lines are not an error, could simply be disabled DEs */
+ if (DATA_INVALID(payld)) {
+ trace_scmi_tlm_access(PAYLD_ID(payld), "TDCF_INVALID", 0, 0);
+ return used_qwords;
+ }
+
+ switch (LINE_TYPE(payld)) {
+ case TDCF_DATA_LINE:
+ scmi_telemetry_tdcf_data_parse(ti, payld, shmti, mode,
+ *active_bts, *active_uuid);
+ break;
+ case TDCF_BLK_TS_LINE:
+ scmi_telemetry_tdcf_blkts_parse(ti, payld, shmti, active_bts);
+ break;
+ case TDCF_UUID_LINE:
+ scmi_telemetry_tdcf_uuid_parse(ti, payld, shmti, active_uuid);
+ break;
+ default:
+ trace_scmi_tlm_access(PAYLD_ID(payld), "TDCF_UNKNOWN", 0, 0);
+ break;
+ }
+
+ return used_qwords;
+}
+
+/**
+ * scmi_telemetry_shmti_scan - Full SHMTI scan
+ * @ti: A reference to the telemetry info descriptor
+ * @shmti_id: ID of the SHMTI area that has to be scanned
+ * @mode: A flag to determine the behaviour of the scan
+ *
+ * Return: 0 on Success
+ */
+static int scmi_telemetry_shmti_scan(struct telemetry_info *ti,
+ unsigned int shmti_id, enum scan_mode mode)
+{
+ struct telemetry_shmti *shmti = &ti->shmti[shmti_id];
+ struct tdcf __iomem *tdcf = shmti->base;
+ int retries = SCMI_TLM_TDCF_MAX_RETRIES;
+ u32 startm = 0, endm = TDCF_BAD_END_SEQ;
+
+ if (!tdcf)
+ return -ENODEV;
+
+ do {
+ void *active_bts = NULL, *active_uuid = NULL;
+ unsigned int qwords;
+ void __iomem *next;
+
+ /* A bit of exponential backoff between retries */
+ fsleep((SCMI_TLM_TDCF_MAX_RETRIES - retries) * 1000);
+
+ /*
+ * Note that during a full SHMTI scan the magic seq numbers are
+ * checked only at the start and at the end of the scan, NOT
+ * between each parsed line and this has these consequences:
+ * - TDCF magic numbers accesses are reduced to 2 reads
+ * - the set of values obtained from a full scan belong all
+ * to the same platform update (same magic number)
+ * - a SHMTI full scan is an all or nothing operation: when
+ * a potentially corrupted read is detected along the way
+ * (MSEQ_MISMATCH) another full scan is triggered.
+ */
+ startm = TDCF_START_SEQ_GET(tdcf);
+ if (IS_BAD_START_SEQ(startm)) {
+ trace_scmi_tlm_access(0, "MSEQ_BADSTART", startm, 0);
+ continue;
+ }
+
+ /* On a BAD_SEQ this will be updated on the next attempt */
+ shmti->last_magic = startm;
+
+ qwords = QWORDS(tdcf);
+ next = tdcf->payld;
+ while (qwords) {
+ int used_qwords;
+
+ used_qwords = scmi_telemetry_tdcf_line_parse(ti, next,
+ shmti, mode,
+ &active_bts,
+ &active_uuid);
+ if (qwords < used_qwords) {
+ trace_scmi_tlm_access(PAYLD_ID(next),
+ "BAD_QWORDS", startm, 0);
+ return -EINVAL;
+ }
+
+ next += used_qwords * 8;
+ qwords -= used_qwords;
+ }
+
+ endm = TDCF_END_SEQ_GET(SHMTI_EPLG(shmti));
+ if (startm != endm)
+ trace_scmi_tlm_access(0, "MSEQ_MISMATCH", startm, endm);
+ } while (startm != endm && --retries);
+
+ if (startm != endm) {
+ trace_scmi_tlm_access(0, "TDCF_SCAN_FAIL", startm, endm);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
static const struct scmi_telemetry_proto_ops tlm_proto_ops = {
.info_get = scmi_telemetry_info_get,
.de_lookup = scmi_telemetry_de_lookup,
@@ -1211,6 +1830,13 @@ static void scmi_telemetry_resources_free(void *arg)
struct telemetry_info *ti = arg;
struct scmi_telemetry_res_info *rinfo = ACCESS_PRIVATE(ti, rinfo);
+ /*
+ * Unlinking all the BLK_TS/UUID lines related to a DE triggers also
+ * the deallocation of such lines when the embedded refcount hits zero.
+ */
+ for (int i = 0; i < rinfo->num_des; i++)
+ scmi_telemetry_de_unlink(rinfo->des[i]);
+
kfree(ti->tdes);
kfree(rinfo->des);
kfree(rinfo->dei_store);
@@ -1316,6 +1942,9 @@ static int scmi_telemetry_instance_init(struct telemetry_info *ti)
return ret;
xa_init(&ti->xa_des);
+ xa_init(&ti->xa_lines);
+ atomic_set(&ti->des_enabled[ENA_STATE], 0);
+ atomic_set(&ti->des_enabled[ENA_TSTAMP], 0);
/* Setup resources lazy initialization */
atomic_set(&ti->rinfo_initializing, 0);
init_completion(&ti->rinfo_initdone);
--
2.54.0
^ permalink raw reply related
* [PATCH v4 08/31] firmware: arm_scmi: Add basic Telemetry support
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>
Add SCMIv4.0 Telemetry basic support to enable initialization and resources
enumeration: add all the telemetry messages definitions and parsing logic
but only a few simple state gathering protocol operations.
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
v3 -->v4
- bail-out on FW_BUG errors
- count timestamp capable DE to track enables
v2 --> v3
- split from monolithic Telemetry patch
- fix checkpatch macros complaints
- fix ACCESS_PRIVATE usage
- add a few comments on allocation/enumeration lifetime
- use interval.num_intervals
- removed needless cleanup handler usage
- simply return from scmi_telemetry_de_lookup()
- fixed composing_des name length to 08X
---
drivers/firmware/arm_scmi/Makefile | 2 +-
drivers/firmware/arm_scmi/driver.c | 2 +
drivers/firmware/arm_scmi/protocols.h | 1 +
drivers/firmware/arm_scmi/telemetry.c | 1380 +++++++++++++++++++++++++
include/linux/scmi_protocol.h | 135 ++-
5 files changed, 1518 insertions(+), 2 deletions(-)
create mode 100644 drivers/firmware/arm_scmi/telemetry.c
diff --git a/drivers/firmware/arm_scmi/Makefile b/drivers/firmware/arm_scmi/Makefile
index 780cd62b2f78..fe55b7aa0707 100644
--- a/drivers/firmware/arm_scmi/Makefile
+++ b/drivers/firmware/arm_scmi/Makefile
@@ -8,7 +8,7 @@ scmi-driver-$(CONFIG_ARM_SCMI_RAW_MODE_SUPPORT) += raw_mode.o
scmi-transport-$(CONFIG_ARM_SCMI_HAVE_SHMEM) = shmem.o
scmi-transport-$(CONFIG_ARM_SCMI_HAVE_MSG) += msg.o
scmi-protocols-y := base.o clock.o perf.o power.o reset.o sensors.o system.o voltage.o powercap.o
-scmi-protocols-y += pinctrl.o
+scmi-protocols-y += pinctrl.o telemetry.o
scmi-module-objs := $(scmi-driver-y) $(scmi-protocols-y) $(scmi-transport-y)
obj-$(CONFIG_ARM_SCMI_PROTOCOL) += transports/
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 9c1ab9925b1d..dd9446b54858 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -3512,6 +3512,7 @@ static int __init scmi_driver_init(void)
scmi_system_register();
scmi_powercap_register();
scmi_pinctrl_register();
+ scmi_telemetry_register();
return platform_driver_register(&scmi_driver);
}
@@ -3530,6 +3531,7 @@ static void __exit scmi_driver_exit(void)
scmi_system_unregister();
scmi_powercap_unregister();
scmi_pinctrl_unregister();
+ scmi_telemetry_unregister();
platform_driver_unregister(&scmi_driver);
diff --git a/drivers/firmware/arm_scmi/protocols.h b/drivers/firmware/arm_scmi/protocols.h
index 3e7b6f8aa72c..3250d981664b 100644
--- a/drivers/firmware/arm_scmi/protocols.h
+++ b/drivers/firmware/arm_scmi/protocols.h
@@ -386,5 +386,6 @@ DECLARE_SCMI_REGISTER_UNREGISTER(sensors);
DECLARE_SCMI_REGISTER_UNREGISTER(voltage);
DECLARE_SCMI_REGISTER_UNREGISTER(system);
DECLARE_SCMI_REGISTER_UNREGISTER(powercap);
+DECLARE_SCMI_REGISTER_UNREGISTER(telemetry);
#endif /* _SCMI_PROTOCOLS_H */
diff --git a/drivers/firmware/arm_scmi/telemetry.c b/drivers/firmware/arm_scmi/telemetry.c
new file mode 100644
index 000000000000..a5c61ec37065
--- /dev/null
+++ b/drivers/firmware/arm_scmi/telemetry.c
@@ -0,0 +1,1380 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * System Control and Management Interface (SCMI) Telemetry Protocol
+ *
+ * Copyright (C) 2026 ARM Ltd.
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitfield.h>
+#include <linux/device.h>
+#include <linux/compiler_types.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/limits.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/refcount.h>
+#include <linux/slab.h>
+#include <linux/sprintf.h>
+#include <linux/string.h>
+#include <linux/xarray.h>
+
+#include "protocols.h"
+#include "notify.h"
+
+#include <trace/events/scmi.h>
+
+/* Updated only after ALL the mandatory features for that version are merged */
+#define SCMI_PROTOCOL_SUPPORTED_VERSION 0x10000
+
+#define SCMI_TLM_TDCF_MAX_RETRIES 5
+
+enum scmi_telemetry_protocol_cmd {
+ TELEMETRY_LIST_SHMTI = 0x3,
+ TELEMETRY_DE_DESCRIPTION = 0x4,
+ TELEMETRY_LIST_UPDATE_INTERVALS = 0x5,
+ TELEMETRY_DE_CONFIGURE = 0x6,
+ TELEMETRY_DE_ENABLED_LIST = 0x7,
+ TELEMETRY_CONFIG_SET = 0x8,
+ TELEMETRY_READING_COMPLETE = TELEMETRY_CONFIG_SET,
+ TELEMETRY_CONFIG_GET = 0x9,
+ TELEMETRY_RESET = 0xA,
+};
+
+struct scmi_msg_resp_telemetry_protocol_attributes {
+ __le32 de_num;
+ __le32 groups_num;
+ __le32 de_implementation_rev_dword[SCMI_TLM_DE_IMPL_MAX_DWORDS];
+ __le32 attributes;
+#define SUPPORTS_SINGLE_READ(x) ((x) & BIT(31))
+#define SUPPORTS_CONTINUOS_UPDATE(x) ((x) & BIT(30))
+#define SUPPORTS_PER_GROUP_CONFIG(x) ((x) & BIT(18))
+#define SUPPORTS_RESET(x) ((x) & BIT(17))
+#define SUPPORTS_FC(x) ((x) & BIT(16))
+ __le32 default_blk_ts_rate;
+};
+
+struct scmi_telemetry_update_notify_payld {
+ __le32 agent_id;
+ __le32 status;
+ __le32 num_dwords;
+ __le32 array[] __counted_by(num_dwords);
+};
+
+struct scmi_shmti_desc {
+ __le32 id;
+ __le32 addr_low;
+ __le32 addr_high;
+ __le32 length;
+ __le32 flags;
+};
+
+struct scmi_msg_resp_telemetry_shmti_list {
+ __le32 num_shmti;
+ struct scmi_shmti_desc desc[] __counted_by(num_shmti);
+};
+
+struct de_desc_fc {
+ __le32 addr_low;
+ __le32 addr_high;
+ __le32 size;
+};
+
+struct scmi_de_desc {
+ __le32 id;
+ __le32 grp_id;
+ __le32 data_sz;
+ __le32 attr_1;
+#define IS_NAME_SUPPORTED(d) ((d)->attr_1 & BIT(31))
+#define IS_FC_SUPPORTED(d) ((d)->attr_1 & BIT(30))
+#define GET_DE_TYPE(d) (le32_get_bits((d)->attr_1, GENMASK(29, 22)))
+#define IS_PERSISTENT(d) ((d)->attr_1 & BIT(21))
+#define GET_DE_UNIT_EXP(d) \
+ ({ \
+ __u32 __signed_exp = \
+ le32_get_bits((d)->attr_1, GENMASK(20, 13)); \
+ \
+ sign_extend32(__signed_exp, 7); \
+ })
+#define GET_DE_UNIT(d) (le32_get_bits((d)->attr_1, GENMASK(12, 5)))
+#define TSTAMP_SUPPORT(d) (le32_get_bits((d)->attr_1, GENMASK(1, 0)))
+ __le32 attr_2;
+#define GET_DE_INSTA_ID(d) (le32_get_bits((d)->attr_2, GENMASK(31, 24)))
+#define GET_COMPO_INSTA_ID(d) (le32_get_bits((d)->attr_2, GENMASK(23, 8)))
+#define GET_COMPO_TYPE(d) (le32_get_bits((d)->attr_2, GENMASK(7, 0)))
+ __le32 reserved;
+};
+
+struct scmi_msg_resp_telemetry_de_description {
+ __le32 num_desc;
+ struct scmi_de_desc desc[] __counted_by(num_desc);
+};
+
+struct scmi_msg_telemetry_update_intervals {
+ __le32 index;
+ __le32 group_identifier;
+#define ALL_DES_NO_GROUP 0x0
+#define SPECIFIC_GROUP_DES 0x1
+#define ALL_DES_ANY_GROUP 0x2
+ __le32 flags;
+};
+
+struct scmi_msg_resp_telemetry_update_intervals {
+ __le32 flags;
+#define INTERVALS_DISCRETE(x) (!((x) & BIT(12)))
+ __le32 intervals[];
+};
+
+struct scmi_msg_telemetry_de_enabled_list {
+ __le32 index;
+ __le32 flags;
+};
+
+struct scmi_enabled_de_desc {
+ __le32 id;
+ __le32 mode;
+};
+
+struct scmi_msg_resp_telemetry_de_enabled_list {
+ __le32 flags;
+ struct scmi_enabled_de_desc entry[];
+};
+
+struct scmi_msg_telemetry_de_configure {
+ __le32 id;
+ __le32 flags;
+#define DE_ENABLE_NO_TSTAMP BIT(0)
+#define DE_ENABLE_WTH_TSTAMP BIT(1)
+#define DE_DISABLE_ALL BIT(2)
+#define GROUP_SELECTOR BIT(3)
+#define EVENT_DE 0
+#define EVENT_GROUP 1
+#define DE_DISABLE_ONE 0x0
+};
+
+struct scmi_msg_resp_telemetry_de_configure {
+ __le32 shmti_id;
+#define IS_SHMTI_ID_VALID(x) ((x) != 0xFFFFFFFF)
+ __le32 shmti_de_offset;
+ __le32 blk_ts_offset;
+};
+
+struct scmi_msg_telemetry_config_set {
+ __le32 grp_id;
+ __le32 control;
+#define TELEMETRY_ENABLE (BIT(0))
+
+#define TELEMETRY_MODE_SET(x) (FIELD_PREP(GENMASK(4, 1), (x)))
+#define TLM_ONDEMAND (0)
+#define TLM_NOTIFS (1)
+#define TLM_SINGLE (2)
+#define TELEMETRY_MODE_ONDEMAND TELEMETRY_MODE_SET(TLM_ONDEMAND)
+#define TELEMETRY_MODE_NOTIFS TELEMETRY_MODE_SET(TLM_NOTIFS)
+#define TELEMETRY_MODE_SINGLE TELEMETRY_MODE_SET(TLM_SINGLE)
+
+#define TLM_ORPHANS (0)
+#define TLM_GROUP (1)
+#define TLM_ALL (2)
+#define TELEMETRY_SET_SELECTOR(x) (FIELD_PREP(GENMASK(8, 5), (x)))
+#define TELEMETRY_SET_SELECTOR_ORPHANS TELEMETRY_SET_SELECTOR(TLM_ORPHANS)
+#define TELEMETRY_SET_SELECTOR_GROUP TELEMETRY_SET_SELECTOR(TLM_GROUP)
+#define TELEMETRY_SET_SELECTOR_ALL TELEMETRY_SET_SELECTOR(TLM_ALL)
+ __le32 sampling_rate;
+};
+
+struct scmi_msg_resp_telemetry_reading_complete {
+ __le32 num_dwords;
+ __le32 dwords[] __counted_by(num_dwords);
+};
+
+struct scmi_msg_telemetry_config_get {
+ __le32 grp_id;
+ __le32 flags;
+#define TELEMETRY_GET_SELECTOR(x) (FIELD_PREP(GENMASK(3, 0), (x)))
+#define TELEMETRY_GET_SELECTOR_ORPHANS TELEMETRY_GET_SELECTOR(TLM_ORPHANS)
+#define TELEMETRY_GET_SELECTOR_GROUP TELEMETRY_GET_SELECTOR(TLM_GROUP)
+#define TELEMETRY_GET_SELECTOR_ALL TELEMETRY_GET_SELECTOR(TLM_ALL)
+};
+
+struct scmi_msg_resp_telemetry_config_get {
+ __le32 control;
+#define TELEMETRY_MODE_GET (FIELD_GET(GENMASK(4, 1)))
+ __le32 sampling_rate;
+};
+
+/* TDCF */
+
+#define _I(__a) (ioread32((void __iomem *)(__a)))
+
+#define TO_CPU_64(h, l) ((((u64)(h)) << 32) | (l))
+
+/*
+ * Define the behaviour of a SHMTI scan defining what information will
+ * be gathered and which Telemetry items can be updated.
+ */
+enum scan_mode {
+ SCAN_LOOKUP, /* Update only value/tstamp */
+ SCAN_UPDATE, /* Update also location offset */
+ SCAN_DISCOVERY /* Update xa_des: allows for new DEs to be discovered */
+};
+
+struct fc_line {
+ u32 data_low;
+ u32 data_high;
+};
+
+struct fc_tsline {
+ u32 data_low;
+ u32 data_high;
+ u32 ts_low;
+ u32 ts_high;
+};
+
+struct line {
+ u32 data_low;
+ u32 data_high;
+};
+
+struct blk_tsline {
+ u32 ts_low;
+ u32 ts_high;
+};
+
+struct tsline {
+ u32 data_low;
+ u32 data_high;
+ u32 ts_low;
+ u32 ts_high;
+};
+
+struct uuid_line {
+ u32 dwords[SCMI_TLM_DE_IMPL_MAX_DWORDS];
+};
+
+enum tdcf_line_types {
+ TDCF_DATA_LINE,
+ TDCF_BLK_TS_LINE,
+ TDCF_UUID_LINE,
+};
+
+struct payload {
+ u32 meta;
+#define LINE_TYPE(x) (le32_get_bits(_I(&((x)->meta)), GENMASK(7, 4)))
+#define IS_DATA_LINE(x) (LINE_TYPE(x) == TDCF_DATA_LINE)
+#define IS_BLK_TS_LINE(x) (LINE_TYPE(x) == TDCF_BLK_TS_LINE)
+#define IS_UUID_LINE(x) (LINE_TYPE(x) == TDCF_UUID_LINE)
+#define USE_BLK_TS(x) (_I(&((x)->meta)) & BIT(3))
+#define HAS_LINE_EXT(x) (_I(&((x)->meta)) & BIT(2))
+#define LINE_TS_VALID(x) (_I(&((x)->meta)) & BIT(1))
+#define DATA_INVALID(x) (_I(&((x)->meta)) & BIT(0))
+#define BLK_TS_INVALID(p) \
+({ \
+ typeof(p) _p = (p); \
+ bool invalid; \
+ \
+ invalid = LINE_TS_VALID(_p) || HAS_LINE_EXT(_p) || \
+ USE_BLK_TS(_p) || DATA_INVALID(_p); \
+ invalid; \
+})
+
+#define UUID_INVALID(p) \
+({ \
+ typeof(p) _p = (p); \
+ bool invalid; \
+ \
+ invalid = LINE_TS_VALID(_p) || USE_BLK_TS(_p) || \
+ DATA_INVALID(_p) || !HAS_LINE_EXT(_p); \
+ invalid; \
+})
+ u32 id;
+ union {
+ struct line l;
+ struct tsline tsl;
+ struct blk_tsline blk_tsl;
+ struct uuid_line uuid_l;
+ };
+};
+
+#define PAYLD_ID(x) (_I(&(((struct payload *)(x))->id)))
+
+#define LINE_DATA_PAYLD_WORDS \
+ ((sizeof(u32) + sizeof(u32) + sizeof(struct line)) / sizeof(u32))
+#define EXT_LINE_DATA_PAYLD_WORDS \
+ ((sizeof(u32) + sizeof(u32) + sizeof(struct tsline)) / sizeof(u32))
+
+#define LINE_LENGTH_WORDS(x) \
+ (HAS_LINE_EXT((x)) ? EXT_LINE_DATA_PAYLD_WORDS : LINE_DATA_PAYLD_WORDS)
+
+#define LINE_LENGTH_QWORDS(x) ((LINE_LENGTH_WORDS(x)) / 2)
+
+struct prlg {
+ u32 sign_start;
+#define SIGNATURE_START 0x5442474E /* TBGN */
+ u32 match_start;
+ u32 num_qwords;
+ u32 hdr_meta_1;
+#define TDCF_REVISION_GET(x) (le32_get_bits((x)->hdr_meta_1, GENMASK(7, 0)))
+};
+
+struct eplg {
+ u32 match_end;
+ u32 sign_end;
+#define SIGNATURE_END 0x54454E44 /* TEND */
+};
+
+#define TDCF_EPLG_SZ (sizeof(struct eplg))
+
+struct tdcf {
+ struct prlg prlg;
+ unsigned char payld[];
+};
+
+#define QWORDS(_t) (_I(&(_t)->prlg.num_qwords))
+
+#define SHMTI_MIN_SIZE (sizeof(struct tdcf) + TDCF_EPLG_SZ)
+
+#define TDCF_START_SIGNATURE(x) (_I(&((x)->prlg.sign_start)))
+#define TDCF_START_SEQ_GET(x) (_I(&((x)->prlg.match_start)))
+#define IS_BAD_START_SEQ(s) ((s) & 0x1)
+
+#define TDCF_END_SEQ_GET(e) (_I(&((e)->match_end)))
+#define TDCF_END_SIGNATURE(e) (_I(&((e)->sign_end)))
+#define TDCF_BAD_END_SEQ GENMASK(31, 0)
+
+struct telemetry_shmti {
+ int id;
+ u32 flags;
+ void __iomem *base;
+ u32 len;
+ u32 last_magic;
+};
+
+#define SHMTI_EPLG(s) \
+ ({ \
+ struct telemetry_shmti *_s = (s); \
+ struct eplg *_eplg; \
+ \
+ _eplg = _s->base + _s->len - TDCF_EPLG_SZ; \
+ (_eplg); \
+ })
+
+struct telemetry_line {
+ refcount_t users;
+ u32 last_magic;
+ struct payload __iomem *payld;
+ /* Protect line accesses */
+ struct mutex mtx;
+};
+
+struct telemetry_block_ts {
+ u64 last_ts;
+ u32 last_rate;
+ struct telemetry_line line;
+};
+
+#define to_blkts(l) container_of(l, struct telemetry_block_ts, line)
+
+struct telemetry_uuid {
+ u32 de_impl_version[SCMI_TLM_DE_IMPL_MAX_DWORDS];
+ struct telemetry_line line;
+};
+
+#define to_uuid(l) container_of(l, struct telemetry_uuid, line)
+
+enum timestamps {
+ TSTAMP_NONE,
+ TSTAMP_LINE,
+ TSTAMP_BLK
+};
+
+struct telemetry_de {
+ enum timestamps ts_type;
+ u32 ts_rate;
+ bool enumerated;
+ bool cached_msg;
+ void __iomem *base;
+ struct eplg __iomem *eplg;
+ u32 offset;
+ /* NOTE THAT DE data_sz is registered in scmi_telemetry_de */
+ u32 fc_size;
+ /* Protect last_val/ts/magic accesses */
+ struct mutex mtx;
+ u64 last_val;
+ u64 last_ts;
+ u32 last_magic;
+ struct list_head item;
+ struct telemetry_block_ts *bts;
+ struct telemetry_uuid *uuid;
+ struct scmi_telemetry_de de;
+};
+
+#define to_tde(d) container_of(d, struct telemetry_de, de)
+
+#define DE_ENABLED_WITH_TSTAMP 2
+
+struct telemetry_info {
+ bool streaming_mode;
+ unsigned int num_shmti;
+ unsigned int num_des_tstamp;
+ unsigned int default_blk_ts_rate;
+ const struct scmi_protocol_handle *ph;
+ struct telemetry_shmti *shmti;
+ struct telemetry_de *tdes;
+ struct scmi_telemetry_group *grps;
+ struct xarray xa_des;
+ /* Mutex to protect access to @free_des */
+ struct mutex free_mtx;
+ struct list_head free_des;
+ struct list_head fcs_des;
+ struct scmi_telemetry_info info;
+ atomic_t rinfo_initializing;
+ struct completion rinfo_initdone;
+ struct scmi_telemetry_res_info __private *rinfo;
+ struct scmi_telemetry_res_info *(*res_get)(struct telemetry_info *ti);
+};
+
+static struct scmi_telemetry_res_info *
+__scmi_telemetry_resources_get(struct telemetry_info *ti);
+
+static struct telemetry_de *
+scmi_telemetry_free_tde_get(struct telemetry_info *ti)
+{
+ struct telemetry_de *tde;
+
+ guard(mutex)(&ti->free_mtx);
+
+ tde = list_first_entry_or_null(&ti->free_des, struct telemetry_de, item);
+ if (!tde)
+ return tde;
+
+ list_del(&tde->item);
+
+ return tde;
+}
+
+static void scmi_telemetry_free_tde_put(struct telemetry_info *ti,
+ struct telemetry_de *tde)
+{
+ guard(mutex)(&ti->free_mtx);
+
+ list_add_tail(&tde->item, &ti->free_des);
+}
+
+static struct telemetry_de *scmi_telemetry_tde_lookup(struct telemetry_info *ti,
+ unsigned int de_id)
+{
+ struct scmi_telemetry_de *de;
+
+ de = xa_load(&ti->xa_des, de_id);
+ if (!de)
+ return NULL;
+
+ return to_tde(de);
+}
+
+static struct telemetry_de *scmi_telemetry_tde_get(struct telemetry_info *ti,
+ unsigned int de_id)
+{
+ static struct telemetry_de *tde;
+
+ /* Pick a new tde */
+ tde = scmi_telemetry_free_tde_get(ti);
+ if (!tde) {
+ dev_err(ti->ph->dev, "Cannot allocate DE for ID:0x%08X\n", de_id);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ return tde;
+}
+
+static int scmi_telemetry_tde_register(struct telemetry_info *ti,
+ struct telemetry_de *tde)
+{
+ struct scmi_telemetry_res_info *rinfo = ACCESS_PRIVATE(ti, rinfo);
+ int ret;
+
+ if (rinfo->num_des >= ti->info.base.num_des) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ /* Store DE pointer by de_id ... */
+ ret = xa_insert(&ti->xa_des, tde->de.info->id, &tde->de, GFP_KERNEL);
+ if (ret)
+ goto err;
+
+ /* ... and in the general array */
+ rinfo->des[rinfo->num_des++] = &tde->de;
+
+ return 0;
+
+err:
+ dev_err(ti->ph->dev, "Cannot register DE for ID:0x%08X\n",
+ tde->de.info->id);
+
+ return ret;
+}
+
+struct scmi_tlm_de_priv {
+ struct telemetry_info *ti;
+ void *next;
+};
+
+static int
+scmi_telemetry_protocol_attributes_get(struct telemetry_info *ti)
+{
+ struct scmi_msg_resp_telemetry_protocol_attributes *resp;
+ const struct scmi_protocol_handle *ph = ti->ph;
+ struct scmi_xfer *t;
+ int ret;
+
+ ret = ph->xops->xfer_get_init(ph, PROTOCOL_ATTRIBUTES, 0,
+ sizeof(*resp), &t);
+ if (ret)
+ return ret;
+
+ resp = t->rx.buf;
+ ret = ph->xops->do_xfer(ph, t);
+ if (!ret) {
+ __le32 attr = resp->attributes;
+
+ ti->info.base.num_des = le32_to_cpu(resp->de_num);
+ ti->info.base.num_groups = le32_to_cpu(resp->groups_num);
+ for (int i = 0; i < SCMI_TLM_DE_IMPL_MAX_DWORDS; i++)
+ ti->info.base.de_impl_version[i] =
+ le32_to_cpu(resp->de_implementation_rev_dword[i]);
+ ti->info.single_read_support = SUPPORTS_SINGLE_READ(attr);
+ ti->info.continuos_update_support = SUPPORTS_CONTINUOS_UPDATE(attr);
+ ti->info.per_group_config_support = SUPPORTS_PER_GROUP_CONFIG(attr);
+ ti->info.reset_support = SUPPORTS_RESET(attr);
+ ti->info.fc_support = SUPPORTS_FC(attr);
+ ti->num_shmti = le32_get_bits(attr, GENMASK(15, 0));
+ ti->default_blk_ts_rate = le32_to_cpu(resp->default_blk_ts_rate);
+ }
+
+ ph->xops->xfer_put(ph, t);
+
+ return ret;
+}
+
+static void iter_tlm_prepare_message(void *message,
+ unsigned int desc_index, const void *priv)
+{
+ put_unaligned_le32(desc_index, message);
+}
+
+static int iter_de_descr_update_state(struct scmi_iterator_state *st,
+ const void *response, void *priv)
+{
+ const struct scmi_msg_resp_telemetry_de_description *r = response;
+ struct scmi_tlm_de_priv *p = priv;
+
+ st->num_returned = le32_get_bits(r->num_desc, GENMASK(15, 0));
+ st->num_remaining = le32_get_bits(r->num_desc, GENMASK(31, 16));
+
+ if (st->rx_len < (sizeof(*r) + sizeof(r->desc[0]) * st->num_returned))
+ return -EINVAL;
+
+ /* Initialized to first descriptor */
+ p->next = (void *)r->desc;
+
+ return 0;
+}
+
+static int scmi_telemetry_de_descriptor_parse(struct telemetry_info *ti,
+ struct telemetry_de *tde,
+ void **next)
+{
+ struct scmi_telemetry_res_info *rinfo = ACCESS_PRIVATE(ti, rinfo);
+ const struct scmi_de_desc *desc = *next;
+ unsigned int grp_id;
+
+ tde->de.info->id = le32_to_cpu(desc->id);
+ grp_id = le32_to_cpu(desc->grp_id);
+ if (grp_id != SCMI_TLM_GRP_INVALID) {
+ /* Group descriptors are empty but allocated at this point */
+ if (grp_id >= ti->info.base.num_groups)
+ return -EINVAL;
+
+ /* Link to parent group */
+ tde->de.info->grp_id = grp_id;
+ tde->de.grp = &rinfo->grps[grp_id];
+ }
+
+ tde->de.info->data_sz = le32_to_cpu(desc->data_sz);
+ tde->de.info->type = GET_DE_TYPE(desc);
+ tde->de.info->unit = GET_DE_UNIT(desc);
+ tde->de.info->unit_exp = GET_DE_UNIT_EXP(desc);
+ tde->de.info->instance_id = GET_DE_INSTA_ID(desc);
+ tde->de.info->compo_instance_id = GET_COMPO_INSTA_ID(desc);
+ tde->de.info->compo_type = GET_COMPO_TYPE(desc);
+ tde->de.info->persistent = IS_PERSISTENT(desc);
+ tde->ts_type = TSTAMP_SUPPORT(desc);
+ tde->de.tstamp_support = !!tde->ts_type;
+ /* Count timestamped DEs */
+ ti->num_des_tstamp += !!tde->de.tstamp_support;
+ tde->de.fc_support = IS_FC_SUPPORTED(desc);
+ tde->de.name_support = IS_NAME_SUPPORTED(desc);
+ /* Update DE_DESCRIPTOR size for the next iteration */
+ *next += sizeof(*desc);
+
+ if (tde->ts_type == TSTAMP_LINE) {
+ u32 *line_ts_rate = *next;
+
+ tde->de.info->ts_rate = *line_ts_rate;
+
+ /* Variably sized depending on TS support */
+ *next += sizeof(*line_ts_rate);
+ } else if (tde->ts_type == TSTAMP_BLK) {
+ /* Setup default BLK TS value at first */
+ tde->de.info->ts_rate = ti->default_blk_ts_rate;
+ }
+
+ if (tde->de.fc_support) {
+ u32 size;
+ u64 phys_addr;
+ void __iomem *addr;
+ struct de_desc_fc *dfc;
+
+ dfc = *next;
+ phys_addr = le32_to_cpu(dfc->addr_low);
+ phys_addr |= (u64)le32_to_cpu(dfc->addr_high) << 32;
+
+ size = le32_to_cpu(dfc->size);
+ addr = devm_ioremap(ti->ph->dev, phys_addr, size);
+ if (!addr)
+ return -EADDRNOTAVAIL;
+
+ tde->base = addr;
+ tde->offset = 0;
+ tde->fc_size = size;
+
+ /* Add to FastChannels list */
+ list_add(&tde->item, &ti->fcs_des);
+
+ /* Variably sized depending on FC support */
+ *next += sizeof(*dfc);
+ }
+
+ if (tde->de.name_support) {
+ const char *de_name = *next;
+
+ strscpy(tde->de.info->name, de_name, SCMI_SHORT_NAME_MAX_SIZE);
+ /* Variably sized depending on name support */
+ *next += SCMI_SHORT_NAME_MAX_SIZE;
+ }
+
+ return 0;
+}
+
+static int iter_de_descr_process_response(const struct scmi_protocol_handle *ph,
+ const void *response,
+ struct scmi_iterator_state *st,
+ void *priv)
+{
+ struct scmi_tlm_de_priv *p = priv;
+ struct telemetry_info *ti = p->ti;
+ const struct scmi_de_desc *desc = p->next;
+ struct telemetry_de *tde;
+ bool discovered = false;
+ unsigned int de_id;
+ int ret;
+
+ de_id = le32_to_cpu(desc->id);
+ /* Check if this DE has already been discovered by other means... */
+ tde = scmi_telemetry_tde_lookup(ti, de_id);
+ if (!tde) {
+ /* Create a new one */
+ tde = scmi_telemetry_tde_get(ti, de_id);
+ if (IS_ERR(tde))
+ return PTR_ERR(tde);
+
+ discovered = true;
+ } else if (tde->enumerated) {
+ /* Cannot be a duplicate of a DE already created by enumeration */
+ dev_err(ph->dev,
+ "Discovered INVALID DE with DUPLICATED ID:0x%08X\n",
+ de_id);
+ return -EINVAL;
+ }
+
+ ret = scmi_telemetry_de_descriptor_parse(ti, tde, &p->next);
+ if (ret)
+ goto err;
+
+ if (discovered) {
+ /* Register if it was not already ... */
+ ret = scmi_telemetry_tde_register(ti, tde);
+ if (ret)
+ goto err;
+
+ tde->enumerated = true;
+ }
+
+ /* Account for this DE in group num_de counter */
+ if (tde->de.grp)
+ tde->de.grp->info->num_des++;
+
+ return 0;
+
+err:
+ /* DE not enumerated at this point were created in this call */
+ if (!tde->enumerated)
+ scmi_telemetry_free_tde_put(ti, tde);
+
+ return ret;
+}
+
+static int
+scmi_telemetry_de_groups_init(struct device *dev, struct telemetry_info *ti)
+{
+ struct scmi_telemetry_res_info *rinfo = ACCESS_PRIVATE(ti, rinfo);
+
+ /* Allocate all groups DEs IDs arrays at first ... */
+ for (int i = 0; i < ti->info.base.num_groups; i++) {
+ struct scmi_telemetry_group *grp = &rinfo->grps[i];
+ size_t des_str_sz;
+
+ unsigned int *des __free(kfree) = kcalloc(grp->info->num_des,
+ sizeof(unsigned int),
+ GFP_KERNEL);
+ if (!des)
+ return -ENOMEM;
+
+ /*
+ * Max size 32bit ID string in Hex: 0xCAFECAFE
+ * - 10 digits + ' '/'\n' = 11 bytes per number
+ * - terminating NUL character
+ */
+ des_str_sz = grp->info->num_des * 11 + 1;
+ char *des_str __free(kfree) = kzalloc(des_str_sz, GFP_KERNEL);
+ if (!des_str)
+ return -ENOMEM;
+
+ grp->des = no_free_ptr(des);
+ grp->des_str = no_free_ptr(des_str);
+ /* Reset group DE counter */
+ grp->info->num_des = 0;
+ }
+
+ /* Scan DEs and populate DE IDs arrays for all groups */
+ for (int i = 0; i < rinfo->num_des; i++) {
+ struct scmi_telemetry_group *grp = rinfo->des[i]->grp;
+
+ if (!grp)
+ continue;
+
+ /*
+ * Note that, at this point, num_des is guaranteed to be
+ * sane (in-bounds) by construction.
+ */
+ grp->des[grp->info->num_des++] = i;
+ }
+
+ /* Build composing DES string */
+ for (int i = 0; i < ti->info.base.num_groups; i++) {
+ struct scmi_telemetry_group *grp = &rinfo->grps[i];
+ size_t bufsize = grp->info->num_des * 11 + 1;
+ char *buf = grp->des_str;
+
+ for (int j = 0; j < grp->info->num_des; j++) {
+ char term = j != (grp->info->num_des - 1) ? ' ' : '\0';
+ int len;
+
+ len = scnprintf(buf, bufsize, "0x%08X%c",
+ rinfo->des[grp->des[j]]->info->id, term);
+
+ buf += len;
+ bufsize -= len;
+ }
+ }
+
+ rinfo->num_groups = ti->info.base.num_groups;
+
+ return 0;
+}
+
+static int scmi_telemetry_de_descriptors_get(struct telemetry_info *ti)
+{
+ const struct scmi_protocol_handle *ph = ti->ph;
+
+ struct scmi_iterator_ops ops = {
+ .prepare_message = iter_tlm_prepare_message,
+ .update_state = iter_de_descr_update_state,
+ .process_response = iter_de_descr_process_response,
+ };
+ struct scmi_tlm_de_priv tpriv = {
+ .ti = ti,
+ .next = NULL,
+ };
+ void *iter;
+ int ret;
+
+ if (!ti->info.base.num_des)
+ return 0;
+
+ iter = ph->hops->iter_response_init(ph, &ops, ti->info.base.num_des,
+ TELEMETRY_DE_DESCRIPTION,
+ sizeof(u32), &tpriv);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ ret = ph->hops->iter_response_run(iter);
+ if (ret)
+ return ret;
+
+ return scmi_telemetry_de_groups_init(ph->dev, ti);
+}
+
+struct scmi_tlm_ivl_priv {
+ struct device *dev;
+ struct scmi_tlm_intervals **intrvs;
+ unsigned int grp_id;
+ unsigned int flags;
+};
+
+static void iter_intervals_prepare_message(void *message,
+ unsigned int desc_index,
+ const void *priv)
+{
+ struct scmi_msg_telemetry_update_intervals *msg = message;
+ const struct scmi_tlm_ivl_priv *p = priv;
+
+ msg->index = cpu_to_le32(desc_index);
+ msg->group_identifier = cpu_to_le32(p->grp_id);
+ msg->flags = FIELD_PREP(GENMASK(3, 0), p->flags);
+}
+
+static int iter_intervals_update_state(struct scmi_iterator_state *st,
+ const void *response, void *priv)
+{
+ const struct scmi_msg_resp_telemetry_update_intervals *r = response;
+
+ st->num_returned = le32_get_bits(r->flags, GENMASK(11, 0));
+ st->num_remaining = le32_get_bits(r->flags, GENMASK(31, 16));
+
+ if (st->rx_len < (sizeof(*r) + sizeof(r->intervals[0]) * st->num_returned))
+ return -EINVAL;
+
+ /*
+ * total intervals is not declared previously anywhere so we
+ * assume it's returned+remaining on first call.
+ */
+ if (!st->max_resources) {
+ struct scmi_tlm_ivl_priv *p = priv;
+ struct scmi_tlm_intervals *intrvs;
+ bool discrete;
+ int inum;
+
+ discrete = INTERVALS_DISCRETE(r->flags);
+ /* Check consistency on first call */
+ if (!discrete && (st->num_returned != 3 || st->num_remaining != 0))
+ return -EINVAL;
+
+ inum = st->num_returned + st->num_remaining;
+ intrvs = kzalloc(sizeof(*intrvs) + inum * sizeof(__u32), GFP_KERNEL);
+ if (!intrvs)
+ return -ENOMEM;
+
+ intrvs->num_intervals = inum;
+ intrvs->discrete = discrete;
+ st->max_resources = intrvs->num_intervals;
+
+ *p->intrvs = intrvs;
+ }
+
+ return 0;
+}
+
+static int
+iter_intervals_process_response(const struct scmi_protocol_handle *ph,
+ const void *response,
+ struct scmi_iterator_state *st, void *priv)
+{
+ const struct scmi_msg_resp_telemetry_update_intervals *r = response;
+ struct scmi_tlm_ivl_priv *p = priv;
+ struct scmi_tlm_intervals *intrvs = *p->intrvs;
+ unsigned int idx = st->loop_idx;
+
+ intrvs->update_intervals[st->desc_index + idx] = r->intervals[idx];
+
+ return 0;
+}
+
+static int
+scmi_tlm_enumerate_update_intervals(struct telemetry_info *ti,
+ struct scmi_tlm_intervals **intervals,
+ int grp_id, unsigned int flags)
+{
+ struct scmi_iterator_ops ops = {
+ .prepare_message = iter_intervals_prepare_message,
+ .update_state = iter_intervals_update_state,
+ .process_response = iter_intervals_process_response,
+ };
+ const struct scmi_protocol_handle *ph = ti->ph;
+ struct scmi_tlm_ivl_priv ipriv = {
+ .dev = ph->dev,
+ .grp_id = grp_id,
+ .intrvs = intervals,
+ .flags = flags,
+ };
+ void *iter;
+
+ iter = ph->hops->iter_response_init(ph, &ops, 0,
+ TELEMETRY_LIST_UPDATE_INTERVALS,
+ sizeof(struct scmi_msg_telemetry_update_intervals),
+ &ipriv);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ return ph->hops->iter_response_run(iter);
+}
+
+static int
+scmi_telemetry_enumerate_groups_intervals(struct telemetry_info *ti)
+{
+ struct scmi_telemetry_res_info *rinfo = ACCESS_PRIVATE(ti, rinfo);
+
+ if (!ti->info.per_group_config_support)
+ return 0;
+
+ for (int id = 0; id < rinfo->num_groups; id++) {
+ int ret;
+
+ ret = scmi_tlm_enumerate_update_intervals(ti,
+ &rinfo->grps[id].intervals,
+ id, SPECIFIC_GROUP_DES);
+ if (ret)
+ return ret;
+
+ rinfo->grps_store[id].num_intervals =
+ rinfo->grps[id].intervals->num_intervals;
+ }
+
+ return 0;
+}
+
+static void scmi_telemetry_intervals_free(void *interval)
+{
+ kfree(interval);
+}
+
+static int
+scmi_telemetry_enumerate_common_intervals(struct telemetry_info *ti)
+{
+ unsigned int flags;
+ int ret;
+
+ flags = !ti->info.per_group_config_support ?
+ ALL_DES_ANY_GROUP : ALL_DES_NO_GROUP;
+
+ ret = scmi_tlm_enumerate_update_intervals(ti, &ti->info.intervals,
+ SCMI_TLM_GRP_INVALID, flags);
+ if (ret)
+ return ret;
+
+ /* A copy for UAPI access... */
+ ti->info.base.num_intervals = ti->info.intervals->num_intervals;
+
+ /* Delegate freeing of allocated intervals to unbind time */
+ return devm_add_action_or_reset(ti->ph->dev,
+ scmi_telemetry_intervals_free,
+ ti->info.intervals);
+}
+
+static int iter_shmti_update_state(struct scmi_iterator_state *st,
+ const void *response, void *priv)
+{
+ const struct scmi_msg_resp_telemetry_shmti_list *r = response;
+
+ st->num_returned = le32_get_bits(r->num_shmti, GENMASK(15, 0));
+ st->num_remaining = le32_get_bits(r->num_shmti, GENMASK(31, 16));
+
+ if (st->rx_len < (sizeof(*r) + sizeof(r->desc[0]) * st->num_returned))
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline int
+scmi_telemetry_shmti_validate(struct device *dev, struct telemetry_shmti *shmti)
+{
+ struct tdcf __iomem *tdcf = shmti->base;
+ u32 sign_start, sign_end;
+
+ sign_start = TDCF_START_SIGNATURE(tdcf);
+ sign_end = TDCF_END_SIGNATURE(SHMTI_EPLG(shmti));
+
+ if (sign_start != SIGNATURE_START || sign_end != SIGNATURE_END) {
+ dev_err(dev,
+ "BAD signature for SHMTI ID:%u @phys:%pK - START:0x%04X END:0x%04X\n",
+ shmti->id, shmti->base, sign_start, sign_end);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int iter_shmti_process_response(const struct scmi_protocol_handle *ph,
+ const void *response,
+ struct scmi_iterator_state *st,
+ void *priv)
+{
+ const struct scmi_msg_resp_telemetry_shmti_list *r = response;
+ struct telemetry_info *ti = priv;
+ struct telemetry_shmti *shmti;
+ const struct scmi_shmti_desc *desc;
+ void __iomem *addr;
+ u64 phys_addr;
+ u32 len;
+
+ desc = &r->desc[st->loop_idx];
+ shmti = &ti->shmti[st->desc_index + st->loop_idx];
+
+ shmti->id = le32_to_cpu(desc->id);
+ shmti->flags = le32_to_cpu(desc->flags);
+ phys_addr = le32_to_cpu(desc->addr_low);
+ phys_addr |= (u64)le32_to_cpu(desc->addr_high) << 32;
+
+ len = le32_to_cpu(desc->length);
+ if (len < SHMTI_MIN_SIZE) {
+ dev_err(ph->dev, "Invalid length for SHMTI ID:%u len:%u\n",
+ shmti->id, len);
+ return -EINVAL;
+ }
+
+ addr = devm_ioremap(ph->dev, phys_addr, len);
+ if (!addr)
+ return -EADDRNOTAVAIL;
+
+ shmti->base = addr;
+ shmti->len = len;
+
+ return scmi_telemetry_shmti_validate(ph->dev, shmti);
+}
+
+static int scmi_telemetry_shmti_list(const struct scmi_protocol_handle *ph,
+ struct telemetry_info *ti)
+{
+ struct scmi_iterator_ops ops = {
+ .prepare_message = iter_tlm_prepare_message,
+ .update_state = iter_shmti_update_state,
+ .process_response = iter_shmti_process_response,
+ };
+ void *iter;
+
+ iter = ph->hops->iter_response_init(ph, &ops, ti->info.base.num_des,
+ TELEMETRY_LIST_SHMTI,
+ sizeof(u32), ti);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ return ph->hops->iter_response_run(iter);
+}
+
+static int scmi_telemetry_enumerate_shmti(struct telemetry_info *ti)
+{
+ const struct scmi_protocol_handle *ph = ti->ph;
+ int ret;
+
+ if (!ti->num_shmti)
+ return 0;
+
+ ti->shmti = devm_kcalloc(ph->dev, ti->num_shmti, sizeof(*ti->shmti),
+ GFP_KERNEL);
+ if (!ti->shmti)
+ return -ENOMEM;
+
+ ret = scmi_telemetry_shmti_list(ph, ti);
+ if (ret) {
+ dev_err(ph->dev, "Cannot get SHMTI list descriptors");
+ return ret;
+ }
+
+ return 0;
+}
+
+static const struct scmi_telemetry_info *
+scmi_telemetry_info_get(const struct scmi_protocol_handle *ph)
+{
+ struct telemetry_info *ti = ph->get_priv(ph);
+
+ return &ti->info;
+}
+
+static const struct scmi_telemetry_de *
+scmi_telemetry_de_lookup(const struct scmi_protocol_handle *ph, u32 id)
+{
+ struct telemetry_info *ti = ph->get_priv(ph);
+
+ ti->res_get(ti);
+ return xa_load(&ti->xa_des, id);
+}
+
+static const struct scmi_telemetry_res_info *
+scmi_telemetry_resources_get(const struct scmi_protocol_handle *ph)
+{
+ struct telemetry_info *ti = ph->get_priv(ph);
+
+ return ti->res_get(ti);
+}
+
+static const struct scmi_telemetry_proto_ops tlm_proto_ops = {
+ .info_get = scmi_telemetry_info_get,
+ .de_lookup = scmi_telemetry_de_lookup,
+ .res_get = scmi_telemetry_resources_get,
+};
+
+/**
+ * scmi_telemetry_resources_alloc - Resources allocation
+ * @ti: A reference to the telemetry info descriptor for this instance
+ *
+ * This allocates and initializes dedicated resources for the maximum possible
+ * number of needed telemetry resources, based on information gathered from
+ * the initial enumeration: these allocations represent an upper bound on
+ * the number of discoverable telemetry resources and they will be later
+ * populated during late deferred further discovery phases.
+ *
+ * Return: 0 on Success, errno otherwise
+ */
+static int scmi_telemetry_resources_alloc(struct telemetry_info *ti)
+{
+ /* Array to hold pointers to discovered DEs */
+ struct scmi_telemetry_de **des __free(kfree) =
+ kcalloc(ti->info.base.num_des, sizeof(*des), GFP_KERNEL);
+ if (!des)
+ return -ENOMEM;
+
+ /* The allocated DE descriptors */
+ struct telemetry_de *tdes __free(kfree) =
+ kcalloc(ti->info.base.num_des, sizeof(*tdes), GFP_KERNEL);
+ if (!tdes)
+ return -ENOMEM;
+
+ /* Allocate a set of contiguous DE info descriptors. */
+ struct scmi_tlm_de_info *dei_store __free(kfree) =
+ kcalloc(ti->info.base.num_des, sizeof(*dei_store), GFP_KERNEL);
+ if (!dei_store)
+ return -ENOMEM;
+
+ /* Array to hold descriptors of discovered GROUPs */
+ struct scmi_telemetry_group *grps __free(kfree) =
+ kcalloc(ti->info.base.num_groups, sizeof(*grps), GFP_KERNEL);
+ if (!grps)
+ return -ENOMEM;
+
+ /* Allocate a set of contiguous Group info descriptors. */
+ struct scmi_tlm_grp_info *grps_store __free(kfree) =
+ kcalloc(ti->info.base.num_groups, sizeof(*grps_store), GFP_KERNEL);
+ if (!grps_store)
+ return -ENOMEM;
+
+ struct scmi_telemetry_res_info *rinfo __free(kfree) =
+ kzalloc(sizeof(*rinfo), GFP_KERNEL);
+ if (!rinfo)
+ return -ENOMEM;
+
+ mutex_init(&ti->free_mtx);
+ INIT_LIST_HEAD(&ti->free_des);
+ for (int i = 0; i < ti->info.base.num_des; i++) {
+ mutex_init(&tdes[i].mtx);
+ /* Bind contiguous DE info structures */
+ tdes[i].de.info = &dei_store[i];
+ list_add_tail(&tdes[i].item, &ti->free_des);
+ }
+
+ for (int i = 0; i < ti->info.base.num_groups; i++) {
+ grps_store[i].id = i;
+ /* Bind contiguous Group info struct */
+ grps[i].info = &grps_store[i];
+ }
+
+ INIT_LIST_HEAD(&ti->fcs_des);
+
+ ti->tdes = no_free_ptr(tdes);
+
+ rinfo->des = no_free_ptr(des);
+ rinfo->dei_store = no_free_ptr(dei_store);
+ rinfo->grps = no_free_ptr(grps);
+ rinfo->grps_store = no_free_ptr(grps_store);
+
+ ACCESS_PRIVATE(ti, rinfo) = no_free_ptr(rinfo);
+
+ return 0;
+}
+
+static void scmi_telemetry_resources_free(void *arg)
+{
+ struct telemetry_info *ti = arg;
+ struct scmi_telemetry_res_info *rinfo = ACCESS_PRIVATE(ti, rinfo);
+
+ kfree(ti->tdes);
+ kfree(rinfo->des);
+ kfree(rinfo->dei_store);
+ kfree(rinfo->grps);
+ kfree(rinfo->grps_store);
+
+ kfree(rinfo);
+
+ ACCESS_PRIVATE(ti, rinfo) = NULL;
+}
+
+static struct scmi_telemetry_res_info *
+__scmi_telemetry_resources_get(struct telemetry_info *ti)
+{
+ return ACCESS_PRIVATE(ti, rinfo);
+}
+
+/**
+ * scmi_telemetry_resources_enumerate - Enumeration helper
+ * @ti: A reference to the telemetry info descriptor for this instance
+ *
+ * This helper is configured to be called once on the first enumeration
+ * attempt, when triggered by invoking ti->res_get() from somewhere else.
+ * Once run it substitues itself in ti->res_get() with the simple accessor
+ * __scmi_telemetry_resources_get, which returns a descriptor to the resources
+ * that were possibly discovered.
+ *
+ * Note that, while it attempts to fully enumerate Data Events and Groups, it
+ * does NOT fail when such enumerations fail, instead it simply gives up with
+ * the end result that only a partially populated, but consistent, resources
+ * descriptor will be returned; in such a case the incomplete descriptor will
+ * be marked as NOT fully_enumerated: this design enables the kernel to deal
+ * with badly implemented out-of-spec firmware support while keep on providing
+ * a minimal sane, albeit possibly incomplete, set of telemetry respources.
+ *
+ * Return: A reference to a fully or partially populated resources descriptor
+ */
+static struct scmi_telemetry_res_info *
+scmi_telemetry_resources_enumerate(struct telemetry_info *ti)
+{
+ struct scmi_telemetry_res_info *rinfo = ACCESS_PRIVATE(ti, rinfo);
+ struct device *dev = ti->ph->dev;
+ int ret;
+
+ /*
+ * Ensure this init function can be called only once and
+ * handles properly concurrent calls.
+ */
+ if (atomic_cmpxchg(&ti->rinfo_initializing, 0, 1)) {
+ if (!completion_done(&ti->rinfo_initdone))
+ wait_for_completion(&ti->rinfo_initdone);
+ goto out;
+ }
+
+ ret = scmi_telemetry_de_descriptors_get(ti);
+ if (ret) {
+ dev_err(dev, FW_BUG "Cannot fully enumerate DEs resources. Degraded system.\n");
+ goto done;
+ }
+
+ ret = scmi_telemetry_enumerate_groups_intervals(ti);
+ if (ret) {
+ dev_err(dev, FW_BUG "Cannot fully enumerate group intervals. Degraded system.\n");
+ goto done;
+ }
+
+ /* If we got here, the enumeration was fully successful */
+ rinfo->fully_enumerated = true;
+done:
+ /* Disable initialization permanently */
+ smp_store_mb(ti->res_get, __scmi_telemetry_resources_get);
+ complete_all(&ti->rinfo_initdone);
+
+out:
+ return rinfo;
+}
+
+/**
+ * scmi_telemetry_instance_init - Instance initializer
+ * @ti: A reference to the telemetry info descriptor for this instance
+ *
+ * Note that this allocates and initialize all the resources possibly needed
+ * and then setups the @scmi_telemetry_resources_enumerate helper as the
+ * default method for the first call to ti->res_get(): this mechanism enables
+ * the possibility of optionally implementing deferred enumeration policies
+ * which optionally delay the discovery phase and related SCMI message exchanges
+ * to a later point in time.
+ *
+ * Return: 0 on Success, errno otherwise
+ */
+static int scmi_telemetry_instance_init(struct telemetry_info *ti)
+{
+ int ret;
+
+ /* Allocate and Initialize on first call... */
+ ret = scmi_telemetry_resources_alloc(ti);
+ if (ret)
+ return ret;
+
+ ret = devm_add_action_or_reset(ti->ph->dev,
+ scmi_telemetry_resources_free, ti);
+ if (ret)
+ return ret;
+
+ xa_init(&ti->xa_des);
+ /* Setup resources lazy initialization */
+ atomic_set(&ti->rinfo_initializing, 0);
+ init_completion(&ti->rinfo_initdone);
+ /* Ensure the new res_get() operation is visible after this point */
+ smp_store_mb(ti->res_get, scmi_telemetry_resources_enumerate);
+
+ return 0;
+}
+
+static int scmi_telemetry_protocol_init(const struct scmi_protocol_handle *ph)
+{
+ struct device *dev = ph->dev;
+ struct telemetry_info *ti;
+ int ret;
+
+ dev_dbg(dev, "Telemetry Version %d.%d\n",
+ PROTOCOL_REV_MAJOR(ph->version), PROTOCOL_REV_MINOR(ph->version));
+
+ ti = devm_kzalloc(dev, sizeof(*ti), GFP_KERNEL);
+ if (!ti)
+ return -ENOMEM;
+
+ ti->ph = ph;
+
+ ret = scmi_telemetry_protocol_attributes_get(ti);
+ if (ret) {
+ dev_err(dev, FW_BUG "Cannot retrieve protocol attributes. Abort.\n");
+ return ret;
+ }
+
+ ret = scmi_telemetry_instance_init(ti);
+ if (ret) {
+ dev_err(dev, "Cannot initialize instance. Abort.\n");
+ return ret;
+ }
+
+ ret = scmi_telemetry_enumerate_common_intervals(ti);
+ if (ret) {
+ dev_err(dev, FW_BUG "Cannot enumerate update intervals. Abort.\n");
+ return ret;
+ }
+
+ ret = scmi_telemetry_enumerate_shmti(ti);
+ if (ret) {
+ dev_err(dev, FW_BUG "Cannot enumerate SHMTIs. Abort.\n");
+ return ret;
+ }
+
+ ti->info.base.version = ph->version;
+
+ return ph->set_priv(ph, ti);
+}
+
+static const struct scmi_protocol scmi_telemetry = {
+ .id = SCMI_PROTOCOL_TELEMETRY,
+ .owner = THIS_MODULE,
+ .instance_init = &scmi_telemetry_protocol_init,
+ .ops = &tlm_proto_ops,
+ .supported_version = SCMI_PROTOCOL_SUPPORTED_VERSION,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(telemetry, scmi_telemetry)
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index aafaac1496b0..fcb45bd4b44c 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -2,17 +2,21 @@
/*
* SCMI Message Protocol driver header
*
- * Copyright (C) 2018-2021 ARM Ltd.
+ * Copyright (C) 2018-2026 ARM Ltd.
*/
#ifndef _LINUX_SCMI_PROTOCOL_H
#define _LINUX_SCMI_PROTOCOL_H
#include <linux/bitfield.h>
+#include <linux/bitops.h>
#include <linux/device.h>
#include <linux/notifier.h>
#include <linux/types.h>
+#include <uapi/linux/limits.h>
+#include <uapi/linux/scmi.h>
+
#define SCMI_MAX_STR_SIZE 64
#define SCMI_SHORT_NAME_MAX_SIZE 16
#define SCMI_MAX_NUM_RATES 16
@@ -820,6 +824,134 @@ struct scmi_pinctrl_proto_ops {
int (*pin_free)(const struct scmi_protocol_handle *ph, u32 pin);
};
+enum scmi_telemetry_de_type {
+ SCMI_TLM_DE_TYPE_USPECIFIED,
+ SCMI_TLM_DE_TYPE_ACCUMUL_IDLE_RESIDENCY,
+ SCMI_TLM_DE_TYPE_ACCUMUL_IDLE_COUNTS,
+ SCMI_TLM_DE_TYPE_ACCUMUL_OTHERS,
+ SCMI_TLM_DE_TYPE_INSTA_IDLE_STATE,
+ SCMI_TLM_DE_TYPE_INSTA_OTHERS,
+ SCMI_TLM_DE_TYPE_AVERAGE,
+ SCMI_TLM_DE_TYPE_STATUS,
+ SCMI_TLM_DE_TYPE_RESERVED_START,
+ SCMI_TLM_DE_TYPE_RESERVED_END = 0xef,
+ SCMI_TLM_DE_TYPE_OEM_START = 0xf0,
+ SCMI_TLM_DE_TYPE_OEM_END = 0xff,
+};
+
+enum scmi_telemetry_compo_type {
+ SCMI_TLM_COMPO_TYPE_USPECIFIED,
+ SCMI_TLM_COMPO_TYPE_CPU,
+ SCMI_TLM_COMPO_TYPE_CLUSTER,
+ SCMI_TLM_COMPO_TYPE_GPU,
+ SCMI_TLM_COMPO_TYPE_NPU,
+ SCMI_TLM_COMPO_TYPE_INTERCONNECT,
+ SCMI_TLM_COMPO_TYPE_MEM_CNTRL,
+ SCMI_TLM_COMPO_TYPE_L1_CACHE,
+ SCMI_TLM_COMPO_TYPE_L2_CACHE,
+ SCMI_TLM_COMPO_TYPE_L3_CACHE,
+ SCMI_TLM_COMPO_TYPE_LL_CACHE,
+ SCMI_TLM_COMPO_TYPE_SYS_CACHE,
+ SCMI_TLM_COMPO_TYPE_DISP_CNTRL,
+ SCMI_TLM_COMPO_TYPE_IPU,
+ SCMI_TLM_COMPO_TYPE_CHIPLET,
+ SCMI_TLM_COMPO_TYPE_PACKAGE,
+ SCMI_TLM_COMPO_TYPE_SOC,
+ SCMI_TLM_COMPO_TYPE_SYSTEM,
+ SCMI_TLM_COMPO_TYPE_SMCU,
+ SCMI_TLM_COMPO_TYPE_ACCEL,
+ SCMI_TLM_COMPO_TYPE_BATTERY,
+ SCMI_TLM_COMPO_TYPE_CHARGER,
+ SCMI_TLM_COMPO_TYPE_PMIC,
+ SCMI_TLM_COMPO_TYPE_BOARD,
+ SCMI_TLM_COMPO_TYPE_MEMORY,
+ SCMI_TLM_COMPO_TYPE_PERIPH,
+ SCMI_TLM_COMPO_TYPE_PERIPH_SUBC,
+ SCMI_TLM_COMPO_TYPE_LID,
+ SCMI_TLM_COMPO_TYPE_DISPLAY,
+ SCMI_TLM_COMPO_TYPE_RESERVED_START = 0x1d,
+ SCMI_TLM_COMPO_TYPE_RESERVED_END = 0xdf,
+ SCMI_TLM_COMPO_TYPE_OEM_START = 0xe0,
+ SCMI_TLM_COMPO_TYPE_OEM_END = 0xff,
+};
+
+#define SCMI_TLM_GET_UPDATE_INTERVAL_SECS(x) \
+ (le32_get_bits((x), GENMASK(20, 5)))
+#define SCMI_TLM_GET_UPDATE_INTERVAL_EXP(x) (sign_extend32((x), 4))
+
+#define SCMI_TLM_GET_UPDATE_INTERVAL(x) (FIELD_GET(GENMASK(20, 0), (x)))
+#define SCMI_TLM_BUILD_UPDATE_INTERVAL(s, e) \
+ (FIELD_PREP(GENMASK(20, 5), (s)) | FIELD_PREP(GENMASK(4, 0), (e)))
+
+enum scmi_telemetry_collection {
+ SCMI_TLM_ONDEMAND,
+ SCMI_TLM_NOTIFICATION,
+ SCMI_TLM_SINGLE_READ,
+};
+
+#define SCMI_TLM_GRP_INVALID 0xFFFFFFFF
+struct scmi_telemetry_group {
+ bool enabled;
+ bool tstamp_enabled;
+ unsigned int *des;
+ char *des_str;
+ struct scmi_tlm_grp_info *info;
+ unsigned int active_update_interval;
+ struct scmi_tlm_intervals *intervals;
+ enum scmi_telemetry_collection current_mode;
+};
+
+struct scmi_telemetry_de {
+ bool tstamp_support;
+ bool fc_support;
+ bool name_support;
+ struct scmi_tlm_de_info *info;
+ struct scmi_telemetry_group *grp;
+ bool enabled;
+ bool tstamp_enabled;
+};
+
+struct scmi_telemetry_res_info {
+ bool fully_enumerated;
+ unsigned int num_des;
+ struct scmi_telemetry_de **des;
+ struct scmi_tlm_de_info *dei_store;
+ unsigned int num_groups;
+ struct scmi_telemetry_group *grps;
+ struct scmi_tlm_grp_info *grps_store;
+};
+
+struct scmi_telemetry_info {
+ bool single_read_support;
+ bool continuos_update_support;
+ bool per_group_config_support;
+ bool reset_support;
+ bool fc_support;
+ struct scmi_tlm_base_info base;
+ unsigned int active_update_interval;
+ struct scmi_tlm_intervals *intervals;
+ bool enabled;
+ bool notif_enabled;
+ enum scmi_telemetry_collection current_mode;
+};
+
+/**
+ * struct scmi_telemetry_proto_ops - represents the various operations provided
+ * by SCMI Telemetry Protocol
+ *
+ * @info_get: get the general Telemetry information.
+ * @de_lookup: get a specific DE descriptor from the DE id.
+ * @res_get: get a reference to the Telemetry resources descriptor.
+ */
+struct scmi_telemetry_proto_ops {
+ const struct scmi_telemetry_info __must_check *(*info_get)
+ (const struct scmi_protocol_handle *ph);
+ const struct scmi_telemetry_de __must_check *(*de_lookup)
+ (const struct scmi_protocol_handle *ph, u32 id);
+ const struct scmi_telemetry_res_info __must_check *(*res_get)
+ (const struct scmi_protocol_handle *ph);
+};
+
/**
* struct scmi_notify_ops - represents notifications' operations provided by
* SCMI core
@@ -926,6 +1058,7 @@ enum scmi_std_protocol {
SCMI_PROTOCOL_VOLTAGE = 0x17,
SCMI_PROTOCOL_POWERCAP = 0x18,
SCMI_PROTOCOL_PINCTRL = 0x19,
+ SCMI_PROTOCOL_TELEMETRY = 0x1b,
};
enum scmi_system_events {
--
2.54.0
^ permalink raw reply related
* [PATCH v4 07/31] include: trace: Add Telemetry trace events
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>
Add custom traces to report Telemetry failed accesses and to report when DE
values are updated internally after a notification is processed.
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
include/trace/events/scmi.h | 48 ++++++++++++++++++++++++++++++++++++-
1 file changed, 47 insertions(+), 1 deletion(-)
diff --git a/include/trace/events/scmi.h b/include/trace/events/scmi.h
index 703b7bb68e44..b70b26e467b8 100644
--- a/include/trace/events/scmi.h
+++ b/include/trace/events/scmi.h
@@ -7,7 +7,8 @@
#include <linux/tracepoint.h>
-#define TRACE_SCMI_MAX_TAG_LEN 6
+#define TRACE_SCMI_MAX_TAG_LEN 6
+#define TRACE_SCMI_TLM_MAX_TAG_LEN 16
TRACE_EVENT(scmi_fc_call,
TP_PROTO(u8 protocol_id, u8 msg_id, u32 res_id, u32 val1, u32 val2),
@@ -180,6 +181,51 @@ TRACE_EVENT(scmi_msg_dump,
__entry->tag, __entry->msg_id, __entry->seq, __entry->status,
__print_hex_str(__get_dynamic_array(cmd), __entry->len))
);
+
+TRACE_EVENT(scmi_tlm_access,
+ TP_PROTO(u64 de_id, unsigned char *tag, u64 startm, u64 endm),
+ TP_ARGS(de_id, tag, startm, endm),
+
+ TP_STRUCT__entry(
+ __field(u64, de_id)
+ __array(char, tag, TRACE_SCMI_TLM_MAX_TAG_LEN)
+ __field(u64, startm)
+ __field(u64, endm)
+ ),
+
+ TP_fast_assign(
+ __entry->de_id = de_id;
+ strscpy(__entry->tag, tag, TRACE_SCMI_TLM_MAX_TAG_LEN);
+ __entry->startm = startm;
+ __entry->endm = endm;
+ ),
+
+ TP_printk("de_id=0x%llX [%s] - startm=%016llX endm=%016llX",
+ __entry->de_id, __entry->tag, __entry->startm, __entry->endm)
+);
+
+TRACE_EVENT(scmi_tlm_collect,
+ TP_PROTO(u64 ts, u64 de_id, u64 value, unsigned char *tag),
+ TP_ARGS(ts, de_id, value, tag),
+
+ TP_STRUCT__entry(
+ __field(u64, ts)
+ __field(u64, de_id)
+ __field(u64, value)
+ __array(char, tag, TRACE_SCMI_TLM_MAX_TAG_LEN)
+ ),
+
+ TP_fast_assign(
+ __entry->ts = ts;
+ __entry->de_id = de_id;
+ __entry->value = value;
+ strscpy(__entry->tag, tag, TRACE_SCMI_TLM_MAX_TAG_LEN);
+ ),
+
+ TP_printk("ts=%llu de_id=0x%04llX value=%016llu [%s]",
+ __entry->ts, __entry->de_id, __entry->value, __entry->tag)
+);
+
#endif /* _TRACE_SCMI_H */
/* This part must be outside protection */
--
2.54.0
^ permalink raw reply related
* [PATCH v4 06/31] dt-bindings: firmware: arm,scmi: Add support for telemetry protocol
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi, Rob Herring, Krzysztof Kozlowski,
Conor Dooley, devicetree
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>
Add new SCMI v4.0 Telemetry protocol bindings definitions.
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
v3 --> v4
- changed protocol number to lowercase 1b
- fixed misplaced block for protocol 0x1b
Cc: Rob Herring <robh@kernel.org>
Cc: Krzysztof Kozlowski <krzk+dt@kernel.org>
Cc: Conor Dooley <conor+dt@kernel.org>
Cc: devicetree@vger.kernel.org
---
Documentation/devicetree/bindings/firmware/arm,scmi.yaml | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/Documentation/devicetree/bindings/firmware/arm,scmi.yaml b/Documentation/devicetree/bindings/firmware/arm,scmi.yaml
index d06cca9273c4..be733f8d60a0 100644
--- a/Documentation/devicetree/bindings/firmware/arm,scmi.yaml
+++ b/Documentation/devicetree/bindings/firmware/arm,scmi.yaml
@@ -324,6 +324,14 @@ properties:
required:
- reg
+ protocol@1b:
+ $ref: '#/$defs/protocol-node'
+ unevaluatedProperties: false
+
+ properties:
+ reg:
+ const: 0x1b
+
unevaluatedProperties: false
$defs:
--
2.54.0
^ permalink raw reply related
* [PATCH v4 05/31] uapi: Add ARM SCMI definitions
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>
Add a number of structures and ioctls definitions used by the ARM
SCMI Telemetry protocol.
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
v2 --> v3
- Change tstamp_exp tp ts_rate
- renamed num -> num_intervals in scmi_tlm_interval
- added padding in scmi_tlm_de_sample to avoid packing issues on 32bit
v1 --> v2
- Added proper __counted_by marks
- Fixed a few dox comments
- Renamed reserved[] fields to pad[]
---
MAINTAINERS | 1 +
include/uapi/linux/scmi.h | 289 ++++++++++++++++++++++++++++++++++++++
2 files changed, 290 insertions(+)
create mode 100644 include/uapi/linux/scmi.h
diff --git a/MAINTAINERS b/MAINTAINERS
index e035a3be797c..cfee3d514bee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -25999,6 +25999,7 @@ F: drivers/regulator/scmi-regulator.c
F: drivers/reset/reset-scmi.c
F: include/linux/sc[mp]i_protocol.h
F: include/trace/events/scmi.h
+F: include/uapi/linux/scmi.h
F: include/uapi/linux/virtio_scmi.h
SYSTEM CONTROL MANAGEMENT INTERFACE (SCMI) i.MX Extension Message Protocol drivers
diff --git a/include/uapi/linux/scmi.h b/include/uapi/linux/scmi.h
new file mode 100644
index 000000000000..abf68bb99960
--- /dev/null
+++ b/include/uapi/linux/scmi.h
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2026 ARM Ltd.
+ */
+#ifndef _UAPI_LINUX_SCMI_H
+#define _UAPI_LINUX_SCMI_H
+
+/*
+ * Userspace interface SCMI Telemetry
+ */
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define SCMI_TLM_DE_IMPL_MAX_DWORDS 4
+
+#define SCMI_TLM_GRP_INVALID 0xFFFFFFFF
+
+/**
+ * scmi_tlm_base_info - Basic info about an instance
+ *
+ * @version: SCMI Telemetry protocol version
+ * @de_impl_version: SCMI Telemetry DE implementation revision
+ * @num_de: Number of defined DEs
+ * @num_groups Number of defined DEs groups
+ * @num_intervals: Number of update intervals available (instance-level)
+ * @flags: Instance specific feature-support bitmap
+ *
+ * Used by:
+ * RO - SCMI_TLM_GET_INFO
+ *
+ * Supported by:
+ * control/
+ */
+struct scmi_tlm_base_info {
+ __u32 version;
+ __u32 de_impl_version[SCMI_TLM_DE_IMPL_MAX_DWORDS];
+ __u32 num_des;
+ __u32 num_groups;
+ __u32 num_intervals;
+ __u32 flags;
+#define SCMI_TLM_CAN_RESET (1 << 0)
+};
+
+/**
+ * scmi_tlm_config - Whole instance or group configuration
+ *
+ * @enable: Enable/Disable Telemetry for the whole instance or the group.
+ * @t_enable: Enable/Disable timestamping for all the DEs belonging to a group.
+ * @pad: Padding fields to enforce alignment.
+ * @current_update_interval: Get/Set currently active update interval for the
+ * whole instance or a group.
+ *
+ * Used by:
+ * RO - SCMI_TLM_GET_CFG
+ * WO - SCMI_TLM_SET_CFG
+ *
+ * Supported by:
+ * control/
+ * groups/<N>/control
+ */
+struct scmi_tlm_config {
+ __u8 enable;
+ __u8 t_enable;
+ __u8 pad[2];
+ __u32 current_update_interval;
+};
+
+/**
+ * scmi_tlm_intervals - Update intervals descriptor
+ *
+ * @discrete: Flag to indicate the nature of the intervals described in
+ * @update_intervals.
+ * When 'false' @update_intervals is a triplet: min/max/step
+ * @pad: Padding fields to enforce alignment.
+ * @num_intervals: Number of entries of @update_intervals
+ * @update_intervals: A variably-sized array containing the update intervals
+ *
+ * Used by:
+ * RW - SCMI_TLM_GET_INTRVS
+ *
+ * Supported by:
+ * control/
+ * groups/<N>/control
+ */
+struct scmi_tlm_intervals {
+ __u8 discrete;
+ __u8 pad[3];
+ __u32 num_intervals;
+#define SCMI_TLM_UPDATE_INTVL_SEGMENT_LOW 0
+#define SCMI_TLM_UPDATE_INTVL_SEGMENT_HIGH 1
+#define SCMI_TLM_UPDATE_INTVL_SEGMENT_STEP 2
+ __u32 update_intervals[] __counted_by(num_intervals);
+};
+
+/**
+ * scmi_tlm_de_config - DE configuration
+ *
+ * @id: Identifier of the DE to act upon (ignored by SCMI_TLM_SET_ALL_CFG)
+ * @enable: A boolean to enable/disable the DE
+ * @t_enable: A boolean to enable/disable the timestamp for this DE
+ * (if supported)
+ *
+ * Used by:
+ * RW - SCMI_TLM_GET_DE_CFG
+ * RW - SCMI_TLM_SET_DE_CFG
+ * WO - SCMI_TLM_SET_ALL_CFG
+ *
+ * Supported by:
+ * control/
+ */
+struct scmi_tlm_de_config {
+ __u32 id;
+ __u32 enable;
+ __u32 t_enable;
+};
+
+/**
+ * scmi_tlm_de_info - DE Descriptor
+ *
+ * @id: DE identifier
+ * @grp_id: Identifier of the group which this DE belongs to; reported as
+ * SCMI_TLM_GRP_INVALID when not part of any group
+ * @data_sz: DE data size in bytes
+ * @type: DE type
+ * @unit: DE unit of measurements
+ * @unit_exp: Power-of-10 multiplier for DE unit
+ * @ts_rate: Clock rate in kHz used to generate the DE timestamp
+ * @instance_id: DE instance ID
+ * @compo_instance_id: DE component instance ID
+ * @compo_type: Type of component which is associated to this DE
+ * @persistent: Data value for this DE survives reboot (non-cold ones)
+ * @name: Optional name of this DE
+ *
+ * Used to get the full description of a DE: it reflects DE Descriptors
+ * definitions in 3.12.4.6.
+ *
+ * Used by:
+ * RW - SCMI_TLM_GET_DE_INFO
+ *
+ * Supported by:
+ * control/
+ */
+struct scmi_tlm_de_info {
+ __u32 id;
+ __u32 grp_id;
+ __u32 data_sz;
+ __u32 type;
+ __u32 unit;
+ __s32 unit_exp;
+ __s32 ts_rate;
+ __u32 instance_id;
+ __u32 compo_instance_id;
+ __u32 compo_type;
+ __u32 persistent;
+ __u8 name[16];
+};
+
+/**
+ * scmi_tlm_des_list - List of all defined DEs
+ *
+ * @num_des: Number of entries in @des
+ * @des: An array containing descriptors for all defined DEs
+ *
+ * Used by:
+ * RW - SCMI_TLM_GET_DE_LIST
+ *
+ * Supported by:
+ * control/
+ */
+struct scmi_tlm_des_list {
+ __u32 num_des;
+ struct scmi_tlm_de_info des[] __counted_by(num_des);
+};
+
+/**
+ * scmi_tlm_de_sample - A DE reading
+ *
+ * @id: DE identifier
+ * @pad: Padding fields to enforce alignment.
+ * @tstamp: DE reading timestamp (equal 0 is NOT supported)
+ * @val: Reading of the DE data value
+ *
+ * Used by:
+ * RW - SCMI_TLM_GET_DE_VALUE
+ *
+ * Supported by:
+ * control/
+ */
+struct scmi_tlm_de_sample {
+ __u32 id;
+ __u32 pad;
+ __u64 tstamp;
+ __u64 val;
+};
+
+/**
+ * scmi_tlm_data_read - Bulk read of multiple DEs
+ *
+ * @num_samples: Number of entries returned in @samples
+ * @samples: An array of samples containing an entry for each DE that was
+ * enabled when the single sample read request was issued.
+ *
+ * Used by:
+ * RW - SCMI_TLM_SINGLE_SAMPLE
+ * RW - SCMI_TLM_BULK_READ
+ *
+ * Supported by:
+ * control/
+ * groups/<N>/control
+ */
+struct scmi_tlm_data_read {
+ __u32 num_samples;
+ struct scmi_tlm_de_sample samples[] __counted_by(num_samples);
+};
+
+/**
+ * scmi_tlm_grp_info - DE-group descriptor
+ *
+ * @id: Group ID number
+ * @num_des: Number of DEs part of this group
+ * @num_intervals: Number of update intervals supported. Zero if group does not
+ * support per-group update interval configuration.
+ *
+ * Used by:
+ * RO - SCMI_TLM_GET_GRP_INFO
+ *
+ * Supported by:
+ * groups/<N>control/
+ */
+struct scmi_tlm_grp_info {
+ __u32 id;
+ __u32 num_des;
+ __u32 num_intervals;
+};
+
+/**
+ * scmi_tlm_grps_list - DE-groups List
+ *
+ * @num_grps: Number of entries returned in @grps
+ * @grps: An array containing descriptors for all defined DE Groups
+ *
+ * Used by:
+ * RW - SCMI_TLM_GET_GRP_LIST
+ *
+ * Supported by:
+ * control/
+ */
+struct scmi_tlm_grps_list {
+ __u32 num_grps;
+ struct scmi_tlm_grp_info grps[] __counted_by(num_grps);
+};
+
+/**
+ * scmi_tlm_grp_desc - Group descriptor
+ *
+ * @num_des: Number of DEs part of this group
+ * @composing_des: An array containing the DE IDs that belongs to this group.
+ *
+ * Used by:
+ * RW - SCMI_TLM_GET_GRP_DESC
+ *
+ * Supported by:
+ * groups/<N>control/
+ */
+struct scmi_tlm_grp_desc {
+ __u32 num_des;
+ __u32 composing_des[] __counted_by(num_des);
+};
+
+#define SCMI 0xF1
+
+#define SCMI_TLM_GET_INFO _IOR(SCMI, 0x00, struct scmi_tlm_base_info)
+#define SCMI_TLM_GET_CFG _IOR(SCMI, 0x01, struct scmi_tlm_config)
+#define SCMI_TLM_SET_CFG _IOW(SCMI, 0x02, struct scmi_tlm_config)
+#define SCMI_TLM_GET_INTRVS _IOWR(SCMI, 0x03, struct scmi_tlm_intervals)
+#define SCMI_TLM_GET_DE_CFG _IOWR(SCMI, 0x04, struct scmi_tlm_de_config)
+#define SCMI_TLM_SET_DE_CFG _IOWR(SCMI, 0x05, struct scmi_tlm_de_config)
+#define SCMI_TLM_GET_DE_INFO _IOWR(SCMI, 0x06, struct scmi_tlm_de_info)
+#define SCMI_TLM_GET_DE_LIST _IOWR(SCMI, 0x07, struct scmi_tlm_des_list)
+#define SCMI_TLM_GET_DE_VALUE _IOWR(SCMI, 0x08, struct scmi_tlm_de_sample)
+#define SCMI_TLM_SET_ALL_CFG _IOW(SCMI, 0x09, struct scmi_tlm_de_config)
+#define SCMI_TLM_GET_GRP_LIST _IOWR(SCMI, 0x0A, struct scmi_tlm_grps_list)
+#define SCMI_TLM_GET_GRP_INFO _IOR(SCMI, 0x0B, struct scmi_tlm_grp_info)
+#define SCMI_TLM_GET_GRP_DESC _IOWR(SCMI, 0x0C, struct scmi_tlm_grp_desc)
+#define SCMI_TLM_SINGLE_SAMPLE _IOWR(SCMI, 0x0D, struct scmi_tlm_data_read)
+#define SCMI_TLM_BULK_READ _IOWR(SCMI, 0x0E, struct scmi_tlm_data_read)
+
+#endif /* _UAPI_LINUX_SCMI_H */
--
2.54.0
^ permalink raw reply related
* [PATCH v4 04/31] firmware: arm_scmi: Allow protocols to register for notifications
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>
Allow protocols themselves to register for their own notifications and
provide their own notifier callbacks.
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
v2-->v3
- split out unrelated changes on event sizing
v1-->v2
- Fixed multiline comment format
---
drivers/firmware/arm_scmi/common.h | 4 ++++
drivers/firmware/arm_scmi/driver.c | 12 ++++++++++++
drivers/firmware/arm_scmi/notify.c | 6 +++---
drivers/firmware/arm_scmi/protocols.h | 6 ++++++
4 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/drivers/firmware/arm_scmi/common.h b/drivers/firmware/arm_scmi/common.h
index abe2fa3d41dd..a8a45bacfa3f 100644
--- a/drivers/firmware/arm_scmi/common.h
+++ b/drivers/firmware/arm_scmi/common.h
@@ -17,6 +17,7 @@
#include <linux/hashtable.h>
#include <linux/list.h>
#include <linux/module.h>
+#include <linux/notifier.h>
#include <linux/refcount.h>
#include <linux/scmi_protocol.h>
#include <linux/spinlock.h>
@@ -533,5 +534,8 @@ static struct platform_driver __drv = { \
void scmi_notification_instance_data_set(const struct scmi_handle *handle,
void *priv);
void *scmi_notification_instance_data_get(const struct scmi_handle *handle);
+int scmi_notifier_register(const struct scmi_handle *handle, u8 proto_id,
+ u8 evt_id, const u32 *src_id,
+ struct notifier_block *nb);
int scmi_inflight_count(const struct scmi_handle *handle);
#endif /* _SCMI_COMMON_H */
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 2ee4cad1f93d..9c1ab9925b1d 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -1655,6 +1655,17 @@ static void *scmi_get_protocol_priv(const struct scmi_protocol_handle *ph)
return pi->priv;
}
+static int
+scmi_register_instance_notifier(const struct scmi_protocol_handle *ph,
+ u8 evt_id, const u32 *src_id,
+ struct notifier_block *nb)
+{
+ const struct scmi_protocol_instance *pi = ph_to_pi(ph);
+
+ return scmi_notifier_register(pi->handle, pi->proto->id,
+ evt_id, src_id, nb);
+}
+
static const struct scmi_xfer_ops xfer_ops = {
.xfer_get_init = xfer_get_init,
.reset_rx_to_maxsz = reset_rx_to_maxsz,
@@ -2223,6 +2234,7 @@ scmi_alloc_init_protocol_instance(struct scmi_info *info,
pi->ph.hops = &helpers_ops;
pi->ph.set_priv = scmi_set_protocol_priv;
pi->ph.get_priv = scmi_get_protocol_priv;
+ pi->ph.notifier_register = scmi_register_instance_notifier;
refcount_set(&pi->users, 1);
/*
diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
index 3e4c97ab7b61..2a8efdf0bab8 100644
--- a/drivers/firmware/arm_scmi/notify.c
+++ b/drivers/firmware/arm_scmi/notify.c
@@ -1389,9 +1389,9 @@ static int scmi_event_handler_enable_events(struct scmi_event_handler *hndl)
*
* Return: 0 on Success
*/
-static int scmi_notifier_register(const struct scmi_handle *handle,
- u8 proto_id, u8 evt_id, const u32 *src_id,
- struct notifier_block *nb)
+int scmi_notifier_register(const struct scmi_handle *handle,
+ u8 proto_id, u8 evt_id, const u32 *src_id,
+ struct notifier_block *nb)
{
int ret = 0;
u32 evt_key;
diff --git a/drivers/firmware/arm_scmi/protocols.h b/drivers/firmware/arm_scmi/protocols.h
index f51245aca259..3e7b6f8aa72c 100644
--- a/drivers/firmware/arm_scmi/protocols.h
+++ b/drivers/firmware/arm_scmi/protocols.h
@@ -166,6 +166,9 @@ struct scmi_proto_helpers_ops;
* can be used by the protocol implementation to generate SCMI messages.
* @set_priv: A method to set protocol private data for this instance.
* @get_priv: A method to get protocol private data previously set.
+ * @notifier_register: A method to register interest for notifications from
+ * within a protocol implementation unit: notifiers can
+ * be registered only for the same protocol.
*
* This structure represents a protocol initialized against specific SCMI
* instance and it will be used as follows:
@@ -185,6 +188,9 @@ struct scmi_protocol_handle {
const struct scmi_proto_helpers_ops *hops;
int (*set_priv)(const struct scmi_protocol_handle *ph, void *priv);
void *(*get_priv)(const struct scmi_protocol_handle *ph);
+ int (*notifier_register)(const struct scmi_protocol_handle *ph,
+ u8 evt_id, const u32 *src_id,
+ struct notifier_block *nb);
};
/**
--
2.54.0
^ permalink raw reply related
* [PATCH v4 03/31] firmware: arm_scmi: Allow registration of unknown-size events/reports
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>
Allow protocols to register events with build-time unknown sizes: such
events can be declared zero-sized and let the core SCMI stack perform the
needed safe-net boundary checks based on the configured transport size.
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
v2 --> v3
- split out of previous patch on protocol notifier
- use max() instead of max_t()
---
drivers/firmware/arm_scmi/notify.c | 24 +++++++++++++++++++-----
drivers/firmware/arm_scmi/notify.h | 8 ++++++--
2 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
index 40ec184eedae..3e4c97ab7b61 100644
--- a/drivers/firmware/arm_scmi/notify.c
+++ b/drivers/firmware/arm_scmi/notify.c
@@ -595,7 +595,13 @@ int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
if (!r_evt)
return -EINVAL;
- if (len > r_evt->evt->max_payld_sz) {
+ /*
+ * Events with a zero max_payld_sz are sized to be of the maximum
+ * size allowed by the transport: no need to be size-checked here
+ * since the transport layer would have already dropped such
+ * over-sized messages.
+ */
+ if (r_evt->evt->max_payld_sz && len > r_evt->evt->max_payld_sz) {
dev_err(handle->dev, "discard badly sized message\n");
return -EINVAL;
}
@@ -754,7 +760,7 @@ int scmi_register_protocol_events(const struct scmi_handle *handle, u8 proto_id,
const struct scmi_protocol_handle *ph,
const struct scmi_protocol_events *ee)
{
- int i;
+ int i, max_msg_sz;
unsigned int num_sources;
size_t payld_sz = 0;
struct scmi_registered_events_desc *pd;
@@ -769,6 +775,8 @@ int scmi_register_protocol_events(const struct scmi_handle *handle, u8 proto_id,
if (!ni)
return -ENOMEM;
+ max_msg_sz = ph->hops->get_max_msg_size(ph);
+
/* num_sources cannot be <= 0 */
if (ee->num_sources) {
num_sources = ee->num_sources;
@@ -781,8 +789,13 @@ int scmi_register_protocol_events(const struct scmi_handle *handle, u8 proto_id,
}
evt = ee->evts;
- for (i = 0; i < ee->num_events; i++)
- payld_sz = max_t(size_t, payld_sz, evt[i].max_payld_sz);
+ for (i = 0; i < ee->num_events; i++) {
+ if (evt[i].max_payld_sz == 0) {
+ payld_sz = max_msg_sz;
+ break;
+ }
+ payld_sz = max(payld_sz, evt[i].max_payld_sz);
+ }
payld_sz += sizeof(struct scmi_event_header);
pd = scmi_allocate_registered_events_desc(ni, proto_id, ee->queue_sz,
@@ -811,7 +824,8 @@ int scmi_register_protocol_events(const struct scmi_handle *handle, u8 proto_id,
mutex_init(&r_evt->sources_mtx);
r_evt->report = devm_kzalloc(ni->handle->dev,
- evt->max_report_sz, GFP_KERNEL);
+ evt->max_report_sz ?: max_msg_sz,
+ GFP_KERNEL);
if (!r_evt->report)
return -ENOMEM;
diff --git a/drivers/firmware/arm_scmi/notify.h b/drivers/firmware/arm_scmi/notify.h
index 76758a736cf4..ecfa4b746487 100644
--- a/drivers/firmware/arm_scmi/notify.h
+++ b/drivers/firmware/arm_scmi/notify.h
@@ -18,8 +18,12 @@
/**
* struct scmi_event - Describes an event to be supported
* @id: Event ID
- * @max_payld_sz: Max possible size for the payload of a notification message
- * @max_report_sz: Max possible size for the report of a notification message
+ * @max_payld_sz: Max possible size for the payload of a notification message.
+ * Set to zero to use the maximum payload size allowed by the
+ * transport.
+ * @max_report_sz: Max possible size for the report of a notification message.
+ * Set to zero to use the maximum payload size allowed by the
+ * transport.
*
* Each SCMI protocol, during its initialization phase, can describe the events
* it wishes to support in a few struct scmi_event and pass them to the core
--
2.54.0
^ permalink raw reply related
* [PATCH v4 02/31] firmware: arm_scmi: Reduce the scope of protocols mutex
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>
Currently the mutex dedicated to the protection of the list of registered
protocols is held during all the protocol initialization phase.
Such a wide locking region is not needed and causes problem when trying to
initialize notifications from within a protocol initialization routine.
Reduce the scope of the protocol mutex.
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
v1-->v2
- Fixed improper mixed usage of cleanup and goto constructs
---
drivers/firmware/arm_scmi/driver.c | 50 ++++++++++++++----------------
1 file changed, 24 insertions(+), 26 deletions(-)
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index f167194f7cf6..2ee4cad1f93d 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -17,6 +17,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/bitmap.h>
+#include <linux/cleanup.h>
#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/export.h>
@@ -2190,7 +2191,6 @@ static void scmi_protocol_version_initialize(struct device *dev,
* all resources management is handled via a dedicated per-protocol devres
* group.
*
- * Context: Assumes to be called with @protocols_mtx already acquired.
* Return: A reference to a freshly allocated and initialized protocol instance
* or ERR_PTR on failure. On failure the @proto reference is at first
* put using @scmi_protocol_put() before releasing all the devres group.
@@ -2236,8 +2236,10 @@ scmi_alloc_init_protocol_instance(struct scmi_info *info,
if (ret)
goto clean;
- ret = idr_alloc(&info->protocols, pi, proto->id, proto->id + 1,
- GFP_KERNEL);
+ /* Finally register the initialized protocol */
+ mutex_lock(&info->protocols_mtx);
+ ret = idr_alloc(&info->protocols, pi, proto->id, proto->id + 1, GFP_KERNEL);
+ mutex_unlock(&info->protocols_mtx);
if (ret != proto->id)
goto clean;
@@ -2284,27 +2286,25 @@ scmi_alloc_init_protocol_instance(struct scmi_info *info,
static struct scmi_protocol_instance * __must_check
scmi_get_protocol_instance(const struct scmi_handle *handle, u8 protocol_id)
{
- struct scmi_protocol_instance *pi;
struct scmi_info *info = handle_to_scmi_info(handle);
+ const struct scmi_protocol *proto;
- mutex_lock(&info->protocols_mtx);
- pi = idr_find(&info->protocols, protocol_id);
-
- if (pi) {
- refcount_inc(&pi->users);
- } else {
- const struct scmi_protocol *proto;
+ scoped_guard(mutex, &info->protocols_mtx) {
+ struct scmi_protocol_instance *pi;
- /* Fails if protocol not registered on bus */
- proto = scmi_protocol_get(protocol_id, &info->version);
- if (proto)
- pi = scmi_alloc_init_protocol_instance(info, proto);
- else
- pi = ERR_PTR(-EPROBE_DEFER);
+ pi = idr_find(&info->protocols, protocol_id);
+ if (pi) {
+ refcount_inc(&pi->users);
+ return pi;
+ }
}
- mutex_unlock(&info->protocols_mtx);
- return pi;
+ /* Fails if protocol not registered on bus */
+ proto = scmi_protocol_get(protocol_id, &info->version);
+ if (!proto)
+ return ERR_PTR(-EPROBE_DEFER);
+
+ return scmi_alloc_init_protocol_instance(info, proto);
}
/**
@@ -2335,10 +2335,11 @@ void scmi_protocol_release(const struct scmi_handle *handle, u8 protocol_id)
struct scmi_info *info = handle_to_scmi_info(handle);
struct scmi_protocol_instance *pi;
- mutex_lock(&info->protocols_mtx);
- pi = idr_find(&info->protocols, protocol_id);
- if (WARN_ON(!pi))
- goto out;
+ scoped_guard(mutex, &info->protocols_mtx) {
+ pi = idr_find(&info->protocols, protocol_id);
+ if (WARN_ON(!pi))
+ return;
+ }
if (refcount_dec_and_test(&pi->users)) {
void *gid = pi->gid;
@@ -2357,9 +2358,6 @@ void scmi_protocol_release(const struct scmi_handle *handle, u8 protocol_id)
dev_dbg(handle->dev, "De-Initialized protocol: 0x%X\n",
protocol_id);
}
-
-out:
- mutex_unlock(&info->protocols_mtx);
}
void scmi_setup_protocol_implemented(const struct scmi_protocol_handle *ph,
--
2.54.0
^ permalink raw reply related
* [PATCH v4 01/31] firmware: arm_scmi: Add new SCMIv4.0 error codes definitions
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>
SCMIv4.0 introduces a couple of new possible protocol error codes: add
the needed definitions and mappings to Linux error values.
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
drivers/firmware/arm_scmi/common.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/firmware/arm_scmi/common.h b/drivers/firmware/arm_scmi/common.h
index 7c9617d080a0..abe2fa3d41dd 100644
--- a/drivers/firmware/arm_scmi/common.h
+++ b/drivers/firmware/arm_scmi/common.h
@@ -45,6 +45,8 @@ enum scmi_error_codes {
SCMI_ERR_GENERIC = -8, /* Generic Error */
SCMI_ERR_HARDWARE = -9, /* Hardware Error */
SCMI_ERR_PROTOCOL = -10,/* Protocol Error */
+ SCMI_ERR_IN_USE = -11, /* In Use Error */
+ SCMI_ERR_PARTIAL = -12, /* Partial Error */
};
static const int scmi_linux_errmap[] = {
@@ -60,6 +62,8 @@ static const int scmi_linux_errmap[] = {
-EIO, /* SCMI_ERR_GENERIC */
-EREMOTEIO, /* SCMI_ERR_HARDWARE */
-EPROTO, /* SCMI_ERR_PROTOCOL */
+ -EPERM, /* SCMI_ERR_IN_USE */
+ -EINVAL, /* SCMI_ERR_PARTIAL */
};
static inline int scmi_to_linux_errno(int errno)
--
2.54.0
^ permalink raw reply related
* [PATCH v4 00/31] Introduce SCMI Telemetry FS support
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
linux-doc
Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
elif.topuz, lukasz.luba, philip.radford, brauner,
souvik.chakravarty, leitao, kas, puranjay, usama.arif,
kernel-team, Cristian Marussi
Hi all,
--------------------------------------------------------------------------------
[TLDR Summary]
This series introduces a new SCMI driver which uses a new Telemetry FS to expose
and configure SCMI Telemetry Data Events retrieved from the platform SCMI FW
at runtime. The patches carrying the new STLMFS Filesystem support are tagged
with 'stlmfs'.
--------------------------------------------------------------------------------
the upcoming SCMI v4.0 specification [0] introduces a new SCMI protocol
dedicated to System Telemetry.
In a nutshell, the SCMI Telemetry protocol allows an agent to discover at
runtime the set of Telemetry Data Events (DEs) available on a specific
platform and provides the means to configure the set of DEs that a user is
interested into, while reading them back using the collection method that
is deeemed more suitable for the usecase at hand. (...amongst the various
possible collection methods allowed by SCMI specification)
Without delving into the gory details of the whole SCMI Telemetry protocol
let's just say that the SCMI platform/server firmware advertises a number
of Telemetry Data Events, each one identified by a 32bit unique ID, and an
SCMI agent/client, like Linux, can discover them and read back at will the
associated data value in a number of ways.
Data collection is mainly intended to happen on demand via shared memory
areas exposed by the platform firmware, discovered dynamically via SCMI
Telemetry and accessed by Linux on-demand, but some DE can also be reported
via SCMI Notifications asynchronous messages or via direct dedicated
FastChannels (another kind of SCMI memory based access): all of this
underlying mechanism is anyway hidden to the user since it is mediated by
the kernel driver which will return the proper data value when queried.
Anyway, the set of well-known architected DE IDs defined by the spec is
limited to a dozen IDs, which means that the vast majority of DE IDs are
customizable per-platform: as a consequence, though, the same ID, say
'0x1234', could represent completely different things on different systems.
Precise definitions and semantic of such custom Data Event IDs are out of
the scope of the SCMI Telemetry specification and of this implementation:
they are supposed to be provided using some kind of JSON-like description
file that will have to be consumed by a userspace tool which would be
finally in charge of making sense of the set of available DEs.
IOW, in turn, this means that even though the DEs enumerated via SCMI come
with some sort of topological and qualitative description provided by the
protocol (like unit of measurements, name, topology info etc), kernel-wise
we CANNOT be completely sure of "what is what" without being fed-back some
sort of information about the DEs by the afore mentioned userspace tool.
For these reasons, currently this series does NOT attempt to register any
of these DEs with any of the usual in-kernel subsystems (like HWMON, IIO,
PERF etc), simply because we cannot be sure which DE is suitable, or even
desirable, for a given subsystem. This also means there are NO in-kernel
users of these Telemetry data events as of now.
So, while we do not exclude, for the future, to feed/register some of the
discovered DEs to/with some of the above mentioned Kernel subsystems, as
of now we have ONLY modeled a custom userspace API to make SCMI Telemetry
available to userspace tools.
In deciding which kind of interface to expose SCMI Telemetry data to a
user, this new SCMI Telemetry driver aims at satisfying 2 main reqs:
- exposing an FS-based human-readable interface that can be used to
discover, configure and access our Telemetry data directly also from
the shell without special tools
- exposing alternative machine-friendly, more-performant, binary
interfaces that can be used to avoid the overhead of multiple accesses
to the VFS and that can be more suitable to access with custom tools
In the initial RFC posted a few months ago [1], the above was achieved
with a combination of a SysFS interface, for the human-readable side of
the story, and a classic chardev/ioctl for the plain binary access.
Since V1, instead, we moved away from this combined approach, especially
away from SysFS, for the following reason:
1. "Abusing SysFS": SysFS is a handy way to expose device related
properties in a common way, using a few common helpers built on
kernfs; this means, though, that unfortunately in our scenario I had
to generate a dummy simple device for EACH SCMI Telemetry DataEvent
that I got to discover at runtime and attach to them, all of the
properties I need.
This by itself seemed to me abusing the SysFS framework, but, even
ignoring this, the impact on the system when we have to deal with
hundreds or tens of thousands of DEs is sensible.
In some test scenario I ended with 50k DE devices and half-a-millon
related property files ... O_o
2. "SysFS constraints": SysFS usage itself has its well-known constraints
and best practices, like the one-file/one-value rule, and due to the
fact that any virtual file with a complex structure or handling logic
is frowned upon, you can forget about IOCTLs and mmap'ing to provide
a more performant interface within SysFs, which is the reason why,
in the previous RFC, there was an additional alternative chardev
interface.
These latter limitations around the implementation of files with a
more complex semantic (i.e. with a broader set of file_operations)
derive from the underlying KernFS support, so KernFS is equally not
suitable as a building block for our implementation.
2. "Chardev limitations": Given the nature of the protocol, the hybrid
approach employing character devices was itself problematic: first
of all because there is an upper limit on the number of chardev we
can create, dictated by the range of available minor numbers, and
then because the fact itself to have to maintain 2 completely
different interfaces (FS + chardev) is painful.
As a final remark, please NOTE THAT all of this is supposed to be available
in production systems across a number of heterogeneous platforms: for these
reasons the easy choice, debugFS, is NOT an option here.
Due to the above reasoning, since V1 we opted for a new approach with the
proposed interfaces now based on a full fledged, unified, virtual pseudo
filesystem implemented from scratch, so that we can:
- expose all the DEs property we like as before with SysFS, but without
any of the constraint imposed by the usage of SysFs or kernfs.
- easily expose additional alternative views of the same set of DEs
using symlinking capabilities (e.g. alternative topological view)
- additionally expose a few alternative and more performant interfaces
by embedding in that same FS, a few special virtual files:
+ 'control': to issue IOCTLs for quicker discovery and on-demand access
to data
+ 'pipe' [TBD]: to provide a stream of events using a virtual
infinite-style file
+ 'raw_<N>' [TBD]: to provide direct memory mapped access to the raw
SCMI Telemetry data from userspace
- use a mount option to enable a lazy enumeration operation mode to delay
SCMI related background discovery activities to the effective point in
time when the user needs it (if ever) so as to mitigate the effect at
boot-time of the initial SCMI full discovery process
INTERFACES
===========
We propose a couple of interfaces, both rooted in the same unified
SCMI Telemetry Filesystem STLMFS, which can be mounted with:
mount -t stlmfs none /sys/fs/arm_telemetry/
The new pseudo FS rationale, design and related ABI interface is documented
in detail at:
- Documentation/filesystems/stlmfs.rst
- Documentation/ABI/testing/stlmfs
...anyway, roughly, STLMFS exposes the following interfaces, rooted at
different points in the FS:
1. a FS based human-readable API tree
This API present the discovered DEs and DEs-groups rooted under a
structrure like this:
/sys/fs/arm_telemetry/tlm_0/
|-- all_des_enable
|-- all_des_tstamp_enable
|-- available_update_intervals_ms
|-- current_update_interval_ms
|-- de_implementation_version
|-- des
| |-- 0x00000000/
| |-- 0x00000016/
| |-- 0x00001010/
| |-- 0x0000A000/
| |-- 0x0000A001/
| |-- 0x0000A002/
| |-- 0x0000A005/
| |-- 0x0000A007/
| |-- 0x0000A008/
| |-- 0x0000A00A/
| |-- 0x0000A00B/
| |-- 0x0000A00C/
| `-- 0x0000A010/
|-- des_bulk_read
|-- des_single_sample_read
|-- groups
| |-- 0/
| `-- 1/
|-- intervals_discrete
|-- reset
|-- tlm_enable
`-- version
At the top level we have general configuration knobs to:
- enable/disable all DEs with or without tstamp
- configure the update interval that the platform will use
- enable Telemetry as a whole
- read all the enabled DEs in a buffer one-per-line
<DE_ID> <TIMESTAMP> <DATA_VALUE>
- des_single_sample_read to request an immediate updated read of
all the enabled DEs in a single buffer one-per-line:
<DE_ID> <TIMESTAMP> <DATA_VALUE>
where each DE in turn is represented by a flat subtree like:
tlm_0/des/0x0000A001/
|-- compo_instance_id
|-- compo_type
|-- enable
|-- instance_id
|-- name
|-- persistent
|-- tstamp_enable
|-- tstamp_exp
|-- type
|-- unit
|-- unit_exp
`-- value
where, beside a bunch of description items, you can:
- enable/disable a single DE
- read back its tstamp and data from 'value' as in:
<TIMESTAMP>: <DATA_VALUE>
then for each (optionally) discovered group of DEs:
scmi_tlm_0/groups/0/
|-- available_update_intervals_ms
|-- composing_des
|-- current_update_interval_ms
|-- des_bulk_read
|-- des_single_sample_read
|-- enable
|-- intervals_discrete
`-- tstamp_enable
you can find the knobs to:
- enable/disable the group as a whole
- lookup group composition
- set a per-group update interval (if supported)
- des_bulk_read to read all the enabled DEs for this group in a
single buffer one-per-line:
<DE_ID> <TIMESTAMP> <DATA_VALUE>
- des_single_sample_read to request an immediate updated read of
all the enabled DEs for this group in a single buffer
one-per-line:
<DE_ID> <TIMESTAMP> <DATA_VALUE>
2. Leveraging the capabilities offered by the full-fledged filesystem
implementation and the topological information provided by SCMI
Telemetry we expose also and alternative view of the above tree, by
symlinking a few of the same entries above under another, topologically
sorted, subtree:
by-components/
├── cpu
│ ├── 0
│ │ ├── celsius
│ │ │ └── 0
│ │ │ └── 0x00000001[pe_0] -> ../../../../../des/0x00000001
│ │ └── cycles
│ │ ├── 0
│ │ │ └── 0x00001010[] -> ../../../../../des/0x00001010
│ │ └── 1
│ │ └── 0x00002020[] -> ../../../../../des/0x00002020
│ ├── 1
│ │ └── celsius
│ │ └── 0
│ │ └── 0x00000002[pe_1] -> ../../../../../des/0x00000002
│ └── 2
│ └── celsius
│ └── 0
│ └── 0x00000003[pe_2] -> ../../../../../des/0x00000003
├── interconnnect
│ └── 0
│ └── hertz
│ └── 0
│ ├── 0x0000A008[A008_de] -> ../../../../../des/0x0000A008
│ └── 0x0000A00B[] -> ../../../../../des/0x0000A00B
├── mem_cntrl
│ └── 0
│ ├── bps
│ │ └── 0
│ │ └── 0x0000A00A[] -> ../../../../../des/0x0000A00A
│ ├── celsius
│ │ └── 0
│ │ └── 0x0000A007[DRAM_temp] -> ../../../../../des/0x0000A007
│ └── joules
│ └── 0
│ └── 0x0000A002[DRAM_energy] -> ../../../../../des/0x0000A002
├── periph
│ ├── 0
│ │ └── messages
│ │ └── 0
│ │ └── 0x00000016[device_16] -> ../../../../../des/0x00000016
│ ├── 1
│ │ └── messages
│ │ └── 0
│ │ └── 0x00000017[device_17] -> ../../../../../des/0x00000017
│ └── 2
│ └── messages
│ └── 0
│ └── 0x00000018[device_18] -> ../../../../../des/0x00000018
└── unspec
└── 0
├── celsius
│ └── 0
│ └── 0x0000A005[] -> ../../../../../des/0x0000A005
├── counts
│ └── 0
│ └── 0x0000A00C[] -> ../../../../../des/0x0000A00C
├── joules
│ └── 0
│ ├── 0x0000A000[SOC_Energy] -> ../../../../../des/0x0000A000
│ └── 0x0000A001[] -> ../../../../../des/0x0000A001
└── state
└── 0
└── 0x0000A010[] -> ../../../../../des/0x0000A010
...so as to provide the human user with a more understandable topological
layout of the madness...
All of this is nice and fancy human-readable, easily scriptable, but
certainly not the fastest possible to access especially on huge trees...
... so for the afore-mentioned reasons we alternatively expose
3. a more performant API based on IOCTLs as described fully in:
include/uapi/linux/scmi.h
As described succinctly in the above UAPI header too, this API is meant
to be called on a few special files named 'control' that are populated
into the tree:
.
|-- all_des_enable
.....
|-- components
| |-- cpu
| |-- interconnnect
| |-- mem_cntrl
| |-- periph
| `-- unspec
|-- control
.....................
|-- groups
| |-- 0
| | |-- available_update_intervals_ms
| | |-- composing_des
| | |-- control
.....................
| |-- 1
| | |-- available_update_intervals_ms
| | |-- composing_des
| | |-- control
.....................
| `-- 2
| |-- available_update_intervals_ms
| |-- composing_des
| |-- control
.....................
This allows a tool to:
- use some IOCTLs to configure a set of properties equivalent to the
ones above in FS
- use some other IOCTLs for direct access to data in binary format
for a single DEs or all of them
4. [FUTURE/NOT IN THIS SERIES]
Add another alternative, completely binary, direct raw accessbinterface
via a new set of memory mappable special files so as to allow userspace
tools to access SCMI Telemetry data directly in binary form without any
kernel mediation.
NOTE THAT this series, at the firmware interface level NOW supports ONLY
the latest SCMI v4.0 specification [0].
Missing feats & future steps
----------------------------
- add direct access interface via mmap-able 'raw' files
- add streaming mode interface via 'pipe' file (tentative)
- evolve/enhance app in tools/testing/scmi/stlm to be interactive
KNOWN ISSUES
------------
- STLMFS code layout and location...nothing lives in fs/ and no distinct
FS Kconfig...but the SCMI Telemetry driver itself has no point in existing
without the FS that exposes...so should I split the pure FS part into fs/
anyway or not ?
- residual sparse/smatch static analyzers errors
- stlm tool utility is minimal for testing or development
Based on V7.1-rc7, tested on an emulated setup.
This series is available also at [2].
If you still reading...any feedback welcome :P
Thanks,
Cristian
----
v3 --> v4
- rebased on v7.1-rc7
- updatded doc to detail Concurrency model
- bail out on FW_BUG errors
- make all_des_enable/all_des_tstamp_enable entry readable
- refactored access to TDE values
- refactored common accessors for tlm_priv (FIX WARN on kfree)
- make all files by default world readable and user writable (if needed)
- added uid/god/umask mount options (and docs)
- added generation counter to aid spotting config changes (and docs)
- added DebugFS configurable support to debug/dump SHMTI areas (and docs)
- hide FS entries when NOT supported (like des_simple_sample_read)
- fixed output format of des/<NNN>/value to -> <TS> <VALUE>
- renamed top-dir by_components to by-components
- add a .remove method to SCMI System Telemetry Driver
- use kzalloc_obj
V2 --> V3
- rebased on v7.0-rc5
- ported the firmware interface to SCMI v4.0 BETA
- split the SCMI protocol layer in a lot of small patches
- completd filesystem and ABI documentation
- renamed components subtree to by_components
- fixed uninitialized var in scmi_telemetry_de_subdir_symlink
- renamd tstamp_exp to tstamp_rate
- swap logic in scmi_telemetry_initial_state_lookup
- use memcpy_from_le32 where required
- changed a dfew dev_err into Telemetry traces
- define and use new helper scmi_telemetry_de_unlink
- simplify a few assignments with ternary ops
- added a missing __mmust_check on the internal SCMI API
- reworked and clarified de_data_read returned errno:
ENODATA vs EINVAL vs ENODEV/ENOENT
- removed some risky/unneeded devres allocations
- various checkpatch fixes
- reworked and clarified usage of traces in Telemetry
- added the missing DT binding for protocol 0x1B
- split out unrelated change around notification from patch
adding support for protocol internal notifier
- more comments
V1 --> V2
- rebased on v6.19-rc3
- harden TDCF shared memory areas accesses by using proper accessors
- reworked protocol resources lifecycle to allow lazy enumeration
- using NEW FS mount API
- reworked FS inode allocation to use a std kmem_cache
- fixed a few IOCTLs support routine to support lazy enumeration
- added (RFC) a new FS lazy mount option to support lazily population of
some subtrees of the FS (des/ groups/ components/)
- reworked implementation of components/ alternative FS view to use
symlinks instead of hardlinks
- added a basic simple (RFC) testing tool to exercise UAPI ioctls interface
- hardened Telmetry protocol and driver to support partial out-of-spec FW
lacking some cmds (best effort)
- reworked probing races handling
- reviewed behaviour on unmount/unload
- added support for Boot_ON Telemetry by supporting SCMI Telemetry cmds:
+ DE_ENABLED_LIST
+ CONFIG_GET
- added FS and ABI docs
RFC --> V1
---
- moved from SysFS/chardev to a full fledged FS
- added support for SCMI Telemetry BLK timestamps
Thanks,
Cristian
[0]: https://developer.arm.com/documentation/den0056/f/?lang=en
[1]: https://lore.kernel.org/arm-scmi/20250620192813.2463367-1-cristian.marussi@arm.com/
[2]: https://git.kernel.org/pub/scm/linux/kernel/git/cris/linux.git/log/?h=scmi_telemetry_unified_fs_V4
Cristian Marussi (31):
firmware: arm_scmi: Add new SCMIv4.0 error codes definitions
firmware: arm_scmi: Reduce the scope of protocols mutex
firmware: arm_scmi: Allow registration of unknown-size events/reports
firmware: arm_scmi: Allow protocols to register for notifications
uapi: Add ARM SCMI definitions
dt-bindings: firmware: arm,scmi: Add support for telemetry protocol
include: trace: Add Telemetry trace events
firmware: arm_scmi: Add basic Telemetry support
firmware: arm_scmi: Add support to parse SHMTIs areas
firmware: arm_scmi: Add Telemetry configuration operations
firmware: arm_scmi: Add Telemetry DataEvent read capabilities
firmware: arm_scmi: Add support for Telemetry reset
firmware: arm_scmi: Add Telemetry notification support
firmware: arm_scmi: Add support for boot-on Telemetry
firmware: arm_scmi: Add Telemetry generation counter
firmware: arm_scmi: Add common per-protocol debugfs support
firmware: arm_scmi: Add Telemetry debugfs SHMTI dump support
firmware: arm_scmi: Add Telemetry debugfs ABI documentation
firmware: arm_scmi: stlmfs: Add System Telemetry filesystem driver
fs/stlmfs: Document ARM SCMI Telemetry filesystem
firmware: arm_scmi: stlmfs: Add basic mount options
fs/stlmfs: Document ARM SCMI Telemetry FS mount options
firmware: arm_scmi: stlmfs: Add ioctls support
fs/stlmfs: Document alternative ioctl based binary interface
firmware: arm_scmi: stlmfs: Add by-components view
fs/stlmfs: Document alternative topological view
firmware: arm_scmi: stlmfs: Add generation file
[RFC] docs: stlmfs: Document ARM SCMI Telemetry FS ABI
firmware: arm_scmi: stlmfs: Add lazy population support
fs/stlmfs: Document lazy mode and related mount option
[RFC] tools/scmi: Add SCMI Telemetry testing tool
Documentation/ABI/testing/debugfs-scmi | 22 +
Documentation/ABI/testing/stlmfs | 348 ++
.../bindings/firmware/arm,scmi.yaml | 8 +
Documentation/filesystems/stlmfs.rst | 342 ++
MAINTAINERS | 1 +
drivers/firmware/arm_scmi/Kconfig | 24 +
drivers/firmware/arm_scmi/Makefile | 3 +-
drivers/firmware/arm_scmi/common.h | 10 +
drivers/firmware/arm_scmi/driver.c | 93 +-
drivers/firmware/arm_scmi/notify.c | 30 +-
drivers/firmware/arm_scmi/notify.h | 8 +-
drivers/firmware/arm_scmi/protocols.h | 13 +
.../firmware/arm_scmi/scmi_system_telemetry.c | 3146 ++++++++++++++++
drivers/firmware/arm_scmi/telemetry.c | 3300 +++++++++++++++++
include/linux/scmi_protocol.h | 203 +-
include/trace/events/scmi.h | 48 +-
include/uapi/linux/scmi.h | 289 ++
tools/testing/scmi/Makefile | 25 +
tools/testing/scmi/stlm.c | 434 +++
19 files changed, 8307 insertions(+), 40 deletions(-)
create mode 100644 Documentation/ABI/testing/stlmfs
create mode 100644 Documentation/filesystems/stlmfs.rst
create mode 100644 drivers/firmware/arm_scmi/scmi_system_telemetry.c
create mode 100644 drivers/firmware/arm_scmi/telemetry.c
create mode 100644 include/uapi/linux/scmi.h
create mode 100644 tools/testing/scmi/Makefile
create mode 100644 tools/testing/scmi/stlm.c
--
2.54.0
^ permalink raw reply
* Re: [PATCH net-next v09 1/5] hinic3: Add ethtool queue ops
From: Harshitha Ramamurthy @ 2026-06-12 22:09 UTC (permalink / raw)
To: Fan Gong
Cc: Wu Di, Teng Peisen, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
Ioana Ciornei, Mohsin Bashir, linux-kernel, linux-doc, luosifu,
Xin Guo, Zhou Shuai, Wu Like, Shi Jing, Zheng Jiezhen,
Maxime Chevallier
In-Reply-To: <02e87952a65aa268526ade2f03de6c76fbc1fe9d.1781062575.git.wudi234@huawei.com>
On Wed, Jun 10, 2026 at 12:05 AM Fan Gong <gongfan1@huawei.com> wrote:
>
> Implement following ethtool callback function:
> .get_ringparam
> .set_ringparam
>
> These callbacks allow users to utilize ethtool for detailed
> queue depth configuration and monitoring.
The patch adds a new mutex. Would be good to call it out in the commit message.
>
> Co-developed-by: Wu Di <wudi234@huawei.com>
> Signed-off-by: Wu Di <wudi234@huawei.com>
> Co-developed-by: Teng Peisen <tengpeisen@huawei.com>
> Signed-off-by: Teng Peisen <tengpeisen@huawei.com>
> Signed-off-by: Fan Gong <gongfan1@huawei.com>
> ---
> .../ethernet/huawei/hinic3/hinic3_ethtool.c | 93 ++++++++++++++++
> .../net/ethernet/huawei/hinic3/hinic3_irq.c | 5 +-
> .../net/ethernet/huawei/hinic3/hinic3_main.c | 6 +
> .../huawei/hinic3/hinic3_netdev_ops.c | 104 ++++++++++++++++--
> .../ethernet/huawei/hinic3/hinic3_nic_dev.h | 9 ++
> .../ethernet/huawei/hinic3/hinic3_nic_io.c | 4 +-
> .../ethernet/huawei/hinic3/hinic3_nic_io.h | 8 +-
> .../net/ethernet/huawei/hinic3/hinic3_rx.c | 2 +-
> 8 files changed, 217 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> index 90fc16288de9..be9992a235f7 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> @@ -9,6 +9,7 @@
> #include <linux/errno.h>
> #include <linux/etherdevice.h>
> #include <linux/netdevice.h>
> +#include <linux/netlink.h>
> #include <linux/ethtool.h>
>
> #include "hinic3_lld.h"
> @@ -409,6 +410,96 @@ hinic3_get_link_ksettings(struct net_device *netdev,
> return 0;
> }
>
> +static void hinic3_get_ringparam(struct net_device *netdev,
> + struct ethtool_ringparam *ring,
> + struct kernel_ethtool_ringparam *kernel_ring,
> + struct netlink_ext_ack *extack)
> +{
> + struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> +
> + ring->rx_max_pending = HINIC3_MAX_RX_QUEUE_DEPTH;
> + ring->tx_max_pending = HINIC3_MAX_TX_QUEUE_DEPTH;
> + ring->rx_pending = nic_dev->q_params.rq_depth;
> + ring->rx_pending = nic_dev->q_params.sq_depth;
copy-paste error
> +}
> +
> +static void hinic3_update_qp_depth(struct net_device *netdev,
> + u32 sq_depth, u32 rq_depth)
> +{
> + struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> + u16 i;
> +
> + nic_dev->q_params.sq_depth = sq_depth;
> + nic_dev->q_params.rq_depth = rq_depth;
> + for (i = 0; i < nic_dev->max_qps; i++) {
> + nic_dev->txqs[i].q_depth = sq_depth;
> + nic_dev->txqs[i].q_mask = sq_depth - 1;
> + nic_dev->rxqs[i].q_depth = rq_depth;
> + nic_dev->rxqs[i].q_mask = rq_depth - 1;
> + }
> +}
> +
> +static int hinic3_check_ringparam_valid(struct net_device *netdev,
> + const struct ethtool_ringparam *ring,
> + struct netlink_ext_ack *extack)
> +{
> + if (ring->tx_pending < HINIC3_MIN_QUEUE_DEPTH ||
> + ring->rx_pending < HINIC3_MIN_QUEUE_DEPTH) {
> + NL_SET_ERR_MSG_FMT_MOD(extack,
> + "Queue depth out of range tx[%d-%d] rx[%d-%d]",
> + HINIC3_MIN_QUEUE_DEPTH,
> + HINIC3_MAX_TX_QUEUE_DEPTH,
> + HINIC3_MIN_QUEUE_DEPTH,
> + HINIC3_MAX_RX_QUEUE_DEPTH);
Consider updating this error message to only call out when the ring
sizes are below the minimum supported value - since that's the check
introduced here and also since ethtool core will reject any values
that are higher than the maximum supported ring sizes.
> +
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static int hinic3_set_ringparam(struct net_device *netdev,
> + struct ethtool_ringparam *ring,
> + struct kernel_ethtool_ringparam *kernel_ring,
> + struct netlink_ext_ack *extack)
> +{
> + struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> + struct hinic3_dyna_txrxq_params q_params = {};
> + u32 new_sq_depth, new_rq_depth;
> + int err;
> +
> + err = hinic3_check_ringparam_valid(netdev, ring, extack);
> + if (err)
> + return err;
> +
> + new_sq_depth = 1U << ilog2(ring->tx_pending);
> + new_rq_depth = 1U << ilog2(ring->rx_pending);
Why not use rounddown_pow_of_two()? More readable...
> + if (new_sq_depth == nic_dev->q_params.sq_depth &&
> + new_rq_depth == nic_dev->q_params.rq_depth)
> + return 0;
> +
> + if (new_sq_depth != ring->tx_pending ||
> + new_rq_depth != ring->rx_pending)
> + NL_SET_ERR_MSG_FMT_MOD(extack,
> + "Requested Tx/Rx ring depth %u/%u trimmed to %u/%u",
> + ring->tx_pending, ring->rx_pending,
> + new_sq_depth, new_rq_depth);
> +
> + if (!netif_running(netdev)) {
> + hinic3_update_qp_depth(netdev, new_sq_depth, new_rq_depth);
> + } else {
> + q_params = nic_dev->q_params;
> + q_params.sq_depth = new_sq_depth;
> + q_params.rq_depth = new_rq_depth;
> +
> + err = hinic3_change_channel_settings(netdev, &q_params);
> + if (err)
> + return err;
> + }
> +
> + return 0;
> +}
> +
> static const struct ethtool_ops hinic3_ethtool_ops = {
> .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
> ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
> @@ -417,6 +508,8 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
> .get_msglevel = hinic3_get_msglevel,
> .set_msglevel = hinic3_set_msglevel,
> .get_link = ethtool_op_get_link,
> + .get_ringparam = hinic3_get_ringparam,
> + .set_ringparam = hinic3_set_ringparam,
> };
>
> void hinic3_set_ethtool_ops(struct net_device *netdev)
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
> index e7d6c2033b45..bc4d879f9be4 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
> @@ -137,7 +137,8 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
> struct hinic3_interrupt_info info = {};
> int err;
>
> - if (q_id >= nic_dev->q_params.num_qps)
> + if (q_id >= nic_dev->q_params.num_qps ||
> + !mutex_trylock(&nic_dev->change_res_mutex))
> return 0;
>
> info.interrupt_coalesc_set = 1;
> @@ -156,6 +157,8 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
> nic_dev->rxqs[q_id].last_pending_limit = pending_limit;
> }
>
> + mutex_unlock(&nic_dev->change_res_mutex);
> +
> return err;
> }
>
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
> index 0a888fe4c975..c87624a5e5dc 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
> @@ -179,6 +179,7 @@ static int hinic3_sw_init(struct net_device *netdev)
> int err;
>
> mutex_init(&nic_dev->port_state_mutex);
> + mutex_init(&nic_dev->change_res_mutex);
>
> nic_dev->q_params.sq_depth = HINIC3_SQ_DEPTH;
> nic_dev->q_params.rq_depth = HINIC3_RQ_DEPTH;
> @@ -315,6 +316,9 @@ static void hinic3_link_status_change(struct net_device *netdev,
> {
> struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
>
> + if (!mutex_trylock(&nic_dev->change_res_mutex))
> + return;
> +
> if (link_status_up) {
> if (netif_carrier_ok(netdev))
> return;
> @@ -330,6 +334,8 @@ static void hinic3_link_status_change(struct net_device *netdev,
> netif_carrier_off(netdev);
> netdev_dbg(netdev, "Link is down\n");
> }
> +
> + mutex_unlock(&nic_dev->change_res_mutex);
> }
>
> static void hinic3_port_module_event_handler(struct net_device *netdev,
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
> index da73811641a9..047214cfc753 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
> @@ -288,7 +288,8 @@ static void hinic3_free_channel_resources(struct net_device *netdev,
> hinic3_free_qps(nic_dev, qp_params);
> }
>
> -static int hinic3_open_channel(struct net_device *netdev)
> +static int hinic3_prepare_channel(struct net_device *netdev,
> + struct hinic3_dyna_txrxq_params *qp_params)
> {
> struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> int err;
> @@ -299,16 +300,28 @@ static int hinic3_open_channel(struct net_device *netdev)
> return err;
> }
>
> - err = hinic3_configure_txrxqs(netdev, &nic_dev->q_params);
> + err = hinic3_configure_txrxqs(netdev, qp_params);
> if (err) {
> netdev_err(netdev, "Failed to configure txrxqs\n");
> goto err_free_qp_ctxts;
> }
>
> + return 0;
> +
> +err_free_qp_ctxts:
> + hinic3_free_qp_ctxts(nic_dev);
> +
> + return err;
> +}
> +
> +static int hinic3_open_channel(struct net_device *netdev)
> +{
> + int err;
> +
> err = hinic3_qps_irq_init(netdev);
> if (err) {
> netdev_err(netdev, "Failed to init txrxq irq\n");
> - goto err_free_qp_ctxts;
> + return err;
> }
>
> err = hinic3_configure(netdev);
> @@ -321,8 +334,6 @@ static int hinic3_open_channel(struct net_device *netdev)
>
> err_uninit_qps_irq:
> hinic3_qps_irq_uninit(netdev);
> -err_free_qp_ctxts:
> - hinic3_free_qp_ctxts(nic_dev);
>
> return err;
> }
> @@ -428,6 +439,74 @@ static void hinic3_vport_down(struct net_device *netdev)
> }
> }
>
> +int
> +hinic3_change_channel_settings(struct net_device *netdev,
> + struct hinic3_dyna_txrxq_params *trxq_params)
> +{
> + struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> + struct hinic3_dyna_txrxq_params cur_trxq_params = {};
> + struct hinic3_dyna_qp_params new_qp_params = {};
> + struct hinic3_dyna_qp_params cur_qp_params = {};
> + int err;
> +
> + cur_trxq_params = nic_dev->q_params;
> +
> + hinic3_config_num_qps(netdev, trxq_params);
> +
> + err = hinic3_alloc_channel_resources(netdev, &new_qp_params,
> + trxq_params);
> + if (err) {
> + netdev_err(netdev, "Failed to alloc channel resources\n");
> + return err;
> + }
> +
> + mutex_lock(&nic_dev->change_res_mutex);
> + hinic3_vport_down(netdev);
> + hinic3_close_channel(netdev);
> + hinic3_get_cur_qps(nic_dev, &cur_qp_params);
> +
> + hinic3_init_qps(nic_dev, &new_qp_params);
> +
> + err = hinic3_prepare_channel(netdev, trxq_params);
> + if (err)
> + goto err_uninit_qps;
> +
> + if (nic_dev->num_qp_irq > trxq_params->num_qps)
> + hinic3_qp_irq_change(netdev, trxq_params->num_qps);
> +
> + nic_dev->q_params = *trxq_params;
> +
> + err = hinic3_open_channel(netdev);
> + if (err)
> + goto err_qp_irq_reset;
> +
> + err = hinic3_vport_up(netdev);
> + if (err)
> + goto err_close_channel;
> +
> + hinic3_free_channel_resources(netdev, &cur_qp_params, &cur_trxq_params);
> +
> + mutex_unlock(&nic_dev->change_res_mutex);
> +
> + return 0;
> +
> +err_close_channel:
> + hinic3_close_channel(netdev);
> +err_qp_irq_reset:
> + nic_dev->q_params = cur_trxq_params;
> +
> + if (trxq_params->num_qps > cur_trxq_params.num_qps)
> + hinic3_qp_irq_change(netdev, cur_trxq_params.num_qps);
> + hinic3_free_qp_ctxts(nic_dev);
> +err_uninit_qps:
> + hinic3_get_cur_qps(nic_dev, &new_qp_params);
> + hinic3_free_channel_resources(netdev, &new_qp_params, trxq_params);
> + hinic3_free_channel_resources(netdev, &cur_qp_params, &cur_trxq_params);
> + mutex_unlock(&nic_dev->change_res_mutex);
> +
> + return err;
> +}
> +
> static int hinic3_open(struct net_device *netdev)
> {
> struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> @@ -458,6 +537,10 @@ static int hinic3_open(struct net_device *netdev)
>
> hinic3_init_qps(nic_dev, &qp_params);
>
> + err = hinic3_prepare_channel(netdev, &nic_dev->q_params);
> + if (err)
> + goto err_uninit_qps;
> +
> err = hinic3_open_channel(netdev);
> if (err)
> goto err_uninit_qps;
> @@ -473,7 +556,7 @@ static int hinic3_open(struct net_device *netdev)
> err_close_channel:
> hinic3_close_channel(netdev);
> err_uninit_qps:
> - hinic3_uninit_qps(nic_dev, &qp_params);
> + hinic3_get_cur_qps(nic_dev, &qp_params);
> hinic3_free_channel_resources(netdev, &qp_params, &nic_dev->q_params);
> err_destroy_num_qps:
> hinic3_destroy_num_qps(netdev);
> @@ -493,10 +576,15 @@ static int hinic3_close(struct net_device *netdev)
> return 0;
> }
>
> + mutex_lock(&nic_dev->change_res_mutex);
> hinic3_vport_down(netdev);
> hinic3_close_channel(netdev);
> - hinic3_uninit_qps(nic_dev, &qp_params);
> - hinic3_free_channel_resources(netdev, &qp_params, &nic_dev->q_params);
> + hinic3_get_cur_qps(nic_dev, &qp_params);
> + hinic3_free_channel_resources(netdev, &qp_params,
> + &nic_dev->q_params);
> + hinic3_free_nicio_res(nic_dev);
> + hinic3_destroy_num_qps(netdev);
> + mutex_unlock(&nic_dev->change_res_mutex);
>
> return 0;
> }
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
> index 9502293ff710..005b2c01a988 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
> @@ -10,6 +10,9 @@
> #include "hinic3_hw_cfg.h"
> #include "hinic3_hwdev.h"
> #include "hinic3_mgmt_interface.h"
> +#include "hinic3_nic_io.h"
> +#include "hinic3_tx.h"
> +#include "hinic3_rx.h"
>
> #define HINIC3_VLAN_BITMAP_BYTE_SIZE(nic_dev) (sizeof(*(nic_dev)->vlan_bitmap))
> #define HINIC3_VLAN_BITMAP_SIZE(nic_dev) \
> @@ -129,6 +132,8 @@ struct hinic3_nic_dev {
> struct work_struct rx_mode_work;
> /* lock for enable/disable port */
> struct mutex port_state_mutex;
> + /* mutex to serialize channel/resource changes */
> + struct mutex change_res_mutex;
>
> struct list_head uc_filter_list;
> struct list_head mc_filter_list;
> @@ -143,6 +148,10 @@ struct hinic3_nic_dev {
>
> void hinic3_set_netdev_ops(struct net_device *netdev);
> int hinic3_set_hw_features(struct net_device *netdev);
> +int
> +hinic3_change_channel_settings(struct net_device *netdev,
> + struct hinic3_dyna_txrxq_params *trxq_params);
> +
> int hinic3_qps_irq_init(struct net_device *netdev);
> void hinic3_qps_irq_uninit(struct net_device *netdev);
>
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.c b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.c
> index 87e736adba02..0e7a0ccfba98 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.c
> @@ -484,8 +484,8 @@ void hinic3_init_qps(struct hinic3_nic_dev *nic_dev,
> }
> }
>
> -void hinic3_uninit_qps(struct hinic3_nic_dev *nic_dev,
> - struct hinic3_dyna_qp_params *qp_params)
> +void hinic3_get_cur_qps(struct hinic3_nic_dev *nic_dev,
> + struct hinic3_dyna_qp_params *qp_params)
> {
> struct hinic3_nic_io *nic_io = nic_dev->nic_io;
>
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
> index 12eefabcf1db..571b34d63950 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
> @@ -14,6 +14,10 @@ struct hinic3_nic_dev;
> #define HINIC3_RQ_WQEBB_SHIFT 3
> #define HINIC3_SQ_WQEBB_SIZE BIT(HINIC3_SQ_WQEBB_SHIFT)
>
> +#define HINIC3_MAX_TX_QUEUE_DEPTH 65536
> +#define HINIC3_MAX_RX_QUEUE_DEPTH 16384
> +#define HINIC3_MIN_QUEUE_DEPTH 128
> +
> /* ******************** RQ_CTRL ******************** */
> enum hinic3_rq_wqe_type {
> HINIC3_NORMAL_RQ_WQE = 1,
> @@ -136,8 +140,8 @@ void hinic3_free_qps(struct hinic3_nic_dev *nic_dev,
> struct hinic3_dyna_qp_params *qp_params);
> void hinic3_init_qps(struct hinic3_nic_dev *nic_dev,
> struct hinic3_dyna_qp_params *qp_params);
> -void hinic3_uninit_qps(struct hinic3_nic_dev *nic_dev,
> - struct hinic3_dyna_qp_params *qp_params);
> +void hinic3_get_cur_qps(struct hinic3_nic_dev *nic_dev,
> + struct hinic3_dyna_qp_params *qp_params);
>
> int hinic3_init_qp_ctxts(struct hinic3_nic_dev *nic_dev);
> void hinic3_free_qp_ctxts(struct hinic3_nic_dev *nic_dev);
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
> index 309ab5901379..b5b601469517 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
> @@ -541,7 +541,7 @@ int hinic3_configure_rxqs(struct net_device *netdev, u16 num_rq,
> rq_associate_cqes(rxq);
>
> pkts = hinic3_rx_fill_buffers(rxq);
> - if (!pkts) {
> + if (pkts < rxq->q_depth - 1) {
> netdev_err(netdev, "Failed to fill Rx buffer\n");
> return -ENOMEM;
> }
> --
> 2.43.0
>
>
^ permalink raw reply
* Re: [PATCH v6 1/7] dt-bindings: hwmon: Add Apple System Management Controller hwmon schema
From: Rob Herring @ 2026-06-12 22:07 UTC (permalink / raw)
To: James Calligeros
Cc: Sven Peter, Janne Grunau, Alyssa Rosenzweig, Neal Gompa,
Lee Jones, Krzysztof Kozlowski, Conor Dooley, Alexandre Belloni,
Jean Delvare, Guenter Roeck, Dmitry Torokhov, Jonathan Corbet,
asahi, linux-arm-kernel, devicetree, linux-kernel, linux-rtc,
linux-hwmon, linux-input, linux-doc
In-Reply-To: <20251215-macsmc-subdevs-v6-1-0518cb5f28ae@gmail.com>
On Mon, Dec 15, 2025 at 07:37:45PM +1000, James Calligeros wrote:
> Apple Silicon devices integrate a vast array of sensors, monitoring
> current, power, temperature, and voltage across almost every part of
> the system. The sensors themselves are all connected to the System
> Management Controller (SMC). The SMC firmware exposes the data
> reported by these sensors via its standard FourCC-based key-value
> API. The SMC is also responsible for monitoring and controlling any
> fans connected to the system, exposing them in the same way.
>
> For reasons known only to Apple, each device exposes its sensors with
> an almost totally unique set of keys. This is true even for devices
> which share an SoC. An M1 Mac mini, for example, will report its core
> temperatures on different keys to an M1 MacBook Pro. Worse still, the
> SMC does not provide a way to enumerate the available keys at runtime,
> nor do the keys follow any sort of reasonable or consistent naming
> rules that could be used to deduce their purpose. We must therefore
> know which keys are present on any given device, and which function
> they serve, ahead of time.
>
> Add a schema so that we can describe the available sensors for a given
> Apple Silicon device in the Devicetree.
>
> Reviewed-by: Neal Gompa <neal@gompa.dev>
> Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
> Signed-off-by: James Calligeros <jcalligeros99@gmail.com>
> ---
> .../bindings/hwmon/apple,smc-hwmon.yaml | 86 +++++++++++++++++++++++++
> .../bindings/mfd/apple,smc.yaml | 36 +++++++++++
> MAINTAINERS | 1 +
> 3 files changed, 123 insertions(+)
I fixed up the error and applied. Please ensure all your patches get
applied.
Rob
^ permalink raw reply
* Re: [PATCH] Documentation: ABI: sysfs-class-reboot-mode-reboot_modes: fix doc warnings
From: Sebastian Reichel @ 2026-06-12 22:05 UTC (permalink / raw)
To: linux-kernel, Randy Dunlap
Cc: linux-pm, linux-arm-kernel, Bartosz Golaszewski, Shivendra Pratap,
linux-doc
In-Reply-To: <20260426232705.422938-1-rdunlap@infradead.org>
On Sun, 26 Apr 2026 16:27:05 -0700, Randy Dunlap wrote:
> Repair the docs build warnings in this file by unindenting the description,
> adding blank lines, and using `` to quote *arg.
>
> WARNING: Documentation/ABI/testing/sysfs-class-reboot-mode-reboot_modes:36: abi_sys_class_reboot_mode_driver_reboot_modes doesn't have a description
> Documentation/ABI/testing/sysfs-class-reboot-mode-reboot_modes:1: ERROR: Unexpected indentation. [docutils]
> Documentation/ABI/testing/sysfs-class-reboot-mode-reboot_modes:1: ERROR: Unexpected indentation. [docutils]
> Documentation/ABI/testing/sysfs-class-reboot-mode-reboot_modes:1: WARNING: Inline emphasis start-string without end-string. [docutils]
> Documentation/ABI/testing/sysfs-class-reboot-mode-reboot_modes:1: ERROR: Unexpected indentation. [docutils]
>
> [...]
Applied, thanks!
[1/1] Documentation: ABI: sysfs-class-reboot-mode-reboot_modes: fix doc warnings
commit: a888754e51e915731c8974c4d6d62709facb35d3
Best regards,
--
Sebastian Reichel <sebastian.reichel@collabora.com>
^ permalink raw reply
* Re: [swap tier discussion] Re: [PATCH v3 2/4] mm/zswap: Implement proactive writeback
From: Yosry Ahmed @ 2026-06-12 21:31 UTC (permalink / raw)
To: Shakeel Butt
Cc: YoungJun Park, Hao Jia, Johannes Weiner, mhocko, tj, mkoutny,
roman.gushchin, Nhat Pham, akpm, chengming.zhou, muchun.song,
cgroups, linux-mm, linux-kernel, linux-doc, Hao Jia, chrisl,
kasong, baoquan.he, joshua.hahnjy
In-Reply-To: <aiw2p5ANjsQUCIHA@linux.dev>
> > > Is Hao's work needed for some followup work/development? The earliest Hao's
> > > work can is 7.3, so if we aim to figure out swap tiering interfaces in next
> > > couple of weeks then option 3 is the way to go. If swap tiers take more time
> > > then we can discuss other options as well.
> > > However I would need zswap folks (Yosry & Nhat) help in figuring out swap tiers
> > > interfaces. Zswap is the current top tier swap usage in real world. I want
> > > zswap users to eaily (and hopefully transparently) migrate to swap tiers.
> >
> > I am looking forward to the discussion on this interface!
> >
> > To help boost the discussion and progress, I would like to share a few of my thoughts.
> > We could either introduce a new interface to trigger demotion/promotion,
> > or we could reuse the existing one (using tier just internally)
> >
> > Based on the memcg interface currently proposed in swap_tier
> > (memory.swap.tiers, memory.swap.tiers.effective), I think it aligns well
> > with the current direction. It provides a foundation for selectively
> > targeting devices in tier order.
>
> Here instead of cpuset like interface, we may want more zswap like interface
> where you can put limit on the usage i.e. memory.swap.tier*.max. We can start
> with allowing only two values i.e. 0 and max which effectively will be the
> same as what you need.
>
> I will respond to your other points later when I have time.
If we will have one interface for all the tiers for memory tiering,
I'd rather we do the swap for swap tiering. So maybe
memory.swap.tiers.max or memory.swap.tiered max?
The file can show the limits for all tiers when read, and maybe write
something like "echo 'tierX max' > memory.swap.tiers.max" to it to set
a new limit. We can support only 0/max for now to enable/disable
tiers. In the future, we can also allow something like "auto" to
automatically scale the limit based on the swapfile size and
memory.swap.max, similar to the direction memory tiering is heading
in.
I think we can start with just this interface for now, and expand
incrementally. For proactive zswap writeback, we can add
memory.swap.tiers.demote or something, and only support zswap
initially?
^ permalink raw reply
* Re: [RFC V2 3/3] mm: Replace pgtable entry prints with new format
From: Hugh Dickins @ 2026-06-12 21:26 UTC (permalink / raw)
To: David Hildenbrand (Arm)
Cc: Anshuman Khandual, linux-mm, Andy Shevchenko, Rasmus Villemoes,
Sergey Senozhatsky, Petr Mladek, Steven Rostedt, Jonathan Corbet,
Andrew Morton, linux-kernel, linux-doc, Lorenzo Stoakes
In-Reply-To: <fc57bb9a-4564-489e-8da4-65068b5283ae@kernel.org>
On Fri, 12 Jun 2026, David Hildenbrand (Arm) wrote:
...
>
> After some off-list discussion, I wonder if we can make our life easier.
>
> I think, even with your patch, there is still the case:
>
> pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm,
> pgtable_level_to_str(level), entry);
>
> Where we cast all entries to an "unsigned long" in the callers. We'd have to rework all
> that for 128bit entries either way (passing them in some struct instead).
>
> I really just extended what we used to do here in print_bad_pte() before commit ec63a44011d.
>
> Maybe we should just drop the "print the involved page table entries" thing?
>
> I mean, we do have the actual page, and we do have the address in the address space, which
> we all print.
>
> Not sure if the actual page table entries are that relevant?
The page table entry is BUGgily Bad: we want to see what it looks like
(sometimes, a sequence of bad page map entries may even show up as ASCII).
Hugh
^ permalink raw reply
* Re: [PATCH] docs: pt_BR: Translate 3.Early-stage.rst into Portuguese
From: Jonathan Corbet @ 2026-06-12 19:40 UTC (permalink / raw)
To: Daniel Pereira; +Cc: linux-doc, Daniel Pereira
In-Reply-To: <20260601192346.192752-1-danielmaraboo@gmail.com>
Daniel Pereira <danielmaraboo@gmail.com> writes:
> Translate the documentation file '3.Early-stage.rst' into Portuguese.
>
> This section addresses corporate kernel development constraints,
> the balance between company secrecy and the open-loop approach,
> and the use of NDAs or Linux Foundation programs to avoid
> integration issues.
>
> Signed-off-by: Daniel Pereira <danielmaraboo@gmail.com>
> ---
> .../pt_BR/process/3.Early-stage.rst | 233 ++++++++++++++++++
> .../pt_BR/process/development-process.rst | 1 +
> 2 files changed, 234 insertions(+)
> create mode 100644 Documentation/translations/pt_BR/process/3.Early-stage.rst
>
Applied, thanks.
jon
^ permalink raw reply
* Re: [PATCH] kdoc: xforms: ignore special static/inline macros
From: Jonathan Corbet @ 2026-06-12 19:34 UTC (permalink / raw)
To: Randy Dunlap, linux-doc
Cc: Randy Dunlap, Shuah Khan, Mauro Carvalho Chehab, Harry Wentland,
Alex Hung, Ivan Lipski, Dan Wheeler, Alex Deucher,
Christian König, amd-gfx
In-Reply-To: <20260602031214.2817411-1-rdunlap@infradead.org>
Randy Dunlap <rdunlap@infradead.org> writes:
> drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c contains 7 (for
> now) functions that use STATIC_IFN_KUNIT or INLINE_IFN_KUNIT macros for
> function qualifiers (static or not, inline or not).
>
> These cause parse warnings from kernel-doc:
> Invalid C declaration: Expected identifier in nested name, got keyword:
> struct [error at 29]
> STATIC_IFN_KUNIT const struct drm_color_lut * __extract_blob_lut (const
> struct drm_property_blob *blob, uint32_t *size)
>
> Handle these in kernel-doc to prevent multiple warnings.
>
> Fixes: 647d1fd04652 ("drm/amd/display: Add KUnit test for color helpers")
> Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
> ---
> Cc: Jonathan Corbet <corbet@lwn.net>
> Cc: Shuah Khan <skhan@linuxfoundation.org>
> Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
> Cc: Harry Wentland <harry.wentland@amd.com>
> Cc: Alex Hung <alex.hung@amd.com>
> Cc: Ivan Lipski <ivan.lipski@amd.com>
> Cc: Dan Wheeler <daniel.wheeler@amd.com>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: amd-gfx@lists.freedesktop.org
>
> tools/lib/python/kdoc/xforms_lists.py | 2 ++
> 1 file changed, 2 insertions(+)
>
> --- linux-next-20260601.orig/tools/lib/python/kdoc/xforms_lists.py
> +++ linux-next-20260601/tools/lib/python/kdoc/xforms_lists.py
> @@ -104,6 +104,8 @@ class CTransforms:
> (CMatch("__context_unsafe"), ""),
> (CMatch("__attribute_const__"), ""),
> (CMatch("__attribute__"), ""),
> + (CMatch("STATIC_IFN_KUNIT"), ""),
> + (CMatch("INLINE_IFN_KUNIT"), ""),
So I can't get this one to apply; which tree did you patch here?
Thanks,
jon
^ permalink raw reply
* Re: [PATCH] Documentation: index.rst: add entry of other sub-directory
From: Jonathan Corbet @ 2026-06-12 19:30 UTC (permalink / raw)
To: Manuel Ebner, Sebastian Andrzej Siewior, Clark Williams,
Steven Rostedt, Shuah Khan,
open list:Real-time Linux (PREEMPT_RT), open list:DOCUMENTATION,
open list
Cc: Manuel Ebner
In-Reply-To: <20260603080430.344391-2-manuelebner@mailbox.org>
Manuel Ebner <manuelebner@mailbox.org> writes:
> add reference to scheduler/sched-rt-group.rst
>
> Signed-off-by: Manuel Ebner <manuelebner@mailbox.org>
> ---
> Documentation/core-api/real-time/index.rst | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/Documentation/core-api/real-time/index.rst b/Documentation/core-api/real-time/index.rst
> index f08d2395a22c..661b419e7f8f 100644
> --- a/Documentation/core-api/real-time/index.rst
> +++ b/Documentation/core-api/real-time/index.rst
> @@ -15,3 +15,4 @@ the required changes compared to a non-PREEMPT_RT configuration.
> differences
> hardware
> architecture-porting
> + Real-Time group scheduling <../../scheduler/sched-rt-group>
This file already appears in the toctree in
Documentation/scheduler/index.rst; adding it elsewhere can create build
problems. If you really want a link here, the way to do it is to add a
cross reference instead.
jon
^ permalink raw reply
* [PATCH 21/21] KVM: arm64: selftests: Relax testing for exceptions when partitioned
From: Colton Lewis @ 2026-06-12 19:29 UTC (permalink / raw)
To: kvm
Cc: Alexandru Elisei, Paolo Bonzini, Jonathan Corbet, Russell King,
Catalin Marinas, Will Deacon, Marc Zyngier, Oliver Upton,
Mingwei Zhang, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Mark Rutland, Shuah Khan, Ganapatrao Kulkarni, James Clark,
linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-perf-users, linux-kselftest, Colton Lewis
In-Reply-To: <20260612192909.1153907-1-coltonlewis@google.com>
Because the Partitioned PMU must lean heavily on underlying hardware
support, it can't guarantee an exception occurs when accessing an
invalid pmc index.
The ARM manual specifies that accessing PMEVCNTR<n>_EL0 where n is
greater than the number of counters on the system is constrained
unpredictable when FEAT_FGT is not implemented, and it is desired the
Partitioned PMU still work without FEAT_FGT.
Though KVM could enforce exceptions here since all PMU accesses
without FEAT_FGT are trapped, that creates further difficulties. For
one example, the manual also says that after writing a value to
PMSELR_EL0 greater than the number of counters on a system, direct
reads will return an unknown value, meaning KVM could not rely on the
hardware register to hold the correct value.
Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
.../selftests/kvm/arm64/vpmu_counter_access.c | 20 ++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
index 9be6034335283..e8c3856df77b7 100644
--- a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
+++ b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
@@ -38,10 +38,14 @@ const char *pmu_impl_str[] = {
struct vpmu_vm {
struct kvm_vm *vm;
struct kvm_vcpu *vcpu;
+};
+
+struct guest_context {
bool pmu_partitioned;
};
static struct vpmu_vm vpmu_vm;
+static struct guest_context guest_context;
struct pmreg_sets {
u64 set_reg_id;
@@ -342,11 +346,16 @@ static void test_access_invalid_pmc_regs(struct pmc_accessor *acc, int pmc_idx)
/*
* Reading/writing the event count/type registers should cause
* an UNDEFINED exception.
+ *
+ * If the pmu is partitioned, we can't guarantee it because
+ * hardware doesn't.
*/
- TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_cntr(pmc_idx));
- TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_cntr(pmc_idx, 0));
- TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_typer(pmc_idx));
- TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_typer(pmc_idx, 0));
+ if (!guest_context.pmu_partitioned) {
+ TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_cntr(pmc_idx));
+ TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_cntr(pmc_idx, 0));
+ TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->read_typer(pmc_idx));
+ TEST_EXCEPTION(ESR_ELx_EC_UNKNOWN, acc->write_typer(pmc_idx, 0));
+ }
/*
* The bit corresponding to the (unimplemented) counter in
* {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers should be RAZ.
@@ -459,7 +468,7 @@ static void create_vpmu_vm(void *guest_code, enum pmu_impl impl)
vpmu_vm.vcpu, KVM_ARM_VCPU_PMU_V3_CTRL, KVM_ARM_VCPU_PMU_V3_ENABLE_PARTITION);
if (!ret) {
vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &part_attr);
- vpmu_vm.pmu_partitioned = partition;
+ guest_context.pmu_partitioned = partition;
pr_debug("Set PMU partitioning: %d\n", partition);
}
@@ -511,6 +520,7 @@ static void test_create_vpmu_vm_with_nr_counters(
TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret));
vcpu_device_attr_set(vcpu, KVM_ARM_VCPU_PMU_V3_CTRL, KVM_ARM_VCPU_PMU_V3_INIT, NULL);
+ sync_global_to_guest(vpmu_vm.vm, guest_context);
}
/*
--
2.54.0.1136.gdb2ca164c4-goog
^ permalink raw reply related
* [PATCH 20/21] KVM: arm64: selftests: Add test case for Partitioned PMU
From: Colton Lewis @ 2026-06-12 19:29 UTC (permalink / raw)
To: kvm
Cc: Alexandru Elisei, Paolo Bonzini, Jonathan Corbet, Russell King,
Catalin Marinas, Will Deacon, Marc Zyngier, Oliver Upton,
Mingwei Zhang, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Mark Rutland, Shuah Khan, Ganapatrao Kulkarni, James Clark,
linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-perf-users, linux-kselftest, Colton Lewis
In-Reply-To: <20260612192909.1153907-1-coltonlewis@google.com>
Rerun all tests for a Partitioned PMU in vpmu_counter_access.
Create an enum specifying whether we are testing the emulated or
Partitioned PMU and all the test functions are modified to take the
implementation as an argument and make the difference in setup
appropriately.
Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
.../selftests/kvm/arm64/vpmu_counter_access.c | 94 ++++++++++++++-----
1 file changed, 73 insertions(+), 21 deletions(-)
diff --git a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
index 22223395969e0..9be6034335283 100644
--- a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
+++ b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
@@ -25,9 +25,20 @@
/* The cycle counter bit position that's common among the PMU registers */
#define ARMV8_PMU_CYCLE_IDX 31
+enum pmu_impl {
+ EMULATED,
+ PARTITIONED
+};
+
+const char *pmu_impl_str[] = {
+ "Emulated",
+ "Partitioned"
+};
+
struct vpmu_vm {
struct kvm_vm *vm;
struct kvm_vcpu *vcpu;
+ bool pmu_partitioned;
};
static struct vpmu_vm vpmu_vm;
@@ -399,7 +410,7 @@ static void guest_code(u64 expected_pmcr_n)
}
/* Create a VM that has one vCPU with PMUv3 configured. */
-static void create_vpmu_vm(void *guest_code)
+static void create_vpmu_vm(void *guest_code, enum pmu_impl impl)
{
struct kvm_vcpu_init init;
u8 pmuver, ec;
@@ -409,6 +420,13 @@ static void create_vpmu_vm(void *guest_code)
.attr = KVM_ARM_VCPU_PMU_V3_IRQ,
.addr = (u64)&irq,
};
+ u32 partition = (impl == PARTITIONED);
+ struct kvm_device_attr part_attr = {
+ .group = KVM_ARM_VCPU_PMU_V3_CTRL,
+ .attr = KVM_ARM_VCPU_PMU_V3_ENABLE_PARTITION,
+ .addr = (uint64_t)&partition
+ };
+ int ret;
/* The test creates the vpmu_vm multiple times. Ensure a clean state */
memset(&vpmu_vm, 0, sizeof(vpmu_vm));
@@ -436,6 +454,15 @@ static void create_vpmu_vm(void *guest_code)
"Unexpected PMUVER (0x%x) on the vCPU with PMUv3", pmuver);
vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &irq_attr);
+
+ ret = __vcpu_has_device_attr(
+ vpmu_vm.vcpu, KVM_ARM_VCPU_PMU_V3_CTRL, KVM_ARM_VCPU_PMU_V3_ENABLE_PARTITION);
+ if (!ret) {
+ vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &part_attr);
+ vpmu_vm.pmu_partitioned = partition;
+ pr_debug("Set PMU partitioning: %d\n", partition);
+ }
+
}
static void destroy_vpmu_vm(void)
@@ -461,13 +488,14 @@ static void run_vcpu(struct kvm_vcpu *vcpu, u64 pmcr_n)
}
}
-static void test_create_vpmu_vm_with_nr_counters(unsigned int nr_counters, bool expect_fail)
+static void test_create_vpmu_vm_with_nr_counters(
+ unsigned int nr_counters, enum pmu_impl impl, bool expect_fail)
{
struct kvm_vcpu *vcpu;
unsigned int prev;
int ret;
- create_vpmu_vm(guest_code);
+ create_vpmu_vm(guest_code, impl);
vcpu = vpmu_vm.vcpu;
prev = get_pmcr_n(vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0)));
@@ -489,7 +517,7 @@ static void test_create_vpmu_vm_with_nr_counters(unsigned int nr_counters, bool
* Create a guest with one vCPU, set the PMCR_EL0.N for the vCPU to @pmcr_n,
* and run the test.
*/
-static void run_access_test(u64 pmcr_n)
+static void run_access_test(u64 pmcr_n, enum pmu_impl impl)
{
u64 sp;
struct kvm_vcpu *vcpu;
@@ -497,7 +525,7 @@ static void run_access_test(u64 pmcr_n)
pr_debug("Test with pmcr_n %lu\n", pmcr_n);
- test_create_vpmu_vm_with_nr_counters(pmcr_n, false);
+ test_create_vpmu_vm_with_nr_counters(pmcr_n, impl, false);
vcpu = vpmu_vm.vcpu;
/* Save the initial sp to restore them later to run the guest again */
@@ -531,14 +559,14 @@ static struct pmreg_sets validity_check_reg_sets[] = {
* Create a VM, and check if KVM handles the userspace accesses of
* the PMU register sets in @validity_check_reg_sets[] correctly.
*/
-static void run_pmregs_validity_test(u64 pmcr_n)
+static void run_pmregs_validity_test(u64 pmcr_n, enum pmu_impl impl)
{
int i;
struct kvm_vcpu *vcpu;
u64 set_reg_id, clr_reg_id, reg_val;
u64 valid_counters_mask, max_counters_mask;
- test_create_vpmu_vm_with_nr_counters(pmcr_n, false);
+ test_create_vpmu_vm_with_nr_counters(pmcr_n, impl, false);
vcpu = vpmu_vm.vcpu;
valid_counters_mask = get_counters_mask(pmcr_n);
@@ -588,11 +616,11 @@ static void run_pmregs_validity_test(u64 pmcr_n)
* the vCPU to @pmcr_n, which is larger than the host value.
* The attempt should fail as @pmcr_n is too big to set for the vCPU.
*/
-static void run_error_test(u64 pmcr_n)
+static void run_error_test(u64 pmcr_n, enum pmu_impl impl)
{
- pr_debug("Error test with pmcr_n %lu (larger than the host)\n", pmcr_n);
+ pr_debug("Error test with pmcr_n %lu (larger than the host allows)\n", pmcr_n);
- test_create_vpmu_vm_with_nr_counters(pmcr_n, true);
+ test_create_vpmu_vm_with_nr_counters(pmcr_n, impl, true);
destroy_vpmu_vm();
}
@@ -600,11 +628,11 @@ static void run_error_test(u64 pmcr_n)
* Return the default number of implemented PMU event counters excluding
* the cycle counter (i.e. PMCR_EL0.N value) for the guest.
*/
-static u64 get_pmcr_n_limit(void)
+static u64 get_pmcr_n_limit(enum pmu_impl impl)
{
u64 pmcr;
- create_vpmu_vm(guest_code);
+ create_vpmu_vm(guest_code, impl);
pmcr = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0));
destroy_vpmu_vm();
return get_pmcr_n(pmcr);
@@ -614,7 +642,7 @@ static bool kvm_supports_nr_counters_attr(void)
{
bool supported;
- create_vpmu_vm(NULL);
+ create_vpmu_vm(NULL, EMULATED);
supported = !__vcpu_has_device_attr(vpmu_vm.vcpu, KVM_ARM_VCPU_PMU_V3_CTRL,
KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS);
destroy_vpmu_vm();
@@ -622,22 +650,46 @@ static bool kvm_supports_nr_counters_attr(void)
return supported;
}
-int main(void)
+static bool kvm_supports_partition_attr(void)
+{
+ bool supported;
+
+ create_vpmu_vm(NULL, EMULATED);
+ supported = !__vcpu_has_device_attr(vpmu_vm.vcpu, KVM_ARM_VCPU_PMU_V3_CTRL,
+ KVM_ARM_VCPU_PMU_V3_ENABLE_PARTITION);
+ destroy_vpmu_vm();
+
+ return supported;
+}
+
+void test_pmu(enum pmu_impl impl)
{
u64 i, pmcr_n;
- TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_PMU_V3));
- TEST_REQUIRE(kvm_supports_vgic_v3());
- TEST_REQUIRE(kvm_supports_nr_counters_attr());
+ pr_info("Testing PMU: Implementation = %s\n", pmu_impl_str[impl]);
+
+ pmcr_n = get_pmcr_n_limit(impl);
+ pr_debug("PMCR_EL0.N: Limit = %lu\n", pmcr_n);
- pmcr_n = get_pmcr_n_limit();
for (i = 0; i <= pmcr_n; i++) {
- run_access_test(i);
- run_pmregs_validity_test(i);
+ run_access_test(i, impl);
+ run_pmregs_validity_test(i, impl);
}
for (i = pmcr_n + 1; i < ARMV8_PMU_MAX_COUNTERS; i++)
- run_error_test(i);
+ run_error_test(i, impl);
+}
+
+int main(void)
+{
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_PMU_V3));
+ TEST_REQUIRE(kvm_supports_vgic_v3());
+ TEST_REQUIRE(kvm_supports_nr_counters_attr());
+
+ test_pmu(EMULATED);
+
+ if (kvm_supports_partition_attr())
+ test_pmu(PARTITIONED);
return 0;
}
--
2.54.0.1136.gdb2ca164c4-goog
^ permalink raw reply related
* [PATCH 19/21] KVM: selftests: Add find_bit to KVM library
From: Colton Lewis @ 2026-06-12 19:29 UTC (permalink / raw)
To: kvm
Cc: Alexandru Elisei, Paolo Bonzini, Jonathan Corbet, Russell King,
Catalin Marinas, Will Deacon, Marc Zyngier, Oliver Upton,
Mingwei Zhang, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Mark Rutland, Shuah Khan, Ganapatrao Kulkarni, James Clark,
linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-perf-users, linux-kselftest, Colton Lewis
In-Reply-To: <20260612192909.1153907-1-coltonlewis@google.com>
Some selftests have a dependency on find_bit and weren't compiling
separately without it, so I've added it to the KVM library here using
the same method as files like rbtree.c.
Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
tools/testing/selftests/kvm/Makefile.kvm | 1 +
tools/testing/selftests/kvm/lib/find_bit.c | 2 ++
2 files changed, 3 insertions(+)
create mode 100644 tools/testing/selftests/kvm/lib/find_bit.c
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 9118a5a51b89f..fa7a2746b1c13 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -5,6 +5,7 @@ all:
LIBKVM += lib/assert.c
LIBKVM += lib/elf.c
+LIBKVM += lib/find_bit.c
LIBKVM += lib/guest_modes.c
LIBKVM += lib/io.c
LIBKVM += lib/kvm_util.c
diff --git a/tools/testing/selftests/kvm/lib/find_bit.c b/tools/testing/selftests/kvm/lib/find_bit.c
new file mode 100644
index 0000000000000..5534248c663f7
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/find_bit.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../../../lib/find_bit.c"
--
2.54.0.1136.gdb2ca164c4-goog
^ permalink raw reply related
* [PATCH 18/21] KVM: arm64: Add vCPU device attr to partition the PMU
From: Colton Lewis @ 2026-06-12 19:29 UTC (permalink / raw)
To: kvm
Cc: Alexandru Elisei, Paolo Bonzini, Jonathan Corbet, Russell King,
Catalin Marinas, Will Deacon, Marc Zyngier, Oliver Upton,
Mingwei Zhang, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Mark Rutland, Shuah Khan, Ganapatrao Kulkarni, James Clark,
linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-perf-users, linux-kselftest, Colton Lewis
In-Reply-To: <20260612192909.1153907-1-coltonlewis@google.com>
Add a new PMU device attr to enable the partitioned PMU for a given
VM. This capability can be set when the PMU is initially configured
before the vCPU starts running and is allowed where PMUv3 and VHE are
supported and the host driver was configured with
arm_pmuv3.reserved_host_counters.
The enabled capability is tracked by the new flag
KVM_ARCH_FLAG_PARTITION_PMU_ENABLED.
Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
arch/arm64/include/uapi/asm/kvm.h | 2 ++
arch/arm64/kvm/pmu-direct.c | 30 ++++++++++++++++++++++++++++++
arch/arm64/kvm/pmu.c | 23 +++++++++++++++++++++++
include/kvm/arm_pmu.h | 9 +++++++++
4 files changed, 64 insertions(+)
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 1c13bfa2d38aa..7f57b8c132925 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -437,6 +437,8 @@ enum {
#define KVM_ARM_VCPU_PMU_V3_FILTER 2
#define KVM_ARM_VCPU_PMU_V3_SET_PMU 3
#define KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS 4
+#define KVM_ARM_VCPU_PMU_V3_ENABLE_PARTITION 5
+
#define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c
index 0062d1d8e1999..2d2294b78ebe0 100644
--- a/arch/arm64/kvm/pmu-direct.c
+++ b/arch/arm64/kvm/pmu-direct.c
@@ -24,6 +24,36 @@ bool has_host_pmu_partition_support(void)
system_supports_pmuv3();
}
+
+/**
+ * has_kvm_pmu_partition_support() - If we can enable/disable partition
+ *
+ * Return: true if allowed, false otherwise.
+ */
+bool has_kvm_pmu_partition_support(void)
+{
+ return has_host_pmu_partition_support() &&
+ kvm_supports_guest_pmuv3() &&
+ armv8pmu_is_partitioned;
+}
+
+/**
+ * kvm_pmu_partition_enable() - Enable/disable partition flag
+ * @kvm: Pointer to vcpu
+ * @enable: Whether to enable or disable
+ *
+ * If we want to enable the partition, the guest is free to grab
+ * hardware by accessing PMU registers. Otherwise, the host maintains
+ * control.
+ */
+void kvm_pmu_partition_enable(struct kvm *kvm, bool enable)
+{
+ if (enable)
+ set_bit(KVM_ARCH_FLAG_PARTITION_PMU_ENABLED, &kvm->arch.flags);
+ else
+ clear_bit(KVM_ARCH_FLAG_PARTITION_PMU_ENABLED, &kvm->arch.flags);
+}
+
/**
* pmu_is_partitioned() - Determine if given PMU is partitioned
* @pmu: Pointer to arm_pmu struct
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index f5ee18b4dfae7..4e15948ac2565 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -769,6 +769,28 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
return kvm_arm_pmu_v3_set_nr_counters(vcpu, n);
}
+ case KVM_ARM_VCPU_PMU_V3_ENABLE_PARTITION: {
+ unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr;
+ u32 val;
+
+ if (get_user(val, uaddr))
+ return -EFAULT;
+
+ if (!has_kvm_pmu_partition_support())
+ return -EPERM;
+
+ if (kvm_vm_has_ran_once(kvm))
+ return -EBUSY;
+
+ kvm_pmu_partition_enable(kvm, val);
+ if (val) {
+ unsigned int max_counters = kvm_arm_pmu_get_max_counters(kvm);
+
+ if (kvm->arch.nr_pmu_counters > max_counters)
+ kvm_arm_set_nr_counters(kvm, max_counters);
+ }
+ return 0;
+ }
case KVM_ARM_VCPU_PMU_V3_INIT:
return kvm_arm_pmu_v3_init(vcpu);
}
@@ -808,6 +830,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
case KVM_ARM_VCPU_PMU_V3_FILTER:
case KVM_ARM_VCPU_PMU_V3_SET_PMU:
case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS:
+ case KVM_ARM_VCPU_PMU_V3_ENABLE_PARTITION:
if (kvm_vcpu_has_pmu(vcpu))
return 0;
}
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index f72d080ee7ba2..6a5572994b7fa 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -99,6 +99,8 @@ bool kvm_pmu_part_overflow_status(struct kvm_vcpu *vcpu);
#define kvm_vcpu_has_pmu(vcpu) \
(vcpu_has_feature(vcpu, KVM_ARM_VCPU_PMU_V3))
+bool has_kvm_pmu_partition_support(void);
+void kvm_pmu_partition_enable(struct kvm *kvm, bool enable);
bool pmu_is_partitioned(struct arm_pmu *pmu);
bool kvm_pmu_is_partitioned(struct kvm *kvm);
void kvm_pmu_direct_pmcr_write(struct kvm_vcpu *vcpu, u64 val);
@@ -279,6 +281,13 @@ static inline u64 kvm_pmu_guest_counter_mask(void *kvm)
static inline void kvm_pmu_handle_guest_irq(struct arm_pmu *pmu, u64 pmovsr) {}
+static inline bool has_kvm_pmu_partition_support(void)
+{
+ return false;
+}
+
+static inline void kvm_pmu_partition_enable(struct kvm *kvm, bool enable) {}
+
#endif
#endif
--
2.54.0.1136.gdb2ca164c4-goog
^ permalink raw reply related
* [PATCH 17/21] KVM: arm64: Detect overflows for the Partitioned PMU
From: Colton Lewis @ 2026-06-12 19:29 UTC (permalink / raw)
To: kvm
Cc: Alexandru Elisei, Paolo Bonzini, Jonathan Corbet, Russell King,
Catalin Marinas, Will Deacon, Marc Zyngier, Oliver Upton,
Mingwei Zhang, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Mark Rutland, Shuah Khan, Ganapatrao Kulkarni, James Clark,
linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-perf-users, linux-kselftest, Colton Lewis
In-Reply-To: <20260612192909.1153907-1-coltonlewis@google.com>
When we re-enter the VM after handling a PMU interrupt, calculate
whether it was any of the guest counters that overflowed and inject an
interrupt into the guest if so.
Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
arch/arm64/kvm/pmu-direct.c | 48 +++++++++++++++++++++++++++++++++++++
arch/arm64/kvm/pmu-emul.c | 4 ++--
arch/arm64/kvm/pmu.c | 6 ++++-
include/kvm/arm_pmu.h | 2 ++
4 files changed, 57 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c
index 64f40cfb31012..0062d1d8e1999 100644
--- a/arch/arm64/kvm/pmu-direct.c
+++ b/arch/arm64/kvm/pmu-direct.c
@@ -426,4 +426,52 @@ void kvm_pmu_handle_guest_irq(struct arm_pmu *pmu, u64 pmovsr)
return;
__vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, govf);
+
+ if (kvm_pmu_part_overflow_status(vcpu)) {
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+
+ if (!in_nmi())
+ kvm_vcpu_kick(vcpu);
+ else
+ irq_work_queue(&vcpu->arch.pmu.overflow_work);
+ }
+}
+
+/**
+ * kvm_pmu_part_overflow_status() - Determine if any guest counters have overflowed
+ * @vcpu: Pointer to struct kvm_vcpu
+ *
+ * Determine if any guest counters have overflowed and therefore an
+ * IRQ needs to be injected into the guest. If access is still free,
+ * then the guest hasn't accessed the PMU yet so we know the guest
+ * context is not loaded onto the pCPU and an overflow is impossible.
+ *
+ * Return: True if there was an overflow, false otherwise
+ */
+bool kvm_pmu_part_overflow_status(struct kvm_vcpu *vcpu)
+{
+ struct arm_pmu *pmu;
+ u64 mask, pmovs, pmint, pmcr;
+ bool overflow;
+
+ pmu = vcpu->kvm->arch.arm_pmu;
+ mask = kvm_pmu_guest_counter_mask(pmu);
+
+ if (vcpu->arch.pmu.access == VCPU_PMU_ACCESS_FREE) {
+ pmovs = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
+ pmint = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
+ pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0);
+
+ if ((pmcr & ARMV8_PMU_PMCR_E) && (mask & pmovs & pmint))
+ kvm_pmu_set_guest_owned(vcpu);
+ else
+ return false;
+ }
+
+ pmovs = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
+ pmint = read_pmintenset();
+ pmcr = read_pmcr();
+ overflow = (pmcr & ARMV8_PMU_PMCR_E) && (mask & pmovs & pmint);
+
+ return overflow;
}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index d1110febe7436..ebc68090bdb26 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -268,7 +268,7 @@ void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val)
* counter where the values of the global enable control, PMOVSSET_EL0[n], and
* PMINTENSET_EL1[n] are all 1.
*/
-bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
+bool kvm_pmu_emul_overflow_status(struct kvm_vcpu *vcpu)
{
u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
@@ -405,7 +405,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
kvm_pmu_counter_increment(vcpu, BIT(idx + 1),
ARMV8_PMUV3_PERFCTR_CHAIN);
- if (kvm_pmu_overflow_status(vcpu)) {
+ if (kvm_pmu_emul_overflow_status(vcpu)) {
kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
if (!in_nmi())
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 55cda8021400a..f5ee18b4dfae7 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -409,7 +409,11 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
bool overflow;
- overflow = kvm_pmu_overflow_status(vcpu);
+ if (kvm_pmu_is_partitioned(vcpu->kvm))
+ overflow = kvm_pmu_part_overflow_status(vcpu);
+ else
+ overflow = kvm_pmu_emul_overflow_status(vcpu);
+
if (pmu->irq_level == overflow)
return;
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 25163a689ae80..f72d080ee7ba2 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -93,6 +93,8 @@ bool kvm_set_pmuserenr(u64 val);
void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu);
void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
void kvm_vcpu_pmu_resync_el0(void);
+bool kvm_pmu_emul_overflow_status(struct kvm_vcpu *vcpu);
+bool kvm_pmu_part_overflow_status(struct kvm_vcpu *vcpu);
#define kvm_vcpu_has_pmu(vcpu) \
(vcpu_has_feature(vcpu, KVM_ARM_VCPU_PMU_V3))
--
2.54.0.1136.gdb2ca164c4-goog
^ permalink raw reply related
* [PATCH 16/21] perf: arm_pmuv3: Handle IRQs for Partitioned PMU guest counters
From: Colton Lewis @ 2026-06-12 19:29 UTC (permalink / raw)
To: kvm
Cc: Alexandru Elisei, Paolo Bonzini, Jonathan Corbet, Russell King,
Catalin Marinas, Will Deacon, Marc Zyngier, Oliver Upton,
Mingwei Zhang, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Mark Rutland, Shuah Khan, Ganapatrao Kulkarni, James Clark,
linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-perf-users, linux-kselftest, Colton Lewis
In-Reply-To: <20260612192909.1153907-1-coltonlewis@google.com>
Because ARM hardware is not yet capable of direct PPI injection into
guests, guest counters will still trigger interrupts that need to be
handled by the host PMU interrupt handler. Clear the overflow flags in
hardware to handle the interrupt as normal, but update the virtual overflow register for later injecting the interrupt into the guest.
Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
arch/arm/include/asm/arm_pmuv3.h | 6 ++++++
arch/arm64/include/asm/arm_pmuv3.h | 5 +++++
arch/arm64/kvm/pmu-direct.c | 22 ++++++++++++++++++++++
drivers/perf/arm_pmuv3.c | 24 +++++++++++++++++-------
include/kvm/arm_pmu.h | 3 +++
5 files changed, 53 insertions(+), 7 deletions(-)
diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h
index f6031bd522718..896fc5d6add0c 100644
--- a/arch/arm/include/asm/arm_pmuv3.h
+++ b/arch/arm/include/asm/arm_pmuv3.h
@@ -180,6 +180,11 @@ static inline void write_pmintenset(u32 val)
write_sysreg(val, PMINTENSET);
}
+static inline u32 read_pmintenset(void)
+{
+ return read_sysreg(PMINTENSET);
+}
+
static inline void write_pmintenclr(u32 val)
{
write_sysreg(val, PMINTENCLR);
@@ -239,6 +244,7 @@ static inline u64 kvm_pmu_host_counter_mask(struct arm_pmu *pmu)
{
return ~0;
}
+static inline void kvm_pmu_handle_guest_irq(struct arm_pmu *pmu, u64 pmovsr) {}
/* PMU Version in DFR Register */
#define ARMV8_PMU_DFR_VER_NI 0
diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h
index 27c4d6d47da31..69ff4d014bf39 100644
--- a/arch/arm64/include/asm/arm_pmuv3.h
+++ b/arch/arm64/include/asm/arm_pmuv3.h
@@ -110,6 +110,11 @@ static inline void write_pmintenset(u64 val)
write_sysreg(val, pmintenset_el1);
}
+static inline u64 read_pmintenset(void)
+{
+ return read_sysreg(pmintenset_el1);
+}
+
static inline void write_pmintenclr(u64 val)
{
write_sysreg(val, pmintenclr_el1);
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c
index bb1f3dca03869..64f40cfb31012 100644
--- a/arch/arm64/kvm/pmu-direct.c
+++ b/arch/arm64/kvm/pmu-direct.c
@@ -405,3 +405,25 @@ void kvm_pmu_set_guest_owned(struct kvm_vcpu *vcpu)
kvm_arm_setup_mdcr_el2(vcpu);
}
}
+
+/**
+ * kvm_pmu_handle_guest_irq() - Record IRQs in guest counters
+ * @pmu: PMU to check for overflows
+ * @pmovsr: Overflow flags reported by driver
+ *
+ * Set overflow flags in guest-reserved counters in the VCPU register
+ * for the guest to clear later.
+ */
+void kvm_pmu_handle_guest_irq(struct arm_pmu *pmu, u64 pmovsr)
+{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+ u64 mask = kvm_pmu_guest_counter_mask(pmu);
+ u64 govf = pmovsr & mask;
+
+ write_pmovsclr(govf);
+
+ if (!vcpu)
+ return;
+
+ __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, govf);
+}
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index c187397134990..6ab15a5209608 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -774,16 +774,15 @@ static void armv8pmu_disable_event_irq(struct perf_event *event)
armv8pmu_disable_intens(BIT(event->hw.idx));
}
-static u64 armv8pmu_getreset_flags(void)
+static u64 armv8pmu_getovf_flags(void)
{
u64 value;
/* Read */
value = read_pmovsclr();
- /* Write to clear flags */
- value &= ARMV8_PMU_CNT_MASK_ALL;
- write_pmovsclr(value);
+ /* Only report interrupt enabled counters. */
+ value &= read_pmintenset();
return value;
}
@@ -897,16 +896,17 @@ static void read_branch_records(struct pmu_hw_events *cpuc,
static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
{
- u64 pmovsr;
struct perf_sample_data data;
struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
struct pt_regs *regs;
+ u64 host_set = kvm_pmu_host_counter_mask(cpu_pmu);
+ u64 pmovsr;
int idx;
/*
- * Get and reset the IRQ flags
+ * Get the IRQ flags
*/
- pmovsr = armv8pmu_getreset_flags();
+ pmovsr = armv8pmu_getovf_flags();
/*
* Did an overflow occur?
@@ -914,6 +914,12 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
if (!armv8pmu_has_overflowed(pmovsr))
return IRQ_NONE;
+ /*
+ * Guest flag reset is handled the kvm hook at the bottom of
+ * this function.
+ */
+ write_pmovsclr(pmovsr & host_set);
+
/*
* Handle the counter(s) overflow(s)
*/
@@ -955,6 +961,10 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
*/
perf_event_overflow(event, &data, regs);
}
+
+ if (pmu_is_partitioned(cpu_pmu))
+ kvm_pmu_handle_guest_irq(cpu_pmu, pmovsr);
+
armv8pmu_start(cpu_pmu);
return IRQ_HANDLED;
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index b77ddb94dc99b..25163a689ae80 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -106,6 +106,7 @@ u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu);
void kvm_pmu_load(struct kvm_vcpu *vcpu);
void kvm_pmu_put(struct kvm_vcpu *vcpu);
void kvm_pmu_set_guest_owned(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_guest_irq(struct arm_pmu *pmu, u64 pmovsr);
#define kvm_pmu_get_access(vcpu) ((vcpu)->arch.pmu.access)
@@ -274,6 +275,8 @@ static inline u64 kvm_pmu_guest_counter_mask(void *kvm)
return 0;
}
+static inline void kvm_pmu_handle_guest_irq(struct arm_pmu *pmu, u64 pmovsr) {}
+
#endif
#endif
--
2.54.0.1136.gdb2ca164c4-goog
^ permalink raw reply related
* [PATCH 15/21] KVM: arm64: Implement lazy PMU context swaps
From: Colton Lewis @ 2026-06-12 19:29 UTC (permalink / raw)
To: kvm
Cc: Alexandru Elisei, Paolo Bonzini, Jonathan Corbet, Russell King,
Catalin Marinas, Will Deacon, Marc Zyngier, Oliver Upton,
Mingwei Zhang, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Mark Rutland, Shuah Khan, Ganapatrao Kulkarni, James Clark,
linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-perf-users, linux-kselftest, Colton Lewis
In-Reply-To: <20260612192909.1153907-1-coltonlewis@google.com>
Since many guests will never touch the PMU, they need not pay the cost
of context swapping those registers.
Use an enum to implement a simple state machine for PMU register
access. The PMU is either free or guest owned. We only need to context
swap if the PMU registers are guest owned. The PMU initially starts as
free and only transitions to guest owned if a guest has touched the
PMU registers.
Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
arch/arm64/include/asm/kvm_host.h | 1 +
arch/arm64/include/asm/kvm_types.h | 6 +++++-
arch/arm64/kvm/debug.c | 5 +++--
arch/arm64/kvm/pmu-direct.c | 21 +++++++++++++++++++--
arch/arm64/kvm/sys_regs.c | 29 ++++++++++++++++-------------
include/kvm/arm_pmu.h | 8 ++++++++
6 files changed, 52 insertions(+), 18 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 9c7e9b92dfbd3..32573b10d9c5b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1445,6 +1445,7 @@ static inline bool kvm_system_needs_idmapped_vectors(void)
return cpus_have_final_cap(ARM64_SPECTRE_V3A);
}
+void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu);
void kvm_init_host_debug_data(void);
void kvm_debug_init_vhe(void);
void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_types.h b/arch/arm64/include/asm/kvm_types.h
index 9a126b9e2d7c9..4e39cbc80aa0b 100644
--- a/arch/arm64/include/asm/kvm_types.h
+++ b/arch/arm64/include/asm/kvm_types.h
@@ -4,5 +4,9 @@
#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40
-#endif /* _ASM_ARM64_KVM_TYPES_H */
+enum vcpu_pmu_register_access {
+ VCPU_PMU_ACCESS_FREE,
+ VCPU_PMU_ACCESS_GUEST_OWNED,
+};
+#endif /* _ASM_ARM64_KVM_TYPES_H */
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index c84321277d893..ab80325e67c5c 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -35,7 +35,7 @@ static int cpu_has_spe(u64 dfr0)
* - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
* - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB)
*/
-static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
+void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
{
preempt_disable();
@@ -63,7 +63,8 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
* fine grain traps and enforce counter access with
* HPMN.
*/
- if (!vcpu_on_unsupported_cpu(vcpu) &&
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED &&
+ !vcpu_on_unsupported_cpu(vcpu) &&
cpus_have_final_cap(ARM64_HAS_FGT) &&
(cpus_have_final_cap(ARM64_HAS_HPMN0) || nr_guest_cntr > 0)) {
vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_TPM | MDCR_EL2_TPMCR | MDCR_EL2_HPMN);
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c
index 044f011c9c84b..bb1f3dca03869 100644
--- a/arch/arm64/kvm/pmu-direct.c
+++ b/arch/arm64/kvm/pmu-direct.c
@@ -269,7 +269,7 @@ void kvm_pmu_load(struct kvm_vcpu *vcpu)
* If we aren't guest-owned then we know the guest isn't using
* the PMU anyway, so no need to bother with the swap.
*/
- if (!kvm_pmu_is_partitioned(vcpu->kvm))
+ if (vcpu->arch.pmu.access != VCPU_PMU_ACCESS_GUEST_OWNED)
return;
preempt_disable();
@@ -343,7 +343,7 @@ void kvm_pmu_put(struct kvm_vcpu *vcpu)
* accessing the PMU anyway, so no need to bother with the
* swap.
*/
- if (!kvm_pmu_is_partitioned(vcpu->kvm))
+ if (vcpu->arch.pmu.access != VCPU_PMU_ACCESS_GUEST_OWNED)
return;
preempt_disable();
@@ -388,3 +388,20 @@ void kvm_pmu_put(struct kvm_vcpu *vcpu)
kvm_pmu_set_guest_counters(pmu, 0);
preempt_enable();
}
+
+/**
+ * kvm_pmu_set_guest_owned() - Give PMU ownership to guest
+ * @vcpu: Pointer to vcpu struct
+ *
+ * Reconfigure the guest for physical access of PMU hardware if
+ * allowed. This means reconfiguring mdcr_el2.
+ *
+ */
+void kvm_pmu_set_guest_owned(struct kvm_vcpu *vcpu)
+{
+ if (kvm_pmu_is_partitioned(vcpu->kvm) &&
+ vcpu->arch.pmu.access == VCPU_PMU_ACCESS_FREE) {
+ vcpu->arch.pmu.access = VCPU_PMU_ACCESS_GUEST_OWNED;
+ kvm_arm_setup_mdcr_el2(vcpu);
+ }
+}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 94572bc52c32a..f0eebeeb5ed96 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1085,15 +1085,17 @@ static void pmu_reg_write(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, u64 val,
u64 mask;
int idx;
+ kvm_pmu_set_guest_owned(vcpu);
+
switch (reg) {
case PMCR_EL0:
- if (kvm_pmu_is_partitioned(vcpu->kvm))
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED)
kvm_pmu_direct_pmcr_write(vcpu, val);
else
kvm_pmu_handle_pmcr(vcpu, val);
break;
case PMSELR_EL0:
- if (kvm_pmu_is_partitioned(vcpu->kvm))
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED)
write_sysreg(val, pmselr_el0);
else
__vcpu_assign_sys_reg(vcpu, reg, val);
@@ -1101,7 +1103,7 @@ static void pmu_reg_write(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, u64 val,
case PMEVCNTR0_EL0 ... PMCCNTR_EL0:
idx = reg - PMEVCNTR0_EL0;
- if (kvm_pmu_is_partitioned(vcpu->kvm)) {
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED) {
if (idx == ARMV8_PMU_CYCLE_IDX)
write_sysreg(val, pmccntr_el0);
else
@@ -1122,7 +1124,7 @@ static void pmu_reg_write(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, u64 val,
}
break;
case PMCNTENSET_EL0:
- if (kvm_pmu_is_partitioned(vcpu->kvm)) {
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED) {
if (set)
write_sysreg(val, pmcntenset_el0);
else
@@ -1139,7 +1141,7 @@ static void pmu_reg_write(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, u64 val,
}
break;
case PMINTENSET_EL1:
- if (kvm_pmu_is_partitioned(vcpu->kvm)) {
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED) {
if (set)
write_sysreg(val, pmintenset_el1);
else
@@ -1166,7 +1168,7 @@ static void pmu_reg_write(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, u64 val,
local_irq_restore(flags);
break;
case PMUSERENR_EL0:
- if (kvm_pmu_is_partitioned(vcpu->kvm))
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED)
write_sysreg(val, pmuserenr_el0);
else
__vcpu_assign_sys_reg(vcpu, reg, val);
@@ -1175,7 +1177,6 @@ static void pmu_reg_write(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg, u64 val,
WARN_ON(1);
break;
}
-
}
/**
@@ -1192,15 +1193,17 @@ static u64 pmu_reg_read(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
u64 val = 0;
int idx;
+ kvm_pmu_set_guest_owned(vcpu);
+
switch (reg) {
case PMCR_EL0:
- if (kvm_pmu_is_partitioned(vcpu->kvm))
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED)
val = kvm_pmu_direct_pmcr_read(vcpu);
else
val = kvm_vcpu_read_pmcr(vcpu);
break;
case PMSELR_EL0:
- if (kvm_pmu_is_partitioned(vcpu->kvm))
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED)
val = read_sysreg(pmselr_el0);
else
val = __vcpu_sys_reg(vcpu, reg);
@@ -1208,7 +1211,7 @@ static u64 pmu_reg_read(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
case PMEVCNTR0_EL0 ... PMCCNTR_EL0:
idx = reg - PMEVCNTR0_EL0;
- if (kvm_pmu_is_partitioned(vcpu->kvm)) {
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED) {
if (idx == ARMV8_PMU_CYCLE_IDX)
val = read_sysreg(pmccntr_el0);
else
@@ -1221,7 +1224,7 @@ static u64 pmu_reg_read(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
val = __vcpu_sys_reg(vcpu, reg);
break;
case PMCNTENSET_EL0:
- if (kvm_pmu_is_partitioned(vcpu->kvm)) {
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED) {
val = read_sysreg(pmcntenset_el0);
val &= kvm_pmu_guest_counter_mask(vcpu->kvm->arch.arm_pmu);
} else {
@@ -1229,7 +1232,7 @@ static u64 pmu_reg_read(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
}
break;
case PMINTENSET_EL1:
- if (kvm_pmu_is_partitioned(vcpu->kvm)) {
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED) {
val = read_sysreg(pmintenset_el1);
val &= kvm_pmu_guest_counter_mask(vcpu->kvm->arch.arm_pmu);
} else {
@@ -1240,7 +1243,7 @@ static u64 pmu_reg_read(struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
val = __vcpu_sys_reg(vcpu, reg);
break;
case PMUSERENR_EL0:
- if (kvm_pmu_is_partitioned(vcpu->kvm))
+ if (kvm_pmu_get_access(vcpu) == VCPU_PMU_ACCESS_GUEST_OWNED)
val = read_sysreg(pmuserenr_el0);
else
val = __vcpu_sys_reg(vcpu, reg);
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 61f8d4ed35e10..b77ddb94dc99b 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -7,6 +7,7 @@
#ifndef __ASM_ARM_KVM_PMU_H
#define __ASM_ARM_KVM_PMU_H
+#include <linux/kvm_types.h>
#include <linux/perf_event.h>
#include <linux/perf/arm_pmuv3.h>
#include <linux/perf/arm_pmu.h>
@@ -43,6 +44,7 @@ struct kvm_pmu {
int irq_num;
bool created;
bool irq_level;
+ enum vcpu_pmu_register_access access;
};
struct arm_pmu_entry {
@@ -103,6 +105,9 @@ u64 kvm_pmu_host_counter_mask(struct arm_pmu *pmu);
u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu);
void kvm_pmu_load(struct kvm_vcpu *vcpu);
void kvm_pmu_put(struct kvm_vcpu *vcpu);
+void kvm_pmu_set_guest_owned(struct kvm_vcpu *vcpu);
+
+#define kvm_pmu_get_access(vcpu) ((vcpu)->arch.pmu.access)
/*
* Updates the vcpu's view of the pmu events for this cpu.
@@ -147,6 +152,8 @@ static inline bool kvm_pmu_is_partitioned(struct kvm *kvm)
{
return false;
}
+
+#define kvm_pmu_get_access(vcpu) (VCPU_PMU_ACCESS_FREE)
static inline void kvm_pmu_direct_pmcr_write(struct kvm_vcpu *vcpu, u64 val) {}
static inline u64 kvm_pmu_direct_pmcr_read(struct kvm_vcpu *vcpu)
{
@@ -154,6 +161,7 @@ static inline u64 kvm_pmu_direct_pmcr_read(struct kvm_vcpu *vcpu)
}
static inline void kvm_pmu_load(struct kvm_vcpu *vcpu) {}
static inline void kvm_pmu_put(struct kvm_vcpu *vcpu) {}
+static inline void kvm_pmu_set_guest_owned(struct kvm_vcpu *vcpu) {}
static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu,
u64 select_idx, u64 val) {}
static inline void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu,
--
2.54.0.1136.gdb2ca164c4-goog
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox