All of lore.kernel.org
 help / color / mirror / Atom feed
From: Guo Ren <guoren@kernel.org>
To: Zong Li <zong.li@sifive.com>,
	David Laight <david.laight.linux@gmail.com>
Cc: tjeznach@rivosinc.com, joro@8bytes.org, will@kernel.org,
	robin.murphy@arm.com, robh@kernel.org, pjw@kernel.org,
	palmer@dabbelt.com, aou@eecs.berkeley.edu, alex@ghiti.fr,
	mark.rutland@arm.com, conor+dt@kernel.org, krzk@kernel.org,
	guoyaxing@bosc.ac.cn, luxu.kernel@bytedance.com,
	lv.zheng@linux.spacemit.com, andrew.jones@oss.qualcomm.com,
	linux-kernel@vger.kernel.org, iommu@lists.linux.dev,
	linux-riscv@lists.infradead.org,
	linux-perf-users@vger.kernel.org
Subject: Re: [PATCH v2 1/2] drivers/perf: riscv-iommu: add risc-v iommu pmu driver
Date: Fri, 19 Jun 2026 12:05:51 -0400	[thread overview]
Message-ID: <ajVo34aTfZFBZpoe@gmail.com> (raw)
In-Reply-To: <20260208063848.3547817-2-zong.li@sifive.com>

On Sat, Feb 07, 2026 at 10:38:35PM -0800, Zong Li wrote:
> Add a new driver to support the RISC-V IOMMU PMU. This is an auxiliary
> device driver created by the parent RISC-V IOMMU driver.
> 
> The RISC-V IOMMU PMU separates the cycle counter from the event counters.
> The cycle counter is not associated with iohpmevt0, so a software-defined
> cycle event is required for the perf subsystem.
> 
> The number and width of the counters are hardware-implemented and must
> be detected at runtime.
> 
> The performance monitor provides counters with filtering support to
> collect events for specific device ID/process ID, or GSCID/PSCID.
> 
> PMU-related definitions are moved into the perf driver, where they are
> used exclusively.
> 
> Signed-off-by: Zong Li <zong.li@sifive.com>
> ---
>  drivers/iommu/riscv/iommu-bits.h |  61 ---
>  drivers/perf/Kconfig             |  12 +
>  drivers/perf/Makefile            |   1 +
>  drivers/perf/riscv_iommu_pmu.c   | 661 +++++++++++++++++++++++++++++++
>  4 files changed, 674 insertions(+), 61 deletions(-)
>  create mode 100644 drivers/perf/riscv_iommu_pmu.c
> 
> diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> index 98daf0e1a306..746cd11f4938 100644
> --- a/drivers/iommu/riscv/iommu-bits.h
> +++ b/drivers/iommu/riscv/iommu-bits.h
> @@ -189,67 +189,6 @@ enum riscv_iommu_ddtp_modes {
>  #define RISCV_IOMMU_IPSR_PMIP		BIT(RISCV_IOMMU_INTR_PM)
>  #define RISCV_IOMMU_IPSR_PIP		BIT(RISCV_IOMMU_INTR_PQ)
>  
> -/* 5.19 Performance monitoring counter overflow status (32bits) */
> -#define RISCV_IOMMU_REG_IOCOUNTOVF	0x0058
> -#define RISCV_IOMMU_IOCOUNTOVF_CY	BIT(0)
> -#define RISCV_IOMMU_IOCOUNTOVF_HPM	GENMASK_ULL(31, 1)
> -
> -/* 5.20 Performance monitoring counter inhibits (32bits) */
> -#define RISCV_IOMMU_REG_IOCOUNTINH	0x005C
> -#define RISCV_IOMMU_IOCOUNTINH_CY	BIT(0)
> -#define RISCV_IOMMU_IOCOUNTINH_HPM	GENMASK(31, 1)
> -
> -/* 5.21 Performance monitoring cycles counter (64bits) */
> -#define RISCV_IOMMU_REG_IOHPMCYCLES     0x0060
> -#define RISCV_IOMMU_IOHPMCYCLES_COUNTER	GENMASK_ULL(62, 0)
> -#define RISCV_IOMMU_IOHPMCYCLES_OF	BIT_ULL(63)
> -
> -/* 5.22 Performance monitoring event counters (31 * 64bits) */
> -#define RISCV_IOMMU_REG_IOHPMCTR_BASE	0x0068
> -#define RISCV_IOMMU_REG_IOHPMCTR(_n)	(RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
> -
> -/* 5.23 Performance monitoring event selectors (31 * 64bits) */
> -#define RISCV_IOMMU_REG_IOHPMEVT_BASE	0x0160
> -#define RISCV_IOMMU_REG_IOHPMEVT(_n)	(RISCV_IOMMU_REG_IOHPMEVT_BASE + ((_n) * 0x8))
> -#define RISCV_IOMMU_IOHPMEVT_EVENTID	GENMASK_ULL(14, 0)
> -#define RISCV_IOMMU_IOHPMEVT_DMASK	BIT_ULL(15)
> -#define RISCV_IOMMU_IOHPMEVT_PID_PSCID	GENMASK_ULL(35, 16)
> -#define RISCV_IOMMU_IOHPMEVT_DID_GSCID	GENMASK_ULL(59, 36)
> -#define RISCV_IOMMU_IOHPMEVT_PV_PSCV	BIT_ULL(60)
> -#define RISCV_IOMMU_IOHPMEVT_DV_GSCV	BIT_ULL(61)
> -#define RISCV_IOMMU_IOHPMEVT_IDT	BIT_ULL(62)
> -#define RISCV_IOMMU_IOHPMEVT_OF		BIT_ULL(63)
> -
> -/* Number of defined performance-monitoring event selectors */
> -#define RISCV_IOMMU_IOHPMEVT_CNT	31
> -
> -/**
> - * enum riscv_iommu_hpmevent_id - Performance-monitoring event identifier
> - *
> - * @RISCV_IOMMU_HPMEVENT_INVALID: Invalid event, do not count
> - * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests
> - * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests
> - * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests
> - * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses
> - * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks
> - * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks
> - * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: First-stage page table walks
> - * @RISCV_IOMMU_HPMEVENT_G_WALKS: Second-stage page table walks
> - * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs
> - */
> -enum riscv_iommu_hpmevent_id {
> -	RISCV_IOMMU_HPMEVENT_INVALID    = 0,
> -	RISCV_IOMMU_HPMEVENT_URQ        = 1,
> -	RISCV_IOMMU_HPMEVENT_TRQ        = 2,
> -	RISCV_IOMMU_HPMEVENT_ATS_RQ     = 3,
> -	RISCV_IOMMU_HPMEVENT_TLB_MISS   = 4,
> -	RISCV_IOMMU_HPMEVENT_DD_WALK    = 5,
> -	RISCV_IOMMU_HPMEVENT_PD_WALK    = 6,
> -	RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7,
> -	RISCV_IOMMU_HPMEVENT_G_WALKS    = 8,
> -	RISCV_IOMMU_HPMEVENT_MAX        = 9
> -};
> -
>  /* 5.24 Translation request IOVA (64bits) */
>  #define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
>  #define RISCV_IOMMU_TR_REQ_IOVA_VPN	GENMASK_ULL(63, 12)
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 638321fc9800..6d0ece827501 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -105,6 +105,18 @@ config RISCV_PMU_SBI
>  	  full perf feature support i.e. counter overflow, privilege mode
>  	  filtering, counter configuration.
>  
> +config RISCV_IOMMU_PMU
> +	depends on RISCV || COMPILE_TEST
> +	depends on RISCV_IOMMU
> +	bool "RISC-V IOMMU Hardware Performance Monitor"
> +	default y
> +	help
> +	  Say Y if you want to use the RISC-V IOMMU performance monitor
> +	  implementation. The performance monitor is an optional hardware
> +	  feature, and whether it is actually enabled depends on IOMMU
> +	  hardware support. If the underlying hardware does not implement
> +	  the PMU, this option will have no effect.
> +
>  config STARFIVE_STARLINK_PMU
>  	depends on ARCH_STARFIVE || COMPILE_TEST
>  	depends on 64BIT
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index ea52711a87e3..f64f7dc046f1 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
>  obj-$(CONFIG_RISCV_PMU) += riscv_pmu.o
>  obj-$(CONFIG_RISCV_PMU_LEGACY) += riscv_pmu_legacy.o
>  obj-$(CONFIG_RISCV_PMU_SBI) += riscv_pmu_sbi.o
> +obj-$(CONFIG_RISCV_IOMMU_PMU) += riscv_iommu_pmu.o
>  obj-$(CONFIG_STARFIVE_STARLINK_PMU) += starfive_starlink_pmu.o
>  obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
>  obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
> diff --git a/drivers/perf/riscv_iommu_pmu.c b/drivers/perf/riscv_iommu_pmu.c
> new file mode 100644
> index 000000000000..72fc4341b165
> --- /dev/null
> +++ b/drivers/perf/riscv_iommu_pmu.c
> @@ -0,0 +1,661 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2026 SiFive
> + *
> + * Authors
> + *	Zong Li <zong.li@sifive.com>
> + */
> +
> +#include <linux/auxiliary_bus.h>
> +#include <linux/io-64-nonatomic-hi-lo.h>
> +#include <linux/perf_event.h>
> +
> +#include "../iommu/riscv/iommu.h"
> +
> +/* 5.19 Performance monitoring counter overflow status (32bits) */
> +#define RISCV_IOMMU_REG_IOCOUNTOVF	0x0058
> +#define RISCV_IOMMU_IOCOUNTOVF_CY	BIT(0)
> +#define RISCV_IOMMU_IOCOUNTOVF_HPM	GENMASK_ULL(31, 1)
> +
> +/* 5.20 Performance monitoring counter inhibits (32bits) */
> +#define RISCV_IOMMU_REG_IOCOUNTINH	0x005C
> +#define RISCV_IOMMU_IOCOUNTINH_CY	BIT(0)
> +#define RISCV_IOMMU_IOCOUNTINH_HPM	GENMASK(31, 0)
> +
> +/* 5.21 Performance monitoring cycles counter (64bits) */
> +#define RISCV_IOMMU_REG_IOHPMCYCLES	0x0060
> +#define RISCV_IOMMU_IOHPMCYCLES_COUNTER	GENMASK_ULL(62, 0)
> +#define RISCV_IOMMU_IOHPMCYCLES_OF	BIT_ULL(63)
> +#define RISCV_IOMMU_REG_IOHPMCTR(_n)	(RISCV_IOMMU_REG_IOHPMCYCLES + ((_n) * 0x8))
> +
> +/* 5.22 Performance monitoring event counters (31 * 64bits) */
> +#define RISCV_IOMMU_REG_IOHPMCTR_BASE	0x0068
> +#define RISCV_IOMMU_IOHPMCTR_COUNTER	GENMASK_ULL(63, 0)
> +
> +/* 5.23 Performance monitoring event selectors (31 * 64bits) */
> +#define RISCV_IOMMU_REG_IOHPMEVT_BASE	0x0160
> +#define RISCV_IOMMU_REG_IOHPMEVT(_n)	(RISCV_IOMMU_REG_IOHPMEVT_BASE + ((_n) * 0x8))
> +#define RISCV_IOMMU_IOHPMEVT_EVENTID	GENMASK_ULL(14, 0)
> +#define RISCV_IOMMU_IOHPMEVT_DMASK	BIT_ULL(15)
> +#define RISCV_IOMMU_IOHPMEVT_PID_PSCID	GENMASK_ULL(35, 16)
> +#define RISCV_IOMMU_IOHPMEVT_DID_GSCID	GENMASK_ULL(59, 36)
> +#define RISCV_IOMMU_IOHPMEVT_PV_PSCV	BIT_ULL(60)
> +#define RISCV_IOMMU_IOHPMEVT_DV_GSCV	BIT_ULL(61)
> +#define RISCV_IOMMU_IOHPMEVT_IDT	BIT_ULL(62)
> +#define RISCV_IOMMU_IOHPMEVT_OF		BIT_ULL(63)
> +#define RISCV_IOMMU_IOHPMEVT_EVENT	GENMASK_ULL(62, 0)
> +
> +/* The total number of counters is 31 event counters plus 1 cycle counter */
> +#define RISCV_IOMMU_HPM_COUNTER_NUM	32
> +
> +static int cpuhp_state;
> +
> +/**
> + * enum riscv_iommu_hpmevent_id - Performance-monitoring event identifier
> + *
> + * @RISCV_IOMMU_HPMEVENT_CYCLE: Clock cycle counter
> + * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests
> + * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests
> + * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests
> + * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses
> + * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks
> + * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks
> + * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: First-stage page table walks
> + * @RISCV_IOMMU_HPMEVENT_G_WALKS: Second-stage page table walks
> + * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs
> + *
> + * The specification does not define an event ID for counting the
> + * number of clock cycles, meaning there is no associated 'iohpmevt0'.
> + * Event ID 0 is an invalid event and does not overlap with any valid
> + * event ID. Let's repurpose ID 0 as the cycle for perf, the cycle
> + * event is not actually written into any register, it serves solely
> + * as an identifier.
> + */
> +enum riscv_iommu_hpmevent_id {
> +	RISCV_IOMMU_HPMEVENT_CYCLE	= 0,
> +	RISCV_IOMMU_HPMEVENT_URQ        = 1,
> +	RISCV_IOMMU_HPMEVENT_TRQ        = 2,
> +	RISCV_IOMMU_HPMEVENT_ATS_RQ     = 3,
> +	RISCV_IOMMU_HPMEVENT_TLB_MISS   = 4,
> +	RISCV_IOMMU_HPMEVENT_DD_WALK    = 5,
> +	RISCV_IOMMU_HPMEVENT_PD_WALK    = 6,
> +	RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7,
> +	RISCV_IOMMU_HPMEVENT_G_WALKS    = 8,
> +	RISCV_IOMMU_HPMEVENT_MAX        = 9
> +};
> +
> +struct riscv_iommu_pmu {
> +	struct pmu pmu;
> +	struct hlist_node node;
> +	void __iomem *reg;
> +	unsigned int on_cpu;
> +	int num_counters;
> +	u64 cycle_cntr_mask;
> +	u64 event_cntr_mask;
> +	struct perf_event *events[RISCV_IOMMU_HPM_COUNTER_NUM];
> +	DECLARE_BITMAP(used_counters, RISCV_IOMMU_HPM_COUNTER_NUM);
> +};
> +
> +#define to_riscv_iommu_pmu(p) (container_of(p, struct riscv_iommu_pmu, pmu))
> +
> +#define RISCV_IOMMU_PMU_ATTR_EXTRACTOR(_name, _mask)			\
> +	static inline u32 get_##_name(struct perf_event *event)		\
> +	{								\
> +		return FIELD_GET(_mask, event->attr.config);		\
> +	}								\
> +
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(event, RISCV_IOMMU_IOHPMEVT_EVENTID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(partial_matching, RISCV_IOMMU_IOHPMEVT_DMASK);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(pid_pscid, RISCV_IOMMU_IOHPMEVT_PID_PSCID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(did_gscid, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_pid_pscid, RISCV_IOMMU_IOHPMEVT_PV_PSCV);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_did_gscid, RISCV_IOMMU_IOHPMEVT_DV_GSCV);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_id_type, RISCV_IOMMU_IOHPMEVT_IDT);
> +
> +/* Formats */
> +PMU_FORMAT_ATTR(event,            "config:0-14");
> +PMU_FORMAT_ATTR(partial_matching, "config:15");
> +PMU_FORMAT_ATTR(pid_pscid,        "config:16-35");
> +PMU_FORMAT_ATTR(did_gscid,        "config:36-59");
> +PMU_FORMAT_ATTR(filter_pid_pscid, "config:60");
> +PMU_FORMAT_ATTR(filter_did_gscid, "config:61");
> +PMU_FORMAT_ATTR(filter_id_type,   "config:62");
> +
> +static struct attribute *riscv_iommu_pmu_formats[] = {
> +	&format_attr_event.attr,
> +	&format_attr_partial_matching.attr,
> +	&format_attr_pid_pscid.attr,
> +	&format_attr_did_gscid.attr,
> +	&format_attr_filter_pid_pscid.attr,
> +	&format_attr_filter_did_gscid.attr,
> +	&format_attr_filter_id_type.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_format_group = {
> +	.name = "format",
> +	.attrs = riscv_iommu_pmu_formats,
> +};
> +
> +/* Events */
> +static ssize_t riscv_iommu_pmu_event_show(struct device *dev,
> +					  struct device_attribute *attr,
> +					  char *page)
> +{
> +	struct perf_pmu_events_attr *pmu_attr;
> +
> +	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
> +
> +	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
> +}
> +
> +#define RISCV_IOMMU_PMU_EVENT_ATTR(name, id)			\
> +	PMU_EVENT_ATTR_ID(name, riscv_iommu_pmu_event_show, id)
> +
> +static struct attribute *riscv_iommu_pmu_events[] = {
> +	RISCV_IOMMU_PMU_EVENT_ATTR(cycle, RISCV_IOMMU_HPMEVENT_CYCLE),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(untranslated_req, RISCV_IOMMU_HPMEVENT_URQ),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(translated_req, RISCV_IOMMU_HPMEVENT_TRQ),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(ats_trans_req, RISCV_IOMMU_HPMEVENT_ATS_RQ),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(tlb_miss, RISCV_IOMMU_HPMEVENT_TLB_MISS),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(ddt_walks, RISCV_IOMMU_HPMEVENT_DD_WALK),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(pdt_walks, RISCV_IOMMU_HPMEVENT_PD_WALK),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(s_vs_pt_walks, RISCV_IOMMU_HPMEVENT_S_VS_WALKS),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(g_pt_walks, RISCV_IOMMU_HPMEVENT_G_WALKS),
> +	NULL,
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_events_group = {
> +	.name = "events",
> +	.attrs = riscv_iommu_pmu_events,
> +};
> +
> +/* cpumask */
> +static ssize_t riscv_iommu_cpumask_show(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(dev_get_drvdata(dev));
> +
> +	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pmu->on_cpu));
> +}
> +
> +static struct device_attribute riscv_iommu_cpumask_attr =
> +	__ATTR(cpumask, 0444, riscv_iommu_cpumask_show, NULL);
> +
> +static struct attribute *riscv_iommu_cpumask_attrs[] = {
> +	&riscv_iommu_cpumask_attr.attr,
> +	NULL
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_cpumask_group = {
> +	.attrs = riscv_iommu_cpumask_attrs,
> +};
> +
> +static const struct attribute_group *riscv_iommu_pmu_attr_grps[] = {
> +	&riscv_iommu_pmu_cpumask_group,
> +	&riscv_iommu_pmu_format_group,
> +	&riscv_iommu_pmu_events_group,
> +	NULL,
> +};
> +
> +/* PMU Operations */
> +static void riscv_iommu_pmu_set_counter(struct riscv_iommu_pmu *pmu, u32 idx,
> +					u64 value)
> +{
> +	u64 counter_mask = idx ? pmu->event_cntr_mask : pmu->cycle_cntr_mask;
> +
> +	writeq(value & counter_mask, pmu->reg + RISCV_IOMMU_REG_IOHPMCTR(idx));
> +}
> +
> +static u64 riscv_iommu_pmu_get_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	u64 value, counter_mask = idx ? pmu->event_cntr_mask : pmu->cycle_cntr_mask;
> +
> +	/* Use readq to read counter would be imprecise on 32-bits system */
> +	value = readq(pmu->reg + RISCV_IOMMU_REG_IOHPMCTR(idx)) & counter_mask;
As stated in the RISC-V IOMMU Specification, Chapter 6:
  Whether an 8-byte access to an IOMMU register is single-copy atomic is
  UNSPECIFIED, and such an access may appear, internally to the IOMMU,
  as if two separate 4-byte accesses — first to the high half and second
  to the low half — were performed.

Therefore, the atomicity of 64-bit MMIO accesses is UNSPECIFIED and
not clearly defined in the current ratified RISC-V IOMMU
specification. To handle this correctly, the Linux RISC-V IOMMU driver
should fall back to 32-bit MMIO accesses when reading 64-bit HPM counter
register.

Additionally, David Laight has proposed an optimization using the
hi-lo-hi read pattern with multiple 32-bit MMIO accesses [1], no retry
loop.

[1]: https://lore.kernel.org/linux-riscv/20260618143634.7f3dd6c5@pumpkin/

> +
> +	/* The bit 63 of cycle counter (i.e., idx == 0) is OF bit */
> +	return idx ? value : (value & ~RISCV_IOMMU_IOHPMCYCLES_OF);
> +}
> +
> +static bool is_cycle_event(u64 event)
> +{
> +	return event == RISCV_IOMMU_HPMEVENT_CYCLE;
> +}
> +
> +static void riscv_iommu_pmu_set_event(struct riscv_iommu_pmu *pmu, u32 idx,
> +				      u64 value)
> +{
> +	/* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> +	if (is_cycle_event(value))
> +		return;
> +
> +	/* Event counter start from idx 1 */
> +	writeq(FIELD_GET(RISCV_IOMMU_IOHPMEVT_EVENT, value),
> +	       pmu->reg + RISCV_IOMMU_REG_IOHPMEVT(idx - 1));
> +}
> +
> +static void riscv_iommu_pmu_enable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	u32 value = readl(addr);
> +
> +	writel(value & ~BIT(idx), addr);
> +}
> +
> +static void riscv_iommu_pmu_disable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	u32 value = readl(addr);
> +
> +	writel(value | BIT(idx), addr);
> +}
> +
> +static void riscv_iommu_pmu_start_all(struct riscv_iommu_pmu *pmu)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	u32 used_cntr = 0;
> +
> +	/* The performance-monitoring counter inhibits is a 32-bit WARL register */
> +	bitmap_to_arr32(&used_cntr, pmu->used_counters, pmu->num_counters);
> +
> +	writel(~used_cntr, addr);
> +}
> +
> +static void riscv_iommu_pmu_stop_all(struct riscv_iommu_pmu *pmu)
> +{
> +	writel(GENMASK_ULL(pmu->num_counters - 1, 0),
> +	       pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH);
> +}
> +
> +/* PMU APIs */
> +static void riscv_iommu_pmu_set_period(struct perf_event *event)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	u64 counter_mask = hwc->idx ? pmu->event_cntr_mask : pmu->cycle_cntr_mask;
> +	u64 period;
> +
> +	/*
> +	 * Limit the maximum period to prevent the counter value
> +	 * from overtaking the one we are about to program.
> +	 * In effect we are reducing max_period to account for
> +	 * interrupt latency (and we are being very conservative).
> +	 */
> +	period = counter_mask >> 1;
> +	riscv_iommu_pmu_set_counter(pmu, hwc->idx, period);
> +	local64_set(&hwc->prev_count, period);
> +}
> +
> +static int riscv_iommu_pmu_event_init(struct perf_event *event)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct perf_event *sibling;
> +	int total_event_counters = pmu->num_counters - 1;
> +	int counters = 0;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	if (hwc->sample_period)
> +		return -EOPNOTSUPP;
> +
> +	if (event->cpu < 0)
> +		return -EOPNOTSUPP;
> +
> +	event->cpu = pmu->on_cpu;
> +
> +	hwc->idx = -1;
> +	hwc->config = event->attr.config;
> +
> +	if (event->group_leader == event)
> +		return 0;
> +
> +	if (is_cycle_event(get_event(event->group_leader)))
> +		if (++counters > total_event_counters)
> +			return -EINVAL;
> +
> +	for_each_sibling_event(sibling, event->group_leader) {
> +		if (is_cycle_event(get_event(sibling)))
> +			continue;
> +
> +		if (sibling->pmu != event->pmu && !is_software_event(sibling))
> +			return -EINVAL;
> +
> +		if (++counters > total_event_counters)
> +			return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static void riscv_iommu_pmu_update(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	u64 delta, prev, now;
> +	u32 idx = hwc->idx;
> +	u64 counter_mask = idx ? pmu->event_cntr_mask : pmu->cycle_cntr_mask;
> +
> +	do {
> +		prev = local64_read(&hwc->prev_count);
> +		now = riscv_iommu_pmu_get_counter(pmu, idx);
> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> +	delta = (now - prev) & counter_mask;
> +	local64_add(delta, &event->count);
> +}
> +
> +static void riscv_iommu_pmu_start(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> +		return;
> +
> +	if (flags & PERF_EF_RELOAD)
> +		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
> +
> +	hwc->state = 0;
> +	riscv_iommu_pmu_set_period(event);
> +	riscv_iommu_pmu_set_event(pmu, hwc->idx, hwc->config);
> +	riscv_iommu_pmu_enable_counter(pmu, hwc->idx);
> +
> +	perf_event_update_userpage(event);
> +}
> +
> +static void riscv_iommu_pmu_stop(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	int idx = hwc->idx;
> +
> +	if (hwc->state & PERF_HES_STOPPED)
> +		return;
> +
> +	riscv_iommu_pmu_disable_counter(pmu, idx);
> +
> +	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE))
> +		riscv_iommu_pmu_update(event);
> +
> +	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +}
> +
> +static int riscv_iommu_pmu_add(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	unsigned int num_counters = pmu->num_counters;
> +	int idx;
> +
> +	/* Reserve index zero for iohpmcycles */
> +	if (is_cycle_event(get_event(event)))
> +		idx = 0;
> +	else
> +		idx = find_next_zero_bit(pmu->used_counters, num_counters, 1);
> +
> +	/* All event counters or cycle counter are in use */
> +	if (idx == num_counters || pmu->events[idx])
> +		return -EAGAIN;
> +
> +	set_bit(idx, pmu->used_counters);
> +
> +	pmu->events[idx] = event;
> +	hwc->idx = idx;
> +	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +	local64_set(&hwc->prev_count, 0);
> +
> +	if (flags & PERF_EF_START)
> +		riscv_iommu_pmu_start(event, flags);
> +
> +	/* Propagate changes to the userspace mapping. */
> +	perf_event_update_userpage(event);
> +
> +	return 0;
> +}
> +
> +static void riscv_iommu_pmu_read(struct perf_event *event)
> +{
> +	riscv_iommu_pmu_update(event);
> +}
> +
> +static void riscv_iommu_pmu_del(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	int idx = hwc->idx;
> +
> +	riscv_iommu_pmu_stop(event, PERF_EF_UPDATE);
> +	pmu->events[idx] = NULL;
> +	clear_bit(idx, pmu->used_counters);
> +
> +	perf_event_update_userpage(event);
> +}
> +
> +static int riscv_iommu_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct riscv_iommu_pmu *iommu_pmu;
> +
> +	iommu_pmu = hlist_entry_safe(node, struct riscv_iommu_pmu, node);
> +
> +	if (iommu_pmu->on_cpu == -1)
> +		iommu_pmu->on_cpu = cpu;
> +
> +	return 0;
> +}
> +
> +static int riscv_iommu_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct riscv_iommu_pmu *iommu_pmu;
> +	unsigned int target_cpu;
> +
> +	iommu_pmu = hlist_entry_safe(node, struct riscv_iommu_pmu, node);
> +
> +	if (cpu != iommu_pmu->on_cpu)
> +		return 0;
> +
> +	iommu_pmu->on_cpu = -1;
> +
> +	target_cpu = cpumask_any_but(cpu_online_mask, cpu);
> +	if (target_cpu >= nr_cpu_ids)
> +		return 0;
> +
> +	perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target_cpu);
> +	iommu_pmu->on_cpu = target_cpu;
> +
> +	return 0;
> +}
> +
> +static irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu)
> +{
> +	u32 ovf = readl(pmu->reg + RISCV_IOMMU_REG_IOCOUNTOVF);
> +	int idx;
> +
> +	if (!ovf)
> +		return IRQ_NONE;
> +
> +	riscv_iommu_pmu_stop_all(pmu);
> +
> +	for_each_set_bit(idx, (unsigned long *)&ovf, pmu->num_counters) {
> +		struct perf_event *event = pmu->events[idx];
> +
> +		if (WARN_ON_ONCE(!event))
> +			continue;
> +
> +		riscv_iommu_pmu_update(event);
> +		riscv_iommu_pmu_set_period(event);
> +	}
> +
> +	riscv_iommu_pmu_start_all(pmu);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static irqreturn_t riscv_iommu_pmu_irq_handler(int irq, void *dev_id)
> +{
> +	struct riscv_iommu_pmu *pmu = (struct riscv_iommu_pmu *)dev_id;
> +	irqreturn_t ret;
> +
> +	/* Check whether this interrupt is for PMU */
> +	if (!(readl_relaxed(pmu->reg + RISCV_IOMMU_REG_IPSR) & RISCV_IOMMU_IPSR_PMIP))
> +		return IRQ_NONE;
> +
> +	/* Process PMU IRQ */
> +	ret = riscv_iommu_pmu_handle_irq(pmu);
> +
> +	/* Clear performance monitoring interrupt pending bit */
> +	writel_relaxed(RISCV_IOMMU_IPSR_PMIP, pmu->reg + RISCV_IOMMU_REG_IPSR);
> +
> +	return ret;
> +}
> +
> +static unsigned int riscv_iommu_pmu_get_irq_num(struct riscv_iommu_device *iommu)
> +{
> +	/* Reuse ICVEC.CIV mask for all interrupt vectors mapping */
> +	int vec = (iommu->icvec >> (RISCV_IOMMU_IPSR_PMIP * 4)) & RISCV_IOMMU_ICVEC_CIV;
> +
> +	return iommu->irqs[vec];
> +}
> +
> +static int riscv_iommu_pmu_request_irq(struct riscv_iommu_device *iommu,
> +				       struct riscv_iommu_pmu *pmu)
> +{
> +	unsigned int irq = riscv_iommu_pmu_get_irq_num(iommu);
> +
> +	/*
> +	 * Set the IRQF_ONESHOT flag because this IRQ can be shared with
> +	 * other threaded IRQs by other queues.
> +	 */
> +	return devm_request_irq(iommu->dev, irq, riscv_iommu_pmu_irq_handler,
> +				IRQF_ONESHOT | IRQF_SHARED, dev_name(iommu->dev), pmu);
> +}
> +
> +static void riscv_iommu_pmu_free_irq(struct riscv_iommu_device *iommu,
> +				     struct riscv_iommu_pmu *pmu)
> +{
> +	unsigned int irq = riscv_iommu_pmu_get_irq_num(iommu);
> +
> +	free_irq(irq, pmu);
> +}
> +
> +static int riscv_iommu_pmu_probe(struct auxiliary_device *auxdev,
> +				 const struct auxiliary_device_id *id)
> +{
> +	struct  riscv_iommu_device *iommu_dev = dev_get_platdata(&auxdev->dev);
> +	struct riscv_iommu_pmu *iommu_pmu;
> +	void __iomem *addr;
> +	char *name;
> +	int ret;
> +
> +	iommu_pmu = devm_kzalloc(&auxdev->dev, sizeof(*iommu_pmu), GFP_KERNEL);
> +	if (!iommu_pmu)
> +		return -ENOMEM;
> +
> +	iommu_pmu->reg = iommu_dev->reg;
> +
> +	/* Counter number and width are hardware-implemented. Detect them by write 1s */
> +	addr = iommu_pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	writel(RISCV_IOMMU_IOCOUNTINH_HPM, addr);
> +	iommu_pmu->num_counters = hweight32(readl(addr));
> +
> +	addr = iommu_pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> +	writeq(RISCV_IOMMU_IOHPMCYCLES_COUNTER, addr);
> +	iommu_pmu->cycle_cntr_mask = readq(addr);
> +
> +	/* Assume the width of all event counters are the same */
> +	addr = iommu_pmu->reg + RISCV_IOMMU_REG_IOHPMCTR_BASE;
> +	writeq(RISCV_IOMMU_IOHPMCTR_COUNTER, addr);
> +	iommu_pmu->event_cntr_mask = readq(addr);
> +
> +	iommu_pmu->pmu = (struct pmu) {
> +		.module		= THIS_MODULE,
> +		.parent		= &auxdev->dev,
> +		.task_ctx_nr	= perf_invalid_context,
> +		.event_init	= riscv_iommu_pmu_event_init,
> +		.add		= riscv_iommu_pmu_add,
> +		.del		= riscv_iommu_pmu_del,
> +		.start		= riscv_iommu_pmu_start,
> +		.stop		= riscv_iommu_pmu_stop,
> +		.read		= riscv_iommu_pmu_read,
> +		.attr_groups	= riscv_iommu_pmu_attr_grps,
> +		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
> +	};
> +
> +	auxiliary_set_drvdata(auxdev, iommu_pmu);
> +
> +	name = devm_kasprintf(&auxdev->dev, GFP_KERNEL,
> +			      "riscv_iommu_pmu_%s", dev_name(iommu_dev->dev));
> +	if (!name) {
> +		dev_err(&auxdev->dev, "Failed to create name riscv_iommu_pmu_%s\n",
> +			dev_name(iommu_dev->dev));
> +		return -ENOMEM;
> +	}
> +
> +	/* Bind all events to the same cpu context to avoid race enabling */
> +	iommu_pmu->on_cpu = raw_smp_processor_id();
> +
> +	ret = cpuhp_state_add_instance_nocalls(cpuhp_state, &iommu_pmu->node);
> +	if (ret) {
> +		dev_err(&auxdev->dev, "Failed to register hotplug %s: %d\n", name, ret);
> +		return ret;
> +	}
> +
> +	ret = riscv_iommu_pmu_request_irq(iommu_dev, iommu_pmu);
> +	if (ret) {
> +		dev_err(&auxdev->dev, "Failed to request irq %s: %d\n", name, ret);
> +		goto err_cpuhp_remove;
> +	}
> +
> +	ret = perf_pmu_register(&iommu_pmu->pmu, name, -1);
> +	if (ret) {
> +		dev_err(&auxdev->dev, "Failed to registe %s: %d\n", name, ret);
> +		goto err_free_irq;
> +	}
> +
> +	dev_info(&auxdev->dev, "%s: Registered with %d counters\n",
> +		 name, iommu_pmu->num_counters);
> +
> +	return 0;
> +
> +err_free_irq:
> +	riscv_iommu_pmu_free_irq(iommu_dev, iommu_pmu);
> +err_cpuhp_remove:
> +	cpuhp_state_remove_instance_nocalls(cpuhp_state, &iommu_pmu->node);
> +	return ret;
> +}
> +
> +static const struct auxiliary_device_id riscv_iommu_pmu_id_table[] = {
> +	{ .name = "iommu.pmu" },
> +	{}
> +};
> +MODULE_DEVICE_TABLE(auxiliary, riscv_iommu_pmu_id_table);
> +
> +static struct auxiliary_driver iommu_pmu_driver = {
> +	.probe		= riscv_iommu_pmu_probe,
> +	.id_table	= riscv_iommu_pmu_id_table,
> +};
> +
> +static int __init riscv_iommu_pmu_init(void)
> +{
> +	int ret;
> +
> +	cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> +					      "perf/riscv/iommu:online",
> +					      riscv_iommu_pmu_online_cpu,
> +					      riscv_iommu_pmu_offline_cpu);
> +	if (cpuhp_state < 0)
> +		return cpuhp_state;
> +
> +	ret = auxiliary_driver_register(&iommu_pmu_driver);
> +	if (ret)
> +		cpuhp_remove_multi_state(cpuhp_state);
> +
> +	return ret;
> +}
> +module_init(riscv_iommu_pmu_init);
> +
> +MODULE_DESCRIPTION("RISC-V IOMMU PMU");
> +MODULE_LICENSE("GPL");
> -- 
> 2.43.7
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
> 

WARNING: multiple messages have this Message-ID (diff)
From: Guo Ren <guoren@kernel.org>
To: Zong Li <zong.li@sifive.com>,
	David Laight <david.laight.linux@gmail.com>
Cc: tjeznach@rivosinc.com, joro@8bytes.org, will@kernel.org,
	robin.murphy@arm.com, robh@kernel.org, pjw@kernel.org,
	palmer@dabbelt.com, aou@eecs.berkeley.edu, alex@ghiti.fr,
	mark.rutland@arm.com, conor+dt@kernel.org, krzk@kernel.org,
	guoyaxing@bosc.ac.cn, luxu.kernel@bytedance.com,
	lv.zheng@linux.spacemit.com, andrew.jones@oss.qualcomm.com,
	linux-kernel@vger.kernel.org, iommu@lists.linux.dev,
	linux-riscv@lists.infradead.org,
	linux-perf-users@vger.kernel.org
Subject: Re: [PATCH v2 1/2] drivers/perf: riscv-iommu: add risc-v iommu pmu driver
Date: Fri, 19 Jun 2026 12:05:51 -0400	[thread overview]
Message-ID: <ajVo34aTfZFBZpoe@gmail.com> (raw)
In-Reply-To: <20260208063848.3547817-2-zong.li@sifive.com>

On Sat, Feb 07, 2026 at 10:38:35PM -0800, Zong Li wrote:
> Add a new driver to support the RISC-V IOMMU PMU. This is an auxiliary
> device driver created by the parent RISC-V IOMMU driver.
> 
> The RISC-V IOMMU PMU separates the cycle counter from the event counters.
> The cycle counter is not associated with iohpmevt0, so a software-defined
> cycle event is required for the perf subsystem.
> 
> The number and width of the counters are hardware-implemented and must
> be detected at runtime.
> 
> The performance monitor provides counters with filtering support to
> collect events for specific device ID/process ID, or GSCID/PSCID.
> 
> PMU-related definitions are moved into the perf driver, where they are
> used exclusively.
> 
> Signed-off-by: Zong Li <zong.li@sifive.com>
> ---
>  drivers/iommu/riscv/iommu-bits.h |  61 ---
>  drivers/perf/Kconfig             |  12 +
>  drivers/perf/Makefile            |   1 +
>  drivers/perf/riscv_iommu_pmu.c   | 661 +++++++++++++++++++++++++++++++
>  4 files changed, 674 insertions(+), 61 deletions(-)
>  create mode 100644 drivers/perf/riscv_iommu_pmu.c
> 
> diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
> index 98daf0e1a306..746cd11f4938 100644
> --- a/drivers/iommu/riscv/iommu-bits.h
> +++ b/drivers/iommu/riscv/iommu-bits.h
> @@ -189,67 +189,6 @@ enum riscv_iommu_ddtp_modes {
>  #define RISCV_IOMMU_IPSR_PMIP		BIT(RISCV_IOMMU_INTR_PM)
>  #define RISCV_IOMMU_IPSR_PIP		BIT(RISCV_IOMMU_INTR_PQ)
>  
> -/* 5.19 Performance monitoring counter overflow status (32bits) */
> -#define RISCV_IOMMU_REG_IOCOUNTOVF	0x0058
> -#define RISCV_IOMMU_IOCOUNTOVF_CY	BIT(0)
> -#define RISCV_IOMMU_IOCOUNTOVF_HPM	GENMASK_ULL(31, 1)
> -
> -/* 5.20 Performance monitoring counter inhibits (32bits) */
> -#define RISCV_IOMMU_REG_IOCOUNTINH	0x005C
> -#define RISCV_IOMMU_IOCOUNTINH_CY	BIT(0)
> -#define RISCV_IOMMU_IOCOUNTINH_HPM	GENMASK(31, 1)
> -
> -/* 5.21 Performance monitoring cycles counter (64bits) */
> -#define RISCV_IOMMU_REG_IOHPMCYCLES     0x0060
> -#define RISCV_IOMMU_IOHPMCYCLES_COUNTER	GENMASK_ULL(62, 0)
> -#define RISCV_IOMMU_IOHPMCYCLES_OF	BIT_ULL(63)
> -
> -/* 5.22 Performance monitoring event counters (31 * 64bits) */
> -#define RISCV_IOMMU_REG_IOHPMCTR_BASE	0x0068
> -#define RISCV_IOMMU_REG_IOHPMCTR(_n)	(RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
> -
> -/* 5.23 Performance monitoring event selectors (31 * 64bits) */
> -#define RISCV_IOMMU_REG_IOHPMEVT_BASE	0x0160
> -#define RISCV_IOMMU_REG_IOHPMEVT(_n)	(RISCV_IOMMU_REG_IOHPMEVT_BASE + ((_n) * 0x8))
> -#define RISCV_IOMMU_IOHPMEVT_EVENTID	GENMASK_ULL(14, 0)
> -#define RISCV_IOMMU_IOHPMEVT_DMASK	BIT_ULL(15)
> -#define RISCV_IOMMU_IOHPMEVT_PID_PSCID	GENMASK_ULL(35, 16)
> -#define RISCV_IOMMU_IOHPMEVT_DID_GSCID	GENMASK_ULL(59, 36)
> -#define RISCV_IOMMU_IOHPMEVT_PV_PSCV	BIT_ULL(60)
> -#define RISCV_IOMMU_IOHPMEVT_DV_GSCV	BIT_ULL(61)
> -#define RISCV_IOMMU_IOHPMEVT_IDT	BIT_ULL(62)
> -#define RISCV_IOMMU_IOHPMEVT_OF		BIT_ULL(63)
> -
> -/* Number of defined performance-monitoring event selectors */
> -#define RISCV_IOMMU_IOHPMEVT_CNT	31
> -
> -/**
> - * enum riscv_iommu_hpmevent_id - Performance-monitoring event identifier
> - *
> - * @RISCV_IOMMU_HPMEVENT_INVALID: Invalid event, do not count
> - * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests
> - * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests
> - * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests
> - * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses
> - * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks
> - * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks
> - * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: First-stage page table walks
> - * @RISCV_IOMMU_HPMEVENT_G_WALKS: Second-stage page table walks
> - * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs
> - */
> -enum riscv_iommu_hpmevent_id {
> -	RISCV_IOMMU_HPMEVENT_INVALID    = 0,
> -	RISCV_IOMMU_HPMEVENT_URQ        = 1,
> -	RISCV_IOMMU_HPMEVENT_TRQ        = 2,
> -	RISCV_IOMMU_HPMEVENT_ATS_RQ     = 3,
> -	RISCV_IOMMU_HPMEVENT_TLB_MISS   = 4,
> -	RISCV_IOMMU_HPMEVENT_DD_WALK    = 5,
> -	RISCV_IOMMU_HPMEVENT_PD_WALK    = 6,
> -	RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7,
> -	RISCV_IOMMU_HPMEVENT_G_WALKS    = 8,
> -	RISCV_IOMMU_HPMEVENT_MAX        = 9
> -};
> -
>  /* 5.24 Translation request IOVA (64bits) */
>  #define RISCV_IOMMU_REG_TR_REQ_IOVA     0x0258
>  #define RISCV_IOMMU_TR_REQ_IOVA_VPN	GENMASK_ULL(63, 12)
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index 638321fc9800..6d0ece827501 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -105,6 +105,18 @@ config RISCV_PMU_SBI
>  	  full perf feature support i.e. counter overflow, privilege mode
>  	  filtering, counter configuration.
>  
> +config RISCV_IOMMU_PMU
> +	depends on RISCV || COMPILE_TEST
> +	depends on RISCV_IOMMU
> +	bool "RISC-V IOMMU Hardware Performance Monitor"
> +	default y
> +	help
> +	  Say Y if you want to use the RISC-V IOMMU performance monitor
> +	  implementation. The performance monitor is an optional hardware
> +	  feature, and whether it is actually enabled depends on IOMMU
> +	  hardware support. If the underlying hardware does not implement
> +	  the PMU, this option will have no effect.
> +
>  config STARFIVE_STARLINK_PMU
>  	depends on ARCH_STARFIVE || COMPILE_TEST
>  	depends on 64BIT
> diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
> index ea52711a87e3..f64f7dc046f1 100644
> --- a/drivers/perf/Makefile
> +++ b/drivers/perf/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
>  obj-$(CONFIG_RISCV_PMU) += riscv_pmu.o
>  obj-$(CONFIG_RISCV_PMU_LEGACY) += riscv_pmu_legacy.o
>  obj-$(CONFIG_RISCV_PMU_SBI) += riscv_pmu_sbi.o
> +obj-$(CONFIG_RISCV_IOMMU_PMU) += riscv_iommu_pmu.o
>  obj-$(CONFIG_STARFIVE_STARLINK_PMU) += starfive_starlink_pmu.o
>  obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
>  obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
> diff --git a/drivers/perf/riscv_iommu_pmu.c b/drivers/perf/riscv_iommu_pmu.c
> new file mode 100644
> index 000000000000..72fc4341b165
> --- /dev/null
> +++ b/drivers/perf/riscv_iommu_pmu.c
> @@ -0,0 +1,661 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2026 SiFive
> + *
> + * Authors
> + *	Zong Li <zong.li@sifive.com>
> + */
> +
> +#include <linux/auxiliary_bus.h>
> +#include <linux/io-64-nonatomic-hi-lo.h>
> +#include <linux/perf_event.h>
> +
> +#include "../iommu/riscv/iommu.h"
> +
> +/* 5.19 Performance monitoring counter overflow status (32bits) */
> +#define RISCV_IOMMU_REG_IOCOUNTOVF	0x0058
> +#define RISCV_IOMMU_IOCOUNTOVF_CY	BIT(0)
> +#define RISCV_IOMMU_IOCOUNTOVF_HPM	GENMASK_ULL(31, 1)
> +
> +/* 5.20 Performance monitoring counter inhibits (32bits) */
> +#define RISCV_IOMMU_REG_IOCOUNTINH	0x005C
> +#define RISCV_IOMMU_IOCOUNTINH_CY	BIT(0)
> +#define RISCV_IOMMU_IOCOUNTINH_HPM	GENMASK(31, 0)
> +
> +/* 5.21 Performance monitoring cycles counter (64bits) */
> +#define RISCV_IOMMU_REG_IOHPMCYCLES	0x0060
> +#define RISCV_IOMMU_IOHPMCYCLES_COUNTER	GENMASK_ULL(62, 0)
> +#define RISCV_IOMMU_IOHPMCYCLES_OF	BIT_ULL(63)
> +#define RISCV_IOMMU_REG_IOHPMCTR(_n)	(RISCV_IOMMU_REG_IOHPMCYCLES + ((_n) * 0x8))
> +
> +/* 5.22 Performance monitoring event counters (31 * 64bits) */
> +#define RISCV_IOMMU_REG_IOHPMCTR_BASE	0x0068
> +#define RISCV_IOMMU_IOHPMCTR_COUNTER	GENMASK_ULL(63, 0)
> +
> +/* 5.23 Performance monitoring event selectors (31 * 64bits) */
> +#define RISCV_IOMMU_REG_IOHPMEVT_BASE	0x0160
> +#define RISCV_IOMMU_REG_IOHPMEVT(_n)	(RISCV_IOMMU_REG_IOHPMEVT_BASE + ((_n) * 0x8))
> +#define RISCV_IOMMU_IOHPMEVT_EVENTID	GENMASK_ULL(14, 0)
> +#define RISCV_IOMMU_IOHPMEVT_DMASK	BIT_ULL(15)
> +#define RISCV_IOMMU_IOHPMEVT_PID_PSCID	GENMASK_ULL(35, 16)
> +#define RISCV_IOMMU_IOHPMEVT_DID_GSCID	GENMASK_ULL(59, 36)
> +#define RISCV_IOMMU_IOHPMEVT_PV_PSCV	BIT_ULL(60)
> +#define RISCV_IOMMU_IOHPMEVT_DV_GSCV	BIT_ULL(61)
> +#define RISCV_IOMMU_IOHPMEVT_IDT	BIT_ULL(62)
> +#define RISCV_IOMMU_IOHPMEVT_OF		BIT_ULL(63)
> +#define RISCV_IOMMU_IOHPMEVT_EVENT	GENMASK_ULL(62, 0)
> +
> +/* The total number of counters is 31 event counters plus 1 cycle counter */
> +#define RISCV_IOMMU_HPM_COUNTER_NUM	32
> +
> +static int cpuhp_state;
> +
> +/**
> + * enum riscv_iommu_hpmevent_id - Performance-monitoring event identifier
> + *
> + * @RISCV_IOMMU_HPMEVENT_CYCLE: Clock cycle counter
> + * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests
> + * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests
> + * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests
> + * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses
> + * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks
> + * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks
> + * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: First-stage page table walks
> + * @RISCV_IOMMU_HPMEVENT_G_WALKS: Second-stage page table walks
> + * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs
> + *
> + * The specification does not define an event ID for counting the
> + * number of clock cycles, meaning there is no associated 'iohpmevt0'.
> + * Event ID 0 is an invalid event and does not overlap with any valid
> + * event ID. Let's repurpose ID 0 as the cycle for perf, the cycle
> + * event is not actually written into any register, it serves solely
> + * as an identifier.
> + */
> +enum riscv_iommu_hpmevent_id {
> +	RISCV_IOMMU_HPMEVENT_CYCLE	= 0,
> +	RISCV_IOMMU_HPMEVENT_URQ        = 1,
> +	RISCV_IOMMU_HPMEVENT_TRQ        = 2,
> +	RISCV_IOMMU_HPMEVENT_ATS_RQ     = 3,
> +	RISCV_IOMMU_HPMEVENT_TLB_MISS   = 4,
> +	RISCV_IOMMU_HPMEVENT_DD_WALK    = 5,
> +	RISCV_IOMMU_HPMEVENT_PD_WALK    = 6,
> +	RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7,
> +	RISCV_IOMMU_HPMEVENT_G_WALKS    = 8,
> +	RISCV_IOMMU_HPMEVENT_MAX        = 9
> +};
> +
> +struct riscv_iommu_pmu {
> +	struct pmu pmu;
> +	struct hlist_node node;
> +	void __iomem *reg;
> +	unsigned int on_cpu;
> +	int num_counters;
> +	u64 cycle_cntr_mask;
> +	u64 event_cntr_mask;
> +	struct perf_event *events[RISCV_IOMMU_HPM_COUNTER_NUM];
> +	DECLARE_BITMAP(used_counters, RISCV_IOMMU_HPM_COUNTER_NUM);
> +};
> +
> +#define to_riscv_iommu_pmu(p) (container_of(p, struct riscv_iommu_pmu, pmu))
> +
> +#define RISCV_IOMMU_PMU_ATTR_EXTRACTOR(_name, _mask)			\
> +	static inline u32 get_##_name(struct perf_event *event)		\
> +	{								\
> +		return FIELD_GET(_mask, event->attr.config);		\
> +	}								\
> +
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(event, RISCV_IOMMU_IOHPMEVT_EVENTID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(partial_matching, RISCV_IOMMU_IOHPMEVT_DMASK);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(pid_pscid, RISCV_IOMMU_IOHPMEVT_PID_PSCID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(did_gscid, RISCV_IOMMU_IOHPMEVT_DID_GSCID);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_pid_pscid, RISCV_IOMMU_IOHPMEVT_PV_PSCV);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_did_gscid, RISCV_IOMMU_IOHPMEVT_DV_GSCV);
> +RISCV_IOMMU_PMU_ATTR_EXTRACTOR(filter_id_type, RISCV_IOMMU_IOHPMEVT_IDT);
> +
> +/* Formats */
> +PMU_FORMAT_ATTR(event,            "config:0-14");
> +PMU_FORMAT_ATTR(partial_matching, "config:15");
> +PMU_FORMAT_ATTR(pid_pscid,        "config:16-35");
> +PMU_FORMAT_ATTR(did_gscid,        "config:36-59");
> +PMU_FORMAT_ATTR(filter_pid_pscid, "config:60");
> +PMU_FORMAT_ATTR(filter_did_gscid, "config:61");
> +PMU_FORMAT_ATTR(filter_id_type,   "config:62");
> +
> +static struct attribute *riscv_iommu_pmu_formats[] = {
> +	&format_attr_event.attr,
> +	&format_attr_partial_matching.attr,
> +	&format_attr_pid_pscid.attr,
> +	&format_attr_did_gscid.attr,
> +	&format_attr_filter_pid_pscid.attr,
> +	&format_attr_filter_did_gscid.attr,
> +	&format_attr_filter_id_type.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_format_group = {
> +	.name = "format",
> +	.attrs = riscv_iommu_pmu_formats,
> +};
> +
> +/* Events */
> +static ssize_t riscv_iommu_pmu_event_show(struct device *dev,
> +					  struct device_attribute *attr,
> +					  char *page)
> +{
> +	struct perf_pmu_events_attr *pmu_attr;
> +
> +	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
> +
> +	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
> +}
> +
> +#define RISCV_IOMMU_PMU_EVENT_ATTR(name, id)			\
> +	PMU_EVENT_ATTR_ID(name, riscv_iommu_pmu_event_show, id)
> +
> +static struct attribute *riscv_iommu_pmu_events[] = {
> +	RISCV_IOMMU_PMU_EVENT_ATTR(cycle, RISCV_IOMMU_HPMEVENT_CYCLE),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(untranslated_req, RISCV_IOMMU_HPMEVENT_URQ),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(translated_req, RISCV_IOMMU_HPMEVENT_TRQ),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(ats_trans_req, RISCV_IOMMU_HPMEVENT_ATS_RQ),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(tlb_miss, RISCV_IOMMU_HPMEVENT_TLB_MISS),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(ddt_walks, RISCV_IOMMU_HPMEVENT_DD_WALK),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(pdt_walks, RISCV_IOMMU_HPMEVENT_PD_WALK),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(s_vs_pt_walks, RISCV_IOMMU_HPMEVENT_S_VS_WALKS),
> +	RISCV_IOMMU_PMU_EVENT_ATTR(g_pt_walks, RISCV_IOMMU_HPMEVENT_G_WALKS),
> +	NULL,
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_events_group = {
> +	.name = "events",
> +	.attrs = riscv_iommu_pmu_events,
> +};
> +
> +/* cpumask */
> +static ssize_t riscv_iommu_cpumask_show(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(dev_get_drvdata(dev));
> +
> +	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pmu->on_cpu));
> +}
> +
> +static struct device_attribute riscv_iommu_cpumask_attr =
> +	__ATTR(cpumask, 0444, riscv_iommu_cpumask_show, NULL);
> +
> +static struct attribute *riscv_iommu_cpumask_attrs[] = {
> +	&riscv_iommu_cpumask_attr.attr,
> +	NULL
> +};
> +
> +static const struct attribute_group riscv_iommu_pmu_cpumask_group = {
> +	.attrs = riscv_iommu_cpumask_attrs,
> +};
> +
> +static const struct attribute_group *riscv_iommu_pmu_attr_grps[] = {
> +	&riscv_iommu_pmu_cpumask_group,
> +	&riscv_iommu_pmu_format_group,
> +	&riscv_iommu_pmu_events_group,
> +	NULL,
> +};
> +
> +/* PMU Operations */
> +static void riscv_iommu_pmu_set_counter(struct riscv_iommu_pmu *pmu, u32 idx,
> +					u64 value)
> +{
> +	u64 counter_mask = idx ? pmu->event_cntr_mask : pmu->cycle_cntr_mask;
> +
> +	writeq(value & counter_mask, pmu->reg + RISCV_IOMMU_REG_IOHPMCTR(idx));
> +}
> +
> +static u64 riscv_iommu_pmu_get_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	u64 value, counter_mask = idx ? pmu->event_cntr_mask : pmu->cycle_cntr_mask;
> +
> +	/* Use readq to read counter would be imprecise on 32-bits system */
> +	value = readq(pmu->reg + RISCV_IOMMU_REG_IOHPMCTR(idx)) & counter_mask;
As stated in the RISC-V IOMMU Specification, Chapter 6:
  Whether an 8-byte access to an IOMMU register is single-copy atomic is
  UNSPECIFIED, and such an access may appear, internally to the IOMMU,
  as if two separate 4-byte accesses — first to the high half and second
  to the low half — were performed.

Therefore, the atomicity of 64-bit MMIO accesses is UNSPECIFIED and
not clearly defined in the current ratified RISC-V IOMMU
specification. To handle this correctly, the Linux RISC-V IOMMU driver
should fall back to 32-bit MMIO accesses when reading 64-bit HPM counter
register.

Additionally, David Laight has proposed an optimization using the
hi-lo-hi read pattern with multiple 32-bit MMIO accesses [1], no retry
loop.

[1]: https://lore.kernel.org/linux-riscv/20260618143634.7f3dd6c5@pumpkin/

> +
> +	/* The bit 63 of cycle counter (i.e., idx == 0) is OF bit */
> +	return idx ? value : (value & ~RISCV_IOMMU_IOHPMCYCLES_OF);
> +}
> +
> +static bool is_cycle_event(u64 event)
> +{
> +	return event == RISCV_IOMMU_HPMEVENT_CYCLE;
> +}
> +
> +static void riscv_iommu_pmu_set_event(struct riscv_iommu_pmu *pmu, u32 idx,
> +				      u64 value)
> +{
> +	/* There is no associtated IOHPMEVT0 for IOHPMCYCLES */
> +	if (is_cycle_event(value))
> +		return;
> +
> +	/* Event counter start from idx 1 */
> +	writeq(FIELD_GET(RISCV_IOMMU_IOHPMEVT_EVENT, value),
> +	       pmu->reg + RISCV_IOMMU_REG_IOHPMEVT(idx - 1));
> +}
> +
> +static void riscv_iommu_pmu_enable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	u32 value = readl(addr);
> +
> +	writel(value & ~BIT(idx), addr);
> +}
> +
> +static void riscv_iommu_pmu_disable_counter(struct riscv_iommu_pmu *pmu, u32 idx)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	u32 value = readl(addr);
> +
> +	writel(value | BIT(idx), addr);
> +}
> +
> +static void riscv_iommu_pmu_start_all(struct riscv_iommu_pmu *pmu)
> +{
> +	void __iomem *addr = pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	u32 used_cntr = 0;
> +
> +	/* The performance-monitoring counter inhibits is a 32-bit WARL register */
> +	bitmap_to_arr32(&used_cntr, pmu->used_counters, pmu->num_counters);
> +
> +	writel(~used_cntr, addr);
> +}
> +
> +static void riscv_iommu_pmu_stop_all(struct riscv_iommu_pmu *pmu)
> +{
> +	writel(GENMASK_ULL(pmu->num_counters - 1, 0),
> +	       pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH);
> +}
> +
> +/* PMU APIs */
> +static void riscv_iommu_pmu_set_period(struct perf_event *event)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	u64 counter_mask = hwc->idx ? pmu->event_cntr_mask : pmu->cycle_cntr_mask;
> +	u64 period;
> +
> +	/*
> +	 * Limit the maximum period to prevent the counter value
> +	 * from overtaking the one we are about to program.
> +	 * In effect we are reducing max_period to account for
> +	 * interrupt latency (and we are being very conservative).
> +	 */
> +	period = counter_mask >> 1;
> +	riscv_iommu_pmu_set_counter(pmu, hwc->idx, period);
> +	local64_set(&hwc->prev_count, period);
> +}
> +
> +static int riscv_iommu_pmu_event_init(struct perf_event *event)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct perf_event *sibling;
> +	int total_event_counters = pmu->num_counters - 1;
> +	int counters = 0;
> +
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	if (hwc->sample_period)
> +		return -EOPNOTSUPP;
> +
> +	if (event->cpu < 0)
> +		return -EOPNOTSUPP;
> +
> +	event->cpu = pmu->on_cpu;
> +
> +	hwc->idx = -1;
> +	hwc->config = event->attr.config;
> +
> +	if (event->group_leader == event)
> +		return 0;
> +
> +	if (is_cycle_event(get_event(event->group_leader)))
> +		if (++counters > total_event_counters)
> +			return -EINVAL;
> +
> +	for_each_sibling_event(sibling, event->group_leader) {
> +		if (is_cycle_event(get_event(sibling)))
> +			continue;
> +
> +		if (sibling->pmu != event->pmu && !is_software_event(sibling))
> +			return -EINVAL;
> +
> +		if (++counters > total_event_counters)
> +			return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static void riscv_iommu_pmu_update(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	u64 delta, prev, now;
> +	u32 idx = hwc->idx;
> +	u64 counter_mask = idx ? pmu->event_cntr_mask : pmu->cycle_cntr_mask;
> +
> +	do {
> +		prev = local64_read(&hwc->prev_count);
> +		now = riscv_iommu_pmu_get_counter(pmu, idx);
> +	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
> +
> +	delta = (now - prev) & counter_mask;
> +	local64_add(delta, &event->count);
> +}
> +
> +static void riscv_iommu_pmu_start(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> +		return;
> +
> +	if (flags & PERF_EF_RELOAD)
> +		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
> +
> +	hwc->state = 0;
> +	riscv_iommu_pmu_set_period(event);
> +	riscv_iommu_pmu_set_event(pmu, hwc->idx, hwc->config);
> +	riscv_iommu_pmu_enable_counter(pmu, hwc->idx);
> +
> +	perf_event_update_userpage(event);
> +}
> +
> +static void riscv_iommu_pmu_stop(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	int idx = hwc->idx;
> +
> +	if (hwc->state & PERF_HES_STOPPED)
> +		return;
> +
> +	riscv_iommu_pmu_disable_counter(pmu, idx);
> +
> +	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE))
> +		riscv_iommu_pmu_update(event);
> +
> +	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +}
> +
> +static int riscv_iommu_pmu_add(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	unsigned int num_counters = pmu->num_counters;
> +	int idx;
> +
> +	/* Reserve index zero for iohpmcycles */
> +	if (is_cycle_event(get_event(event)))
> +		idx = 0;
> +	else
> +		idx = find_next_zero_bit(pmu->used_counters, num_counters, 1);
> +
> +	/* All event counters or cycle counter are in use */
> +	if (idx == num_counters || pmu->events[idx])
> +		return -EAGAIN;
> +
> +	set_bit(idx, pmu->used_counters);
> +
> +	pmu->events[idx] = event;
> +	hwc->idx = idx;
> +	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
> +	local64_set(&hwc->prev_count, 0);
> +
> +	if (flags & PERF_EF_START)
> +		riscv_iommu_pmu_start(event, flags);
> +
> +	/* Propagate changes to the userspace mapping. */
> +	perf_event_update_userpage(event);
> +
> +	return 0;
> +}
> +
> +static void riscv_iommu_pmu_read(struct perf_event *event)
> +{
> +	riscv_iommu_pmu_update(event);
> +}
> +
> +static void riscv_iommu_pmu_del(struct perf_event *event, int flags)
> +{
> +	struct riscv_iommu_pmu *pmu = to_riscv_iommu_pmu(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	int idx = hwc->idx;
> +
> +	riscv_iommu_pmu_stop(event, PERF_EF_UPDATE);
> +	pmu->events[idx] = NULL;
> +	clear_bit(idx, pmu->used_counters);
> +
> +	perf_event_update_userpage(event);
> +}
> +
> +static int riscv_iommu_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct riscv_iommu_pmu *iommu_pmu;
> +
> +	iommu_pmu = hlist_entry_safe(node, struct riscv_iommu_pmu, node);
> +
> +	if (iommu_pmu->on_cpu == -1)
> +		iommu_pmu->on_cpu = cpu;
> +
> +	return 0;
> +}
> +
> +static int riscv_iommu_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct riscv_iommu_pmu *iommu_pmu;
> +	unsigned int target_cpu;
> +
> +	iommu_pmu = hlist_entry_safe(node, struct riscv_iommu_pmu, node);
> +
> +	if (cpu != iommu_pmu->on_cpu)
> +		return 0;
> +
> +	iommu_pmu->on_cpu = -1;
> +
> +	target_cpu = cpumask_any_but(cpu_online_mask, cpu);
> +	if (target_cpu >= nr_cpu_ids)
> +		return 0;
> +
> +	perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target_cpu);
> +	iommu_pmu->on_cpu = target_cpu;
> +
> +	return 0;
> +}
> +
> +static irqreturn_t riscv_iommu_pmu_handle_irq(struct riscv_iommu_pmu *pmu)
> +{
> +	u32 ovf = readl(pmu->reg + RISCV_IOMMU_REG_IOCOUNTOVF);
> +	int idx;
> +
> +	if (!ovf)
> +		return IRQ_NONE;
> +
> +	riscv_iommu_pmu_stop_all(pmu);
> +
> +	for_each_set_bit(idx, (unsigned long *)&ovf, pmu->num_counters) {
> +		struct perf_event *event = pmu->events[idx];
> +
> +		if (WARN_ON_ONCE(!event))
> +			continue;
> +
> +		riscv_iommu_pmu_update(event);
> +		riscv_iommu_pmu_set_period(event);
> +	}
> +
> +	riscv_iommu_pmu_start_all(pmu);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static irqreturn_t riscv_iommu_pmu_irq_handler(int irq, void *dev_id)
> +{
> +	struct riscv_iommu_pmu *pmu = (struct riscv_iommu_pmu *)dev_id;
> +	irqreturn_t ret;
> +
> +	/* Check whether this interrupt is for PMU */
> +	if (!(readl_relaxed(pmu->reg + RISCV_IOMMU_REG_IPSR) & RISCV_IOMMU_IPSR_PMIP))
> +		return IRQ_NONE;
> +
> +	/* Process PMU IRQ */
> +	ret = riscv_iommu_pmu_handle_irq(pmu);
> +
> +	/* Clear performance monitoring interrupt pending bit */
> +	writel_relaxed(RISCV_IOMMU_IPSR_PMIP, pmu->reg + RISCV_IOMMU_REG_IPSR);
> +
> +	return ret;
> +}
> +
> +static unsigned int riscv_iommu_pmu_get_irq_num(struct riscv_iommu_device *iommu)
> +{
> +	/* Reuse ICVEC.CIV mask for all interrupt vectors mapping */
> +	int vec = (iommu->icvec >> (RISCV_IOMMU_IPSR_PMIP * 4)) & RISCV_IOMMU_ICVEC_CIV;
> +
> +	return iommu->irqs[vec];
> +}
> +
> +static int riscv_iommu_pmu_request_irq(struct riscv_iommu_device *iommu,
> +				       struct riscv_iommu_pmu *pmu)
> +{
> +	unsigned int irq = riscv_iommu_pmu_get_irq_num(iommu);
> +
> +	/*
> +	 * Set the IRQF_ONESHOT flag because this IRQ can be shared with
> +	 * other threaded IRQs by other queues.
> +	 */
> +	return devm_request_irq(iommu->dev, irq, riscv_iommu_pmu_irq_handler,
> +				IRQF_ONESHOT | IRQF_SHARED, dev_name(iommu->dev), pmu);
> +}
> +
> +static void riscv_iommu_pmu_free_irq(struct riscv_iommu_device *iommu,
> +				     struct riscv_iommu_pmu *pmu)
> +{
> +	unsigned int irq = riscv_iommu_pmu_get_irq_num(iommu);
> +
> +	free_irq(irq, pmu);
> +}
> +
> +static int riscv_iommu_pmu_probe(struct auxiliary_device *auxdev,
> +				 const struct auxiliary_device_id *id)
> +{
> +	struct  riscv_iommu_device *iommu_dev = dev_get_platdata(&auxdev->dev);
> +	struct riscv_iommu_pmu *iommu_pmu;
> +	void __iomem *addr;
> +	char *name;
> +	int ret;
> +
> +	iommu_pmu = devm_kzalloc(&auxdev->dev, sizeof(*iommu_pmu), GFP_KERNEL);
> +	if (!iommu_pmu)
> +		return -ENOMEM;
> +
> +	iommu_pmu->reg = iommu_dev->reg;
> +
> +	/* Counter number and width are hardware-implemented. Detect them by write 1s */
> +	addr = iommu_pmu->reg + RISCV_IOMMU_REG_IOCOUNTINH;
> +	writel(RISCV_IOMMU_IOCOUNTINH_HPM, addr);
> +	iommu_pmu->num_counters = hweight32(readl(addr));
> +
> +	addr = iommu_pmu->reg + RISCV_IOMMU_REG_IOHPMCYCLES;
> +	writeq(RISCV_IOMMU_IOHPMCYCLES_COUNTER, addr);
> +	iommu_pmu->cycle_cntr_mask = readq(addr);
> +
> +	/* Assume the width of all event counters are the same */
> +	addr = iommu_pmu->reg + RISCV_IOMMU_REG_IOHPMCTR_BASE;
> +	writeq(RISCV_IOMMU_IOHPMCTR_COUNTER, addr);
> +	iommu_pmu->event_cntr_mask = readq(addr);
> +
> +	iommu_pmu->pmu = (struct pmu) {
> +		.module		= THIS_MODULE,
> +		.parent		= &auxdev->dev,
> +		.task_ctx_nr	= perf_invalid_context,
> +		.event_init	= riscv_iommu_pmu_event_init,
> +		.add		= riscv_iommu_pmu_add,
> +		.del		= riscv_iommu_pmu_del,
> +		.start		= riscv_iommu_pmu_start,
> +		.stop		= riscv_iommu_pmu_stop,
> +		.read		= riscv_iommu_pmu_read,
> +		.attr_groups	= riscv_iommu_pmu_attr_grps,
> +		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
> +	};
> +
> +	auxiliary_set_drvdata(auxdev, iommu_pmu);
> +
> +	name = devm_kasprintf(&auxdev->dev, GFP_KERNEL,
> +			      "riscv_iommu_pmu_%s", dev_name(iommu_dev->dev));
> +	if (!name) {
> +		dev_err(&auxdev->dev, "Failed to create name riscv_iommu_pmu_%s\n",
> +			dev_name(iommu_dev->dev));
> +		return -ENOMEM;
> +	}
> +
> +	/* Bind all events to the same cpu context to avoid race enabling */
> +	iommu_pmu->on_cpu = raw_smp_processor_id();
> +
> +	ret = cpuhp_state_add_instance_nocalls(cpuhp_state, &iommu_pmu->node);
> +	if (ret) {
> +		dev_err(&auxdev->dev, "Failed to register hotplug %s: %d\n", name, ret);
> +		return ret;
> +	}
> +
> +	ret = riscv_iommu_pmu_request_irq(iommu_dev, iommu_pmu);
> +	if (ret) {
> +		dev_err(&auxdev->dev, "Failed to request irq %s: %d\n", name, ret);
> +		goto err_cpuhp_remove;
> +	}
> +
> +	ret = perf_pmu_register(&iommu_pmu->pmu, name, -1);
> +	if (ret) {
> +		dev_err(&auxdev->dev, "Failed to registe %s: %d\n", name, ret);
> +		goto err_free_irq;
> +	}
> +
> +	dev_info(&auxdev->dev, "%s: Registered with %d counters\n",
> +		 name, iommu_pmu->num_counters);
> +
> +	return 0;
> +
> +err_free_irq:
> +	riscv_iommu_pmu_free_irq(iommu_dev, iommu_pmu);
> +err_cpuhp_remove:
> +	cpuhp_state_remove_instance_nocalls(cpuhp_state, &iommu_pmu->node);
> +	return ret;
> +}
> +
> +static const struct auxiliary_device_id riscv_iommu_pmu_id_table[] = {
> +	{ .name = "iommu.pmu" },
> +	{}
> +};
> +MODULE_DEVICE_TABLE(auxiliary, riscv_iommu_pmu_id_table);
> +
> +static struct auxiliary_driver iommu_pmu_driver = {
> +	.probe		= riscv_iommu_pmu_probe,
> +	.id_table	= riscv_iommu_pmu_id_table,
> +};
> +
> +static int __init riscv_iommu_pmu_init(void)
> +{
> +	int ret;
> +
> +	cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> +					      "perf/riscv/iommu:online",
> +					      riscv_iommu_pmu_online_cpu,
> +					      riscv_iommu_pmu_offline_cpu);
> +	if (cpuhp_state < 0)
> +		return cpuhp_state;
> +
> +	ret = auxiliary_driver_register(&iommu_pmu_driver);
> +	if (ret)
> +		cpuhp_remove_multi_state(cpuhp_state);
> +
> +	return ret;
> +}
> +module_init(riscv_iommu_pmu_init);
> +
> +MODULE_DESCRIPTION("RISC-V IOMMU PMU");
> +MODULE_LICENSE("GPL");
> -- 
> 2.43.7
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
> 

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

  reply	other threads:[~2026-06-19 16:06 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-08  6:38 [PATCH v2 0/2] RISC-V IOMMU HPM support Zong Li
2026-02-08  6:38 ` Zong Li
2026-02-08  6:38 ` [PATCH v2 1/2] drivers/perf: riscv-iommu: add risc-v iommu pmu driver Zong Li
2026-02-08  6:38   ` Zong Li
2026-06-19 16:05   ` Guo Ren [this message]
2026-06-19 16:05     ` Guo Ren
2026-02-08  6:38 ` [PATCH v2 2/2] iommu/riscv: create a auxiliary device for HPM Zong Li
2026-02-08  6:38   ` Zong Li
2026-02-09  1:40 ` [PATCH v2 0/2] RISC-V IOMMU HPM support Lv Zheng
2026-02-09  1:40   ` Lv Zheng
2026-02-09  3:19   ` Zong Li
2026-02-09  3:19     ` Zong Li
2026-02-09  3:21   ` yaxing guo
2026-02-09  3:21     ` yaxing guo
2026-02-27  5:54     ` Lv Zheng
2026-02-27  5:54       ` Lv Zheng
2026-03-05  3:51 ` Zong Li
2026-03-05  3:51   ` Zong Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ajVo34aTfZFBZpoe@gmail.com \
    --to=guoren@kernel.org \
    --cc=alex@ghiti.fr \
    --cc=andrew.jones@oss.qualcomm.com \
    --cc=aou@eecs.berkeley.edu \
    --cc=conor+dt@kernel.org \
    --cc=david.laight.linux@gmail.com \
    --cc=guoyaxing@bosc.ac.cn \
    --cc=iommu@lists.linux.dev \
    --cc=joro@8bytes.org \
    --cc=krzk@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=linux-riscv@lists.infradead.org \
    --cc=luxu.kernel@bytedance.com \
    --cc=lv.zheng@linux.spacemit.com \
    --cc=mark.rutland@arm.com \
    --cc=palmer@dabbelt.com \
    --cc=pjw@kernel.org \
    --cc=robh@kernel.org \
    --cc=robin.murphy@arm.com \
    --cc=tjeznach@rivosinc.com \
    --cc=will@kernel.org \
    --cc=zong.li@sifive.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.