[RFC PATCH 5/6] mm/damon/vaddr: implement perf-event access check

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
To: sj@kernel.org, akinobu.mita@gmail.com, damon@lists.linux.dev,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org
Cc: akpm@linux-foundation.org, corbet@lwn.net, bijan311@gmail.com,
	ajayjoshi@micron.com, honggyu.kim@sk.com, yunjeong.mun@sk.com,
	ravis.opensrc@gmail.com
Subject: [RFC PATCH 5/6] mm/damon/vaddr: implement perf-event access check
Date: Fri, 29 May 2026 09:56:39 -0700	[thread overview]
Message-ID: <20260529165640.820-6-ravis.opensrc@gmail.com> (raw)
In-Reply-To: <20260529165640.820-1-ravis.opensrc@gmail.com>

Add the perf-event backend used by the substrate.  Two stateless NMI
overflow handlers are picked at perf_event_create_kernel_counter()
time (paddr- vs vaddr-keyed) and called with context = NULL, so the
NMI fast path never dereferences the per-event struct.  Each submits
a damon_access_report into the per-CPU ring.

The vaddr handler drops samples with addr == 0 or addr >= TASK_SIZE.
The paddr handler gates on data->sample_flags & PERF_SAMPLE_PHYS_ADDR
rather than testing data->phys_addr for zero (which would also drop
legitimate page 0).  AMD IBS Op only populates phys_addr when
IBS_OP_DATA3.dc_phy_addr_valid is set; gating on sample_flags is the
documented way to detect that.  is_write is derived from
data->data_src.mem_op.

cpuhp_setup_state_multi() registers one global state at subsys_initcall;
each damon_perf_event is added as an instance in damon_perf_init() so
cpuhp drives per-CPU event creation and offline-time release.  Events
are created with disabled=1 and armed by kdamond_fn() when the
substrate is ready; per-CPU init failures are surfaced via
init_complete / any_cpu_failed so damon_perf_init() rolls back the
cpuhp instance instead of leaving a half-armed event behind.

Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
---
 mm/damon/vaddr.c | 267 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index d271476035641..73fcea91afa07 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -7,11 +7,13 @@
 
 #define pr_fmt(fmt) "damon-va: " fmt
 
+#include <linux/cpuhotplug.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
+#include <linux/perf_event.h>
 #include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
 
@@ -957,6 +959,263 @@ static int damon_va_scheme_score(struct damon_ctx *context,
 	return DAMOS_MAX_SCORE;
 }
 
+#ifdef CONFIG_PERF_EVENTS
+
+#define DAMON_PERF_MAX_RECORDS	(1UL << 20)
+#define DAMON_PERF_INIT_RECORDS	(1UL << 15)
+
+/*
+ * NMI hot-path: avoid every heap dereference.  These handlers carry no
+ * pointer back to the per-event struct -- perf_event_create_kernel_counter
+ * is called with context = NULL.  Submission flows into the global
+ * per-CPU SPSC ring (damon_report_access -> kdamond_check_reported_accesses
+ * drains).
+ */
+static void damon_perf_overflow_vaddr(struct perf_event *perf_event,
+		struct perf_sample_data *data, struct pt_regs *regs)
+{
+	struct damon_access_report report;
+
+	if (!data || !data->addr)
+		return;
+
+	/* Drop kernel-VA hits -- only user-space VAs land in damon vaddr regions. */
+	if (data->addr >= TASK_SIZE)
+		return;
+
+	report = (struct damon_access_report){
+		.vaddr = data->addr & PAGE_MASK,
+		.size = PAGE_SIZE,
+		.cpu = smp_processor_id(),
+		.tid = current->pid,
+		.tgid = current->tgid,
+		.is_write = !!(data->data_src.mem_op & PERF_MEM_OP_STORE),
+	};
+	damon_report_access(&report);
+}
+
+static void damon_perf_overflow_paddr(struct perf_event *perf_event,
+		struct perf_sample_data *data, struct pt_regs *regs)
+{
+	struct damon_access_report report;
+
+	if (!data)
+		return;
+
+	/*
+	 * AMD IBS Op only populates data->phys_addr when
+	 * IBS_OP_DATA3.dc_phy_addr_valid is set; otherwise the field
+	 * carries a stale value.  Gate on sample_flags rather than testing
+	 * phys_addr for zero (which would also drop legitimate page 0).
+	 */
+	if (!(data->sample_flags & PERF_SAMPLE_PHYS_ADDR))
+		return;
+
+	report = (struct damon_access_report){
+		.paddr = data->phys_addr & PAGE_MASK,
+		.size = PAGE_SIZE,
+		.cpu = smp_processor_id(),
+		.is_write = !!(data->data_src.mem_op & PERF_MEM_OP_STORE),
+	};
+	damon_report_access(&report);
+}
+
+static enum cpuhp_state damon_perf_cpuhp_state;
+
+static void damon_perf_event_init_attr(struct damon_perf_event *event,
+		struct perf_event_attr *attr)
+{
+	*attr = (struct perf_event_attr) {
+		.size = sizeof(*attr),
+		.type = event->attr.type,
+		.config = event->attr.config,
+		.config1 = event->attr.config1,
+		.config2 = event->attr.config2,
+		.freq = event->attr.freq,
+		.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_ADDR |
+			PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
+			(event->attr.sample_phys_addr ?
+				PERF_SAMPLE_PHYS_ADDR : 0) |
+			(event->attr.sample_weight_struct ?
+				PERF_SAMPLE_WEIGHT_STRUCT : 0),
+		.precise_ip = event->attr.precise_ip,
+		.pinned = 1,
+		.disabled = 1,
+		.wakeup_events = event->attr.wakeup_events,
+		.exclude_kernel = event->attr.exclude_kernel,
+		.exclude_hv = event->attr.exclude_hv,
+	};
+
+	/*
+	 * sample_period and sample_freq share storage in the kernel
+	 * perf_event_attr (union).  Select based on the freq toggle so
+	 * frequency-based callers (PEBS) and period-based callers
+	 * (AMD IBS Op MaxCnt) both work correctly.
+	 */
+	if (event->attr.freq)
+		attr->sample_freq = event->attr.sample_freq;
+	else
+		attr->sample_period = event->attr.sample_period;
+}
+
+static int damon_perf_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct damon_perf_event *event = hlist_entry(node,
+			struct damon_perf_event, hlist_node);
+	struct damon_perf *perf = event->priv;
+	struct perf_event_attr attr;
+	struct perf_event *perf_event;
+	perf_overflow_handler_t handler;
+
+	if (!perf)
+		return 0;
+
+	damon_perf_event_init_attr(event, &attr);
+
+	/*
+	 * Pick a paddr- or vaddr-specific handler at create time so the
+	 * NMI fast path is statically branched.  Pass NULL as context --
+	 * handlers are stateless wrt the per-event struct, so the NMI
+	 * fast path performs no per-event heap dereference.  Submission
+	 * flows into the global per-CPU SPSC ring via damon_report_access().
+	 */
+	handler = event->attr.sample_phys_addr ?
+		damon_perf_overflow_paddr : damon_perf_overflow_vaddr;
+
+	perf_event = perf_event_create_kernel_counter(&attr, cpu, NULL,
+			handler, NULL);
+	if (IS_ERR(perf_event)) {
+		pr_warn_ratelimited("damon-perf: cpu %u event create failed: %ld\n",
+				cpu, PTR_ERR(perf_event));
+		if (!event->init_complete)
+			event->any_cpu_failed = true;
+		return 0;	/* never block CPU online */
+	}
+	*per_cpu_ptr(perf->event, cpu) = perf_event;
+	/*
+	 * Late-online CPU after the substrate is armed: events are created
+	 * with attr.disabled = 1 and would otherwise stay quiescent on this
+	 * CPU until the next arm walk.  Enable here so coverage matches the
+	 * already-online CPUs.
+	 */
+	if (event->ctx && READ_ONCE(event->ctx->perf_events_active))
+		perf_event_enable(perf_event);
+	return 0;
+}
+
+static int damon_perf_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+	struct damon_perf_event *event = hlist_entry(node,
+			struct damon_perf_event, hlist_node);
+	struct damon_perf *perf = event->priv;
+	struct perf_event *perf_event;
+
+	if (!perf)
+		return 0;
+
+	perf_event = per_cpu(*perf->event, cpu);
+	if (perf_event) {
+		perf_event_disable(perf_event);
+		perf_event_release_kernel(perf_event);
+		*per_cpu_ptr(perf->event, cpu) = NULL;
+	}
+	return 0;
+}
+
+void damon_perf_event_arm(struct damon_perf_event *event)
+{
+	struct damon_perf *perf = event->priv;
+	struct perf_event *perf_event;
+	int cpu;
+
+	if (!perf)
+		return;
+
+	for_each_online_cpu(cpu) {
+		perf_event = *per_cpu_ptr(perf->event, cpu);
+		if (perf_event)
+			perf_event_enable(perf_event);
+	}
+}
+
+void damon_perf_event_disarm(struct damon_perf_event *event)
+{
+	struct damon_perf *perf = event->priv;
+	struct perf_event *perf_event;
+	int cpu;
+
+	if (!perf)
+		return;
+
+	for_each_online_cpu(cpu) {
+		perf_event = *per_cpu_ptr(perf->event, cpu);
+		if (perf_event)
+			perf_event_disable(perf_event);
+	}
+}
+
+int damon_perf_init(struct damon_ctx *ctx, struct damon_perf_event *event)
+{
+	struct damon_perf *perf;
+	int err = -ENOMEM;
+
+	perf = kzalloc(sizeof(*perf), GFP_KERNEL);
+	if (!perf)
+		return -ENOMEM;
+
+	perf->event = alloc_percpu(typeof(*perf->event));
+	if (!perf->event)
+		goto free_perf;
+
+	event->priv = perf;
+	event->ctx = ctx;
+	INIT_HLIST_NODE(&event->hlist_node);
+
+	/*
+	 * cpuhp_state_add_instance() invokes the online callback synchronously
+	 * for every currently-online CPU; late-online CPUs subsequently get
+	 * an event automatically and offline CPUs release theirs cleanly.
+	 */
+	err = cpuhp_state_add_instance(damon_perf_cpuhp_state,
+			&event->hlist_node);
+	if (err)
+		goto free_event;
+
+	event->init_complete = true;
+	if (event->any_cpu_failed) {
+		cpuhp_state_remove_instance(damon_perf_cpuhp_state,
+				&event->hlist_node);
+		err = -ENODEV;
+		goto free_event;
+	}
+
+	return 0;
+
+free_event:
+	free_percpu(perf->event);
+free_perf:
+	kfree(perf);
+	event->priv = NULL;
+	return err;
+}
+
+void damon_perf_cleanup(struct damon_ctx *ctx, struct damon_perf_event *event)
+{
+	struct damon_perf *perf = event->priv;
+
+	if (!perf)
+		return;
+
+	cpuhp_state_remove_instance(damon_perf_cpuhp_state,
+			&event->hlist_node);
+
+	free_percpu(perf->event);
+	kfree(perf);
+	event->priv = NULL;
+}
+
+#endif /* CONFIG_PERF_EVENTS */
+
 static int __init damon_va_initcall(void)
 {
 	struct damon_operations ops = {
@@ -979,6 +1238,14 @@ static int __init damon_va_initcall(void)
 	ops_fvaddr.init = NULL;
 	ops_fvaddr.update = NULL;
 
+#ifdef CONFIG_PERF_EVENTS
+	err = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "damon/perf:online",
+			damon_perf_cpu_online, damon_perf_cpu_offline);
+	if (err < 0)
+		return err;
+	damon_perf_cpuhp_state = err;
+#endif
+
 	err = damon_register_ops(&ops);
 	if (err)
 		return err;
-- 
2.43.0

next prev parent reply	other threads:[~2026-05-29 16:57 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-29 16:56 [RFC PATCH 0/6] mm/damon: hardware-sampled access reports Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 1/6] mm/damon: add struct damon_perf_event{,_attr} and per-ctx perf_events list Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 2/6] mm/damon/sysfs-sample: expose perf_events configuration via sysfs Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 3/6] mm/damon/sysfs: install perf_events on apply Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 4/6] mm/damon/core: per-CPU SPSC ring drain and damon_perf_event lifecycle Ravi Jonnalagadda
2026-05-29 16:56 ` Ravi Jonnalagadda [this message]
2026-05-29 16:56 ` [RFC PATCH 6/6] mm/damon: add damos_node_eligible_mem_bp tracepoint Ravi Jonnalagadda
2026-05-30  0:04 ` [RFC PATCH 0/6] mm/damon: hardware-sampled access reports SeongJae Park
2026-05-30  3:01   ` Akinobu Mita
2026-05-30  5:03     ` Ravi Jonnalagadda

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:d27147603564 dfblob:73fcea91afa0 )
 OR (
bs:"[RFC PATCH 5/6] mm/damon/vaddr: implement perf-event access check" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260529165640.820-6-ravis.opensrc@gmail.com \
    --to=ravis.opensrc@gmail.com \
    --cc=ajayjoshi@micron.com \
    --cc=akinobu.mita@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=bijan311@gmail.com \
    --cc=corbet@lwn.net \
    --cc=damon@lists.linux.dev \
    --cc=honggyu.kim@sk.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=sj@kernel.org \
    --cc=yunjeong.mun@sk.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.