From: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
To: sj@kernel.org, akinobu.mita@gmail.com, damon@lists.linux.dev,
linux-mm@kvack.org, linux-kernel@vger.kernel.org,
linux-doc@vger.kernel.org
Cc: akpm@linux-foundation.org, corbet@lwn.net, bijan311@gmail.com,
ajayjoshi@micron.com, honggyu.kim@sk.com, yunjeong.mun@sk.com,
ravis.opensrc@gmail.com
Subject: [RFC PATCH 5/6] mm/damon/vaddr: implement perf-event access check
Date: Fri, 29 May 2026 09:56:39 -0700 [thread overview]
Message-ID: <20260529165640.820-6-ravis.opensrc@gmail.com> (raw)
In-Reply-To: <20260529165640.820-1-ravis.opensrc@gmail.com>
Add the perf-event backend used by the substrate. Two stateless NMI
overflow handlers are picked at perf_event_create_kernel_counter()
time (paddr- vs vaddr-keyed) and called with context = NULL, so the
NMI fast path never dereferences the per-event struct. Each submits
a damon_access_report into the per-CPU ring.
The vaddr handler drops samples with addr == 0 or addr >= TASK_SIZE.
The paddr handler gates on data->sample_flags & PERF_SAMPLE_PHYS_ADDR
rather than testing data->phys_addr for zero (which would also drop
legitimate page 0). AMD IBS Op only populates phys_addr when
IBS_OP_DATA3.dc_phy_addr_valid is set; gating on sample_flags is the
documented way to detect that. is_write is derived from
data->data_src.mem_op.
cpuhp_setup_state_multi() registers one global state at subsys_initcall;
each damon_perf_event is added as an instance in damon_perf_init() so
cpuhp drives per-CPU event creation and offline-time release. Events
are created with disabled=1 and armed by kdamond_fn() when the
substrate is ready; per-CPU init failures are surfaced via
init_complete / any_cpu_failed so damon_perf_init() rolls back the
cpuhp instance instead of leaving a half-armed event behind.
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
---
mm/damon/vaddr.c | 267 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 267 insertions(+)
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index d271476035641..73fcea91afa07 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -7,11 +7,13 @@
#define pr_fmt(fmt) "damon-va: " fmt
+#include <linux/cpuhotplug.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/perf_event.h>
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
@@ -957,6 +959,263 @@ static int damon_va_scheme_score(struct damon_ctx *context,
return DAMOS_MAX_SCORE;
}
+#ifdef CONFIG_PERF_EVENTS
+
+#define DAMON_PERF_MAX_RECORDS (1UL << 20)
+#define DAMON_PERF_INIT_RECORDS (1UL << 15)
+
+/*
+ * NMI hot-path: avoid every heap dereference. These handlers carry no
+ * pointer back to the per-event struct -- perf_event_create_kernel_counter
+ * is called with context = NULL. Submission flows into the global
+ * per-CPU SPSC ring (damon_report_access -> kdamond_check_reported_accesses
+ * drains).
+ */
+static void damon_perf_overflow_vaddr(struct perf_event *perf_event,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ struct damon_access_report report;
+
+ if (!data || !data->addr)
+ return;
+
+ /* Drop kernel-VA hits -- only user-space VAs land in damon vaddr regions. */
+ if (data->addr >= TASK_SIZE)
+ return;
+
+ report = (struct damon_access_report){
+ .vaddr = data->addr & PAGE_MASK,
+ .size = PAGE_SIZE,
+ .cpu = smp_processor_id(),
+ .tid = current->pid,
+ .tgid = current->tgid,
+ .is_write = !!(data->data_src.mem_op & PERF_MEM_OP_STORE),
+ };
+ damon_report_access(&report);
+}
+
+static void damon_perf_overflow_paddr(struct perf_event *perf_event,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ struct damon_access_report report;
+
+ if (!data)
+ return;
+
+ /*
+ * AMD IBS Op only populates data->phys_addr when
+ * IBS_OP_DATA3.dc_phy_addr_valid is set; otherwise the field
+ * carries a stale value. Gate on sample_flags rather than testing
+ * phys_addr for zero (which would also drop legitimate page 0).
+ */
+ if (!(data->sample_flags & PERF_SAMPLE_PHYS_ADDR))
+ return;
+
+ report = (struct damon_access_report){
+ .paddr = data->phys_addr & PAGE_MASK,
+ .size = PAGE_SIZE,
+ .cpu = smp_processor_id(),
+ .is_write = !!(data->data_src.mem_op & PERF_MEM_OP_STORE),
+ };
+ damon_report_access(&report);
+}
+
+static enum cpuhp_state damon_perf_cpuhp_state;
+
+static void damon_perf_event_init_attr(struct damon_perf_event *event,
+ struct perf_event_attr *attr)
+{
+ *attr = (struct perf_event_attr) {
+ .size = sizeof(*attr),
+ .type = event->attr.type,
+ .config = event->attr.config,
+ .config1 = event->attr.config1,
+ .config2 = event->attr.config2,
+ .freq = event->attr.freq,
+ .sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_ADDR |
+ PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
+ (event->attr.sample_phys_addr ?
+ PERF_SAMPLE_PHYS_ADDR : 0) |
+ (event->attr.sample_weight_struct ?
+ PERF_SAMPLE_WEIGHT_STRUCT : 0),
+ .precise_ip = event->attr.precise_ip,
+ .pinned = 1,
+ .disabled = 1,
+ .wakeup_events = event->attr.wakeup_events,
+ .exclude_kernel = event->attr.exclude_kernel,
+ .exclude_hv = event->attr.exclude_hv,
+ };
+
+ /*
+ * sample_period and sample_freq share storage in the kernel
+ * perf_event_attr (union). Select based on the freq toggle so
+ * frequency-based callers (PEBS) and period-based callers
+ * (AMD IBS Op MaxCnt) both work correctly.
+ */
+ if (event->attr.freq)
+ attr->sample_freq = event->attr.sample_freq;
+ else
+ attr->sample_period = event->attr.sample_period;
+}
+
+static int damon_perf_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+ struct damon_perf_event *event = hlist_entry(node,
+ struct damon_perf_event, hlist_node);
+ struct damon_perf *perf = event->priv;
+ struct perf_event_attr attr;
+ struct perf_event *perf_event;
+ perf_overflow_handler_t handler;
+
+ if (!perf)
+ return 0;
+
+ damon_perf_event_init_attr(event, &attr);
+
+ /*
+ * Pick a paddr- or vaddr-specific handler at create time so the
+ * NMI fast path is statically branched. Pass NULL as context --
+ * handlers are stateless wrt the per-event struct, so the NMI
+ * fast path performs no per-event heap dereference. Submission
+ * flows into the global per-CPU SPSC ring via damon_report_access().
+ */
+ handler = event->attr.sample_phys_addr ?
+ damon_perf_overflow_paddr : damon_perf_overflow_vaddr;
+
+ perf_event = perf_event_create_kernel_counter(&attr, cpu, NULL,
+ handler, NULL);
+ if (IS_ERR(perf_event)) {
+ pr_warn_ratelimited("damon-perf: cpu %u event create failed: %ld\n",
+ cpu, PTR_ERR(perf_event));
+ if (!event->init_complete)
+ event->any_cpu_failed = true;
+ return 0; /* never block CPU online */
+ }
+ *per_cpu_ptr(perf->event, cpu) = perf_event;
+ /*
+ * Late-online CPU after the substrate is armed: events are created
+ * with attr.disabled = 1 and would otherwise stay quiescent on this
+ * CPU until the next arm walk. Enable here so coverage matches the
+ * already-online CPUs.
+ */
+ if (event->ctx && READ_ONCE(event->ctx->perf_events_active))
+ perf_event_enable(perf_event);
+ return 0;
+}
+
+static int damon_perf_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+ struct damon_perf_event *event = hlist_entry(node,
+ struct damon_perf_event, hlist_node);
+ struct damon_perf *perf = event->priv;
+ struct perf_event *perf_event;
+
+ if (!perf)
+ return 0;
+
+ perf_event = per_cpu(*perf->event, cpu);
+ if (perf_event) {
+ perf_event_disable(perf_event);
+ perf_event_release_kernel(perf_event);
+ *per_cpu_ptr(perf->event, cpu) = NULL;
+ }
+ return 0;
+}
+
+void damon_perf_event_arm(struct damon_perf_event *event)
+{
+ struct damon_perf *perf = event->priv;
+ struct perf_event *perf_event;
+ int cpu;
+
+ if (!perf)
+ return;
+
+ for_each_online_cpu(cpu) {
+ perf_event = *per_cpu_ptr(perf->event, cpu);
+ if (perf_event)
+ perf_event_enable(perf_event);
+ }
+}
+
+void damon_perf_event_disarm(struct damon_perf_event *event)
+{
+ struct damon_perf *perf = event->priv;
+ struct perf_event *perf_event;
+ int cpu;
+
+ if (!perf)
+ return;
+
+ for_each_online_cpu(cpu) {
+ perf_event = *per_cpu_ptr(perf->event, cpu);
+ if (perf_event)
+ perf_event_disable(perf_event);
+ }
+}
+
+int damon_perf_init(struct damon_ctx *ctx, struct damon_perf_event *event)
+{
+ struct damon_perf *perf;
+ int err = -ENOMEM;
+
+ perf = kzalloc(sizeof(*perf), GFP_KERNEL);
+ if (!perf)
+ return -ENOMEM;
+
+ perf->event = alloc_percpu(typeof(*perf->event));
+ if (!perf->event)
+ goto free_perf;
+
+ event->priv = perf;
+ event->ctx = ctx;
+ INIT_HLIST_NODE(&event->hlist_node);
+
+ /*
+ * cpuhp_state_add_instance() invokes the online callback synchronously
+ * for every currently-online CPU; late-online CPUs subsequently get
+ * an event automatically and offline CPUs release theirs cleanly.
+ */
+ err = cpuhp_state_add_instance(damon_perf_cpuhp_state,
+ &event->hlist_node);
+ if (err)
+ goto free_event;
+
+ event->init_complete = true;
+ if (event->any_cpu_failed) {
+ cpuhp_state_remove_instance(damon_perf_cpuhp_state,
+ &event->hlist_node);
+ err = -ENODEV;
+ goto free_event;
+ }
+
+ return 0;
+
+free_event:
+ free_percpu(perf->event);
+free_perf:
+ kfree(perf);
+ event->priv = NULL;
+ return err;
+}
+
+void damon_perf_cleanup(struct damon_ctx *ctx, struct damon_perf_event *event)
+{
+ struct damon_perf *perf = event->priv;
+
+ if (!perf)
+ return;
+
+ cpuhp_state_remove_instance(damon_perf_cpuhp_state,
+ &event->hlist_node);
+
+ free_percpu(perf->event);
+ kfree(perf);
+ event->priv = NULL;
+}
+
+#endif /* CONFIG_PERF_EVENTS */
+
static int __init damon_va_initcall(void)
{
struct damon_operations ops = {
@@ -979,6 +1238,14 @@ static int __init damon_va_initcall(void)
ops_fvaddr.init = NULL;
ops_fvaddr.update = NULL;
+#ifdef CONFIG_PERF_EVENTS
+ err = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "damon/perf:online",
+ damon_perf_cpu_online, damon_perf_cpu_offline);
+ if (err < 0)
+ return err;
+ damon_perf_cpuhp_state = err;
+#endif
+
err = damon_register_ops(&ops);
if (err)
return err;
--
2.43.0
next prev parent reply other threads:[~2026-05-29 16:57 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-29 16:56 [RFC PATCH 0/6] mm/damon: hardware-sampled access reports Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 1/6] mm/damon: add struct damon_perf_event{,_attr} and per-ctx perf_events list Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 2/6] mm/damon/sysfs-sample: expose perf_events configuration via sysfs Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 3/6] mm/damon/sysfs: install perf_events on apply Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 4/6] mm/damon/core: per-CPU SPSC ring drain and damon_perf_event lifecycle Ravi Jonnalagadda
2026-05-29 16:56 ` Ravi Jonnalagadda [this message]
2026-05-29 16:56 ` [RFC PATCH 6/6] mm/damon: add damos_node_eligible_mem_bp tracepoint Ravi Jonnalagadda
2026-05-30 0:04 ` [RFC PATCH 0/6] mm/damon: hardware-sampled access reports SeongJae Park
2026-05-30 3:01 ` Akinobu Mita
2026-05-30 5:03 ` Ravi Jonnalagadda
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260529165640.820-6-ravis.opensrc@gmail.com \
--to=ravis.opensrc@gmail.com \
--cc=ajayjoshi@micron.com \
--cc=akinobu.mita@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=bijan311@gmail.com \
--cc=corbet@lwn.net \
--cc=damon@lists.linux.dev \
--cc=honggyu.kim@sk.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=sj@kernel.org \
--cc=yunjeong.mun@sk.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox