* [RFC PATCH 1/6] mm/damon: add struct damon_perf_event{,_attr} and per-ctx perf_events list
2026-05-29 16:56 [RFC PATCH 0/6] mm/damon: hardware-sampled access reports Ravi Jonnalagadda
@ 2026-05-29 16:56 ` Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 2/6] mm/damon/sysfs-sample: expose perf_events configuration via sysfs Ravi Jonnalagadda
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Ravi Jonnalagadda @ 2026-05-29 16:56 UTC (permalink / raw)
To: sj, akinobu.mita, damon, linux-mm, linux-kernel, linux-doc
Cc: akpm, corbet, bijan311, ajayjoshi, honggyu.kim, yunjeong.mun,
ravis.opensrc
Introduce the substrate types for using perf events as DAMON access
check sources. struct damon_perf_event_attr carries the raw PMU attr
configurable from userspace; struct damon_perf_event is the per-event
entry on a new damon_ctx::perf_events list.
Declare damon_perf_init() and damon_perf_cleanup() in
mm/damon/ops-common.h. When CONFIG_PERF_EVENTS=n they fold to a no-op
returning -ENOSYS.
Suggested-by: Akinobu Mita <akinobu.mita@gmail.com>
Link: https://lore.kernel.org/20260423004211.7037-1-akinobu.mita@gmail.com
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
---
include/linux/damon.h | 80 +++++++++++++++++++++++++++++++++++++++++
mm/damon/ops-common.h | 39 ++++++++++++++++++++
mm/damon/sysfs-common.h | 6 ++++
3 files changed, 125 insertions(+)
diff --git a/include/linux/damon.h b/include/linux/damon.h
index c0375035a3a7b..11f1c1071b9ba 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -123,6 +123,7 @@ struct damon_target {
* @size: The size of the accessed address range.
* @cpu: The id of the CPU that made the access.
* @tid: The task id of the task that made the access.
+ * @tgid: Thread group id of the task that made the access.
* @is_write: Whether the access is write.
*
* Any DAMON API callers that notified access events can report the information
@@ -135,6 +136,7 @@ struct damon_access_report {
unsigned long size;
unsigned int cpu;
pid_t tid;
+ pid_t tgid;
bool is_write;
/* private: */
unsigned long report_jiffies; /* when this report is made */
@@ -501,6 +503,7 @@ struct damos_filter {
};
struct damon_ctx;
+struct damon_target_lookup;
struct damos;
/**
@@ -966,6 +969,67 @@ struct damon_sample_control {
struct list_head sample_filters;
};
+/**
+ * struct damon_perf_event_attr - raw PMU event attr for access check.
+ *
+ * @type: raw PMU event type.
+ * @config: raw PMU event config.
+ * @config1: raw PMU event config1.
+ * @config2: raw PMU event config2.
+ * @sample_phys_addr: whether to set PERF_SAMPLE_PHYS_ADDR in sample_type.
+ * @sample_weight_struct: whether to set PERF_SAMPLE_WEIGHT_STRUCT in
+ * sample_type. PMUs that do not advertise
+ * weight (e.g. AMD IBS Op) reject events with
+ * this flag set, so it must be opt-in.
+ * @exclude_kernel: exclude kernel-mode samples.
+ * @exclude_hv: exclude hypervisor samples.
+ * @freq: when true use @sample_freq, otherwise @sample_period.
+ * @sample_freq: target sample rate when @freq is true.
+ * @sample_period: period (samples-between-overflows) when @freq is false.
+ * @wakeup_events: perf_event_attr.wakeup_events.
+ * @precise_ip: precise sampling skid bound (PEBS-style PMUs).
+ */
+struct damon_perf_event_attr {
+ u32 type;
+ u64 config;
+ u64 config1;
+ u64 config2;
+ bool sample_phys_addr;
+ bool sample_weight_struct;
+ bool exclude_kernel;
+ bool exclude_hv;
+ bool freq;
+ u64 sample_freq;
+ u64 sample_period;
+ u32 wakeup_events;
+ u32 precise_ip;
+};
+
+/**
+ * struct damon_perf_event - perf event for access check.
+ *
+ * @attr: Per-event PMU attribute (configured via sysfs).
+ * @priv: Monitoring operations-specific data.
+ * @list: List head for &damon_ctx->perf_events siblings.
+ * @hlist_node: Tracks this event among cpuhp multi-instance entries.
+ * @init_complete: Set after the synchronous online sweep finishes; gates
+ * @any_cpu_failed writes from late hotplug callbacks.
+ * @any_cpu_failed: Set by the cpuhp online callback if perf_event creation
+ * fails on any CPU during the synchronous initial install.
+ * @ctx: Back-pointer to the owning damon_ctx; the cpu_online callback
+ * reads ctx->perf_events_active to decide whether to enable a
+ * late-onlining CPU's event immediately after create.
+ */
+struct damon_perf_event {
+ struct damon_perf_event_attr attr;
+ void *priv;
+ struct list_head list;
+ struct hlist_node hlist_node;
+ bool init_complete;
+ bool any_cpu_failed;
+ struct damon_ctx *ctx;
+};
+
/**
* struct damon_ctx - Represents a context for each monitoring. This is the
* main interface that allows users to set the attributes and get the results
@@ -991,6 +1055,11 @@ struct damon_sample_control {
* @addr_unit: Scale factor for core to ops address conversion.
* @min_region_sz: Minimum region size.
* @pause: Pause kdamond main loop.
+ * @perf_events: Head of perf events (&damon_perf_event) list.
+ * @perf_events_active: Set while kdamond_fn has the perf events armed.
+ * Cleared in the kdamond_fn done path before the events are
+ * disabled; serves as the gate for damon_commit_perf_events()
+ * and the kdamond_fn drain dispatch.
*/
struct damon_ctx {
struct damon_attrs attrs;
@@ -1046,6 +1115,9 @@ struct damon_ctx {
unsigned long min_region_sz;
bool pause;
+ struct list_head perf_events;
+ bool perf_events_active;
+
/* private: */
/* Head of monitoring targets (&damon_target) list. */
struct list_head adaptive_targets;
@@ -1054,6 +1126,14 @@ struct damon_ctx {
/* Per-ctx PRNG state for damon_rand(); kdamond is the sole consumer. */
struct rnd_state rnd_state;
+
+ /* Reusable drain-loop snapshot buffer (avoids per-tick kmalloc). */
+ struct {
+ struct damon_target_lookup *lookups;
+ unsigned int nr_lookups;
+ struct damon_region **region_buf;
+ unsigned int region_buf_cap;
+ } drain_snapshot;
};
/* Get a random number in [@l, @r) using @ctx's lockless PRNG. */
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 5efa5b5970def..35da400a67ec1 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -23,3 +23,42 @@ bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio);
unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid);
bool damos_ops_has_filter(struct damos *s);
+
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Per-event opaque allocated by damon_perf_init(). The NMI overflow
+ * handler does NOT touch this struct; submission goes through the
+ * shared per-CPU SPSC ring via damon_report_access().
+ */
+struct damon_perf {
+ struct perf_event * __percpu *event;
+};
+
+int damon_perf_init(struct damon_ctx *ctx, struct damon_perf_event *event);
+void damon_perf_cleanup(struct damon_ctx *ctx, struct damon_perf_event *event);
+void damon_perf_event_arm(struct damon_perf_event *event);
+void damon_perf_event_disarm(struct damon_perf_event *event);
+
+#else /* !CONFIG_PERF_EVENTS */
+
+static inline int damon_perf_init(struct damon_ctx *ctx,
+ struct damon_perf_event *event)
+{
+ return -ENOSYS;
+}
+
+static inline void damon_perf_cleanup(struct damon_ctx *ctx,
+ struct damon_perf_event *event)
+{
+}
+
+static inline void damon_perf_event_arm(struct damon_perf_event *event)
+{
+}
+
+static inline void damon_perf_event_disarm(struct damon_perf_event *event)
+{
+}
+
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 25a6c28abdea8..67c7545fd57d0 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -66,10 +66,13 @@ int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id);
* sample directory
*/
+struct damon_sysfs_perf_events;
+
struct damon_sysfs_sample {
struct kobject kobj;
struct damon_sysfs_primitives *primitives;
struct damon_sysfs_sample_filters *filters;
+ struct damon_sysfs_perf_events *perf_events;
};
struct damon_sysfs_sample *damon_sysfs_sample_alloc(void);
@@ -82,3 +85,6 @@ extern const struct kobj_type damon_sysfs_sample_ktype;
int damon_sysfs_set_sample_control(
struct damon_sample_control *control,
struct damon_sysfs_sample *sysfs_sample);
+
+int damon_sysfs_add_perf_events(struct damon_ctx *ctx,
+ struct damon_sysfs_sample *sysfs_sample);
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [RFC PATCH 2/6] mm/damon/sysfs-sample: expose perf_events configuration via sysfs
2026-05-29 16:56 [RFC PATCH 0/6] mm/damon: hardware-sampled access reports Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 1/6] mm/damon: add struct damon_perf_event{,_attr} and per-ctx perf_events list Ravi Jonnalagadda
@ 2026-05-29 16:56 ` Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 3/6] mm/damon/sysfs: install perf_events on apply Ravi Jonnalagadda
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Ravi Jonnalagadda @ 2026-05-29 16:56 UTC (permalink / raw)
To: sj, akinobu.mita, damon, linux-mm, linux-kernel, linux-doc
Cc: akpm, corbet, bijan311, ajayjoshi, honggyu.kim, yunjeong.mun,
ravis.opensrc
Add a perf_events/ subdirectory under each context's sample/ directory.
Each numbered entry maps to one damon_perf_event and exposes its raw
PMU attr, addressing flags, and period/delivery knobs.
Defaults match Intel PEBS L3-miss; userspace overrides them for other
PMUs. sample_weight_struct defaults off because PMUs that do not
advertise PERF_SAMPLE_WEIGHT_STRUCT (e.g. AMD IBS Op) reject events
that request it with -EOPNOTSUPP.
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
---
mm/damon/sysfs-sample.c | 579 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 579 insertions(+)
diff --git a/mm/damon/sysfs-sample.c b/mm/damon/sysfs-sample.c
index ffc9c85455474..0570d27a47b1c 100644
--- a/mm/damon/sysfs-sample.c
+++ b/mm/damon/sysfs-sample.c
@@ -452,6 +452,520 @@ static const struct kobj_type damon_sysfs_primitives_ktype = {
.default_groups = damon_sysfs_primitives_groups,
};
+/*
+ * perf_event_attr directory
+ */
+
+struct damon_sysfs_perf_event_attr {
+ struct kobject kobj;
+ u32 type;
+ u64 config;
+ u64 config1;
+ u64 config2;
+ bool sample_phys_addr;
+ bool sample_weight_struct;
+ bool exclude_kernel;
+ bool exclude_hv;
+ bool freq;
+ u64 sample_freq;
+ u64 sample_period;
+ u32 wakeup_events;
+ u32 precise_ip;
+};
+
+static struct damon_sysfs_perf_event_attr *
+damon_sysfs_perf_event_attr_alloc(void)
+{
+ struct damon_sysfs_perf_event_attr *attr =
+ kzalloc(sizeof(*attr), GFP_KERNEL);
+
+ if (!attr)
+ return NULL;
+ attr->wakeup_events = 1;
+ attr->precise_ip = 2;
+ attr->freq = true;
+ attr->exclude_kernel = true;
+ attr->exclude_hv = true;
+ return attr;
+}
+
+static ssize_t attr_type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "0x%x\n", perf_event_attr->type);
+}
+
+static ssize_t attr_type_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ int err = kstrtou32(buf, 0, &perf_event_attr->type);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t config_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "0x%llx\n", perf_event_attr->config);
+}
+
+static ssize_t config_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ int err = kstrtou64(buf, 0, &perf_event_attr->config);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t config1_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "0x%llx\n", perf_event_attr->config1);
+}
+
+static ssize_t config1_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ int err = kstrtou64(buf, 0, &perf_event_attr->config1);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t config2_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "0x%llx\n", perf_event_attr->config2);
+}
+
+static ssize_t config2_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ int err = kstrtou64(buf, 0, &perf_event_attr->config2);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t sample_phys_addr_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "%d\n", perf_event_attr->sample_phys_addr);
+}
+
+static ssize_t sample_phys_addr_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ bool sample_phys_addr;
+ int err = kstrtobool(buf, &sample_phys_addr);
+
+ if (err)
+ return -EINVAL;
+
+ perf_event_attr->sample_phys_addr = sample_phys_addr;
+ return count;
+}
+
+static ssize_t sample_weight_struct_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "%d\n", perf_event_attr->sample_weight_struct);
+}
+
+static ssize_t sample_weight_struct_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ bool sample_weight_struct;
+ int err = kstrtobool(buf, &sample_weight_struct);
+
+ if (err)
+ return -EINVAL;
+
+ perf_event_attr->sample_weight_struct = sample_weight_struct;
+ return count;
+}
+
+static ssize_t sample_freq_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "%llu\n", perf_event_attr->sample_freq);
+}
+
+static ssize_t sample_freq_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ int err = kstrtou64(buf, 0, &perf_event_attr->sample_freq);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t wakeup_events_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "%u\n", perf_event_attr->wakeup_events);
+}
+
+static ssize_t wakeup_events_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ int err = kstrtou32(buf, 0, &perf_event_attr->wakeup_events);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t precise_ip_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "%u\n", perf_event_attr->precise_ip);
+}
+
+static ssize_t precise_ip_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ int err = kstrtou32(buf, 0, &perf_event_attr->precise_ip);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t freq_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "%d\n", perf_event_attr->freq);
+}
+
+static ssize_t freq_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ bool freq;
+ int err = kstrtobool(buf, &freq);
+
+ if (err)
+ return -EINVAL;
+ perf_event_attr->freq = freq;
+ return count;
+}
+
+static ssize_t sample_period_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "%llu\n", perf_event_attr->sample_period);
+}
+
+static ssize_t sample_period_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ int err = kstrtou64(buf, 0, &perf_event_attr->sample_period);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t exclude_kernel_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "%d\n", perf_event_attr->exclude_kernel);
+}
+
+static ssize_t exclude_kernel_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ bool v;
+ int err = kstrtobool(buf, &v);
+
+ if (err)
+ return -EINVAL;
+ perf_event_attr->exclude_kernel = v;
+ return count;
+}
+
+static ssize_t exclude_hv_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+
+ return sysfs_emit(buf, "%d\n", perf_event_attr->exclude_hv);
+}
+
+static ssize_t exclude_hv_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_event_attr *perf_event_attr = container_of(kobj,
+ struct damon_sysfs_perf_event_attr, kobj);
+ bool v;
+ int err = kstrtobool(buf, &v);
+
+ if (err)
+ return -EINVAL;
+ perf_event_attr->exclude_hv = v;
+ return count;
+}
+
+static void damon_sysfs_perf_event_attr_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_perf_event_attr, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_type_attr =
+ __ATTR(type, 0600, attr_type_show, attr_type_store);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_config_attr =
+ __ATTR_RW_MODE(config, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_config1_attr =
+ __ATTR_RW_MODE(config1, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_config2_attr =
+ __ATTR_RW_MODE(config2, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_sample_phys_addr_attr =
+ __ATTR_RW_MODE(sample_phys_addr, 0600);
+
+static struct kobj_attribute
+ damon_sysfs_perf_event_attr_sample_weight_struct_attr =
+ __ATTR_RW_MODE(sample_weight_struct, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_sample_freq_attr =
+ __ATTR_RW_MODE(sample_freq, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_wakeup_events_attr =
+ __ATTR_RW_MODE(wakeup_events, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_precise_ip_attr =
+ __ATTR_RW_MODE(precise_ip, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_freq_attr =
+ __ATTR_RW_MODE(freq, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_sample_period_attr =
+ __ATTR_RW_MODE(sample_period, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_exclude_kernel_attr =
+ __ATTR_RW_MODE(exclude_kernel, 0600);
+
+static struct kobj_attribute damon_sysfs_perf_event_attr_exclude_hv_attr =
+ __ATTR_RW_MODE(exclude_hv, 0600);
+
+static struct attribute *damon_sysfs_perf_event_attr_attrs[] = {
+ &damon_sysfs_perf_event_attr_type_attr.attr,
+ &damon_sysfs_perf_event_attr_config_attr.attr,
+ &damon_sysfs_perf_event_attr_config1_attr.attr,
+ &damon_sysfs_perf_event_attr_config2_attr.attr,
+ &damon_sysfs_perf_event_attr_sample_phys_addr_attr.attr,
+ &damon_sysfs_perf_event_attr_sample_weight_struct_attr.attr,
+ &damon_sysfs_perf_event_attr_freq_attr.attr,
+ &damon_sysfs_perf_event_attr_sample_freq_attr.attr,
+ &damon_sysfs_perf_event_attr_sample_period_attr.attr,
+ &damon_sysfs_perf_event_attr_wakeup_events_attr.attr,
+ &damon_sysfs_perf_event_attr_precise_ip_attr.attr,
+ &damon_sysfs_perf_event_attr_exclude_kernel_attr.attr,
+ &damon_sysfs_perf_event_attr_exclude_hv_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_perf_event_attr);
+
+static const struct kobj_type damon_sysfs_perf_event_attr_ktype = {
+ .release = damon_sysfs_perf_event_attr_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_perf_event_attr_groups,
+};
+
+/*
+ * perf_events directory
+ */
+
+/*
+ * Cap on the number of perf events per damon_ctx, to bound the sysfs
+ * kobject footprint and prevent unbounded allocations from a careless
+ * write to nr_perf_events.
+ */
+#define DAMON_SYSFS_PERF_EVENTS_MAX 64
+
+struct damon_sysfs_perf_events {
+ struct kobject kobj;
+ struct damon_sysfs_perf_event_attr **attrs_arr;
+ int nr;
+};
+
+static struct damon_sysfs_perf_events *damon_sysfs_perf_events_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_perf_events), GFP_KERNEL);
+}
+
+static void damon_sysfs_perf_events_rm_dirs(
+ struct damon_sysfs_perf_events *events)
+{
+ struct damon_sysfs_perf_event_attr **attrs_arr = events->attrs_arr;
+ int i;
+
+ for (i = 0; i < events->nr; i++)
+ kobject_put(&attrs_arr[i]->kobj);
+ events->nr = 0;
+ kfree(attrs_arr);
+ events->attrs_arr = NULL;
+}
+
+static int damon_sysfs_perf_events_add_dirs(
+ struct damon_sysfs_perf_events *events, int nr_events)
+{
+ struct damon_sysfs_perf_event_attr **attrs_arr, *attr;
+ int err, i;
+
+ damon_sysfs_perf_events_rm_dirs(events);
+ if (!nr_events)
+ return 0;
+
+ attrs_arr = kmalloc_array(nr_events, sizeof(*attrs_arr), GFP_KERNEL);
+ if (!attrs_arr)
+ return -ENOMEM;
+ events->attrs_arr = attrs_arr;
+
+ for (i = 0; i < nr_events; i++) {
+ attr = damon_sysfs_perf_event_attr_alloc();
+ if (!attr) {
+ damon_sysfs_perf_events_rm_dirs(events);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&attr->kobj,
+ &damon_sysfs_perf_event_attr_ktype, &events->kobj,
+ "%d", i);
+ if (err) {
+ kobject_put(&attr->kobj);
+ damon_sysfs_perf_events_rm_dirs(events);
+ return err;
+ }
+ attrs_arr[i] = attr;
+ events->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_perf_events_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_perf_events *events = container_of(kobj,
+ struct damon_sysfs_perf_events, kobj);
+
+ return sysfs_emit(buf, "%d\n", events->nr);
+}
+
+static ssize_t nr_perf_events_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_perf_events *events;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0 || nr > DAMON_SYSFS_PERF_EVENTS_MAX)
+ return -EINVAL;
+
+ events = container_of(kobj, struct damon_sysfs_perf_events, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_perf_events_add_dirs(events, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_perf_events_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_perf_events, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_perf_events_nr_attr =
+ __ATTR_RW_MODE(nr_perf_events, 0600);
+
+static struct attribute *damon_sysfs_perf_events_attrs[] = {
+ &damon_sysfs_perf_events_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_perf_events);
+
+static const struct kobj_type damon_sysfs_perf_events_ktype = {
+ .release = damon_sysfs_perf_events_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_perf_events_groups,
+};
+
/*
* sample directory
*/
@@ -471,6 +985,7 @@ int damon_sysfs_sample_add_dirs(struct damon_sysfs_sample *sample)
{
struct damon_sysfs_primitives *primitives;
struct damon_sysfs_sample_filters *filters;
+ struct damon_sysfs_perf_events *perf_events;
int err;
primitives = damon_sysfs_primitives_alloc(true, false);
@@ -494,7 +1009,23 @@ int damon_sysfs_sample_add_dirs(struct damon_sysfs_sample *sample)
if (err)
goto put_filters_out;
sample->filters = filters;
+
+ perf_events = damon_sysfs_perf_events_alloc();
+ if (!perf_events) {
+ err = -ENOMEM;
+ goto put_filters_out;
+ }
+ err = kobject_init_and_add(&perf_events->kobj,
+ &damon_sysfs_perf_events_ktype, &sample->kobj,
+ "perf_events");
+ if (err)
+ goto put_perf_events_out;
+ sample->perf_events = perf_events;
+
return 0;
+put_perf_events_out:
+ kobject_put(&perf_events->kobj);
+ sample->perf_events = NULL;
put_filters_out:
kobject_put(&filters->kobj);
sample->filters = NULL;
@@ -512,6 +1043,10 @@ void damon_sysfs_sample_rm_dirs(struct damon_sysfs_sample *sample)
damon_sysfs_sample_filters_rm_dirs(sample->filters);
kobject_put(&sample->filters->kobj);
}
+ if (sample->perf_events) {
+ damon_sysfs_perf_events_rm_dirs(sample->perf_events);
+ kobject_put(&sample->perf_events->kobj);
+ }
}
void damon_sysfs_sample_release(struct kobject *kobj)
@@ -596,3 +1131,47 @@ int damon_sysfs_set_sample_control(
return damon_sysfs_set_sample_filters(control,
sysfs_sample->filters);
}
+
+static int damon_sysfs_add_perf_event(
+ struct damon_sysfs_perf_event_attr *sys_attr,
+ struct damon_ctx *ctx)
+{
+ struct damon_perf_event *event = kzalloc(sizeof(*event), GFP_KERNEL);
+
+ if (!event)
+ return -ENOMEM;
+
+ event->attr.type = sys_attr->type;
+ event->attr.config = sys_attr->config;
+ event->attr.config1 = sys_attr->config1;
+ event->attr.config2 = sys_attr->config2;
+ event->attr.sample_phys_addr = sys_attr->sample_phys_addr;
+ event->attr.sample_weight_struct = sys_attr->sample_weight_struct;
+ event->attr.freq = sys_attr->freq;
+ event->attr.sample_freq = sys_attr->sample_freq;
+ event->attr.sample_period = sys_attr->sample_period;
+ event->attr.wakeup_events = sys_attr->wakeup_events;
+ event->attr.precise_ip = sys_attr->precise_ip;
+ event->attr.exclude_kernel = sys_attr->exclude_kernel;
+ event->attr.exclude_hv = sys_attr->exclude_hv;
+
+ list_add_tail(&event->list, &ctx->perf_events);
+ return 0;
+}
+
+int damon_sysfs_add_perf_events(struct damon_ctx *ctx,
+ struct damon_sysfs_sample *sysfs_sample)
+{
+ struct damon_sysfs_perf_events *events = sysfs_sample->perf_events;
+ int i, err;
+
+ if (!events)
+ return 0;
+
+ for (i = 0; i < events->nr; i++) {
+ err = damon_sysfs_add_perf_event(events->attrs_arr[i], ctx);
+ if (err)
+ return err;
+ }
+ return 0;
+}
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [RFC PATCH 3/6] mm/damon/sysfs: install perf_events on apply
2026-05-29 16:56 [RFC PATCH 0/6] mm/damon: hardware-sampled access reports Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 1/6] mm/damon: add struct damon_perf_event{,_attr} and per-ctx perf_events list Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 2/6] mm/damon/sysfs-sample: expose perf_events configuration via sysfs Ravi Jonnalagadda
@ 2026-05-29 16:56 ` Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 4/6] mm/damon/core: per-CPU SPSC ring drain and damon_perf_event lifecycle Ravi Jonnalagadda
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Ravi Jonnalagadda @ 2026-05-29 16:56 UTC (permalink / raw)
To: sj, akinobu.mita, damon, linux-mm, linux-kernel, linux-doc
Cc: akpm, corbet, bijan311, ajayjoshi, honggyu.kim, yunjeong.mun,
ravis.opensrc
Call damon_sysfs_add_perf_events() from damon_sysfs_apply_inputs() so
events configured under sample/perf_events/ get attached to the
damon_ctx when the kdamond starts.
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
---
mm/damon/sysfs.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 9f71871a249d8..bc4a931fe3f0a 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2092,6 +2092,9 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
return err;
err = damon_sysfs_set_sample_control(&ctx->sample_control,
sys_ctx->attrs->sample);
+ if (err)
+ return err;
+ err = damon_sysfs_add_perf_events(ctx, sys_ctx->attrs->sample);
if (err)
return err;
err = damon_sysfs_add_targets(ctx, sys_ctx->targets);
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [RFC PATCH 4/6] mm/damon/core: per-CPU SPSC ring drain and damon_perf_event lifecycle
2026-05-29 16:56 [RFC PATCH 0/6] mm/damon: hardware-sampled access reports Ravi Jonnalagadda
` (2 preceding siblings ...)
2026-05-29 16:56 ` [RFC PATCH 3/6] mm/damon/sysfs: install perf_events on apply Ravi Jonnalagadda
@ 2026-05-29 16:56 ` Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 5/6] mm/damon/vaddr: implement perf-event access check Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 6/6] mm/damon: add damos_node_eligible_mem_bp tracepoint Ravi Jonnalagadda
5 siblings, 0 replies; 7+ messages in thread
From: Ravi Jonnalagadda @ 2026-05-29 16:56 UTC (permalink / raw)
To: sj, akinobu.mita, damon, linux-mm, linux-kernel, linux-doc
Cc: akpm, corbet, bijan311, ajayjoshi, honggyu.kim, yunjeong.mun,
ravis.opensrc
Replace the mutex-protected damon_access_reports[] single-buffer with
a per-CPU SPSC ring. The producer (damon_report_access) is called
from NMI by perf overflow handlers; the consumer
(kdamond_check_reported_accesses) runs once per sample tick.
- 256-entry ring per CPU with cache-line-aligned head/tail
- per-CPU damon_report_ring_busy guards against NMI nesting on top
of a process-context producer on the same CPU
- per-CPU damon_ring_pending bit so the consumer iterates only CPUs
that produced samples this tick
- smp_mb between flag clear and head read on the consumer side
pairs with the producer's head-publish ordering
Replace the O(N) per-region scan in kdamond_apply_access_report() with
bsearch over a per-tick per-target snapshot built into a reusable
damon_ctx::drain_snapshot buffer. The pid-based ctx early-reject is
no longer needed: kdamond_apply_access_report() already discriminates
report->vaddr vs report->paddr per ctx.
Wire the damon_perf_event lifecycle: init per attached event when
kdamond starts, teardown when the ctx is destroyed, replayed across
damon_commit_ctx. Add the matching forward decl + drain_snapshot
field on struct damon_ctx.
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
---
include/trace/events/damon.h | 17 ++
mm/damon/core.c | 383 ++++++++++++++++++++++++++++++-----
2 files changed, 344 insertions(+), 56 deletions(-)
diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index b131bee27cc4a..e97e70579a8c8 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -74,6 +74,23 @@ TRACE_EVENT(damos_esz,
__entry->esz)
);
+TRACE_EVENT(damon_perf_ring_overflow,
+
+ TP_PROTO(int cpu),
+
+ TP_ARGS(cpu),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ ),
+
+ TP_printk("cpu=%d", __entry->cpu)
+);
+
TRACE_EVENT_CONDITION(damos_before_apply,
TP_PROTO(unsigned int context_idx, unsigned int scheme_idx,
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 23311189b589e..1e6966e45144f 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "damon: " fmt
#include <linux/damon.h>
+#include <asm/local.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
@@ -24,22 +25,43 @@
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
-#define DAMON_ACCESS_REPORTS_CAP 1000
+#define DAMON_REPORT_RING_SIZE 256
+#define DAMON_REPORT_RING_MASK (DAMON_REPORT_RING_SIZE - 1)
+
+/* Per-target region lookup snapshot for the drain loop. */
+struct damon_target_lookup {
+ struct damon_target *t;
+ struct damon_region **regions;
+ unsigned int nr_regions;
+};
+
+struct damon_report_ring {
+ unsigned int head; /* written by producer (NMI) */
+ unsigned int tail /* written by consumer (kdamond) */
+ ____cacheline_aligned_in_smp;
+ struct damon_access_report entries[DAMON_REPORT_RING_SIZE]
+ ____cacheline_aligned_in_smp;
+};
+
+static DEFINE_PER_CPU(struct damon_report_ring, damon_report_rings);
+static DEFINE_PER_CPU(local_t, damon_report_ring_busy);
+/*
+ * Producer (NMI) sets after publishing a report; consumer (kdamond) clears
+ * before draining the corresponding ring. Per-CPU to avoid cross-CPU
+ * cacheline bouncing under sampling load on large systems.
+ */
+static DEFINE_PER_CPU(unsigned long, damon_ring_pending);
static DEFINE_MUTEX(damon_lock);
static int nr_running_ctxs;
static bool running_exclusive_ctxs;
+static struct damon_ctx *damon_perf_owner;
static DEFINE_MUTEX(damon_ops_lock);
static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
static struct kmem_cache *damon_region_cache __ro_after_init;
-static DEFINE_MUTEX(damon_access_reports_lock);
-static struct damon_access_report damon_access_reports[
- DAMON_ACCESS_REPORTS_CAP];
-static int damon_access_reports_len;
-
/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */
static bool __damon_is_registered_ops(enum damon_ops_id id)
{
@@ -805,11 +827,24 @@ struct damon_ctx *damon_new_ctx(void)
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
+ INIT_LIST_HEAD(&ctx->perf_events);
+
prandom_seed_state(&ctx->rnd_state, get_random_u64());
return ctx;
}
+static void damon_perf_destroy(struct damon_ctx *ctx)
+{
+ struct damon_perf_event *event, *next;
+
+ list_for_each_entry_safe(event, next, &ctx->perf_events, list) {
+ damon_perf_cleanup(ctx, event);
+ list_del(&event->list);
+ kfree(event);
+ }
+}
+
static void damon_destroy_targets(struct damon_ctx *ctx)
{
struct damon_target *t, *next_t;
@@ -835,6 +870,11 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
damon_for_each_sample_filter_safe(f, next_f, &ctx->sample_control)
damon_destroy_sample_filter(f, &ctx->sample_control);
+ damon_perf_destroy(ctx);
+
+ kfree(ctx->drain_snapshot.lookups);
+ kfree(ctx->drain_snapshot.region_buf);
+
kfree(ctx);
}
@@ -1694,6 +1734,45 @@ static int damon_commit_sample_control(
return damon_commit_sample_filters(dst, src);
}
+static int damon_commit_perf_events(struct damon_ctx *dst,
+ struct damon_ctx *src)
+{
+ struct damon_perf_event *src_event, *new_event;
+ int err = 0;
+
+ damon_perf_destroy(dst);
+
+ list_for_each_entry(src_event, &src->perf_events, list) {
+ new_event = kzalloc(sizeof(*new_event), GFP_KERNEL);
+ if (!new_event) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ new_event->attr = src_event->attr;
+
+ if (damon_is_running(dst)) {
+ err = damon_perf_init(dst, new_event);
+ if (err) {
+ kfree(new_event);
+ goto out;
+ }
+ /*
+ * Events are created with attr.disabled=1 and only fire while
+ * the kdamond runs. Arm now if we are committing into a
+ * running ctx whose substrate is already armed.
+ */
+ if (dst->perf_events_active)
+ damon_perf_event_arm(new_event);
+ }
+ list_add_tail(&new_event->list, &dst->perf_events);
+ }
+ return 0;
+out:
+ damon_perf_destroy(dst);
+ return err;
+}
+
static int __damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
{
int err;
@@ -1742,6 +1821,9 @@ static int __damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
return err;
err = damon_commit_sample_control(&dst->sample_control,
&src->sample_control);
+ if (err)
+ return err;
+ err = damon_commit_perf_events(dst, src);
if (err)
return err;
dst->addr_unit = src->addr_unit;
@@ -1929,12 +2011,40 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive)
return -EBUSY;
}
+ /*
+ * The per-CPU PMU events backing the perf-event substrate are a single
+ * shared resource; only one ctx may own them. Reject the start if
+ * another already-running ctx owns the substrate, or if more than one
+ * ctx in this batch wants it.
+ */
+ for (i = 0; i < nr_ctxs; i++) {
+ if (!list_empty(&ctxs[i]->perf_events)) {
+ int j;
+
+ if (damon_perf_owner) {
+ mutex_unlock(&damon_lock);
+ return -EBUSY;
+ }
+ for (j = i + 1; j < nr_ctxs; j++) {
+ if (!list_empty(&ctxs[j]->perf_events)) {
+ mutex_unlock(&damon_lock);
+ return -EBUSY;
+ }
+ }
+ damon_perf_owner = ctxs[i];
+ break;
+ }
+ }
+
for (i = 0; i < nr_ctxs; i++) {
err = __damon_start(ctxs[i]);
if (err)
break;
nr_running_ctxs++;
}
+ if (err && damon_perf_owner &&
+ !damon_perf_owner->kdamond)
+ damon_perf_owner = NULL;
if (exclusive && nr_running_ctxs)
running_exclusive_ctxs = true;
mutex_unlock(&damon_lock);
@@ -2113,29 +2223,47 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
* damon_report_access() - Report identified access events to DAMON.
* @report: The reporting access information.
*
- * Report access events to DAMON.
+ * Report access events to DAMON via a per-CPU SPSC lockless ring. Producer
+ * is the local CPU (typically NMI from a hardware-sampling backend);
+ * consumer is the kdamond drain in kdamond_check_reported_accesses().
*
- * Context: May sleep.
+ * Context: any (NMI-safe). An NMI nesting on top of a process-context
+ * producer on the same CPU would otherwise stomp the same entries[head]
+ * slot; the busy guard detects and drops in that case.
*
- * NOTE: we may be able to implement this as a lockless queue, and allow any
- * context. As the overhead is unknown, and region-based DAMON logics would
- * guarantee the reports would be not made that frequently, let's start with
- * this simple implementation.
+ * If the ring is full, the sample is dropped and the per-CPU overflow
+ * counter incremented.
*/
void damon_report_access(struct damon_access_report *report)
{
- struct damon_access_report *dst;
+ struct damon_report_ring *ring;
+ unsigned int head, next;
- /* silently fail for races */
- if (!mutex_trylock(&damon_access_reports_lock))
- return;
- dst = &damon_access_reports[damon_access_reports_len++];
- /* just drop all existing reports in favor of simplicity. */
- if (damon_access_reports_len == DAMON_ACCESS_REPORTS_CAP)
- damon_access_reports_len = 0;
- *dst = *report;
- dst->report_jiffies = jiffies;
- mutex_unlock(&damon_access_reports_lock);
+ /* Pin to a CPU so the SPSC invariant holds for preemptible callers. */
+ preempt_disable();
+ if (local_inc_return(this_cpu_ptr(&damon_report_ring_busy)) != 1) {
+ /* NMI nested on a process-context producer; drop. */
+ trace_damon_perf_ring_overflow(smp_processor_id());
+ goto out;
+ }
+
+ ring = this_cpu_ptr(&damon_report_rings);
+ head = ring->head;
+ next = (head + 1) & DAMON_REPORT_RING_MASK;
+
+ if (next == READ_ONCE(ring->tail)) {
+ trace_damon_perf_ring_overflow(smp_processor_id());
+ goto out;
+ }
+
+ ring->entries[head] = *report;
+ ring->entries[head].report_jiffies = jiffies;
+ smp_wmb(); /* publish entry before head advance */
+ WRITE_ONCE(ring->head, next);
+ WRITE_ONCE(*this_cpu_ptr(&damon_ring_pending), 1);
+out:
+ local_dec(this_cpu_ptr(&damon_report_ring_busy));
+ preempt_enable();
}
#ifdef CONFIG_MMU
@@ -2145,7 +2273,8 @@ void damon_report_page_fault(struct vm_fault *vmf, bool huge_pmd)
.vaddr = vmf->address,
.size = 1, /* todo: set appripriately */
.cpu = smp_processor_id(),
- .tid = task_pid_vnr(current),
+ .tid = current->pid,
+ .tgid = task_tgid_nr(current),
.is_write = vmf->flags & FAULT_FLAG_WRITE,
};
@@ -3700,6 +3829,7 @@ static void kdamond_init_ctx(struct damon_ctx *ctx)
unsigned long sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
struct damos *scheme;
+ struct damon_perf_event *event, *next;
ctx->passed_sample_intervals = 0;
ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
@@ -3713,6 +3843,15 @@ static void kdamond_init_ctx(struct damon_ctx *ctx)
damos_set_next_apply_sis(scheme, ctx);
damos_set_filters_default_reject(scheme);
}
+
+ list_for_each_entry_safe(event, next, &ctx->perf_events, list) {
+ int err = damon_perf_init(ctx, event);
+
+ if (err) {
+ list_del(&event->list);
+ kfree(event);
+ }
+ }
}
static bool damon_sample_filter_matching(struct damon_access_report *report,
@@ -3759,26 +3898,46 @@ static bool damon_sample_filter_out(struct damon_access_report *report,
}
static void kdamond_apply_access_report(struct damon_access_report *report,
- struct damon_target *t, struct damon_ctx *ctx)
+ struct damon_target *t,
+ struct damon_region **regions, unsigned int nr_regions,
+ struct damon_ctx *ctx)
{
struct damon_region *r;
unsigned long addr;
+ int left, right, mid;
- if (damon_sample_filter_out(report, &ctx->sample_control))
- return;
- if (damon_target_has_pid(ctx))
+ if (damon_target_has_pid(ctx)) {
+ if (pid_nr(t->pid) != report->tgid)
+ return;
addr = report->vaddr;
- else
+ } else {
addr = report->paddr;
+ }
- /* todo: make search faster, e.g., binary search? */
- damon_for_each_region(r, t) {
- if (addr < r->ar.start)
- continue;
- if (r->ar.end < addr + report->size)
- continue;
- if (!r->access_reported)
- damon_update_region_access_rate(r, true, &ctx->attrs);
+ /* Binary search the snapshot for the region containing addr. */
+ left = 0;
+ right = nr_regions - 1;
+ r = NULL;
+ while (left <= right) {
+ /* Avoid (left + right) overflow at large nr_regions. */
+ mid = left + (right - left) / 2;
+ if (addr < regions[mid]->ar.start)
+ right = mid - 1;
+ else if (addr >= regions[mid]->ar.end)
+ left = mid + 1;
+ else {
+ r = regions[mid];
+ break;
+ }
+ }
+
+ if (!r)
+ return;
+ /* Reject reports straddling a region boundary. */
+ if (addr + report->size > r->ar.end)
+ return;
+ if (!r->access_reported) {
+ damon_update_region_access_rate(r, true, &ctx->attrs);
r->access_reported = true;
}
}
@@ -3802,28 +3961,120 @@ static unsigned int kdamond_apply_zero_access_report(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static unsigned int kdamond_check_reported_accesses(struct damon_ctx *ctx)
+/*
+ * Build a snapshot of the ctx's targets and their region arrays for use
+ * by the ring drain loop. The snapshot buffer is reused across ticks,
+ * grown via krealloc only when a new high water mark is reached.
+ *
+ * The two-pass walk over adaptive_targets is safe even though
+ * krealloc_array() may sleep: target list mutation is funneled through
+ * damon_call onto the kdamond itself, so no other thread can mutate the
+ * list while kdamond is running this function.
+ */
+static struct damon_target_lookup *damon_build_target_lookup(
+ struct damon_ctx *ctx, unsigned int *nr_targets_out)
{
- int i;
- struct damon_access_report *report;
struct damon_target *t;
+ struct damon_target_lookup *tbl;
+ unsigned int nr_targets = 0, total_regions = 0, ti = 0, ri = 0;
- /* currently damon_access_report supports only physical address */
- if (damon_target_has_pid(ctx))
- return 0;
+ damon_for_each_target(t, ctx) {
+ nr_targets++;
+ total_regions += damon_nr_regions(t);
+ }
- mutex_lock(&damon_access_reports_lock);
- for (i = 0; i < damon_access_reports_len; i++) {
- report = &damon_access_reports[i];
- if (time_before(report->report_jiffies,
- jiffies -
- usecs_to_jiffies(
- ctx->attrs.sample_interval)))
+ if (nr_targets > ctx->drain_snapshot.nr_lookups) {
+ tbl = krealloc_array(ctx->drain_snapshot.lookups,
+ nr_targets, sizeof(*tbl), GFP_KERNEL);
+ if (!tbl)
+ return NULL;
+ ctx->drain_snapshot.lookups = tbl;
+ ctx->drain_snapshot.nr_lookups = nr_targets;
+ }
+ tbl = ctx->drain_snapshot.lookups;
+
+ if (total_regions > ctx->drain_snapshot.region_buf_cap) {
+ struct damon_region **buf;
+
+ buf = krealloc_array(ctx->drain_snapshot.region_buf,
+ total_regions, sizeof(*buf), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+ ctx->drain_snapshot.region_buf = buf;
+ ctx->drain_snapshot.region_buf_cap = total_regions;
+ }
+
+ damon_for_each_target(t, ctx) {
+ struct damon_region *r;
+
+ tbl[ti].t = t;
+ tbl[ti].regions = &ctx->drain_snapshot.region_buf[ri];
+ tbl[ti].nr_regions = damon_nr_regions(t);
+ damon_for_each_region(r, t)
+ ctx->drain_snapshot.region_buf[ri++] = r;
+ ti++;
+ }
+
+ *nr_targets_out = nr_targets;
+ return tbl;
+}
+
+static unsigned int kdamond_check_reported_accesses(struct damon_ctx *ctx)
+{
+ int cpu;
+ struct damon_target_lookup *tbl;
+ unsigned int nr_targets = 0;
+ unsigned int i;
+
+ tbl = damon_build_target_lookup(ctx, &nr_targets);
+ if (!tbl) {
+ pr_warn_ratelimited(
+ "damon: target-lookup alloc failed; ring drain skipped this tick\n");
+ return kdamond_apply_zero_access_report(ctx);
+ }
+
+ for_each_online_cpu(cpu) {
+ struct damon_report_ring *ring;
+ unsigned int head, tail;
+
+ if (!READ_ONCE(*per_cpu_ptr(&damon_ring_pending, cpu)))
continue;
- damon_for_each_target(t, ctx)
- kdamond_apply_access_report(report, t, ctx);
+ ring = per_cpu_ptr(&damon_report_rings, cpu);
+
+ WRITE_ONCE(*per_cpu_ptr(&damon_ring_pending, cpu), 0);
+ /*
+ * Pair with the producer's smp_wmb between entry and head
+ * publish: order our flag clear before the head read so that
+ * a producer publishing between our clear and READ_ONCE(head)
+ * is observed via the flag it re-sets, not lost as a
+ * stale-head drain.
+ */
+ smp_mb();
+ head = READ_ONCE(ring->head);
+ smp_rmb(); /* pair with smp_wmb in producer */
+ tail = ring->tail;
+
+ while (tail != head) {
+ struct damon_access_report *report =
+ &ring->entries[tail];
+
+ if (time_before(report->report_jiffies,
+ jiffies - usecs_to_jiffies(
+ ctx->attrs.sample_interval)))
+ goto next;
+ if (damon_sample_filter_out(report,
+ &ctx->sample_control))
+ goto next;
+ for (i = 0; i < nr_targets; i++)
+ kdamond_apply_access_report(report,
+ tbl[i].t,
+ tbl[i].regions,
+ tbl[i].nr_regions, ctx);
+next:
+ tail = (tail + 1) & DAMON_REPORT_RING_MASK;
+ }
+ WRITE_ONCE(ring->tail, tail);
}
- mutex_unlock(&damon_access_reports_lock);
/* For nr_accesses_bp, absence of access should also be reported. */
return kdamond_apply_zero_access_report(ctx);
}
@@ -3848,6 +4099,14 @@ static int kdamond_fn(void *data)
complete(&ctx->kdamond_started);
kdamond_init_ctx(ctx);
+ if (!list_empty(&ctx->perf_events)) {
+ struct damon_perf_event *event;
+
+ WRITE_ONCE(ctx->perf_events_active, true);
+ list_for_each_entry(event, &ctx->perf_events, list)
+ damon_perf_event_arm(event);
+ }
+
if (ctx->ops.init)
ctx->ops.init(ctx);
ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1,
@@ -3871,14 +4130,15 @@ static int kdamond_fn(void *data)
if (kdamond_wait_activation(ctx))
break;
- if (ctx->ops.prepare_access_checks)
+ if (list_empty(&ctx->perf_events) &&
+ ctx->ops.prepare_access_checks)
ctx->ops.prepare_access_checks(ctx);
kdamond_usleep(sample_interval);
ctx->passed_sample_intervals++;
- /* todo: make these non-exclusive */
- if (ctx->sample_control.primitives_enabled.page_fault)
+ if (!list_empty(&ctx->perf_events) ||
+ ctx->sample_control.primitives_enabled.page_fault)
max_nr_accesses = kdamond_check_reported_accesses(ctx);
else if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
@@ -3965,6 +4225,15 @@ static int kdamond_fn(void *data)
}
}
done:
+ if (ctx->perf_events_active) {
+ struct damon_perf_event *event;
+
+ WRITE_ONCE(ctx->perf_events_active, false);
+ list_for_each_entry(event, &ctx->perf_events, list)
+ damon_perf_event_disarm(event);
+ /* Drain any in-flight reports queued before disarm took effect. */
+ kdamond_check_reported_accesses(ctx);
+ }
damon_destroy_targets(ctx);
kfree(ctx->regions_score_histogram);
@@ -3986,6 +4255,8 @@ static int kdamond_fn(void *data)
nr_running_ctxs--;
if (!nr_running_ctxs && running_exclusive_ctxs)
running_exclusive_ctxs = false;
+ if (damon_perf_owner == ctx)
+ damon_perf_owner = NULL;
mutex_unlock(&damon_lock);
return 0;
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [RFC PATCH 5/6] mm/damon/vaddr: implement perf-event access check
2026-05-29 16:56 [RFC PATCH 0/6] mm/damon: hardware-sampled access reports Ravi Jonnalagadda
` (3 preceding siblings ...)
2026-05-29 16:56 ` [RFC PATCH 4/6] mm/damon/core: per-CPU SPSC ring drain and damon_perf_event lifecycle Ravi Jonnalagadda
@ 2026-05-29 16:56 ` Ravi Jonnalagadda
2026-05-29 16:56 ` [RFC PATCH 6/6] mm/damon: add damos_node_eligible_mem_bp tracepoint Ravi Jonnalagadda
5 siblings, 0 replies; 7+ messages in thread
From: Ravi Jonnalagadda @ 2026-05-29 16:56 UTC (permalink / raw)
To: sj, akinobu.mita, damon, linux-mm, linux-kernel, linux-doc
Cc: akpm, corbet, bijan311, ajayjoshi, honggyu.kim, yunjeong.mun,
ravis.opensrc
Add the perf-event backend used by the substrate. Two stateless NMI
overflow handlers are picked at perf_event_create_kernel_counter()
time (paddr- vs vaddr-keyed) and called with context = NULL, so the
NMI fast path never dereferences the per-event struct. Each submits
a damon_access_report into the per-CPU ring.
The vaddr handler drops samples with addr == 0 or addr >= TASK_SIZE.
The paddr handler gates on data->sample_flags & PERF_SAMPLE_PHYS_ADDR
rather than testing data->phys_addr for zero (which would also drop
legitimate page 0). AMD IBS Op only populates phys_addr when
IBS_OP_DATA3.dc_phy_addr_valid is set; gating on sample_flags is the
documented way to detect that. is_write is derived from
data->data_src.mem_op.
cpuhp_setup_state_multi() registers one global state at subsys_initcall;
each damon_perf_event is added as an instance in damon_perf_init() so
cpuhp drives per-CPU event creation and offline-time release. Events
are created with disabled=1 and armed by kdamond_fn() when the
substrate is ready; per-CPU init failures are surfaced via
init_complete / any_cpu_failed so damon_perf_init() rolls back the
cpuhp instance instead of leaving a half-armed event behind.
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
---
mm/damon/vaddr.c | 267 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 267 insertions(+)
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index d271476035641..73fcea91afa07 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -7,11 +7,13 @@
#define pr_fmt(fmt) "damon-va: " fmt
+#include <linux/cpuhotplug.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/perf_event.h>
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
@@ -957,6 +959,263 @@ static int damon_va_scheme_score(struct damon_ctx *context,
return DAMOS_MAX_SCORE;
}
+#ifdef CONFIG_PERF_EVENTS
+
+#define DAMON_PERF_MAX_RECORDS (1UL << 20)
+#define DAMON_PERF_INIT_RECORDS (1UL << 15)
+
+/*
+ * NMI hot-path: avoid every heap dereference. These handlers carry no
+ * pointer back to the per-event struct -- perf_event_create_kernel_counter
+ * is called with context = NULL. Submission flows into the global
+ * per-CPU SPSC ring (damon_report_access -> kdamond_check_reported_accesses
+ * drains).
+ */
+static void damon_perf_overflow_vaddr(struct perf_event *perf_event,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ struct damon_access_report report;
+
+ if (!data || !data->addr)
+ return;
+
+ /* Drop kernel-VA hits -- only user-space VAs land in damon vaddr regions. */
+ if (data->addr >= TASK_SIZE)
+ return;
+
+ report = (struct damon_access_report){
+ .vaddr = data->addr & PAGE_MASK,
+ .size = PAGE_SIZE,
+ .cpu = smp_processor_id(),
+ .tid = current->pid,
+ .tgid = current->tgid,
+ .is_write = !!(data->data_src.mem_op & PERF_MEM_OP_STORE),
+ };
+ damon_report_access(&report);
+}
+
+static void damon_perf_overflow_paddr(struct perf_event *perf_event,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ struct damon_access_report report;
+
+ if (!data)
+ return;
+
+ /*
+ * AMD IBS Op only populates data->phys_addr when
+ * IBS_OP_DATA3.dc_phy_addr_valid is set; otherwise the field
+ * carries a stale value. Gate on sample_flags rather than testing
+ * phys_addr for zero (which would also drop legitimate page 0).
+ */
+ if (!(data->sample_flags & PERF_SAMPLE_PHYS_ADDR))
+ return;
+
+ report = (struct damon_access_report){
+ .paddr = data->phys_addr & PAGE_MASK,
+ .size = PAGE_SIZE,
+ .cpu = smp_processor_id(),
+ .is_write = !!(data->data_src.mem_op & PERF_MEM_OP_STORE),
+ };
+ damon_report_access(&report);
+}
+
+static enum cpuhp_state damon_perf_cpuhp_state;
+
+static void damon_perf_event_init_attr(struct damon_perf_event *event,
+ struct perf_event_attr *attr)
+{
+ *attr = (struct perf_event_attr) {
+ .size = sizeof(*attr),
+ .type = event->attr.type,
+ .config = event->attr.config,
+ .config1 = event->attr.config1,
+ .config2 = event->attr.config2,
+ .freq = event->attr.freq,
+ .sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_ADDR |
+ PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
+ (event->attr.sample_phys_addr ?
+ PERF_SAMPLE_PHYS_ADDR : 0) |
+ (event->attr.sample_weight_struct ?
+ PERF_SAMPLE_WEIGHT_STRUCT : 0),
+ .precise_ip = event->attr.precise_ip,
+ .pinned = 1,
+ .disabled = 1,
+ .wakeup_events = event->attr.wakeup_events,
+ .exclude_kernel = event->attr.exclude_kernel,
+ .exclude_hv = event->attr.exclude_hv,
+ };
+
+ /*
+ * sample_period and sample_freq share storage in the kernel
+ * perf_event_attr (union). Select based on the freq toggle so
+ * frequency-based callers (PEBS) and period-based callers
+ * (AMD IBS Op MaxCnt) both work correctly.
+ */
+ if (event->attr.freq)
+ attr->sample_freq = event->attr.sample_freq;
+ else
+ attr->sample_period = event->attr.sample_period;
+}
+
+static int damon_perf_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+ struct damon_perf_event *event = hlist_entry(node,
+ struct damon_perf_event, hlist_node);
+ struct damon_perf *perf = event->priv;
+ struct perf_event_attr attr;
+ struct perf_event *perf_event;
+ perf_overflow_handler_t handler;
+
+ if (!perf)
+ return 0;
+
+ damon_perf_event_init_attr(event, &attr);
+
+ /*
+ * Pick a paddr- or vaddr-specific handler at create time so the
+ * NMI fast path is statically branched. Pass NULL as context --
+ * handlers are stateless wrt the per-event struct, so the NMI
+ * fast path performs no per-event heap dereference. Submission
+ * flows into the global per-CPU SPSC ring via damon_report_access().
+ */
+ handler = event->attr.sample_phys_addr ?
+ damon_perf_overflow_paddr : damon_perf_overflow_vaddr;
+
+ perf_event = perf_event_create_kernel_counter(&attr, cpu, NULL,
+ handler, NULL);
+ if (IS_ERR(perf_event)) {
+ pr_warn_ratelimited("damon-perf: cpu %u event create failed: %ld\n",
+ cpu, PTR_ERR(perf_event));
+ if (!event->init_complete)
+ event->any_cpu_failed = true;
+ return 0; /* never block CPU online */
+ }
+ *per_cpu_ptr(perf->event, cpu) = perf_event;
+ /*
+ * Late-online CPU after the substrate is armed: events are created
+ * with attr.disabled = 1 and would otherwise stay quiescent on this
+ * CPU until the next arm walk. Enable here so coverage matches the
+ * already-online CPUs.
+ */
+ if (event->ctx && READ_ONCE(event->ctx->perf_events_active))
+ perf_event_enable(perf_event);
+ return 0;
+}
+
+static int damon_perf_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+ struct damon_perf_event *event = hlist_entry(node,
+ struct damon_perf_event, hlist_node);
+ struct damon_perf *perf = event->priv;
+ struct perf_event *perf_event;
+
+ if (!perf)
+ return 0;
+
+ perf_event = per_cpu(*perf->event, cpu);
+ if (perf_event) {
+ perf_event_disable(perf_event);
+ perf_event_release_kernel(perf_event);
+ *per_cpu_ptr(perf->event, cpu) = NULL;
+ }
+ return 0;
+}
+
+void damon_perf_event_arm(struct damon_perf_event *event)
+{
+ struct damon_perf *perf = event->priv;
+ struct perf_event *perf_event;
+ int cpu;
+
+ if (!perf)
+ return;
+
+ for_each_online_cpu(cpu) {
+ perf_event = *per_cpu_ptr(perf->event, cpu);
+ if (perf_event)
+ perf_event_enable(perf_event);
+ }
+}
+
+void damon_perf_event_disarm(struct damon_perf_event *event)
+{
+ struct damon_perf *perf = event->priv;
+ struct perf_event *perf_event;
+ int cpu;
+
+ if (!perf)
+ return;
+
+ for_each_online_cpu(cpu) {
+ perf_event = *per_cpu_ptr(perf->event, cpu);
+ if (perf_event)
+ perf_event_disable(perf_event);
+ }
+}
+
+int damon_perf_init(struct damon_ctx *ctx, struct damon_perf_event *event)
+{
+ struct damon_perf *perf;
+ int err = -ENOMEM;
+
+ perf = kzalloc(sizeof(*perf), GFP_KERNEL);
+ if (!perf)
+ return -ENOMEM;
+
+ perf->event = alloc_percpu(typeof(*perf->event));
+ if (!perf->event)
+ goto free_perf;
+
+ event->priv = perf;
+ event->ctx = ctx;
+ INIT_HLIST_NODE(&event->hlist_node);
+
+ /*
+ * cpuhp_state_add_instance() invokes the online callback synchronously
+ * for every currently-online CPU; late-online CPUs subsequently get
+ * an event automatically and offline CPUs release theirs cleanly.
+ */
+ err = cpuhp_state_add_instance(damon_perf_cpuhp_state,
+ &event->hlist_node);
+ if (err)
+ goto free_event;
+
+ event->init_complete = true;
+ if (event->any_cpu_failed) {
+ cpuhp_state_remove_instance(damon_perf_cpuhp_state,
+ &event->hlist_node);
+ err = -ENODEV;
+ goto free_event;
+ }
+
+ return 0;
+
+free_event:
+ free_percpu(perf->event);
+free_perf:
+ kfree(perf);
+ event->priv = NULL;
+ return err;
+}
+
+void damon_perf_cleanup(struct damon_ctx *ctx, struct damon_perf_event *event)
+{
+ struct damon_perf *perf = event->priv;
+
+ if (!perf)
+ return;
+
+ cpuhp_state_remove_instance(damon_perf_cpuhp_state,
+ &event->hlist_node);
+
+ free_percpu(perf->event);
+ kfree(perf);
+ event->priv = NULL;
+}
+
+#endif /* CONFIG_PERF_EVENTS */
+
static int __init damon_va_initcall(void)
{
struct damon_operations ops = {
@@ -979,6 +1238,14 @@ static int __init damon_va_initcall(void)
ops_fvaddr.init = NULL;
ops_fvaddr.update = NULL;
+#ifdef CONFIG_PERF_EVENTS
+ err = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "damon/perf:online",
+ damon_perf_cpu_online, damon_perf_cpu_offline);
+ if (err < 0)
+ return err;
+ damon_perf_cpuhp_state = err;
+#endif
+
err = damon_register_ops(&ops);
if (err)
return err;
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [RFC PATCH 6/6] mm/damon: add damos_node_eligible_mem_bp tracepoint
2026-05-29 16:56 [RFC PATCH 0/6] mm/damon: hardware-sampled access reports Ravi Jonnalagadda
` (4 preceding siblings ...)
2026-05-29 16:56 ` [RFC PATCH 5/6] mm/damon/vaddr: implement perf-event access check Ravi Jonnalagadda
@ 2026-05-29 16:56 ` Ravi Jonnalagadda
5 siblings, 0 replies; 7+ messages in thread
From: Ravi Jonnalagadda @ 2026-05-29 16:56 UTC (permalink / raw)
To: sj, akinobu.mita, damon, linux-mm, linux-kernel, linux-doc
Cc: akpm, corbet, bijan311, ajayjoshi, honggyu.kim, yunjeong.mun,
ravis.opensrc
Fire a tracepoint at every DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP goal
evaluation, exposing (context, scheme, nid, target_value,
current_value). This gives userspace observability into goal-tracking
without polling sysfs.
The trace_..._enabled() guard avoids the damon_for_each_scheme()
iteration cost when nothing is listening.
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@gmail.com>
---
include/trace/events/damon.h | 32 ++++++++++++++++++++++++++++++++
mm/damon/core.c | 20 ++++++++++++++++++++
2 files changed, 52 insertions(+)
diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index e97e70579a8c8..877627c9a1a18 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -91,6 +91,38 @@ TRACE_EVENT(damon_perf_ring_overflow,
TP_printk("cpu=%d", __entry->cpu)
);
+/* Per-tick DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP goal evaluation. */
+TRACE_EVENT(damos_node_eligible_mem_bp,
+
+ TP_PROTO(unsigned int context_idx, unsigned int scheme_idx,
+ int nid,
+ unsigned long target_value, unsigned long current_value),
+
+ TP_ARGS(context_idx, scheme_idx, nid, target_value, current_value),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, context_idx)
+ __field(unsigned int, scheme_idx)
+ __field(int, nid)
+ __field(unsigned long, target_value)
+ __field(unsigned long, current_value)
+ ),
+
+ TP_fast_assign(
+ __entry->context_idx = context_idx;
+ __entry->scheme_idx = scheme_idx;
+ __entry->nid = nid;
+ __entry->target_value = target_value;
+ __entry->current_value = current_value;
+ ),
+
+ TP_printk("ctx_idx=%u scheme_idx=%u nid=%d "
+ "target_value=%lu current_value=%lu",
+ __entry->context_idx, __entry->scheme_idx,
+ __entry->nid,
+ __entry->target_value, __entry->current_value)
+);
+
TRACE_EVENT_CONDITION(damos_before_apply,
TP_PROTO(unsigned int context_idx, unsigned int scheme_idx,
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 1e6966e45144f..609d627e2b33e 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -3203,6 +3203,26 @@ static unsigned long damos_quota_score(struct damon_ctx *c, struct damos *s)
highest_score = max(highest_score,
mult_frac(goal->current_value, 10000,
goal->target_value));
+
+ /*
+ * Per-tick visibility of NODE_ELIGIBLE_MEM_BP goal evaluation
+ * for userspace convergence-detection.
+ */
+ if (goal->metric == DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP &&
+ trace_damos_node_eligible_mem_bp_enabled()) {
+ unsigned int cidx = 0, sidx = 0;
+ struct damos *siter;
+
+ damon_for_each_scheme(siter, c) {
+ if (siter == s)
+ break;
+ sidx++;
+ }
+ trace_damos_node_eligible_mem_bp(cidx, sidx,
+ goal->nid,
+ goal->target_value,
+ goal->current_value);
+ }
}
return highest_score;
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread