* [RFC PATCH v2 01/10] rv/da: fix monitor start ordering and memory ordering for monitoring flag
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
@ 2026-05-11 18:24 ` wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 02/10] rv/da: fix per-task da_monitor_destroy() ordering and sync wen.yang
` (8 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
da_monitor_start() set monitoring=1 before calling da_monitor_init_hook(),
may racing with the sched_switch handler:
da_monitor_start() sched_switch handler
------------------------- ---------------------------------
da_mon->monitoring = 1;
if (da_monitoring(da_mon)) /* true */
ha_start_timer_ns(...);
/* hrtimer->base == NULL, crash */
da_monitor_init_hook(da_mon);
/* hrtimer_setup() sets base */
Fix the ordering and pair with release/acquire semantics:
da_monitor_init_hook(da_mon);
smp_store_release(&da_mon->monitoring, 1); /* da_monitor_start() */
return smp_load_acquire(&da_mon->monitoring); /* da_monitoring() */
On ARM64 a plain STR + LDR does not form a release-acquire pair, so
the load can observe monitoring=1 while hrtimer->base is still NULL.
The plain accesses are also data races under KCSAN.
Use WRITE_ONCE for the monitoring=0 store in da_monitor_reset() to
cover the reset path.
Fixes: 792575348ff7 ("rv/include: Add deterministic automata monitor definition via C macros")
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
include/rv/da_monitor.h | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 39765ff6f098..00ded3d5ab3f 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -82,7 +82,7 @@ static void react(enum states curr_state, enum events event)
static inline void da_monitor_reset(struct da_monitor *da_mon)
{
da_monitor_reset_hook(da_mon);
- da_mon->monitoring = 0;
+ WRITE_ONCE(da_mon->monitoring, 0);
da_mon->curr_state = model_get_initial_state();
}
@@ -95,8 +95,9 @@ static inline void da_monitor_reset(struct da_monitor *da_mon)
static inline void da_monitor_start(struct da_monitor *da_mon)
{
da_mon->curr_state = model_get_initial_state();
- da_mon->monitoring = 1;
da_monitor_init_hook(da_mon);
+ /* Pairs with smp_load_acquire in da_monitoring(). */
+ smp_store_release(&da_mon->monitoring, 1);
}
/*
@@ -104,7 +105,8 @@ static inline void da_monitor_start(struct da_monitor *da_mon)
*/
static inline bool da_monitoring(struct da_monitor *da_mon)
{
- return da_mon->monitoring;
+ /* Pairs with smp_store_release in da_monitor_start(). */
+ return smp_load_acquire(&da_mon->monitoring);
}
/*
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [RFC PATCH v2 02/10] rv/da: fix per-task da_monitor_destroy() ordering and sync
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 01/10] rv/da: fix monitor start ordering and memory ordering for monitoring flag wen.yang
@ 2026-05-11 18:24 ` wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 03/10] selftests/verification: fix verificationtest-ktap for out-of-tree execution wen.yang
` (7 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
The following two paths race:
CPU 0 (disable_stall/__rv_disable_monitor) CPU 1 (wwnr probe handler)
------------------------------------------ -----------------------------
disable_stall()
da_monitor_destroy()
da_monitor_reset_all() <------ [task T: monitoring=0]
da_monitor_start(&T->rv[n])
/* no timer_setup */
monitoring=1 <----
tracepoint_synchronize_unregister()
// CPU 1 probe has already returned; sync returns
Later, enable_stall() acquires the same slot and calls da_monitor_init():
da_monitor_reset_all()
da_monitor_reset(&T->rv[slot]) // monitoring=1, timer.function==0
ha_monitor_reset_env()
ha_cancel_timer()
timer_delete(&ha_mon->timer) // ODEBUG: timer never initialised
ODEBUG: assert_init not available (active state 0)
object type: timer_list
Call trace: timer_delete <- da_monitor_reset_all <- enable_stall
Call tracepoint_synchronize_unregister() inside da_monitor_destroy()
before da_monitor_reset_all(). The unregister_trace_xxx() calls in the
monitor's disable() have already disconnected the tracepoints; the sync
here drains any handler still in flight, so no new monitoring=1 can
appear after da_monitor_reset_all() clears the slot.
Also fix the slot release ordering: release the slot only after
reset_all() to avoid accessing rv[] with an out-of-bounds index.
Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type")
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
include/rv/da_monitor.h | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 00ded3d5ab3f..d04bb3229c75 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -304,6 +304,20 @@ static int da_monitor_init(void)
/*
* da_monitor_destroy - return the allocated slot
+ *
+ * Call tracepoint_synchronize_unregister() before reset_all() to close
+ * the race where an in-flight non-HA probe handler sets monitoring=1
+ * (without calling timer_setup()) after da_monitor_reset_all() has
+ * already cleared the slot but before the caller's own sync completes.
+ * Without this barrier, an HA_TIMER_WHEEL monitor that later acquires
+ * the same slot would call timer_delete() on a never-initialised
+ * timer_list, triggering ODEBUG warnings.
+ *
+ * Note: tracepoint_synchronize_unregister() is a system-wide barrier
+ * that waits for all CPUs to finish any in-flight tracepoint handlers.
+ * The caller's own __rv_disable_monitor() issues a second sync after
+ * returning from disable(); that redundant call is harmless on the
+ * infrequent admin (enable/disable) path.
*/
static inline void da_monitor_destroy(void)
{
@@ -311,10 +325,10 @@ static inline void da_monitor_destroy(void)
WARN_ONCE(1, "Disabling a disabled monitor: " __stringify(MONITOR_NAME));
return;
}
+ tracepoint_synchronize_unregister();
+ da_monitor_reset_all();
rv_put_task_monitor_slot(task_mon_slot);
task_mon_slot = RV_PER_TASK_MONITOR_INIT;
-
- da_monitor_reset_all();
}
#elif RV_MON_TYPE == RV_MON_PER_OBJ
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [RFC PATCH v2 03/10] selftests/verification: fix verificationtest-ktap for out-of-tree execution
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 01/10] rv/da: fix monitor start ordering and memory ordering for monitoring flag wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 02/10] rv/da: fix per-task da_monitor_destroy() ordering and sync wen.yang
@ 2026-05-11 18:24 ` wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 04/10] rv/da: add pre-allocated storage pool for per-object monitors wen.yang
` (6 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
verificationtest-ktap used a CWD-relative path (../ftrace/ftracetest)
and a relative argument (../verification) for --rv. This works when
the shell changes into the verification directory first, but breaks
when the script is invoked directly - e.g. by the kselftest runner or
vng - because the working directory is the kernel source root, not the
script's own directory.
Fix this by computing the script's directory from $0 with cd/dirname/pwd
and using absolute paths for both the ftracetest invocation and the --rv
argument. Also export the directory to PATH so that check_requires in
the ftracetest framework can locate helper binaries.
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
tools/testing/selftests/verification/verificationtest-ktap | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/verification/verificationtest-ktap b/tools/testing/selftests/verification/verificationtest-ktap
index 18f7fe324e2f..456b8578a307 100755
--- a/tools/testing/selftests/verification/verificationtest-ktap
+++ b/tools/testing/selftests/verification/verificationtest-ktap
@@ -5,4 +5,6 @@
#
# Copyright (C) Arm Ltd., 2023
-../ftrace/ftracetest -K -v --rv ../verification
+dir=$(cd "$(dirname "$0")" && pwd)
+export PATH="$dir:$PATH"
+"$dir/../ftrace/ftracetest" -K -v --rv "$dir"
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [RFC PATCH v2 04/10] rv/da: add pre-allocated storage pool for per-object monitors
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
` (2 preceding siblings ...)
2026-05-11 18:24 ` [RFC PATCH v2 03/10] selftests/verification: fix verificationtest-ktap for out-of-tree execution wen.yang
@ 2026-05-11 18:24 ` wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 05/10] rv: add generic uprobe infrastructure for RV monitors wen.yang
` (5 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
da_create_empty_storage() uses kmalloc_nolock(), which requires
CONFIG_HAVE_ALIGNED_STRUCT_PAGE; on UML and some PREEMPT_RT
configurations it always returns NULL. Calling kmalloc from scheduler
tracepoint handlers also adds unwanted latency and can fail under
memory pressure.
Add da_monitor_init_prealloc(N) as an opt-in alternative to
da_monitor_init(). It allocates N da_monitor_storage slots with
GFP_KERNEL up-front and manages them on a LIFO free-stack protected
by a spinlock, so da_create_or_get() never calls kmalloc on the hot
path.
Monitors that do not call da_monitor_init_prealloc() are unaffected.
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
include/rv/da_monitor.h | 208 +++++++++++++++++++++++++++++++++++-----
1 file changed, 186 insertions(+), 22 deletions(-)
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index d04bb3229c75..7d6f62766251 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -433,18 +433,6 @@ static inline da_id_type da_get_id(struct da_monitor *da_mon)
return container_of(da_mon, struct da_monitor_storage, rv.da_mon)->id;
}
-/*
- * da_create_or_get - create the per-object storage if not already there
- *
- * This needs a lookup so should be guarded by RCU, the condition is checked
- * directly in da_create_storage()
- */
-static inline void da_create_or_get(da_id_type id, monitor_target target)
-{
- guard(rcu)();
- da_create_storage(id, target, da_get_monitor(id, target));
-}
-
/*
* da_fill_empty_storage - store the target in a pre-allocated storage
*
@@ -475,15 +463,121 @@ static inline monitor_target da_get_target_by_id(da_id_type id)
return mon_storage->target;
}
+/*
+ * Per-object pool state.
+ *
+ * Zero-initialised by default (storage == NULL ⟹ kmalloc mode). A monitor
+ * opts into pool mode by calling da_monitor_init_prealloc(N) instead of
+ * da_monitor_init(), which sets storage to a non-NULL kcalloc'd array.
+ *
+ * Because every field is wrapped in this struct and the struct itself is a
+ * per-TU static, each monitor that includes this header gets a completely
+ * independent pool. A kmalloc monitor (e.g. nomiss) and a pool monitor
+ * (e.g. tlob) therefore coexist without any interference.
+ *
+ * da_pool_return_cb runs from softirq on non-PREEMPT_RT, so irqsave is
+ * required to prevent deadlock with task-context callers. On PREEMPT_RT
+ * it runs from an rcuc kthread where spinlock_t is a sleeping lock.
+ */
+struct da_per_obj_pool {
+ struct da_monitor_storage *storage; /* non-NULL ⟹ pool mode */
+ struct da_monitor_storage **free; /* kmalloc'd pointer stack */
+ unsigned int free_top;
+ spinlock_t lock;
+};
+
+static struct da_per_obj_pool da_pool = {
+ .lock = __SPIN_LOCK_UNLOCKED(da_pool.lock),
+};
+
+static void da_pool_return_cb(struct rcu_head *head)
+{
+ struct da_monitor_storage *ms =
+ container_of(head, struct da_monitor_storage, rcu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&da_pool.lock, flags);
+ da_pool.free[da_pool.free_top++] = ms;
+ spin_unlock_irqrestore(&da_pool.lock, flags);
+}
+
+/* Pops a slot from the pre-allocated pool; returns -ENOSPC if exhausted. */
+static inline int da_create_or_get_pool(da_id_type id, monitor_target target)
+{
+ struct da_monitor_storage *mon_storage;
+ unsigned long flags;
+
+ spin_lock_irqsave(&da_pool.lock, flags);
+ if (!da_pool.free_top) {
+ spin_unlock_irqrestore(&da_pool.lock, flags);
+ return -ENOSPC;
+ }
+ mon_storage = da_pool.free[--da_pool.free_top];
+ spin_unlock_irqrestore(&da_pool.lock, flags);
+
+ mon_storage->id = id;
+ mon_storage->target = target;
+ guard(rcu)();
+ hash_add_rcu(da_monitor_ht, &mon_storage->node, id);
+ return 0;
+}
+
+/*
+ * Tries da_create_storage() first (lock-free via kmalloc_nolock); falls back
+ * to kmalloc(GFP_KERNEL). Must be called from task context.
+ */
+static inline int da_create_or_get_kmalloc(da_id_type id, monitor_target target)
+{
+ struct da_monitor_storage *mon_storage;
+
+ scoped_guard(rcu) {
+ if (da_create_storage(id, target, da_get_monitor(id, target)))
+ return 0;
+ }
+
+ /*
+ * da_create_storage() failed because kmalloc_nolock() returned NULL.
+ * Allocate with GFP_KERNEL outside the RCU read section: GFP_KERNEL
+ * may sleep for memory reclaim, which is illegal while the RCU read
+ * lock is held (preemption disabled on !PREEMPT_RT).
+ */
+ mon_storage = kmalloc_obj(*mon_storage, GFP_KERNEL | __GFP_ZERO);
+ if (!mon_storage)
+ return -ENOMEM;
+ mon_storage->id = id;
+ mon_storage->target = target;
+
+ /*
+ * Re-check for a concurrent insertion before linking: another
+ * caller may have succeeded while we slept in kmalloc().
+ * Discard our allocation and let the winner's entry stand.
+ */
+ scoped_guard(rcu) {
+ if (da_get_monitor(id, target)) {
+ kfree(mon_storage);
+ return 0;
+ }
+ hash_add_rcu(da_monitor_ht, &mon_storage->node, id);
+ }
+ return 0;
+}
+
+/* Create the per-object storage if not already there. */
+static inline int da_create_or_get(da_id_type id, monitor_target target)
+{
+ if (da_pool.storage)
+ return da_create_or_get_pool(id, target);
+ return da_create_or_get_kmalloc(id, target);
+}
+
/*
* da_destroy_storage - destroy the per-object storage
*
- * The caller is responsible to synchronise writers, either with locks or
- * implicitly. For instance, if da_destroy_storage is called at sched_exit and
- * da_create_storage can never occur after that, it's safe to call this without
- * locks.
- * This function includes an RCU read-side critical section to synchronise
- * against da_monitor_destroy().
+ * Pool mode: removes from hash and returns the slot via call_rcu().
+ * Kmalloc mode: removes from hash and frees via kfree_rcu().
+ *
+ * Includes an RCU read-side critical section to synchronise against
+ * da_monitor_destroy().
*/
static inline void da_destroy_storage(da_id_type id)
{
@@ -491,15 +585,17 @@ static inline void da_destroy_storage(da_id_type id)
guard(rcu)();
mon_storage = __da_get_mon_storage(id);
-
if (!mon_storage)
return;
da_monitor_reset_hook(&mon_storage->rv.da_mon);
hash_del_rcu(&mon_storage->node);
- kfree_rcu(mon_storage, rcu);
+ if (da_pool.storage)
+ call_rcu(&mon_storage->rcu, da_pool_return_cb);
+ else
+ kfree_rcu(mon_storage, rcu);
}
-static void da_monitor_reset_all(void)
+static __maybe_unused void da_monitor_reset_all(void)
{
struct da_monitor_storage *mon_storage;
int bkt;
@@ -510,13 +606,65 @@ static void da_monitor_reset_all(void)
rcu_read_unlock();
}
+/*
+ * da_monitor_init_prealloc - initialise with a pre-allocated storage pool
+ *
+ * Allocates @prealloc_count storage slots up-front so that da_create_or_get()
+ * and da_destroy_storage() never call kmalloc/kfree. Must be called instead
+ * of da_monitor_init() for monitors that require pool mode.
+ */
+static inline int da_monitor_init_prealloc(unsigned int prealloc_count)
+{
+ hash_init(da_monitor_ht);
+
+ da_pool.storage = kcalloc(prealloc_count, sizeof(*da_pool.storage),
+ GFP_KERNEL);
+ if (!da_pool.storage)
+ return -ENOMEM;
+
+ da_pool.free = kmalloc_array(prealloc_count, sizeof(*da_pool.free),
+ GFP_KERNEL);
+ if (!da_pool.free) {
+ kfree(da_pool.storage);
+ da_pool.storage = NULL;
+ return -ENOMEM;
+ }
+
+ da_pool.free_top = 0;
+ for (unsigned int i = 0; i < prealloc_count; i++)
+ da_pool.free[da_pool.free_top++] = &da_pool.storage[i];
+ return 0;
+}
+
+/*
+ * da_monitor_init - initialise in kmalloc mode (no pre-allocation)
+ */
static inline int da_monitor_init(void)
{
hash_init(da_monitor_ht);
return 0;
}
-static inline void da_monitor_destroy(void)
+static inline void da_monitor_destroy_pool(void)
+{
+ WARN_ON_ONCE(!hash_empty(da_monitor_ht));
+ /*
+ * Wait for all in-flight da_pool_return_cb() callbacks to
+ * complete before freeing da_pool.free. synchronize_rcu() is
+ * not sufficient: it only waits for callbacks registered before
+ * it was called, but call_rcu() from concurrent da_destroy_storage()
+ * calls may have been enqueued later. rcu_barrier() drains every
+ * pending callback.
+ */
+ rcu_barrier();
+ kfree(da_pool.storage);
+ da_pool.storage = NULL;
+ kfree(da_pool.free);
+ da_pool.free = NULL;
+ da_pool.free_top = 0;
+}
+
+static inline void da_monitor_destroy_kmalloc(void)
{
struct da_monitor_storage *mon_storage;
struct hlist_node *tmp;
@@ -534,6 +682,22 @@ static inline void da_monitor_destroy(void)
}
}
+/*
+ * da_monitor_destroy - tear down the per-object monitor
+ *
+ * Pool mode: the hash must already be empty (caller must have drained all
+ * tasks first); calls rcu_barrier() to drain all pending da_pool_return_cb()
+ * callbacks before freeing pool arrays.
+ * Kmalloc mode: drains any remaining entries after synchronize_rcu().
+ */
+static inline void da_monitor_destroy(void)
+{
+ if (da_pool.storage)
+ da_monitor_destroy_pool();
+ else
+ da_monitor_destroy_kmalloc();
+}
+
/*
* Allow the per-object monitors to run allocation manually, necessary if the
* start condition is in a context problematic for allocation (e.g. scheduling).
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [RFC PATCH v2 05/10] rv: add generic uprobe infrastructure for RV monitors
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
` (3 preceding siblings ...)
2026-05-11 18:24 ` [RFC PATCH v2 04/10] rv/da: add pre-allocated storage pool for per-object monitors wen.yang
@ 2026-05-11 18:24 ` wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 06/10] rvgen: support reset() on the __init arrow for global-window HA clocks wen.yang
` (4 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
Introduce rv_uprobe, a thin wrapper around uprobe_consumer providing
rv_uprobe_attach_path(), rv_uprobe_attach(), and rv_uprobe_detach()
for RV monitors. An opaque priv pointer is forwarded unchanged to
entry/return handlers so monitors can carry per-binding state (e.g. a
latency threshold) to the hot path without any global lookup.
rv_uprobe_detach() is fully synchronous (nosync + sync + path_put +
kfree), closing the use-after-free window present in open-coded
patterns where kfree() precedes uprobe_unregister_sync().
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
include/rv/rv_uprobe.h | 87 ++++++++++++++++++++
kernel/trace/rv/Kconfig | 4 +
kernel/trace/rv/Makefile | 1 +
kernel/trace/rv/rv_uprobe.c | 153 ++++++++++++++++++++++++++++++++++++
4 files changed, 245 insertions(+)
create mode 100644 include/rv/rv_uprobe.h
create mode 100644 kernel/trace/rv/rv_uprobe.c
diff --git a/include/rv/rv_uprobe.h b/include/rv/rv_uprobe.h
new file mode 100644
index 000000000000..084cdb36a2ff
--- /dev/null
+++ b/include/rv/rv_uprobe.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Generic uprobe infrastructure for RV monitors.
+ *
+ */
+
+#ifndef _RV_UPROBE_H
+#define _RV_UPROBE_H
+
+#include <linux/path.h>
+#include <linux/types.h>
+
+struct pt_regs;
+
+/**
+ * struct rv_uprobe - a single uprobe registered on behalf of an RV monitor
+ *
+ * @offset: byte offset within the ELF binary where the probe is installed
+ * @priv: monitor-private pointer; set at attach time, never touched by
+ * this layer; passed unchanged to entry_fn / ret_fn
+ * @path: resolved path of the probed binary (read-only after attach);
+ * callers may use path.dentry for identity comparisons
+ *
+ * The implementation fields (uprobe_consumer, uprobe handle, callbacks) are
+ * private to rv_uprobe.c and are not exposed here; monitors must not access
+ * them directly.
+ */
+struct rv_uprobe {
+ /* public: read-only after rv_uprobe_attach*() */
+ loff_t offset;
+ void *priv;
+ struct path path;
+};
+
+/**
+ * rv_uprobe_attach_path - register an uprobe given an already-resolved path
+ * @path: path of the target binary; rv_uprobe takes its own reference
+ * @offset: byte offset within the binary
+ * @entry_fn: called on probe hit (entry); may be NULL
+ * @ret_fn: called on function return (uretprobe); may be NULL
+ * @priv: opaque pointer forwarded to callbacks unchanged
+ *
+ * Use this variant when the caller has already resolved the path (e.g. to
+ * register multiple probes on the same binary with a single kern_path call).
+ * The inode is derived internally via d_real_inode(), so inode and path are
+ * always consistent.
+ *
+ * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure.
+ */
+struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv);
+
+/**
+ * rv_uprobe_attach - resolve binpath and register an uprobe
+ * @binpath: absolute path to the target binary
+ * @offset: byte offset within the binary
+ * @entry_fn: called on probe hit (entry); may be NULL
+ * @ret_fn: called on function return (uretprobe); may be NULL
+ * @priv: opaque pointer forwarded to callbacks unchanged
+ *
+ * Resolves binpath via kern_path(), then delegates to rv_uprobe_attach_path().
+ *
+ * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure.
+ */
+struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv);
+
+/**
+ * rv_uprobe_detach - synchronously unregister an uprobe and free it
+ * @p: probe to detach; may be NULL (no-op)
+ *
+ * Calls uprobe_unregister_nosync(), then uprobe_unregister_sync() to wait
+ * for any in-progress handler to finish, then releases the path reference
+ * and frees the rv_uprobe struct. The caller's priv data is NOT freed.
+ *
+ * Safe to call from process context only (uprobe_unregister_sync() may
+ * schedule).
+ */
+void rv_uprobe_detach(struct rv_uprobe *p);
+
+#endif /* _RV_UPROBE_H */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 3884b14df375..e2e0033a00b9 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -59,6 +59,10 @@ config RV_PER_TASK_MONITORS
This option configures the maximum number of per-task RV monitors that can run
simultaneously.
+config RV_UPROBE
+ bool
+ depends on RV && UPROBES
+
source "kernel/trace/rv/monitors/wip/Kconfig"
source "kernel/trace/rv/monitors/wwnr/Kconfig"
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 94498da35b37..f139b904bea3 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
# Add new monitors here
+obj-$(CONFIG_RV_UPROBE) += rv_uprobe.o
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o
diff --git a/kernel/trace/rv/rv_uprobe.c b/kernel/trace/rv/rv_uprobe.c
new file mode 100644
index 000000000000..bc28399cfd4b
--- /dev/null
+++ b/kernel/trace/rv/rv_uprobe.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic uprobe infrastructure for RV monitors.
+ *
+ */
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/uprobes.h>
+#include <rv/rv_uprobe.h>
+
+/*
+ * Private extension of struct rv_uprobe. Allocated by rv_uprobe_attach*()
+ * and returned to callers as &impl->pub.
+ */
+struct rv_uprobe_impl {
+ struct rv_uprobe pub; /* must be first; callers hold &pub */
+ struct uprobe_consumer uc;
+ struct uprobe *uprobe;
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data);
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data);
+};
+
+static int rv_uprobe_handler(struct uprobe_consumer *uc,
+ struct pt_regs *regs, __u64 *data)
+{
+ struct rv_uprobe_impl *impl = container_of(uc, struct rv_uprobe_impl, uc);
+
+ if (impl->entry_fn)
+ return impl->entry_fn(&impl->pub, regs, data);
+ return 0;
+}
+
+static int rv_uprobe_ret_handler(struct uprobe_consumer *uc,
+ unsigned long func,
+ struct pt_regs *regs, __u64 *data)
+{
+ struct rv_uprobe_impl *impl = container_of(uc, struct rv_uprobe_impl, uc);
+
+ if (impl->ret_fn)
+ return impl->ret_fn(&impl->pub, func, regs, data);
+ return 0;
+}
+
+static struct rv_uprobe *
+__rv_uprobe_attach(struct inode *inode, struct path *path, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv)
+{
+ struct rv_uprobe_impl *impl;
+ int ret;
+
+ if (!entry_fn && !ret_fn)
+ return ERR_PTR(-EINVAL);
+
+ impl = kzalloc_obj(*impl, GFP_KERNEL);
+ if (!impl)
+ return ERR_PTR(-ENOMEM);
+
+ impl->pub.offset = offset;
+ impl->pub.priv = priv;
+ impl->entry_fn = entry_fn;
+ impl->ret_fn = ret_fn;
+ path_get(path);
+ impl->pub.path = *path;
+
+ if (entry_fn)
+ impl->uc.handler = rv_uprobe_handler;
+ if (ret_fn)
+ impl->uc.ret_handler = rv_uprobe_ret_handler;
+
+ impl->uprobe = uprobe_register(inode, offset, 0, &impl->uc);
+ if (IS_ERR(impl->uprobe)) {
+ ret = PTR_ERR(impl->uprobe);
+ path_put(&impl->pub.path);
+ kfree(impl);
+ return ERR_PTR(ret);
+ }
+
+ return &impl->pub;
+}
+
+/**
+ * rv_uprobe_attach_path - register an uprobe given an already-resolved path
+ */
+struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv)
+{
+ struct inode *inode = d_real_inode(path->dentry);
+
+ return __rv_uprobe_attach(inode, path, offset, entry_fn, ret_fn, priv);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_attach_path);
+
+/**
+ * rv_uprobe_attach - resolve binpath and register an uprobe
+ */
+struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv)
+{
+ struct rv_uprobe *p;
+ struct path path;
+ int ret;
+
+ ret = kern_path(binpath, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!d_is_reg(path.dentry)) {
+ path_put(&path);
+ return ERR_PTR(-EINVAL);
+ }
+
+ p = rv_uprobe_attach_path(&path, offset, entry_fn, ret_fn, priv);
+ path_put(&path);
+ return p;
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_attach);
+
+/**
+ * rv_uprobe_detach - synchronously unregister an uprobe and free it
+ */
+void rv_uprobe_detach(struct rv_uprobe *p)
+{
+ struct rv_uprobe_impl *impl;
+
+ if (!p)
+ return;
+
+ impl = container_of(p, struct rv_uprobe_impl, pub);
+ uprobe_unregister_nosync(impl->uprobe, &impl->uc);
+ /*
+ * uprobe_unregister_sync() is a global barrier: it waits for all
+ * in-flight uprobe handlers across the entire system to complete,
+ * not just handlers for this probe. This is intentional — it
+ * guarantees that no handler touching impl->pub.priv is running by
+ * the time we return, even if the caller immediately frees priv.
+ */
+ uprobe_unregister_sync();
+ path_put(&p->path);
+ kfree(impl);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_detach);
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [RFC PATCH v2 06/10] rvgen: support reset() on the __init arrow for global-window HA clocks
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
` (4 preceding siblings ...)
2026-05-11 18:24 ` [RFC PATCH v2 05/10] rv: add generic uprobe infrastructure for RV monitors wen.yang
@ 2026-05-11 18:24 ` wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 07/10] rv/tlob: add tlob model DOT file wen.yang
` (3 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
rvgen rejects a state invariant when its env is never reset on any
state-transition edge. This prevents expressing monitors where a clock
tracks the full monitoring window — reset once at object creation,
active in all states.
Allow reset() annotations on the __init_STATE -> STATE arrow.
automata.py adds listed envs to the new env_init_started set (and to
env_stored so the HA framework allocates per-object storage). dot2k.py
uses env_init_started for three purposes:
- Generate a handle_monitor_start() skeleton that resets the env and
arms the timer after the caller sets up DA storage and initial state.
- Guard ha_inv_to_guard calls with !ha_monitor_env_invalid() for these
envs: a concurrent DA event between da_handle_start_event() and
ha_reset_env() would otherwise store U64_MAX - BUDGET as the guard
anchor, silently disabling enforcement.
- Always generate ha_verify_guards() for monitors with invariants,
providing a stable extension point for future per-event guards.
Models without __init resets (e.g. stall.dot) are unaffected.
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
tools/verification/rvgen/rvgen/automata.py | 26 ++++++
tools/verification/rvgen/rvgen/dot2k.py | 100 +++++++++++++++++++--
2 files changed, 119 insertions(+), 7 deletions(-)
diff --git a/tools/verification/rvgen/rvgen/automata.py b/tools/verification/rvgen/rvgen/automata.py
index b9f8149f7118..178a1a4ffd8a 100644
--- a/tools/verification/rvgen/rvgen/automata.py
+++ b/tools/verification/rvgen/rvgen/automata.py
@@ -69,15 +69,41 @@ class Automata:
self.states, self.initial_state, self.final_states = self.__get_state_variables()
self.env_types = {}
self.env_stored = set()
+ self.env_init_started = set()
self.constraint_vars = set()
self.self_loop_reset_events = set()
self.events, self.envs = self.__get_event_variables()
+ self.__parse_init_resets()
self.function, self.constraints = self.__create_matrix()
self.events_start, self.events_start_run = self.__store_init_events()
self.env_stored = sorted(self.env_stored)
+ self.env_init_started = sorted(self.env_init_started)
self.constraint_vars = sorted(self.constraint_vars)
self.self_loop_reset_events = sorted(self.self_loop_reset_events)
+ def __parse_init_resets(self) -> None:
+ """Parse reset() annotations on the __init_STATE -> STATE arrow.
+
+ Adds each listed env to env_stored (HA framework allocates per-object
+ storage) and env_init_started (ha2k generates handle_monitor_start()).
+ """
+ init_prefix = f'"{self.init_marker}'
+ for line in map(str.lstrip, self.__dot_lines):
+ if not line.startswith(init_prefix):
+ continue
+ split_line = line.split()
+ if len(split_line) < 3 or split_line[1] != "->":
+ continue
+ if "label" not in line:
+ continue
+ label = "".join(split_line[split_line.index("label") + 2:-1]).replace('"', '')
+ for part in label.split(";"):
+ reset_m = self.constraint_reset.search(part.strip())
+ if reset_m:
+ env = reset_m["env"]
+ self.env_stored.add(env)
+ self.env_init_started.add(env)
+
def __get_model_name(self) -> str:
basename = ntpath.basename(self.__dot_path)
if not basename.endswith(".dot") and not basename.endswith(".gv"):
diff --git a/tools/verification/rvgen/rvgen/dot2k.py b/tools/verification/rvgen/rvgen/dot2k.py
index e6f476b903b0..e8066260c0af 100644
--- a/tools/verification/rvgen/rvgen/dot2k.py
+++ b/tools/verification/rvgen/rvgen/dot2k.py
@@ -366,7 +366,18 @@ f"""static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
conf_g = [e for s, e in conflict_guards if s == state]
if not conf_i and not conf_g:
continue
- buff.append(f"\t{_else}if (curr_state == {self.states[state]}{self.enum_suffix})")
+
+ state_name = f"{self.states[state]}{self.enum_suffix}"
+ env_full = self.__get_constraint_env(constr)
+ env_bare = env_full[:-len(self.enum_suffix)]
+ if env_bare in self.env_init_started:
+ # env_store is ENV_INVALID_VALUE until handle_monitor_start();
+ # skip ha_inv_to_guard during the init race window.
+ cont = "\t\t " if _else else "\t "
+ buff.append(f"\t{_else}if (curr_state == {state_name} &&")
+ buff.append(f"{cont}!ha_monitor_env_invalid(ha_mon, {env_full}))")
+ else:
+ buff.append(f"\t{_else}if (curr_state == {state_name})")
buff.append(f"\t\t{self.__start_to_conv(constr)};")
_else = "else "
@@ -376,16 +387,22 @@ f"""static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
def __fill_verify_guards_func(self) -> list[str]:
buff = []
- if not self.guards:
+ # Always generate for monitors with invariants: stable extension
+ # point for future guard conditions.
+ if not self.guards and not self.invariants:
return []
buff.append(
f"""static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
\t\t\t\t enum {self.enum_states_def} curr_state, enum {self.enum_events_def} event,
\t\t\t\t enum {self.enum_states_def} next_state, u64 time_ns)
-{{
-\tbool res = true;
-""")
+{{""")
+
+ if not self.guards:
+ buff.append("\treturn true;\n}\n")
+ return buff
+
+ buff.append("\tbool res = true;\n")
_else = ""
for edge, constr in sorted(self.guards.items()):
@@ -522,7 +539,7 @@ f"""static bool ha_verify_constraint(struct ha_monitor *ha_mon,
buff.append("\tha_convert_inv_guard(ha_mon, curr_state, event, "
"next_state, time_ns);\n")
- if self.guards:
+ if self.guards or self.invariants:
buff.append("\tif (!ha_verify_guards(ha_mon, curr_state, event, "
"next_state, time_ns))\n\t\treturn false;\n")
@@ -599,8 +616,77 @@ f"""static bool ha_verify_constraint(struct ha_monitor *ha_mon,
buff.append("}\n")
return buff
+ def __fill_init_start_helper(self) -> list[str]:
+ """Generate handle_monitor_start() for envs reset on the __init arrow.
+
+ env_store is invalid inside da_handle_start_event(); this helper must
+ be called after DA storage is allocated and initial state is set.
+ """
+ if not self.env_init_started:
+ return []
+
+ # Collect the ha_start_timer call for each init-started env from the
+ # first state invariant that references it.
+ timer_calls: dict[str, str] = {}
+ for env in self.env_init_started:
+ env_full = f"{env}{self.enum_suffix}"
+ for constr in self.invariants.values():
+ if env_full in constr:
+ timer_calls[env] = constr
+ break
+
+ buff = []
+ buff.append(
+"""/*
+ * handle_monitor_start - reset per-object clock(s) and arm the timer.
+ *
+ * env_store is invalid inside da_handle_start_event(); call this helper
+ * after allocating DA storage and setting the initial DA state.
+ *
+ * XXX: replace the placeholders with the actual logic for your monitor.
+ */""")
+
+ if self.monitor_type == "per_obj":
+ buff.append("static int handle_monitor_start(int id, monitor_target t)")
+ buff.append("{")
+ buff.append("\tstruct ha_monitor *ha_mon;")
+ buff.append("\tu64 time_ns = ktime_get_ns();\n")
+ buff.append("\t/* XXX: allocate DA storage, e.g. da_create_or_get(id, t) */")
+ buff.append("\t/* XXX: set initial DA state, e.g. da_handle_start_event(id, t, <event>) */")
+ buff.append("\tha_mon = /* XXX: retrieve ha_monitor for (id, t) */;")
+ elif self.monitor_type == "per_task":
+ buff.append("static int handle_monitor_start(struct task_struct *p)")
+ buff.append("{")
+ buff.append("\tstruct ha_monitor *ha_mon;")
+ buff.append("\tu64 time_ns = ktime_get_ns();\n")
+ buff.append("\t/* XXX: allocate DA storage, e.g. da_create_or_get(p->pid, p) */")
+ buff.append("\t/* XXX: set initial DA state, e.g. da_handle_start_event(p->pid, p, <event>) */")
+ buff.append("\tha_mon = /* XXX: retrieve ha_monitor for p */;")
+ else:
+ buff.append("static int handle_monitor_start(void)")
+ buff.append("{")
+ buff.append("\tstruct ha_monitor *ha_mon;")
+ buff.append("\tu64 time_ns = ktime_get_ns();\n")
+ buff.append("\tha_mon = /* XXX: retrieve global ha_monitor */;")
+
+ buff.append("\tif (!ha_mon)")
+ buff.append("\t\treturn -ENOENT;")
+
+ for env in self.env_init_started:
+ buff.append(f"\tha_reset_env(ha_mon, {env}{self.enum_suffix}, time_ns);")
+ if env in timer_calls:
+ buff.append(f"\t{timer_calls[env]};")
+ else:
+ buff.append(f"\t/* XXX: arm timer for {env} */")
+
+ buff.append("\treturn 0;")
+ buff.append("}\n")
+ return buff
+
def _fill_hybrid_definitions(self) -> list[str]:
- return self.__fill_hybrid_get_reset_functions() + self.__fill_constr_func()
+ return (self.__fill_hybrid_get_reset_functions() +
+ self.__fill_init_start_helper() +
+ self.__fill_constr_func())
def _fill_timer_type(self) -> list:
if self.invariants:
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [RFC PATCH v2 07/10] rv/tlob: add tlob model DOT file
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
` (5 preceding siblings ...)
2026-05-11 18:24 ` [RFC PATCH v2 06/10] rvgen: support reset() on the __init arrow for global-window HA clocks wen.yang
@ 2026-05-11 18:24 ` wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 08/10] rv/tlob: add tlob hybrid automaton monitor wen.yang
` (2 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
Add the Graphviz DOT specification for the tlob (task latency over
budget) hybrid automaton.
The model defines three states: running (initial), waiting
(in the scheduler runqueue), and sleeping (blocked on a
resource), with the transitions:
running --(sleep)-------> sleeping
running --(preempt)-----> waiting
sleeping --(wakeup)------> waiting
waiting --(switch_in)--> running
A single clock invariant clk_elapsed < BUDGET_NS() is active in all
three states. The HA framework enforces it via a per-task hrtimer;
expiry emits error_env_tlob and resets the monitor automatically.
Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
MAINTAINERS | 3 +++
tools/verification/models/tlob.dot | 21 +++++++++++++++++++++
2 files changed, 24 insertions(+)
create mode 100644 tools/verification/models/tlob.dot
diff --git a/MAINTAINERS b/MAINTAINERS
index 74c86cf9bc65..beb7224d08ef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23317,7 +23317,10 @@ S: Maintained
F: Documentation/trace/rv/
F: include/linux/rv.h
F: include/rv/
+F: include/uapi/linux/rv.h
F: kernel/trace/rv/
+F: samples/rv/
+F: tools/testing/selftests/rv/
F: tools/testing/selftests/verification/
F: tools/verification/
diff --git a/tools/verification/models/tlob.dot b/tools/verification/models/tlob.dot
new file mode 100644
index 000000000000..8421b1120e80
--- /dev/null
+++ b/tools/verification/models/tlob.dot
@@ -0,0 +1,21 @@
+digraph state_automaton {
+ center = true;
+ size = "7,11";
+ {node [shape = plaintext, style=invis, label=""] "__init_running"};
+ {node [shape = ellipse] "running"};
+ {node [shape = plaintext] "running"};
+ {node [shape = plaintext] "waiting"};
+ {node [shape = plaintext] "sleeping"};
+ "__init_running" -> "running" [ label = "reset(clk_elapsed)" ];
+ "running" [label = "running\nclk_elapsed < BUDGET_NS()", color = green3];
+ "waiting" [label = "waiting\nclk_elapsed < BUDGET_NS()"];
+ "sleeping" [label = "sleeping\nclk_elapsed < BUDGET_NS()"];
+ "running" -> "sleeping" [ label = "sleep" ];
+ "running" -> "waiting" [ label = "preempt" ];
+ "waiting" -> "running" [ label = "switch_in" ];
+ "sleeping" -> "waiting" [ label = "wakeup" ];
+ { rank = min ;
+ "__init_running";
+ "running";
+ }
+}
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [RFC PATCH v2 08/10] rv/tlob: add tlob hybrid automaton monitor
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
` (6 preceding siblings ...)
2026-05-11 18:24 ` [RFC PATCH v2 07/10] rv/tlob: add tlob model DOT file wen.yang
@ 2026-05-11 18:24 ` wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 09/10] rv/tlob: add KUnit tests for the tlob monitor wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 10/10] selftests/verification: add tlob selftests wen.yang
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
Introduce tlob (task latency over budget), a per-task hybrid-automaton
RV monitor that measures elapsed time (CLOCK_MONOTONIC) across
a user-delimited code section and fires an error_env_tlob tracepoint
when the elapsed time exceeds a configurable per-invocation budget.
The monitor is built on RV_MON_PER_OBJ with HA_TIMER_HRTIMER. Three
states track the scheduler status of the monitored task:
running --(sleep)-------> sleeping
running --(preempt)-----> waiting
sleeping --(wakeup)------> waiting
waiting --(switch_in)--> running
A single clock invariant clk_elapsed < BUDGET_NS() is active in all
three states. The budget hrtimer is rearmed on each DA transition for
the remaining budget, keeping the absolute deadline fixed at
start_time + BUDGET_NS.
Per-task state is stored in the DA framework's hash table keyed by
task->pid. Storage is pre-allocated by tlob_start_task() with
GFP_KERNEL via da_create_or_get() before the scheduler tracepoints
can fire, using DA_SKIP_AUTO_ALLOC so that no kmalloc occurs on the
tracepoint hot path. This avoids both the kmalloc_nolock() restriction
(requires HAVE_ALIGNED_STRUCT_PAGE) and latency issues under PREEMPT_RT.
Nested monitoring is handled by nest_depth: tlob_start_task() on an
already-monitored pid returns -EEXIST and increments nest_depth without
disturbing the outer window; only the outermost tlob_stop_task()
performs real cleanup.
Two userspace interfaces are provided. The ioctl interface exposes
in-process self-instrumentation via /dev/rv with TLOB_IOCTL_TRACE_START
and TLOB_IOCTL_TRACE_STOP. The uprobe interface enables external
monitoring of unmodified binaries via tracefs:
echo "p PATH:OFFSET_START OFFSET_STOP threshold=NS" \
> /sys/kernel/tracing/rv/monitors/tlob/monitor
Violations are reported via error_env_tlob (HA clock-invariant)
regardless of which interface triggered them.
Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
Documentation/trace/rv/index.rst | 1 +
Documentation/trace/rv/monitor_tlob.rst | 213 ++++
include/linux/rv.h | 45 +
include/rv/automata.h | 15 +
include/rv/ha_monitor.h | 33 +-
include/rv/rv_uprobe.h | 32 +
include/uapi/linux/rv.h | 86 ++
kernel/trace/rv/Kconfig | 2 +
kernel/trace/rv/Makefile | 4 +-
kernel/trace/rv/monitors/tlob/Kconfig | 69 ++
kernel/trace/rv/monitors/tlob/tlob.c | 1307 ++++++++++++++++++++
kernel/trace/rv/monitors/tlob/tlob.h | 171 +++
kernel/trace/rv/monitors/tlob/tlob_trace.h | 58 +
kernel/trace/rv/rv.c | 38 +
kernel/trace/rv/rv.h | 2 +
kernel/trace/rv/rv_chardev.c | 201 +++
kernel/trace/rv/rv_trace.h | 1 +
kernel/trace/rv/rv_uprobe.c | 46 +-
tools/include/uapi/linux/rv.h | 86 ++
19 files changed, 2400 insertions(+), 10 deletions(-)
create mode 100644 Documentation/trace/rv/monitor_tlob.rst
create mode 100644 include/uapi/linux/rv.h
create mode 100644 kernel/trace/rv/monitors/tlob/Kconfig
create mode 100644 kernel/trace/rv/monitors/tlob/tlob.c
create mode 100644 kernel/trace/rv/monitors/tlob/tlob.h
create mode 100644 kernel/trace/rv/monitors/tlob/tlob_trace.h
create mode 100644 kernel/trace/rv/rv_chardev.c
create mode 100644 tools/include/uapi/linux/rv.h
diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst
index 29769f06bb0f..1501545b5f08 100644
--- a/Documentation/trace/rv/index.rst
+++ b/Documentation/trace/rv/index.rst
@@ -16,5 +16,6 @@ Runtime Verification
monitor_wwnr.rst
monitor_sched.rst
monitor_rtapp.rst
+ monitor_tlob.rst
monitor_stall.rst
monitor_deadline.rst
diff --git a/Documentation/trace/rv/monitor_tlob.rst b/Documentation/trace/rv/monitor_tlob.rst
new file mode 100644
index 000000000000..91b592630b3f
--- /dev/null
+++ b/Documentation/trace/rv/monitor_tlob.rst
@@ -0,0 +1,213 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Monitor tlob
+============
+
+- Name: tlob - task latency over budget
+- Type: per-object hybrid automaton (RV_MON_PER_OBJ)
+- Author: Wen Yang <wen.yang@linux.dev>
+
+Description
+-----------
+
+The tlob monitor tracks per-task elapsed wall-clock time (CLOCK_MONOTONIC,
+spanning running, waiting, and sleeping states) and reports a violation when
+the monitored task exceeds a configurable per-invocation budget threshold.
+
+The monitor implements a three-state hybrid automaton with a single clock
+environment variable ``clk_elapsed``. The clock invariant
+``clk_elapsed < BUDGET_NS()`` is active in all three states; when it is
+violated the HA timer fires and the framework emits ``error_env_tlob``
+then calls ``da_monitor_reset()`` automatically::
+
+ | (initial, via task_start)
+ v
+ +--------------+
+ | running | <-----------+
+ +--------------+ |
+ | | |
+ sleep preempt switch_in
+ | | |
+ v v |
+ +---------+ +---------+ |
+ | sleeping| | waiting | -------+
+ +---------+ +---------+
+ | ^
+ +---wakeup---+
+
+ Key transitions:
+ running --(sleep)------> sleeping (task blocks waiting for a resource)
+ running --(preempt)----> waiting (task preempted, back in runqueue)
+ sleeping --(wakeup)-----> waiting (resource available, enters runqueue)
+ waiting --(switch_in)--> running (scheduler picks task, back on CPU)
+
+ ``task_start`` calls ``da_handle_start_event()`` with the synthetic event
+ ``switch_in_tlob`` to force the initial DA state to ``running`` (since
+ ``switch_in`` transitions waiting→running), then resets ``clk_elapsed`` and
+ arms the budget timer directly via ``ha_reset_clk_ns()`` + ``ha_start_timer_ns()``.
+ ``task_stop`` cancels the HA timer synchronously via
+ ``ha_cancel_timer_sync()`` then calls ``da_monitor_reset()`` directly.
+
+The non-running condition (monitor not yet started or reset after a
+stop/violation) is handled implicitly by the RV framework
+(``da_mon->monitoring == 0``) — it is not an explicit DA state.
+
+Per-task state lives in ``struct tlob_task_state`` which is stored as
+``monitor_target`` in the framework's ``da_monitor_storage``, indexed by
+pid. The per-invocation ``threshold_us`` is read via
+``ha_get_target(ha_mon)->threshold_us`` inside the HA constraint functions,
+following the same pattern as the ``nomiss`` monitor.
+
+Usage
+-----
+
+tracefs interface (uprobe-based external monitoring)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``monitor`` tracefs file instruments an unmodified binary via uprobes.
+The format follows the ftrace ``uprobe_events`` convention (``PATH:OFFSET``
+for the probe location, ``key=value`` for configuration parameters)::
+
+ p PATH:OFFSET_START OFFSET_STOP threshold=US
+
+The uprobe at ``OFFSET_START`` fires ``tlob_start_task()``; the uprobe at
+``OFFSET_STOP`` fires ``tlob_stop_task()``. Both offsets are ELF file
+offsets of entry points in ``PATH``. ``PATH`` may contain ``:``; the last
+``:`` in the ``PATH:OFFSET_START`` token is the separator.
+
+To remove a binding, use ``-PATH:OFFSET_START``::
+
+ echo 1 > /sys/kernel/tracing/rv/monitors/tlob/enable
+
+ echo "p /usr/bin/myapp:0x12a0 0x12f0 threshold=5000" \
+ > /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+ # Remove a binding
+ echo "-/usr/bin/myapp:0x12a0" > /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+ # List registered bindings
+ cat /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+ # Read violations from the trace buffer
+ cat /sys/kernel/tracing/trace
+
+ioctl self-instrumentation (/dev/rv)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``/dev/rv`` is a shared RV character device. Before using any monitor-specific
+ioctl, the fd must be bound to a monitor via ``RV_IOCTL_BIND_MONITOR``. Each
+open fd has independent per-fd monitoring state::
+
+ int fd = open("/dev/rv", O_RDWR);
+
+ /* Bind this fd to the tlob monitor. */
+ struct rv_bind_args bind = { .monitor_name = "tlob" };
+ ioctl(fd, RV_IOCTL_BIND_MONITOR, &bind);
+
+ struct tlob_start_args args = {
+ .threshold_us = 50000, /* 50 ms in microseconds */
+ };
+ ioctl(fd, TLOB_IOCTL_TRACE_START, &args);
+
+ /* ... code path under observation ... */
+
+ int ret = ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ /* ret == 0: within budget */
+ /* ret == -EOVERFLOW: budget exceeded */
+
+ close(fd);
+
+``TRACE_STOP`` returns ``-EOVERFLOW`` whenever the budget was exceeded.
+The HA timer calls ``da_monitor_reset()`` (storage remains); the
+synchronous ``ha_cancel_timer_sync()`` in ``tlob_stop_task()`` ensures the
+callback has completed before checking ``da_monitoring()``.
+
+Violation events
+~~~~~~~~~~~~~~~~
+
+Budget violations are always reported via the ``error_env_tlob`` RV
+tracepoint (HA clock-invariant violation), regardless of which interface
+triggered them::
+
+ cat /sys/kernel/tracing/trace
+
+To capture violations in a file::
+
+ trace-cmd record -e error_env_tlob &
+ # ... run workload ...
+ trace-cmd report
+
+tracefs files
+-------------
+
+The following files are created under
+``/sys/kernel/tracing/rv/monitors/tlob/``:
+
+``enable`` (rw)
+ Write ``1`` to enable the monitor; write ``0`` to disable it.
+
+``desc`` (ro)
+ Human-readable description of the monitor.
+
+``monitor`` (rw)
+ Write ``p PATH:OFFSET_START OFFSET_STOP threshold=US``
+ to bind two entry uprobes. Write ``-PATH:OFFSET_START`` to remove a
+ binding. Read to list registered bindings in the same format.
+
+Kernel API
+----------
+
+.. kernel-doc:: kernel/trace/rv/monitors/tlob/tlob.c
+ :functions: tlob_start_task tlob_stop_task
+
+``tlob_start_task(task, threshold_us)``
+ Begin monitoring *task* with a total latency budget of *threshold_us*
+ microseconds. Allocates per-task state, sets initial DA state to
+ ``running``, resets ``clk_elapsed``, and arms the HA budget timer.
+ Returns 0, -ENODEV (monitor disabled), -ERANGE (zero threshold),
+ -EALREADY (already monitoring), -ENOSPC (at capacity), or -ENOMEM.
+
+``tlob_stop_task(task)``
+ Stop monitoring *task*. Synchronously cancels the HA timer via
+ ``ha_cancel_timer_sync()``, checks ``da_monitoring()`` to determine outcome.
+ Returns 0 (clean stop, within budget), -EOVERFLOW (budget was exceeded),
+ -ESRCH (not monitored), or -EAGAIN (concurrent stop racing).
+
+Design notes
+------------
+
+State transitions are driven by two tracepoints:
+
+- ``sched_switch``: ``prev_state == 0`` (``TASK_RUNNING``, preempted,
+ stays on runqueue) → running→waiting; ``prev_state != 0`` (voluntarily
+ blocked, leaves runqueue) → running→sleeping; ``next`` pointer →
+ waiting→running.
+- ``sched_wakeup``: task moves back onto the runqueue → sleeping→waiting.
+
+No ``waiting → sleeping`` edge exists because a task can only block
+itself while executing on CPU. ``try_to_wake_up()`` is also a no-op
+when ``__state == TASK_RUNNING``, so ``sched_wakeup`` never fires while
+the task is in ``waiting`` state.
+
+Limitations:
+
+- The initial DA state is always ``running``, set by feeding the synthetic
+ event ``switch_in_tlob`` to ``da_handle_start_event()``. Monitoring a non-current
+ task that is already in waiting or sleeping state at call time misclassifies
+ the first interval as ``running_ns``.
+- ``TASK_STOPPED`` and ``TASK_TRACED`` carry ``prev_state != 0`` and are
+ therefore counted as ``sleeping_ns``, indistinguishable from
+ I/O-blocked time.
+- ``sched_wakeup_new`` is not hooked. In practice this is not an issue
+ because ``tlob_start_task`` is always called from a running context.
+
+Specification
+-------------
+
+Graphviz DOT file in tools/verification/models/tlob.dot.
+
+KUnit tests under ``kernel/trace/rv/monitors/tlob/tlob_kunit.c``
+(CONFIG_TLOB_KUNIT_TEST).
+
+User-space integration tests under ``tools/testing/selftests/verification/``
+(requires CONFIG_RV_MON_TLOB=y and root).
diff --git a/include/linux/rv.h b/include/linux/rv.h
index 541ba404926a..1ea91bb3f1c2 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -21,6 +21,13 @@
#include <linux/list.h>
#include <linux/types.h>
+/* Forward declaration: poll_table is only needed by rv_chardev_ops::poll.
+ * Avoid pulling in <linux/poll.h> from rv.h — that header is included by
+ * sched.h, and poll.h → fs.h → rcupdate.h creates a header-ordering cycle
+ * with migrate_disable() on UML/non-SMP targets.
+ */
+struct poll_table_struct;
+
/*
* Deterministic automaton per-object variables.
*/
@@ -158,6 +165,44 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent);
int rv_get_task_monitor_slot(void);
void rv_put_task_monitor_slot(int slot);
+/**
+ * struct rv_chardev_ops - per-monitor callbacks for the /dev/rv chardev
+ *
+ * Monitors that want to expose an ioctl self-instrumentation interface
+ * register an instance of this struct with rv_chardev_register_monitor().
+ *
+ * @owner: Module that owns this ops struct. Set to THIS_MODULE.
+ * The chardev holds a module reference for every bound fd so
+ * the module cannot be unloaded while any fd remains open.
+ * @bind: Called when userspace issues RV_IOCTL_BIND_MONITOR. Should
+ * allocate and return per-fd private data (opaque pointer), or
+ * ERR_PTR(errno) on failure.
+ * @ioctl: Called for every monitor-specific ioctl after binding. @priv
+ * is the pointer returned by @bind.
+ * @poll: Optional. Called from the fd's poll() / epoll_wait() path.
+ * Should call poll_wait(@file, wq, @wait) on the monitor's internal
+ * wait queue and return the current event mask (EPOLLIN | EPOLLRDNORM
+ * when an event is pending, 0 otherwise). If NULL, poll() always
+ * returns 0 (no events).
+ * @release: Called when the fd is closed. Must free @priv.
+ */
+struct rv_chardev_ops {
+ struct module *owner;
+ void *(*bind)(void);
+ long (*ioctl)(void *priv, unsigned int cmd, unsigned long arg);
+ __poll_t (*poll)(void *priv, struct file *file, struct poll_table_struct *wait);
+ void (*release)(void *priv);
+};
+
+int rv_chardev_register_monitor(const char *name,
+ const struct rv_chardev_ops *ops);
+void rv_chardev_unregister_monitor(const char *name);
+
+#if IS_ENABLED(CONFIG_KUNIT)
+void rv_kunit_monitoring_on(void);
+void rv_kunit_monitoring_off(void);
+#endif
+
#ifdef CONFIG_RV_REACTORS
int rv_unregister_reactor(struct rv_reactor *reactor);
int rv_register_reactor(struct rv_reactor *reactor);
diff --git a/include/rv/automata.h b/include/rv/automata.h
index 4a4eb40cf09a..ae819638d85a 100644
--- a/include/rv/automata.h
+++ b/include/rv/automata.h
@@ -41,6 +41,21 @@ static char *model_get_event_name(enum events event)
return RV_AUTOMATON_NAME.event_names[event];
}
+/*
+ * model_get_timer_event_name - label used when the HA timer fires (no event).
+ *
+ * Monitors may define MONITOR_TIMER_EVENT_NAME before including the model
+ * header to give the timer-fired violation a semantically meaningful label
+ * (e.g. "budget_exceeded" for tlob). Defaults to "none".
+ */
+#ifndef MONITOR_TIMER_EVENT_NAME
+#define MONITOR_TIMER_EVENT_NAME "none"
+#endif
+static inline char *model_get_timer_event_name(void)
+{
+ return MONITOR_TIMER_EVENT_NAME;
+}
+
/*
* model_get_initial_state - return the automaton's initial state
*/
diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
index d59507e8cb30..dfc993774089 100644
--- a/include/rv/ha_monitor.h
+++ b/include/rv/ha_monitor.h
@@ -28,6 +28,7 @@ static inline void ha_monitor_init_env(struct da_monitor *da_mon);
static inline void ha_monitor_reset_env(struct da_monitor *da_mon);
static inline void ha_setup_timer(struct ha_monitor *ha_mon);
static inline bool ha_cancel_timer(struct ha_monitor *ha_mon);
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon);
static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
enum states curr_state,
enum events event,
@@ -35,7 +36,10 @@ static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
da_id_type id);
#define da_monitor_event_hook ha_monitor_handle_constraint
#define da_monitor_init_hook ha_monitor_init_env
+/* Allow monitors to override da_monitor_reset_hook before including this header. */
+#ifndef da_monitor_reset_hook
#define da_monitor_reset_hook ha_monitor_reset_env
+#endif
#include <rv/da_monitor.h>
#include <linux/seq_buf.h>
@@ -70,7 +74,7 @@ static void ha_react(enum states curr_state, enum events event, char *env)
rv_react(&rv_this,
"rv: monitor %s does not allow event %s on state %s with env %s\n",
__stringify(MONITOR_NAME),
- event == EVENT_NONE ? EVENT_NONE_LBL : model_get_event_name(event),
+ event == EVENT_NONE ? model_get_timer_event_name() : model_get_event_name(event),
model_get_state_name(curr_state), env);
}
@@ -246,7 +250,7 @@ static inline void __ha_monitor_timer_callback(struct ha_monitor *ha_mon)
ha_get_env_string(&env_string, ha_mon, time_ns);
ha_react(curr_state, EVENT_NONE, env_string.buffer);
ha_trace_error_env(ha_mon, model_get_state_name(curr_state),
- EVENT_NONE_LBL, env_string.buffer,
+ model_get_timer_event_name(), env_string.buffer,
da_get_id(&ha_mon->da_mon));
da_monitor_reset(&ha_mon->da_mon);
@@ -412,6 +416,14 @@ static inline bool ha_cancel_timer(struct ha_monitor *ha_mon)
{
return timer_delete(&ha_mon->timer);
}
+/*
+ * ha_cancel_timer_sync - Cancel the timer, blocking until any running
+ * callback has completed.
+ */
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon)
+{
+ timer_delete_sync(&ha_mon->timer);
+}
#elif HA_TIMER_TYPE == HA_TIMER_HRTIMER
/*
* Helper functions to handle the monitor timer.
@@ -432,12 +444,12 @@ static enum hrtimer_restart ha_monitor_timer_callback(struct hrtimer *hrtimer)
static inline void ha_setup_timer(struct ha_monitor *ha_mon)
{
hrtimer_setup(&ha_mon->hrtimer, ha_monitor_timer_callback,
- CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
}
static inline void ha_start_timer_ns(struct ha_monitor *ha_mon, enum envs env,
u64 expire, u64 time_ns)
{
- int mode = HRTIMER_MODE_REL_HARD;
+ int mode = HRTIMER_MODE_REL_SOFT;
u64 passed = ha_invariant_passed_ns(ha_mon, env, expire, time_ns);
if (RV_MON_TYPE == RV_MON_PER_CPU)
@@ -463,6 +475,18 @@ static inline bool ha_cancel_timer(struct ha_monitor *ha_mon)
{
return hrtimer_try_to_cancel(&ha_mon->hrtimer) == 1;
}
+/*
+ * ha_cancel_timer_sync - Cancel the timer, blocking until any running
+ * callback has completed.
+ *
+ * Use in teardown paths (e.g. stop_task) where the caller must know the
+ * callback has finished before inspecting or freeing monitor state.
+ * Must not be called from atomic context or within the timer callback.
+ */
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon)
+{
+ hrtimer_cancel(&ha_mon->hrtimer);
+}
#else /* HA_TIMER_NONE */
/*
* Start function is intentionally not defined, monitors using timers must
@@ -473,6 +497,7 @@ static inline bool ha_cancel_timer(struct ha_monitor *ha_mon)
{
return false;
}
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon) { }
#endif
#endif
diff --git a/include/rv/rv_uprobe.h b/include/rv/rv_uprobe.h
index 084cdb36a2ff..9106c5c9275e 100644
--- a/include/rv/rv_uprobe.h
+++ b/include/rv/rv_uprobe.h
@@ -79,9 +79,41 @@ struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
* for any in-progress handler to finish, then releases the path reference
* and frees the rv_uprobe struct. The caller's priv data is NOT freed.
*
+ * When removing a single probe, prefer this over the three-phase API.
* Safe to call from process context only (uprobe_unregister_sync() may
* schedule).
*/
void rv_uprobe_detach(struct rv_uprobe *p);
+/**
+ * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting
+ * @p: probe to dequeue; may be NULL (no-op)
+ *
+ * Removes the uprobe from the uprobe subsystem but does NOT wait for
+ * in-flight handlers to complete. The caller must call rv_uprobe_sync()
+ * before calling rv_uprobe_free() on the same probe.
+ *
+ * Use this to batch multiple deregistrations before a single rv_uprobe_sync().
+ */
+void rv_uprobe_unregister_nosync(struct rv_uprobe *p);
+
+/**
+ * rv_uprobe_sync - wait for all in-flight uprobe handlers to complete
+ *
+ * Global barrier: waits for every in-flight uprobe handler across the system
+ * to finish. Call once after a batch of rv_uprobe_unregister_nosync() calls
+ * and before any rv_uprobe_free() call.
+ */
+void rv_uprobe_sync(void);
+
+/**
+ * rv_uprobe_free - release resources of a previously deregistered probe
+ * @p: probe to free; may be NULL (no-op)
+ *
+ * Releases the path reference and frees the rv_uprobe struct. Must only
+ * be called after rv_uprobe_sync() has returned. The caller's priv data
+ * is NOT freed.
+ */
+void rv_uprobe_free(struct rv_uprobe *p);
+
#endif /* _RV_UPROBE_H */
diff --git a/include/uapi/linux/rv.h b/include/uapi/linux/rv.h
new file mode 100644
index 000000000000..a34e5426393b
--- /dev/null
+++ b/include/uapi/linux/rv.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * UAPI definitions for Runtime Verification (RV) monitors.
+ *
+ * All RV monitors that expose an ioctl self-instrumentation interface
+ * share the magic byte RV_IOC_MAGIC ('r').
+ *
+ * Usage examples and design rationale are in:
+ * Documentation/trace/rv/monitor_tlob.rst
+ */
+
+#ifndef _UAPI_LINUX_RV_H
+#define _UAPI_LINUX_RV_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/* Magic byte shared by all RV monitor ioctls. */
+#define RV_IOC_MAGIC 'r'
+
+/* Maximum monitor name length (including NUL terminator). */
+#define RV_MONITOR_NAME_MAX 32
+
+/* Generic /dev/rv ioctls (ioctl numbers 0–15 are reserved for the core) */
+
+/**
+ * struct rv_bind_args - arguments for RV_IOCTL_BIND_MONITOR
+ * @monitor_name: NUL-terminated name of the monitor to bind (e.g. "tlob").
+ */
+struct rv_bind_args {
+ char monitor_name[RV_MONITOR_NAME_MAX];
+};
+
+/*
+ * RV_IOCTL_BIND_MONITOR - associate this fd with a specific RV monitor.
+ *
+ * Must be called once after open() and before any monitor-specific ioctl.
+ *
+ * Returns 0 on success.
+ * Returns -EBUSY if this fd is already bound to a monitor.
+ * Returns -ENOENT if the requested monitor is not registered.
+ * Returns -ENOMEM on allocation failure.
+ */
+#define RV_IOCTL_BIND_MONITOR _IOW(RV_IOC_MAGIC, 0, struct rv_bind_args)
+
+/* tlob: task latency over budget monitor (ioctl numbers 1–15) */
+
+/**
+ * struct tlob_start_args - arguments for TLOB_IOCTL_TRACE_START
+ * @threshold_us: Total latency budget for this window, in microseconds.
+ * Must be greater than zero. Both on-CPU and off-CPU time
+ * (including runqueue wait) count toward this budget.
+ */
+struct tlob_start_args {
+ __u64 threshold_us;
+};
+
+/*
+ * TLOB_IOCTL_TRACE_START - begin monitoring the calling task.
+ *
+ * Arms a per-task hrtimer for threshold_us microseconds (CLOCK_MONOTONIC,
+ * so both on-CPU and off-CPU time count toward the budget).
+ *
+ * Returns 0 on success.
+ * Returns -EEXIST if TRACE_START was already called on this fd.
+ * Returns -ENOSPC if TLOB_MAX_MONITORED tasks are already being tracked.
+ * Returns -ENOMEM on allocation failure.
+ * Returns -ENODEV if the tlob monitor is not enabled.
+ * Returns -ERANGE if threshold_us is 0.
+ */
+#define TLOB_IOCTL_TRACE_START _IOW(RV_IOC_MAGIC, 1, struct tlob_start_args)
+
+/*
+ * TLOB_IOCTL_TRACE_STOP - end monitoring the calling task.
+ *
+ * Returns 0 if within budget.
+ * Returns -EOVERFLOW if the latency budget was exceeded.
+ * Returns -EINVAL if TLOB_IOCTL_TRACE_START was not called on this fd.
+ *
+ * poll/epoll: after TRACE_START the fd becomes readable (EPOLLIN) when the
+ * budget is exceeded. The caller may then issue TRACE_STOP to retrieve the
+ * result, or simply close the fd to clean up.
+ */
+#define TLOB_IOCTL_TRACE_STOP _IO(RV_IOC_MAGIC, 2)
+
+#endif /* _UAPI_LINUX_RV_H */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index e2e0033a00b9..1c36939db8e5 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -87,6 +87,8 @@ source "kernel/trace/rv/monitors/deadline/Kconfig"
source "kernel/trace/rv/monitors/nomiss/Kconfig"
# Add new deadline monitors here
+source "kernel/trace/rv/monitors/tlob/Kconfig"
+
# Add new monitors here
config RV_REACTORS
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index f139b904bea3..8a5b5c84aff9 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -2,7 +2,7 @@
ccflags-y += -I $(src) # needed for trace events
-obj-$(CONFIG_RV) += rv.o
+obj-$(CONFIG_RV) += rv.o rv_chardev.o
obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o
@@ -17,6 +17,8 @@ obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o
obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o
obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o
obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
+obj-$(CONFIG_RV_MON_TLOB) += monitors/tlob/tlob.o
+obj-$(CONFIG_TLOB_KUNIT_TEST) += monitors/tlob/tlob_kunit.o
obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
diff --git a/kernel/trace/rv/monitors/tlob/Kconfig b/kernel/trace/rv/monitors/tlob/Kconfig
new file mode 100644
index 000000000000..82e521891496
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/Kconfig
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_TLOB
+ depends on RV
+ select RV_UPROBE
+ select HA_MON_EVENTS_ID
+ bool "tlob monitor"
+ help
+ Enable the tlob (task latency over budget) monitor. This monitor
+ tracks the elapsed time (CLOCK_MONOTONIC) of a marked code path
+ within a task (including both on-CPU and off-CPU time) and reports
+ a violation when the elapsed time exceeds a configurable budget.
+
+ The monitor uses a three-state hybrid automaton (running, waiting,
+ sleeping) stored per object using RV_MON_PER_OBJ. A single HA
+ clock invariant (clk_elapsed < BUDGET_NS) is enforced in all three
+ states via a per-task hrtimer.
+
+ States: running (initial, on-CPU), waiting (in runqueue, off-CPU),
+ sleeping (blocked on resource, off-CPU).
+ Key transitions:
+ running --(sleep)------> sleeping
+ running --(preempt)----> waiting
+ sleeping --(wakeup)-----> waiting
+ waiting --(switch_in)--> running
+ task_start calls da_handle_start_event() to set the initial state,
+ then arms the budget timer directly via ha_reset_clk_ns() +
+ ha_start_timer_ns(). task_stop cancels the timer synchronously via
+ ha_cancel_timer_sync() then calls da_monitor_reset().
+
+ Two userspace interfaces are provided:
+
+ tracefs uprobe binding (external, unmodified binaries):
+ echo "p PATH:OFFSET_START OFFSET_STOP threshold=NS" \
+ > /sys/kernel/tracing/rv/monitors/tlob/monitor
+ The uprobe at offset_start fires tlob_start_task(); the uprobe at
+ offset_stop fires tlob_stop_task(). Both are plain entry uprobes
+ so a mistyped offset cannot corrupt the call stack.
+
+ /dev/rv ioctl (in-process self-instrumentation):
+ ioctl(fd, TLOB_IOCTL_TRACE_START, &args);
+ do_critical_work();
+ ret = ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ /* ret == -EOVERFLOW when budget exceeded */
+ Allows conditional monitoring, sub-function granularity, and
+ inline reaction to violations without polling the trace buffer.
+
+ Up to TLOB_MAX_MONITORED tasks may be monitored simultaneously.
+
+ Violations are always reported via the standard error_env_tlob RV
+ tracepoint regardless of which interface triggered them. The
+ tracefs interface requires only tracefs write permissions, avoiding
+ the CAP_BPF privilege needed for equivalent eBPF-based approaches.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_tlob.rst
+
+config TLOB_KUNIT_TEST
+ tristate "KUnit tests for tlob monitor" if !KUNIT_ALL_TESTS
+ depends on RV_MON_TLOB && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ Enable KUnit in-kernel unit tests for the tlob RV monitor.
+
+ Tests cover automaton state transitions, the start/stop task
+ interface, scheduler context-switch accounting, and the uprobe
+ format string parser.
+
+ Say Y or M here to run the tlob KUnit test suite; otherwise say N.
diff --git a/kernel/trace/rv/monitors/tlob/tlob.c b/kernel/trace/rv/monitors/tlob/tlob.c
new file mode 100644
index 000000000000..475e972ae9aa
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob.c
@@ -0,0 +1,1307 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob: task latency over budget monitor
+ *
+ * Track the elapsed wall-clock time of a marked code path and detect when
+ * a monitored task exceeds its per-task latency budget. CLOCK_MONOTONIC
+ * is used so both on-CPU and off-CPU time count toward the budget.
+ *
+ * On a budget violation, two tracepoints are emitted from the hrtimer
+ * callback: error_env_tlob signals the violation, and detail_env_tlob
+ * provides a per-state time breakdown (running_ns, waiting_ns, sleeping_ns)
+ * that pinpoints whether the overrun occurred in running, waiting, or sleeping state.
+ *
+ * The monitor uses RV_MON_PER_OBJ: per-task state (struct tlob_task_state)
+ * is stored as monitor_target in the framework's hash table.
+ *
+ * One HA clock invariant is enforced:
+ * clk_elapsed < BUDGET_NS() (active in all states)
+ *
+ * task_start uses da_handle_start_event() to set the initial state, then
+ * calls ha_reset_clk_ns() + ha_start_timer_ns() directly to initialise the
+ * clock and arm the budget timer. No synthetic event is needed.
+ * The HA timer is cancelled synchronously by ha_cancel_timer_sync() in
+ * tlob_stop_task().
+ *
+ * Copyright (C) 2026 Wen Yang <wen.yang@linux.dev>
+ */
+#include <linux/completion.h>
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/refcount.h>
+#include <linux/rv.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <kunit/visibility.h>
+#include <rv/instrumentation.h>
+#include <rv/rv_uprobe.h>
+#include <uapi/linux/rv.h>
+#include "../../rv.h"
+
+#define MODULE_NAME "tlob"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+
+/*
+ * Per-fd private data; one instance per open /dev/rv fd.
+ * monitoring: set while TRACE_START is active; cleared at TRACE_STOP.
+ * budget_exceeded: set by hrtimer callback; read at TRACE_STOP to report
+ * -EOVERFLOW even when cleanup was claimed by a concurrent stop_all or
+ * a task-exit handler.
+ */
+struct tlob_fpriv {
+ struct task_struct *task;
+ bool monitoring;
+ bool budget_exceeded;
+};
+
+/*
+ * Per-task latency monitoring state. One instance per monitoring window.
+ * Stored as monitor_target in da_monitor_storage; freed via call_rcu.
+ */
+struct tlob_task_state {
+ struct task_struct *task; /* via get_task_struct */
+ u64 threshold_us; /* budget in microseconds */
+
+ /* 1 = cleanup claimed; ha_setup_invariants won't restart the timer. */
+ atomic_t stopping;
+
+ /* Serialises the ns accumulators; held briefly (hardirq-safe). */
+ raw_spinlock_t entry_lock;
+ u64 running_ns; /* time in running state */
+ u64 waiting_ns; /* time in waiting state */
+ u64 sleeping_ns; /* time in sleeping state */
+ ktime_t last_ts;
+
+ /* store-release in TRACE_START ioctl, load-acquire in reset_notify. */
+ struct tlob_fpriv *fpriv;
+
+ struct rcu_head rcu; /* for call_rcu() teardown */
+};
+
+#define RV_MON_TYPE RV_MON_PER_OBJ
+#define HA_TIMER_TYPE HA_TIMER_HRTIMER
+/* Pool mode: da_handle_start_event uses da_fill_empty_storage, not kmalloc. */
+#define DA_SKIP_AUTO_ALLOC
+
+/* Type for da_monitor_storage.target; must be defined before the includes. */
+typedef struct tlob_task_state *monitor_target;
+
+/* Forward-declared so da_monitor_reset_hook works before ha_monitor.h. */
+static inline void tlob_reset_notify(struct da_monitor *da_mon);
+#define da_monitor_reset_hook tlob_reset_notify
+
+/*
+ * When the hrtimer fires (budget elapsed), the HA framework emits
+ * error_env_tlob with this label instead of the generic "none".
+ */
+#define MONITOR_TIMER_EVENT_NAME "budget_exceeded"
+
+#include "tlob.h"
+#include <rv/ha_monitor.h>
+
+/*
+ * Called from da_monitor_reset() on both normal stop and hrtimer expiry.
+ * On violation (stopping==0), emits detail_env_tlob.
+ */
+static inline void tlob_reset_notify(struct da_monitor *da_mon)
+{
+ struct ha_monitor *ha_mon = to_ha_monitor(da_mon);
+ struct tlob_task_state *ws;
+
+ ha_monitor_reset_env(da_mon);
+
+ ws = ha_get_target(ha_mon);
+ if (!ws)
+ return;
+
+ /*
+ * Emit per-state breakdown on budget violation only.
+ * stopping==0: timer callback owns this path (genuine overrun).
+ * stopping==1: normal stop claimed ownership first; skip.
+ */
+ if (!atomic_read(&ws->stopping)) {
+ unsigned int curr_state = READ_ONCE(da_mon->curr_state);
+ u64 running_ns, waiting_ns, sleeping_ns, partial_ns;
+ struct tlob_fpriv *fp;
+ unsigned long flags;
+
+ /*
+ * Snapshot accumulators; partial_ns covers curr_state time
+ * not yet folded in (transition-out pending).
+ */
+ raw_spin_lock_irqsave(&ws->entry_lock, flags);
+ partial_ns = ktime_get_ns() - ktime_to_ns(ws->last_ts);
+ running_ns = ws->running_ns +
+ (curr_state == running_tlob ? partial_ns : 0);
+ waiting_ns = ws->waiting_ns +
+ (curr_state == waiting_tlob ? partial_ns : 0);
+ sleeping_ns = ws->sleeping_ns +
+ (curr_state == sleeping_tlob ? partial_ns : 0);
+ raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+
+ trace_detail_env_tlob(da_get_id(da_mon), ws->threshold_us,
+ running_ns, waiting_ns, sleeping_ns);
+
+ /*
+ * Latch violation in the fd so TRACE_STOP can return -EOVERFLOW
+ * even if a concurrent stop_all or task-exit handler claims
+ * cleanup first. Pairs with smp_store_release in TRACE_START.
+ */
+ fp = smp_load_acquire(&ws->fpriv);
+ if (fp)
+ WRITE_ONCE(fp->budget_exceeded, true);
+ }
+}
+
+#define BUDGET_US(ha_mon) (ha_get_target(ha_mon)->threshold_us)
+#define BUDGET_NS(ha_mon) (BUDGET_US(ha_mon) * 1000ULL)
+
+/* HA constraint functions (called by ha_monitor_handle_constraint) */
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_tlob env, u64 time_ns)
+{
+ if (env == clk_elapsed_tlob)
+ return ha_get_clk_ns(ha_mon, env, time_ns);
+ return ENV_INVALID_VALUE;
+}
+
+static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_tlob env, u64 time_ns)
+{
+ if (env == clk_elapsed_tlob)
+ ha_reset_clk_ns(ha_mon, env, time_ns);
+}
+
+/*
+ * ha_verify_invariants - clk_elapsed < BUDGET_NS must hold in all states.
+ */
+static inline bool ha_verify_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == running_tlob)
+ return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns);
+ else if (curr_state == sleeping_tlob)
+ return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns);
+ else if (curr_state == waiting_tlob)
+ return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns);
+ return true;
+}
+
+/*
+ * Convert invariant (deadline) to guard (reset anchor) on state transitions.
+ * Skip if uninitialised (ENV_INVALID_VALUE): the race between
+ * da_handle_start_event() and ha_reset_clk_ns() would give U64_MAX - BUDGET_NS.
+ */
+static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == next_state)
+ return;
+ if (curr_state == running_tlob &&
+ !ha_monitor_env_invalid(ha_mon, clk_elapsed_tlob))
+ ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ else if (curr_state == sleeping_tlob &&
+ !ha_monitor_env_invalid(ha_mon, clk_elapsed_tlob))
+ ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ else if (curr_state == waiting_tlob &&
+ !ha_monitor_env_invalid(ha_mon, clk_elapsed_tlob))
+ ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+}
+
+/* No per-event guard conditions for tlob; invariants suffice. */
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ return true;
+}
+
+/*
+ * Arm or cancel the HA budget timer on state transitions.
+ * Guard on stopping: sched_switch events can arrive after ha_cancel_timer_sync,
+ * restarting the timer and triggering an ODEBUG "activate active" splat.
+ */
+static inline void ha_setup_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (next_state == curr_state)
+ return;
+ if (next_state == running_tlob) {
+ if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping))
+ ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ } else if (next_state == sleeping_tlob) {
+ if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping))
+ ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ } else if (next_state == waiting_tlob) {
+ if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping))
+ ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ } else if (curr_state == running_tlob)
+ ha_cancel_timer(ha_mon);
+ else if (curr_state == waiting_tlob)
+ ha_cancel_timer(ha_mon);
+ else if (curr_state == sleeping_tlob)
+ ha_cancel_timer(ha_mon);
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns);
+
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns);
+
+ return true;
+}
+
+static struct kmem_cache *tlob_state_cache;
+
+static atomic_t tlob_num_monitored = ATOMIC_INIT(0);
+
+/* Uprobe binding list; protected by tlob_uprobe_mutex. */
+static LIST_HEAD(tlob_uprobe_list);
+static DEFINE_MUTEX(tlob_uprobe_mutex);
+
+/*
+ * Serialises duplicate-check + da_create_or_get() to prevent two concurrent
+ * callers for the same pid from both inserting into the hash table.
+ */
+static DEFINE_MUTEX(tlob_start_mutex);
+
+/*
+ * Counts open /dev/rv fds plus one synthetic ref held while enabled.
+ * __tlob_destroy_monitor() drops the synthetic ref and waits for zero
+ * before teardown, preventing kmem_cache_zalloc() on a destroyed cache.
+ */
+static refcount_t tlob_fd_refcount = REFCOUNT_INIT(0);
+static DECLARE_COMPLETION(tlob_fd_released);
+
+/* Per-uprobe-binding state: a start + stop probe pair for one binary region. */
+struct tlob_uprobe_binding {
+ struct list_head list;
+ u64 threshold_us;
+ char binpath[TLOB_MAX_PATH];
+ loff_t offset_start;
+ loff_t offset_stop;
+ struct rv_uprobe *start_probe;
+ struct rv_uprobe *stop_probe;
+};
+
+/* RCU callback: free the slab once no readers remain. */
+static void tlob_free_rcu(struct rcu_head *head)
+{
+ struct tlob_task_state *ws =
+ container_of(head, struct tlob_task_state, rcu);
+ kmem_cache_free(tlob_state_cache, ws);
+}
+
+/*
+ * handle_sched_switch - advance the DA on every context switch.
+ *
+ * Generates three DA events:
+ * prev, prev_state != 0 -> sleep_tlob (running -> sleeping)
+ * prev, prev_state == 0 -> preempt_tlob (running -> waiting)
+ * next -> switch_in_tlob (waiting -> running)
+ */
+static void handle_sched_switch(void *data, bool preempt_unused,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ struct tlob_task_state *ws;
+ unsigned long flags;
+ bool do_prev = false, do_next = false;
+ bool prev_preempted;
+ ktime_t now;
+
+ rcu_read_lock();
+
+ ws = da_get_target_by_id(prev->pid);
+ if (ws) {
+ raw_spin_lock_irqsave(&ws->entry_lock, flags);
+ now = ktime_get();
+ ws->running_ns += ktime_to_ns(ktime_sub(now, ws->last_ts));
+ ws->last_ts = now;
+ /* prev_state == 0: TASK_RUNNING (preempted); != 0: sleeping. */
+ prev_preempted = (prev_state == 0);
+ do_prev = true;
+ raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+ }
+
+ ws = da_get_target_by_id(next->pid);
+ if (ws) {
+ raw_spin_lock_irqsave(&ws->entry_lock, flags);
+ now = ktime_get();
+ ws->waiting_ns += ktime_to_ns(ktime_sub(now, ws->last_ts));
+ ws->last_ts = now;
+ do_next = true;
+ raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+ }
+
+ rcu_read_unlock();
+
+ if (do_prev)
+ da_handle_event(prev->pid, NULL,
+ prev_preempted ? preempt_tlob : sleep_tlob);
+ if (do_next)
+ da_handle_event(next->pid, NULL, switch_in_tlob);
+}
+
+/*
+ * handle_sched_wakeup - sleeping -> waiting transition.
+ *
+ * try_to_wake_up() skips TASK_RUNNING tasks, so this never fires for a
+ * task already in running or waiting state.
+ */
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+ struct tlob_task_state *ws;
+ unsigned long flags;
+ bool found = false;
+
+ rcu_read_lock();
+ ws = da_get_target_by_id(p->pid);
+ if (ws) {
+ ktime_t now = ktime_get();
+
+ raw_spin_lock_irqsave(&ws->entry_lock, flags);
+ ws->sleeping_ns += ktime_to_ns(ktime_sub(now, ws->last_ts));
+ ws->last_ts = now;
+ raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+ found = true;
+ }
+ rcu_read_unlock();
+
+ if (found)
+ da_handle_event(p->pid, NULL, wakeup_tlob);
+}
+
+/*
+ * handle_sched_process_exit - clean up if a task exits without TRACE_STOP.
+ *
+ * Called in do_exit() context; the task still has a valid pid here.
+ */
+static void handle_sched_process_exit(void *data, struct task_struct *p,
+ bool group_dead)
+{
+ struct tlob_task_state *ws;
+ bool found = false;
+
+ rcu_read_lock();
+ ws = da_get_target_by_id(p->pid);
+ found = !!ws;
+ rcu_read_unlock();
+
+ if (found)
+ tlob_stop_task(p);
+}
+
+
+
+/**
+ * tlob_start_task - begin monitoring @task with budget @threshold_us us.
+ * @task: Task to monitor; may be current or another task.
+ * @threshold_us: Latency budget in microseconds (wall-clock; running + waiting + sleeping). > 0.
+ *
+ * Returns 0, -ENODEV, -EALREADY, -ENOSPC, or -ENOMEM.
+ */
+int tlob_start_task(struct task_struct *task, u64 threshold_us)
+{
+ struct tlob_task_state *ws_existing;
+ struct tlob_task_state *ws;
+ struct da_monitor *da_mon;
+ struct ha_monitor *ha_mon;
+ u64 now_ns;
+ int ret;
+
+ if (!da_monitor_enabled())
+ return -ENODEV;
+
+ if (threshold_us == 0)
+ return -ERANGE;
+
+ /* Serialise duplicate-check + da_create_or_get for the same pid. */
+ guard(mutex)(&tlob_start_mutex);
+
+ rcu_read_lock();
+ ws_existing = da_get_target_by_id(task->pid);
+ if (ws_existing) {
+ rcu_read_unlock();
+ return -EALREADY;
+ }
+ rcu_read_unlock();
+
+ ws = kmem_cache_zalloc(tlob_state_cache, GFP_KERNEL);
+ if (!ws)
+ return -ENOMEM;
+
+ ws->task = task;
+ get_task_struct(task);
+ ws->threshold_us = threshold_us;
+ ws->last_ts = ktime_get();
+ raw_spin_lock_init(&ws->entry_lock);
+
+ /* Claim a pool slot (no kmalloc; DA_SKIP_AUTO_ALLOC + prealloc). */
+ ret = da_create_or_get(task->pid, ws);
+ if (ret) {
+ put_task_struct(task);
+ kmem_cache_free(tlob_state_cache, ws);
+ return ret;
+ }
+
+ atomic_inc(&tlob_num_monitored);
+
+ /* Hold RCU across handle + timer setup to keep da_mon valid. */
+ rcu_read_lock();
+ da_handle_start_event(task->pid, ws, switch_in_tlob);
+ da_mon = da_get_monitor(task->pid, NULL);
+ if (unlikely(!da_mon)) {
+ /* Slot registered; missing da_mon means concurrent destroy. */
+ rcu_read_unlock();
+ da_destroy_storage(task->pid);
+ atomic_dec(&tlob_num_monitored);
+ put_task_struct(task);
+ kmem_cache_free(tlob_state_cache, ws);
+ return -ENOMEM;
+ }
+ ha_mon = to_ha_monitor(da_mon);
+ now_ns = ktime_get_ns();
+ ha_reset_env(ha_mon, clk_elapsed_tlob, now_ns);
+ ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), now_ns);
+ rcu_read_unlock();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tlob_start_task);
+
+/**
+ * tlob_stop_task - stop monitoring @task.
+ * @task: Task to stop.
+ *
+ * CAS on ws->stopping (0->1) under RCU claims cleanup ownership;
+ * the winner cancels the timer synchronously and frees all resources.
+ *
+ * Returns 0, -EOVERFLOW (budget exceeded), -ESRCH (not monitored),
+ * or -EAGAIN (concurrent caller claimed cleanup).
+ */
+int tlob_stop_task(struct task_struct *task)
+{
+ struct da_monitor *da_mon;
+ struct ha_monitor *ha_mon;
+ struct tlob_task_state *ws;
+ bool budget_exceeded;
+
+ rcu_read_lock();
+ ws = da_get_target_by_id(task->pid);
+ if (!ws) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ da_mon = da_get_monitor(task->pid, NULL);
+ if (unlikely(!da_mon)) {
+ /* ws in hash but da_mon gone; internal inconsistency. */
+ rcu_read_unlock();
+ WARN_ON_ONCE(1);
+ return -ESRCH;
+ }
+
+ ha_mon = to_ha_monitor(da_mon);
+
+ /*
+ * CAS (0->1) claims cleanup ownership under RCU (ws guaranteed valid).
+ * _release pairs with atomic_read_acquire in ha_setup_invariants.
+ */
+ if (atomic_cmpxchg_release(&ws->stopping, 0, 1) != 0) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
+
+ rcu_read_unlock();
+
+ /* Wait for in-flight timer callback before reading da_monitoring. */
+ ha_cancel_timer_sync(ha_mon);
+
+ /* Timer fired first -> budget exceeded; otherwise reset normally. */
+ rcu_read_lock();
+ budget_exceeded = !da_monitoring(da_mon);
+ if (!budget_exceeded)
+ da_monitor_reset(da_mon);
+ rcu_read_unlock();
+ da_destroy_storage(task->pid);
+ atomic_dec(&tlob_num_monitored);
+
+ put_task_struct(ws->task);
+ call_rcu(&ws->rcu, tlob_free_rcu);
+ return budget_exceeded ? -EOVERFLOW : 0;
+}
+EXPORT_SYMBOL_GPL(tlob_stop_task);
+
+static void tlob_stop_all(void)
+{
+ struct da_monitor_storage *ms;
+ pid_t pids[TLOB_MAX_MONITORED];
+ int bkt, n = 0;
+
+ /* Snapshot pids under RCU; re-derive ws under a fresh lock below. */
+ rcu_read_lock();
+ hash_for_each_rcu(da_monitor_ht, bkt, ms, node) {
+ if (ms->target && n < TLOB_MAX_MONITORED)
+ pids[n++] = ms->id;
+ }
+ rcu_read_unlock();
+
+ for (int i = 0; i < n; i++) {
+ pid_t pid = pids[i];
+ struct da_monitor *da_mon;
+ struct ha_monitor *ha_mon;
+ struct tlob_task_state *ws;
+
+ rcu_read_lock();
+ da_mon = da_get_monitor(pid, NULL);
+ if (!da_mon) {
+ /* Cleaned up by tlob_stop_task or exit handler. */
+ rcu_read_unlock();
+ continue;
+ }
+
+ ws = da_get_target(da_mon);
+ ha_mon = to_ha_monitor(da_mon);
+
+ /* CAS (0->1) claims ownership; skip if another caller won. */
+ if (atomic_cmpxchg_release(&ws->stopping, 0, 1) != 0) {
+ rcu_read_unlock();
+ continue;
+ }
+ rcu_read_unlock();
+
+ ha_cancel_timer_sync(ha_mon);
+
+ scoped_guard(rcu) {
+ da_monitor_reset(da_mon);
+ }
+ da_destroy_storage(pid);
+ atomic_dec(&tlob_num_monitored);
+ put_task_struct(ws->task);
+ call_rcu(&ws->rcu, tlob_free_rcu);
+ }
+}
+
+static int tlob_uprobe_entry_handler(struct rv_uprobe *p, struct pt_regs *regs,
+ __u64 *data)
+{
+ struct tlob_uprobe_binding *b = p->priv;
+
+ tlob_start_task(current, b->threshold_us);
+ return 0;
+}
+
+static int tlob_uprobe_stop_handler(struct rv_uprobe *p, struct pt_regs *regs,
+ __u64 *data)
+{
+ tlob_stop_task(current);
+ return 0;
+}
+
+/*
+ * Register start + stop entry uprobes for a binding.
+ * Called with tlob_uprobe_mutex held.
+ */
+static int tlob_add_uprobe(u64 threshold_us, const char *binpath,
+ loff_t offset_start, loff_t offset_stop)
+{
+ struct tlob_uprobe_binding *b, *tmp_b;
+ char pathbuf[TLOB_MAX_PATH];
+ struct path path;
+ char *canon;
+ int ret;
+
+ if (binpath[0] != '/')
+ return -EINVAL;
+
+ b = kzalloc_obj(*b, GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ b->threshold_us = threshold_us;
+ b->offset_start = offset_start;
+ b->offset_stop = offset_stop;
+
+ ret = kern_path(binpath, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto err_free;
+
+ if (!d_is_reg(path.dentry)) {
+ ret = -EINVAL;
+ goto err_path;
+ }
+
+ /* Reject duplicate start offset for the same binary. */
+ list_for_each_entry(tmp_b, &tlob_uprobe_list, list) {
+ if (tmp_b->offset_start == offset_start &&
+ tmp_b->start_probe->path.dentry == path.dentry) {
+ ret = -EEXIST;
+ goto err_path;
+ }
+ }
+
+ canon = d_path(&path, pathbuf, sizeof(pathbuf));
+ if (IS_ERR(canon)) {
+ ret = PTR_ERR(canon);
+ goto err_path;
+ }
+ strscpy(b->binpath, canon, sizeof(b->binpath));
+
+ /* Both probes share b (priv) and path; attach_path refs path itself. */
+ b->start_probe = rv_uprobe_attach_path(&path, offset_start,
+ tlob_uprobe_entry_handler, NULL, b);
+ if (IS_ERR(b->start_probe)) {
+ ret = PTR_ERR(b->start_probe);
+ b->start_probe = NULL;
+ goto err_path;
+ }
+
+ b->stop_probe = rv_uprobe_attach_path(&path, offset_stop,
+ tlob_uprobe_stop_handler, NULL, b);
+ if (IS_ERR(b->stop_probe)) {
+ ret = PTR_ERR(b->stop_probe);
+ b->stop_probe = NULL;
+ goto err_start;
+ }
+
+ path_put(&path);
+ list_add_tail(&b->list, &tlob_uprobe_list);
+ return 0;
+
+err_start:
+ rv_uprobe_detach(b->start_probe);
+err_path:
+ path_put(&path);
+err_free:
+ kfree(b);
+ return ret;
+}
+
+static int tlob_remove_uprobe_by_key(loff_t offset_start, const char *binpath)
+{
+ struct tlob_uprobe_binding *b, *tmp;
+ struct path remove_path;
+ int ret;
+
+ ret = kern_path(binpath, LOOKUP_FOLLOW, &remove_path);
+ if (ret)
+ return ret;
+
+ ret = -ENOENT;
+ list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
+ if (b->offset_start != offset_start)
+ continue;
+ if (b->start_probe->path.dentry != remove_path.dentry)
+ continue;
+ list_del(&b->list);
+ rv_uprobe_detach(b->start_probe);
+ rv_uprobe_detach(b->stop_probe);
+ kfree(b);
+ ret = 0;
+ break;
+ }
+
+ path_put(&remove_path);
+ return ret;
+}
+
+static void tlob_remove_all_uprobes(void)
+{
+ struct tlob_uprobe_binding *b, *tmp;
+ LIST_HEAD(pending);
+
+ mutex_lock(&tlob_uprobe_mutex);
+ list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
+ list_move(&b->list, &pending);
+ rv_uprobe_unregister_nosync(b->start_probe);
+ rv_uprobe_unregister_nosync(b->stop_probe);
+ }
+ mutex_unlock(&tlob_uprobe_mutex);
+
+ if (list_empty(&pending))
+ return;
+
+ /*
+ * One global barrier for all probes dequeued above; no new handlers
+ * for any of them can fire after this returns.
+ */
+ rv_uprobe_sync();
+
+ list_for_each_entry_safe(b, tmp, &pending, list) {
+ rv_uprobe_free(b->start_probe);
+ rv_uprobe_free(b->stop_probe);
+ kfree(b);
+ }
+}
+
+static ssize_t tlob_monitor_read(struct file *file,
+ char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ const int line_sz = TLOB_MAX_PATH + 128;
+ struct tlob_uprobe_binding *b;
+ char *buf, *p;
+ int n = 0, buf_sz, pos = 0;
+ ssize_t ret;
+
+ mutex_lock(&tlob_uprobe_mutex);
+ list_for_each_entry(b, &tlob_uprobe_list, list)
+ n++;
+
+ buf_sz = (n ? n : 1) * line_sz + 1;
+ buf = kmalloc(buf_sz, GFP_KERNEL);
+ if (!buf) {
+ mutex_unlock(&tlob_uprobe_mutex);
+ return -ENOMEM;
+ }
+
+ list_for_each_entry(b, &tlob_uprobe_list, list) {
+ p = b->binpath;
+ pos += scnprintf(buf + pos, buf_sz - pos,
+ "p %s:0x%llx 0x%llx threshold=%llu\n",
+ p,
+ (unsigned long long)b->offset_start,
+ (unsigned long long)b->offset_stop,
+ b->threshold_us);
+ }
+ mutex_unlock(&tlob_uprobe_mutex);
+
+ ret = simple_read_from_buffer(ubuf, count, ppos, buf, pos);
+ kfree(buf);
+ return ret;
+}
+
+/*
+ * Parse "p PATH:OFFSET_START OFFSET_STOP threshold=US".
+ * PATH may contain ':'; the last ':' separates path from offset.
+ * Returns 0 or -EINVAL.
+ */
+static int tlob_parse_uprobe_line(char *buf, u64 *thr_out,
+ char **path_out,
+ loff_t *start_out, loff_t *stop_out)
+{
+ unsigned long long thr = 0, stop_val = 0;
+ long long start_val;
+ char *p, *path_token, *token, *colon;
+ bool got_stop = false, got_thr = false;
+ int n;
+
+ /* Must start with "p " */
+ if (buf[0] != 'p' || buf[1] != ' ')
+ return -EINVAL;
+
+ p = buf + 2;
+ while (*p == ' ')
+ p++;
+
+ /* First space-delimited token is PATH:OFFSET_START */
+ path_token = strsep(&p, " \t");
+ if (!path_token || !*path_token)
+ return -EINVAL;
+
+ /* Split at last ':' to handle paths that contain ':'. */
+ colon = strrchr(path_token, ':');
+ if (!colon || colon - path_token < 2)
+ return -EINVAL;
+ *colon = '\0';
+
+ if (path_token[0] != '/')
+ return -EINVAL;
+
+ n = 0;
+ if (sscanf(colon + 1, "%lli%n", &start_val, &n) != 1 || n == 0)
+ return -EINVAL;
+ if (start_val < 0)
+ return -EINVAL;
+
+ /* Remaining tokens: OFFSET_STOP threshold=US */
+ while (p && (token = strsep(&p, " \t")) != NULL) {
+ if (!*token)
+ continue;
+ if (strncmp(token, "threshold=", 10) == 0) {
+ if (kstrtoull(token + 10, 0, &thr))
+ return -EINVAL;
+ got_thr = true;
+ } else if (!got_stop) {
+ long long sv;
+
+ n = 0;
+ if (sscanf(token, "%lli%n", &sv, &n) != 1 || n == 0)
+ return -EINVAL;
+ if (sv < 0)
+ return -EINVAL;
+ stop_val = (unsigned long long)sv;
+ got_stop = true;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ if (!got_stop || !got_thr || thr == 0)
+ return -EINVAL;
+ if (start_val == (long long)stop_val)
+ return -EINVAL;
+
+ *thr_out = thr;
+ *path_out = path_token;
+ *start_out = (loff_t)start_val;
+ *stop_out = (loff_t)stop_val;
+ return 0;
+}
+
+/* Parse "-PATH:OFFSET_START" (ftrace uprobe_events removal convention). */
+static int tlob_parse_remove_line(char *buf, char **path_out, loff_t *start_out)
+{
+ char *binpath, *colon;
+ long long off;
+ int n = 0;
+
+ if (buf[0] != '-')
+ return -EINVAL;
+ binpath = buf + 1;
+ if (binpath[0] != '/')
+ return -EINVAL;
+ colon = strrchr(binpath, ':');
+ if (!colon || colon - binpath < 2)
+ return -EINVAL;
+ *colon = '\0';
+ if (sscanf(colon + 1, "%lli%n", &off, &n) != 1 || n == 0)
+ return -EINVAL;
+ *path_out = binpath;
+ *start_out = (loff_t)off;
+ return 0;
+}
+
+VISIBLE_IF_KUNIT int tlob_create_or_delete_uprobe(char *buf)
+{
+ loff_t offset_start, offset_stop;
+ u64 threshold_us;
+ char *binpath;
+ int ret;
+
+ if (buf[0] == '-') {
+ ret = tlob_parse_remove_line(buf, &binpath, &offset_start);
+ if (ret)
+ return ret;
+ mutex_lock(&tlob_uprobe_mutex);
+ ret = tlob_remove_uprobe_by_key(offset_start, binpath);
+ mutex_unlock(&tlob_uprobe_mutex);
+ return ret;
+ }
+ ret = tlob_parse_uprobe_line(buf, &threshold_us, &binpath,
+ &offset_start, &offset_stop);
+ if (ret)
+ return ret;
+ mutex_lock(&tlob_uprobe_mutex);
+ ret = tlob_add_uprobe(threshold_us, binpath, offset_start, offset_stop);
+ mutex_unlock(&tlob_uprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_create_or_delete_uprobe);
+
+static ssize_t tlob_monitor_write(struct file *file,
+ const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[TLOB_MAX_PATH + 128];
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+ if (copy_from_user(buf, ubuf, count))
+ return -EFAULT;
+ buf[count] = '\0';
+ if (count > 0 && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ return tlob_create_or_delete_uprobe(buf) ?: (ssize_t)count;
+}
+
+static const struct file_operations tlob_monitor_fops = {
+ .open = simple_open,
+ .read = tlob_monitor_read,
+ .write = tlob_monitor_write,
+ .llseek = noop_llseek,
+};
+
+static int __tlob_init_monitor(void)
+{
+ int retval;
+
+ tlob_state_cache = kmem_cache_create("tlob_task_state",
+ sizeof(struct tlob_task_state),
+ 0, 0, NULL);
+ if (!tlob_state_cache)
+ return -ENOMEM;
+
+ atomic_set(&tlob_num_monitored, 0);
+
+ retval = da_monitor_init_prealloc(TLOB_MAX_MONITORED);
+ if (retval) {
+ kmem_cache_destroy(tlob_state_cache);
+ tlob_state_cache = NULL;
+ return retval;
+ }
+
+ /* Synthetic reference: held while the monitor is enabled. */
+ reinit_completion(&tlob_fd_released);
+ refcount_set(&tlob_fd_refcount, 1);
+
+ rv_this.enabled = 1;
+ return 0;
+}
+
+static void __tlob_destroy_monitor(void)
+{
+ rv_this.enabled = 0;
+ /*
+ * Remove uprobes first so stop_task can't race with tlob_stop_all().
+ * rv_uprobe_sync() inside ensures all in-flight handlers have finished.
+ */
+ tlob_remove_all_uprobes();
+ tlob_stop_all();
+ /* Wait for tlob_free_rcu and da_pool_return_cb before pool teardown. */
+ synchronize_rcu();
+
+ /*
+ * Drop the synthetic ref and wait for all open fds to close before
+ * teardown; prevents kmem_cache_zalloc() on the destroyed cache.
+ */
+ if (!refcount_dec_and_test(&tlob_fd_refcount))
+ wait_for_completion(&tlob_fd_released);
+
+ da_monitor_destroy();
+ kmem_cache_destroy(tlob_state_cache);
+ tlob_state_cache = NULL;
+}
+
+/* KUnit wrappers that acquire rv_interface_lock around monitor init/destroy. */
+#if IS_ENABLED(CONFIG_KUNIT)
+int tlob_init_monitor(void)
+{
+ int ret;
+
+ mutex_lock(&rv_interface_lock);
+ ret = __tlob_init_monitor();
+ mutex_unlock(&rv_interface_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tlob_init_monitor);
+
+void tlob_destroy_monitor(void)
+{
+ mutex_lock(&rv_interface_lock);
+ __tlob_destroy_monitor();
+ mutex_unlock(&rv_interface_lock);
+}
+EXPORT_SYMBOL_GPL(tlob_destroy_monitor);
+
+int tlob_num_monitored_read(void)
+{
+ return atomic_read(&tlob_num_monitored);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_num_monitored_read);
+
+/* Tracepoint probes for KUnit; rv_trace.h is only included here. */
+static struct tlob_captured_event tlob_kunit_last_event;
+static struct tlob_captured_error_env tlob_kunit_last_error_env;
+static atomic_t tlob_kunit_event_cnt = ATOMIC_INIT(0);
+static atomic_t tlob_kunit_error_env_cnt = ATOMIC_INIT(0);
+
+static void tlob_kunit_event_probe(void *data, int id, char *state, char *event,
+ char *next_state, bool final_state)
+{
+ tlob_kunit_last_event.id = id;
+ strscpy(tlob_kunit_last_event.state, state,
+ sizeof(tlob_kunit_last_event.state));
+ strscpy(tlob_kunit_last_event.event, event,
+ sizeof(tlob_kunit_last_event.event));
+ strscpy(tlob_kunit_last_event.next_state, next_state,
+ sizeof(tlob_kunit_last_event.next_state));
+ tlob_kunit_last_event.final_state = final_state;
+ atomic_inc(&tlob_kunit_event_cnt);
+}
+
+static void tlob_kunit_error_env_probe(void *data, int id, char *state,
+ char *event, char *env)
+{
+ tlob_kunit_last_error_env.id = id;
+ strscpy(tlob_kunit_last_error_env.state, state,
+ sizeof(tlob_kunit_last_error_env.state));
+ strscpy(tlob_kunit_last_error_env.event, event,
+ sizeof(tlob_kunit_last_error_env.event));
+ strscpy(tlob_kunit_last_error_env.env, env,
+ sizeof(tlob_kunit_last_error_env.env));
+ atomic_inc(&tlob_kunit_error_env_cnt);
+}
+
+int tlob_register_kunit_probes(void)
+{
+ int ret;
+
+ atomic_set(&tlob_kunit_event_cnt, 0);
+ atomic_set(&tlob_kunit_error_env_cnt, 0);
+
+ ret = register_trace_event_tlob(tlob_kunit_event_probe, NULL);
+ if (ret)
+ return ret;
+ ret = register_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL);
+ if (ret) {
+ unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_register_kunit_probes);
+
+void tlob_unregister_kunit_probes(void)
+{
+ unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
+ unregister_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL);
+ tracepoint_synchronize_unregister();
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_unregister_kunit_probes);
+
+int tlob_event_count_read(void)
+{
+ return atomic_read(&tlob_kunit_event_cnt);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_event_count_read);
+
+void tlob_event_count_reset(void)
+{
+ atomic_set(&tlob_kunit_event_cnt, 0);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_event_count_reset);
+
+int tlob_error_env_count_read(void)
+{
+ return atomic_read(&tlob_kunit_error_env_cnt);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_error_env_count_read);
+
+void tlob_error_env_count_reset(void)
+{
+ atomic_set(&tlob_kunit_error_env_cnt, 0);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_error_env_count_reset);
+
+const struct tlob_captured_event *tlob_last_event_read(void)
+{
+ return &tlob_kunit_last_event;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_last_event_read);
+
+const struct tlob_captured_error_env *tlob_last_error_env_read(void)
+{
+ return &tlob_kunit_last_error_env;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_last_error_env_read);
+
+#endif /* CONFIG_KUNIT */
+
+VISIBLE_IF_KUNIT int tlob_enable_hooks(void)
+{
+ rv_attach_trace_probe("tlob", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup);
+ rv_attach_trace_probe("tlob", sched_process_exit, handle_sched_process_exit);
+ return 0;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_enable_hooks);
+
+VISIBLE_IF_KUNIT void tlob_disable_hooks(void)
+{
+ rv_detach_trace_probe("tlob", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup);
+ rv_detach_trace_probe("tlob", sched_process_exit, handle_sched_process_exit);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_disable_hooks);
+
+static int enable_tlob(void)
+{
+ int retval;
+
+ retval = __tlob_init_monitor();
+ if (retval)
+ return retval;
+
+ return tlob_enable_hooks();
+}
+
+static void disable_tlob(void)
+{
+ tlob_disable_hooks();
+ __tlob_destroy_monitor();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "tlob",
+ .description = "Per-task latency-over-budget monitor.",
+ .enable = enable_tlob,
+ .disable = disable_tlob,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static void *tlob_chardev_bind(void)
+{
+ struct tlob_fpriv *fp;
+
+ fp = kzalloc_obj(*fp, GFP_KERNEL);
+ if (!fp)
+ return ERR_PTR(-ENOMEM);
+
+ /* Pin cache/pool for fd lifetime; balanced in tlob_chardev_release.
+ * If the synthetic ref has already been dropped (__tlob_destroy_monitor
+ * ran to completion), reject the bind so the caller gets ENODEV instead
+ * of corrupting a zero refcount.
+ */
+ if (!refcount_inc_not_zero(&tlob_fd_refcount)) {
+ kfree(fp);
+ return ERR_PTR(-ENODEV);
+ }
+ return fp;
+}
+
+static void tlob_chardev_release(void *priv)
+{
+ struct tlob_fpriv *fp = priv;
+
+ if (fp->monitoring) {
+ /* All return values are safe on close. */
+ (void)tlob_stop_task(fp->task);
+ put_task_struct(fp->task);
+ }
+
+ kfree(fp);
+
+ /* Release fd's pin; if last, wake __tlob_destroy_monitor. */
+ if (refcount_dec_and_test(&tlob_fd_refcount))
+ complete(&tlob_fd_released);
+}
+
+static long tlob_chardev_ioctl(void *priv, unsigned int cmd, unsigned long arg)
+{
+ struct tlob_fpriv *fp = priv;
+ struct tlob_start_args args;
+ struct task_struct *task;
+ int ret;
+
+ switch (cmd) {
+ case TLOB_IOCTL_TRACE_START:
+ if (fp->monitoring)
+ return -EALREADY;
+
+ if (copy_from_user(&args, (void __user *)arg, sizeof(args)))
+ return -EFAULT;
+
+ ret = tlob_start_task(current, args.threshold_us);
+ if (ret)
+ return ret;
+
+ fp->task = current;
+ get_task_struct(current);
+ fp->budget_exceeded = false;
+
+ /* Link fd so hrtimer callback can latch budget_exceeded. */
+ scoped_guard(rcu) {
+ struct tlob_task_state *ws = da_get_target_by_id(current->pid);
+
+ if (ws)
+ smp_store_release(&ws->fpriv, fp);
+ }
+
+ fp->monitoring = true;
+ return 0;
+
+ case TLOB_IOCTL_TRACE_STOP:
+ if (!fp->monitoring)
+ return -EINVAL;
+
+ task = fp->task;
+ fp->monitoring = false;
+ fp->task = NULL;
+
+ ret = tlob_stop_task(task);
+ put_task_struct(task);
+
+ /*
+ * -EOVERFLOW: budget exceeded; propagate to caller.
+ * -EAGAIN: concurrent stop_all claimed cleanup; fall through to
+ * budget_exceeded latch set by the hrtimer callback.
+ * -ESRCH: task exited before TRACE_STOP (process-exit handler
+ * claimed cleanup); same latch applies. Not an internal error.
+ */
+ if (ret == -EAGAIN || ret == -ESRCH)
+ return READ_ONCE(fp->budget_exceeded) ? -EOVERFLOW : 0;
+ return ret;
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct rv_chardev_ops tlob_chardev_ops = {
+ .owner = THIS_MODULE,
+ .bind = tlob_chardev_bind,
+ .ioctl = tlob_chardev_ioctl,
+ .release = tlob_chardev_release,
+};
+
+static int __init register_tlob(void)
+{
+ int ret;
+
+ ret = rv_chardev_register_monitor("tlob", &tlob_chardev_ops);
+ if (ret)
+ return ret;
+
+ ret = rv_register_monitor(&rv_this, NULL);
+ if (ret) {
+ rv_chardev_unregister_monitor("tlob");
+ return ret;
+ }
+
+ if (rv_this.root_d) {
+ if (!tracefs_create_file("monitor", 0644, rv_this.root_d, NULL,
+ &tlob_monitor_fops)) {
+ rv_unregister_monitor(&rv_this);
+ rv_chardev_unregister_monitor("tlob");
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static void __exit unregister_tlob(void)
+{
+ rv_chardev_unregister_monitor("tlob");
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_tlob);
+module_exit(unregister_tlob);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wen Yang <wen.yang@linux.dev>");
+MODULE_DESCRIPTION("tlob: task latency over budget per-task monitor.");
diff --git a/kernel/trace/rv/monitors/tlob/tlob.h b/kernel/trace/rv/monitors/tlob/tlob.h
new file mode 100644
index 000000000000..71c1735d27d2
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RV_TLOB_H
+#define _RV_TLOB_H
+
+/*
+ * C representation of the tlob hybrid automaton.
+ *
+ * Three-state HA following sched_stat / wwnr monitor naming conventions:
+ *
+ * running (initial) - task is executing on CPU [sched_stat: runtime]
+ * waiting - task is in runqueue, awaiting CPU [sched_stat: wait ]
+ * sleeping - task is blocked, awaiting resource[sched_stat: sleep ]
+ *
+ * Events (derived from sched_switch / sched_wakeup tracepoints):
+ * sleep - sched_switch, prev_state != 0 running → sleeping
+ * preempt - sched_switch, prev_state == 0 running → waiting
+ * wakeup - sched_wakeup sleeping → waiting
+ * switch_in - sched_switch, next == task waiting → running
+ *
+ * One HA clock invariant:
+ * clk_elapsed < BUDGET_NS() active in all states (total latency budget)
+ *
+ * task_start and task_stop are NOT DA events:
+ * task_start calls da_handle_start_event() to set initial state, then
+ * ha_reset_clk_ns() + ha_start_timer_ns() to initialise the clock and arm
+ * the timer directly.
+ * task_stop calls hrtimer_cancel() + da_monitor_reset() directly.
+ *
+ * For the format description see:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#include <linux/rv.h>
+#include <linux/sched.h>
+
+#define MONITOR_NAME tlob
+
+enum states_tlob {
+ running_tlob,
+ waiting_tlob,
+ sleeping_tlob,
+ state_max_tlob,
+};
+
+#define INVALID_STATE state_max_tlob
+
+enum events_tlob {
+ sleep_tlob,
+ preempt_tlob,
+ wakeup_tlob,
+ switch_in_tlob,
+ event_max_tlob,
+};
+
+/*
+ * HA environment variable: clk_elapsed is the only clock.
+ * It measures wall-clock time since task_start and is active in all states.
+ */
+enum envs_tlob {
+ clk_elapsed_tlob,
+ env_max_tlob,
+ env_max_stored_tlob = env_max_tlob,
+};
+
+_Static_assert(env_max_stored_tlob <= MAX_HA_ENV_LEN, "Not enough slots");
+#define HA_CLK_NS
+
+struct automaton_tlob {
+ char *state_names[state_max_tlob];
+ char *event_names[event_max_tlob];
+ char *env_names[env_max_tlob];
+ unsigned char function[state_max_tlob][event_max_tlob];
+ unsigned char initial_state;
+ bool final_states[state_max_tlob];
+};
+
+static const struct automaton_tlob automaton_tlob = {
+ .state_names = {
+ "running",
+ "waiting",
+ "sleeping",
+ },
+ .event_names = {
+ "sleep",
+ "preempt",
+ "wakeup",
+ "switch_in",
+ },
+ .env_names = {
+ "clk_elapsed",
+ },
+ .function = {
+ /* running */
+ {
+ sleeping_tlob, /* sleep (sched_switch, prev_state != 0) */
+ waiting_tlob, /* preempt (sched_switch, prev_state == 0) */
+ INVALID_STATE, /* wakeup (TASK_RUNNING can't be woken) */
+ INVALID_STATE, /* switch_in (already on CPU) */
+ },
+ /* waiting */
+ {
+ INVALID_STATE, /* sleep (not on CPU) */
+ INVALID_STATE, /* preempt (not on CPU) */
+ INVALID_STATE, /* wakeup (already TASK_RUNNING) */
+ running_tlob, /* switch_in */
+ },
+ /* sleeping */
+ {
+ INVALID_STATE, /* sleep (already sleeping) */
+ INVALID_STATE, /* preempt (not on CPU) */
+ waiting_tlob, /* wakeup */
+ INVALID_STATE, /* switch_in (must go through waiting first) */
+ },
+ },
+ .initial_state = running_tlob,
+ .final_states = { 1, 0, 0 },
+};
+
+/* Maximum number of concurrently monitored tasks. */
+#define TLOB_MAX_MONITORED 64U
+
+/* Maximum binary path length for uprobe binding. */
+#define TLOB_MAX_PATH 256
+
+/* Exported to ioctl/uprobe layers and KUnit */
+int tlob_start_task(struct task_struct *task, u64 threshold_us);
+int tlob_stop_task(struct task_struct *task);
+
+#if IS_ENABLED(CONFIG_KUNIT)
+int tlob_init_monitor(void);
+void tlob_destroy_monitor(void);
+int tlob_enable_hooks(void);
+void tlob_disable_hooks(void);
+int tlob_create_or_delete_uprobe(char *buf);
+int tlob_num_monitored_read(void);
+
+struct tlob_captured_event {
+ int id;
+ char state[16];
+ char event[16];
+ char next_state[16];
+ bool final_state;
+};
+
+struct tlob_captured_error_env {
+ int id;
+ char state[16];
+ char event[16];
+ char env[64];
+};
+
+struct tlob_captured_detail {
+ int pid;
+ u64 threshold_us;
+ u64 running_ns;
+ u64 waiting_ns;
+ u64 sleeping_ns;
+};
+
+int tlob_register_kunit_probes(void);
+void tlob_unregister_kunit_probes(void);
+int tlob_event_count_read(void);
+void tlob_event_count_reset(void);
+int tlob_error_env_count_read(void);
+void tlob_error_env_count_reset(void);
+const struct tlob_captured_event *tlob_last_event_read(void);
+const struct tlob_captured_error_env *tlob_last_error_env_read(void);
+const struct tlob_captured_detail *tlob_last_detail_read(void);
+#endif /* CONFIG_KUNIT */
+
+#endif /* _RV_TLOB_H */
diff --git a/kernel/trace/rv/monitors/tlob/tlob_trace.h b/kernel/trace/rv/monitors/tlob/tlob_trace.h
new file mode 100644
index 000000000000..08d34e1b0ab8
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob_trace.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h for tlob tracepoints.
+ *
+ * event_tlob and error_tlob are defined on the event_da_monitor_id and
+ * error_da_monitor_id classes, following the same pattern as nomiss.
+ * error_env_tlob carries the environment variable name that caused the
+ * clock-invariant violation (budget exceeded).
+ * The id field carries the pid of the monitored task.
+ */
+
+#ifdef CONFIG_RV_MON_TLOB
+/* id is the pid of the monitored task */
+DEFINE_EVENT(event_da_monitor_id, event_tlob,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_tlob,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+
+DEFINE_EVENT(error_env_da_monitor_id, error_env_tlob,
+ TP_PROTO(int id, char *state, char *event, char *env),
+ TP_ARGS(id, state, event, env));
+
+/*
+ * detail_env_tlob - per-state time breakdown emitted alongside error_env_tlob.
+ *
+ * Fired once per budget violation, immediately after error_env_tlob, from
+ * the hrtimer callback (hardirq context). The three _ns fields sum to
+ * approximately threshold_us * 1000; any rounding comes from the partial
+ * time accumulated in the current state since the last transition.
+ */
+TRACE_EVENT(detail_env_tlob,
+ TP_PROTO(int pid, u64 threshold_us,
+ u64 running_ns, u64 waiting_ns, u64 sleeping_ns),
+ TP_ARGS(pid, threshold_us, running_ns, waiting_ns, sleeping_ns),
+ TP_STRUCT__entry(
+ __field(int, pid)
+ __field(u64, threshold_us)
+ __field(u64, running_ns)
+ __field(u64, waiting_ns)
+ __field(u64, sleeping_ns)
+ ),
+ TP_fast_assign(
+ __entry->pid = pid;
+ __entry->threshold_us = threshold_us;
+ __entry->running_ns = running_ns;
+ __entry->waiting_ns = waiting_ns;
+ __entry->sleeping_ns = sleeping_ns;
+ ),
+ TP_printk("pid=%d threshold_us=%llu running_ns=%llu waiting_ns=%llu sleeping_ns=%llu",
+ __entry->pid, __entry->threshold_us,
+ __entry->running_ns, __entry->waiting_ns,
+ __entry->sleeping_ns)
+);
+#endif /* CONFIG_RV_MON_TLOB */
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index ee4e68102f17..a45c4763dbe5 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -142,10 +142,17 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <kunit/visibility.h>
#ifdef CONFIG_RV_MON_EVENTS
#define CREATE_TRACE_POINTS
#include <rv_trace.h>
+
+#ifdef CONFIG_RV_MON_TLOB
+EXPORT_TRACEPOINT_SYMBOL_GPL(error_tlob);
+EXPORT_TRACEPOINT_SYMBOL_GPL(event_tlob);
+EXPORT_TRACEPOINT_SYMBOL_GPL(error_env_tlob);
+#endif
#endif
#include "rv.h"
@@ -696,6 +703,33 @@ static void turn_monitoring_on(void)
WRITE_ONCE(monitoring_on, true);
}
+#if IS_ENABLED(CONFIG_KUNIT)
+/**
+ * rv_kunit_monitoring_on - enable the global monitoring_on flag for KUnit tests.
+ *
+ * KUnit test suite_init functions must call this before initialising any
+ * monitor, mirroring the turn_monitoring_on() call in rv_init_interface().
+ * The matching rv_kunit_monitoring_off() must be called in suite_exit to
+ * restore the flag so that test suites do not interfere with each other.
+ */
+void rv_kunit_monitoring_on(void)
+{
+ turn_monitoring_on();
+}
+EXPORT_SYMBOL_IF_KUNIT(rv_kunit_monitoring_on);
+
+/**
+ * rv_kunit_monitoring_off - disable the global monitoring_on flag for KUnit tests.
+ *
+ * Must be called in suite_exit to restore global state after rv_kunit_monitoring_on().
+ */
+void rv_kunit_monitoring_off(void)
+{
+ turn_monitoring_off();
+}
+EXPORT_SYMBOL_IF_KUNIT(rv_kunit_monitoring_off);
+#endif /* CONFIG_KUNIT */
+
static void turn_monitoring_on_with_reset(void)
{
lockdep_assert_held(&rv_interface_lock);
@@ -846,6 +880,10 @@ int __init rv_init_interface(void)
if (retval)
return 1;
+ retval = rv_chardev_init();
+ if (retval)
+ return 1;
+
turn_monitoring_on();
rv_root.root_dir = no_free_ptr(root_dir);
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index 2c0f51ff9d5c..82c9a2b57596 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -31,6 +31,8 @@ int rv_enable_monitor(struct rv_monitor *mon);
bool rv_is_container_monitor(struct rv_monitor *mon);
bool rv_is_nested_monitor(struct rv_monitor *mon);
+int rv_chardev_init(void);
+
#ifdef CONFIG_RV_REACTORS
int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root);
int init_rv_reactors(struct dentry *root_dir);
diff --git a/kernel/trace/rv/rv_chardev.c b/kernel/trace/rv/rv_chardev.c
new file mode 100644
index 000000000000..1fba1642ebc1
--- /dev/null
+++ b/kernel/trace/rv/rv_chardev.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/rv.h>
+#include <uapi/linux/rv.h>
+
+#include "rv.h"
+
+static_assert(MAX_RV_MONITOR_NAME_SIZE == RV_MONITOR_NAME_MAX,
+ "RV internal and UAPI monitor name size constants must match");
+
+struct rv_fd_priv {
+ const struct rv_chardev_ops *ops;
+ void *monitor_priv;
+};
+
+struct rv_chardev_entry {
+ char name[MAX_RV_MONITOR_NAME_SIZE];
+ const struct rv_chardev_ops *ops;
+ struct list_head list;
+};
+
+/* Protected by rv_interface_lock (from rv.h / rv.c). */
+static LIST_HEAD(rv_chardev_list);
+
+/**
+ * rv_chardev_register_monitor - expose a monitor via /dev/rv
+ * @name: Monitor name, must match the rv_monitor .name field.
+ * @ops: Callbacks providing bind / ioctl / release.
+ *
+ * Returns 0 on success, -EINVAL if @name is too long, -EEXIST if @name is
+ * already registered, -ENOMEM on OOM.
+ */
+int rv_chardev_register_monitor(const char *name,
+ const struct rv_chardev_ops *ops)
+{
+ struct rv_chardev_entry *e, *existing;
+
+ if (strlen(name) >= MAX_RV_MONITOR_NAME_SIZE)
+ return -EINVAL;
+
+ e = kmalloc_obj(*e, GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+
+ strscpy(e->name, name, sizeof(e->name));
+ e->ops = ops;
+
+ guard(mutex)(&rv_interface_lock);
+ list_for_each_entry(existing, &rv_chardev_list, list) {
+ if (strcmp(existing->name, name) == 0) {
+ kfree(e);
+ return -EEXIST;
+ }
+ }
+ list_add_tail(&e->list, &rv_chardev_list);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rv_chardev_register_monitor);
+
+/**
+ * rv_chardev_unregister_monitor - remove a monitor from the /dev/rv registry
+ * @name: Monitor name previously passed to rv_chardev_register_monitor().
+ *
+ * Existing bound fds remain valid; their ops pointer is stable until the
+ * fd is closed. The caller must ensure no new binds to this monitor can
+ * succeed after unregistration — typically by unregistering before unloading
+ * the module that provides the ops.
+ */
+void rv_chardev_unregister_monitor(const char *name)
+{
+ struct rv_chardev_entry *e, *tmp;
+
+ guard(mutex)(&rv_interface_lock);
+ list_for_each_entry_safe(e, tmp, &rv_chardev_list, list) {
+ if (strcmp(e->name, name) == 0) {
+ list_del(&e->list);
+ kfree(e);
+ return;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(rv_chardev_unregister_monitor);
+
+static int rv_dev_open(struct inode *inode, struct file *file)
+{
+ struct rv_fd_priv *fp;
+
+ fp = kzalloc_obj(*fp, GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ file->private_data = fp;
+ return 0;
+}
+
+static int rv_dev_release(struct inode *inode, struct file *file)
+{
+ struct rv_fd_priv *fp = file->private_data;
+
+ if (fp->ops) {
+ fp->ops->release(fp->monitor_priv);
+ module_put(fp->ops->owner);
+ }
+ kfree(fp);
+ return 0;
+}
+
+static int rv_bind_monitor(struct rv_fd_priv *fp, const char __user *uarg)
+{
+ const struct rv_chardev_ops *ops = NULL;
+ struct rv_bind_args args;
+ void *priv;
+
+ if (fp->ops)
+ return -EBUSY;
+
+ if (copy_from_user(&args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ args.monitor_name[RV_MONITOR_NAME_MAX - 1] = '\0';
+
+ /*
+ * Pin the owning module while the list entry is still valid under
+ * rv_interface_lock, preventing a concurrent rmmod from completing
+ * between lookup and reference acquisition. bind() may sleep
+ * (GFP_KERNEL inside), so it runs after the lock is dropped.
+ */
+ scoped_guard(mutex, &rv_interface_lock) {
+ struct rv_chardev_entry *e;
+
+ list_for_each_entry(e, &rv_chardev_list, list) {
+ if (strcmp(e->name, args.monitor_name) != 0)
+ continue;
+ if (!try_module_get(e->ops->owner))
+ return -ENODEV;
+ ops = e->ops;
+ break;
+ }
+ }
+
+ if (!ops)
+ return -ENOENT;
+
+ priv = ops->bind();
+ if (IS_ERR(priv)) {
+ module_put(ops->owner);
+ return PTR_ERR(priv);
+ }
+
+ fp->ops = ops;
+ fp->monitor_priv = priv;
+ return 0;
+}
+
+static long rv_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct rv_fd_priv *fp = file->private_data;
+
+ if (cmd == RV_IOCTL_BIND_MONITOR)
+ return rv_bind_monitor(fp, (const char __user *)arg);
+
+ if (!fp->ops)
+ return -ENXIO;
+
+ return fp->ops->ioctl(fp->monitor_priv, cmd, arg);
+}
+
+static __poll_t rv_dev_poll(struct file *file, poll_table *wait)
+{
+ struct rv_fd_priv *fp = file->private_data;
+
+ if (!fp->ops || !fp->ops->poll)
+ return 0;
+
+ return fp->ops->poll(fp->monitor_priv, file, wait);
+}
+
+static const struct file_operations rv_dev_fops = {
+ .owner = THIS_MODULE,
+ .open = rv_dev_open,
+ .release = rv_dev_release,
+ .unlocked_ioctl = rv_dev_ioctl,
+ .compat_ioctl = rv_dev_ioctl,
+ .poll = rv_dev_poll,
+};
+
+static struct miscdevice rv_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "rv",
+ .fops = &rv_dev_fops,
+};
+
+int __init rv_chardev_init(void)
+{
+ return misc_register(&rv_miscdev);
+}
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 9622c269789c..a4bc215c1f15 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -189,6 +189,7 @@ DECLARE_EVENT_CLASS(error_env_da_monitor_id,
#include <monitors/stall/stall_trace.h>
#include <monitors/nomiss/nomiss_trace.h>
+#include <monitors/tlob/tlob_trace.h>
// Add new monitors based on CONFIG_HA_MON_EVENTS_ID here
#endif
diff --git a/kernel/trace/rv/rv_uprobe.c b/kernel/trace/rv/rv_uprobe.c
index bc28399cfd4b..1ba7b80c1d87 100644
--- a/kernel/trace/rv/rv_uprobe.c
+++ b/kernel/trace/rv/rv_uprobe.c
@@ -132,13 +132,10 @@ EXPORT_SYMBOL_GPL(rv_uprobe_attach);
*/
void rv_uprobe_detach(struct rv_uprobe *p)
{
- struct rv_uprobe_impl *impl;
-
if (!p)
return;
- impl = container_of(p, struct rv_uprobe_impl, pub);
- uprobe_unregister_nosync(impl->uprobe, &impl->uc);
+ rv_uprobe_unregister_nosync(p);
/*
* uprobe_unregister_sync() is a global barrier: it waits for all
* in-flight uprobe handlers across the entire system to complete,
@@ -146,8 +143,47 @@ void rv_uprobe_detach(struct rv_uprobe *p)
* guarantees that no handler touching impl->pub.priv is running by
* the time we return, even if the caller immediately frees priv.
*/
+ rv_uprobe_sync();
+ rv_uprobe_free(p);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_detach);
+
+/**
+ * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting
+ */
+void rv_uprobe_unregister_nosync(struct rv_uprobe *p)
+{
+ struct rv_uprobe_impl *impl;
+
+ if (!p)
+ return;
+
+ impl = container_of(p, struct rv_uprobe_impl, pub);
+ uprobe_unregister_nosync(impl->uprobe, &impl->uc);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_unregister_nosync);
+
+/**
+ * rv_uprobe_sync - wait for all in-flight uprobe handlers to complete
+ */
+void rv_uprobe_sync(void)
+{
uprobe_unregister_sync();
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_sync);
+
+/**
+ * rv_uprobe_free - release resources of a previously deregistered probe
+ */
+void rv_uprobe_free(struct rv_uprobe *p)
+{
+ struct rv_uprobe_impl *impl;
+
+ if (!p)
+ return;
+
+ impl = container_of(p, struct rv_uprobe_impl, pub);
path_put(&p->path);
kfree(impl);
}
-EXPORT_SYMBOL_GPL(rv_uprobe_detach);
+EXPORT_SYMBOL_GPL(rv_uprobe_free);
diff --git a/tools/include/uapi/linux/rv.h b/tools/include/uapi/linux/rv.h
new file mode 100644
index 000000000000..a34e5426393b
--- /dev/null
+++ b/tools/include/uapi/linux/rv.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * UAPI definitions for Runtime Verification (RV) monitors.
+ *
+ * All RV monitors that expose an ioctl self-instrumentation interface
+ * share the magic byte RV_IOC_MAGIC ('r').
+ *
+ * Usage examples and design rationale are in:
+ * Documentation/trace/rv/monitor_tlob.rst
+ */
+
+#ifndef _UAPI_LINUX_RV_H
+#define _UAPI_LINUX_RV_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/* Magic byte shared by all RV monitor ioctls. */
+#define RV_IOC_MAGIC 'r'
+
+/* Maximum monitor name length (including NUL terminator). */
+#define RV_MONITOR_NAME_MAX 32
+
+/* Generic /dev/rv ioctls (ioctl numbers 0–15 are reserved for the core) */
+
+/**
+ * struct rv_bind_args - arguments for RV_IOCTL_BIND_MONITOR
+ * @monitor_name: NUL-terminated name of the monitor to bind (e.g. "tlob").
+ */
+struct rv_bind_args {
+ char monitor_name[RV_MONITOR_NAME_MAX];
+};
+
+/*
+ * RV_IOCTL_BIND_MONITOR - associate this fd with a specific RV monitor.
+ *
+ * Must be called once after open() and before any monitor-specific ioctl.
+ *
+ * Returns 0 on success.
+ * Returns -EBUSY if this fd is already bound to a monitor.
+ * Returns -ENOENT if the requested monitor is not registered.
+ * Returns -ENOMEM on allocation failure.
+ */
+#define RV_IOCTL_BIND_MONITOR _IOW(RV_IOC_MAGIC, 0, struct rv_bind_args)
+
+/* tlob: task latency over budget monitor (ioctl numbers 1–15) */
+
+/**
+ * struct tlob_start_args - arguments for TLOB_IOCTL_TRACE_START
+ * @threshold_us: Total latency budget for this window, in microseconds.
+ * Must be greater than zero. Both on-CPU and off-CPU time
+ * (including runqueue wait) count toward this budget.
+ */
+struct tlob_start_args {
+ __u64 threshold_us;
+};
+
+/*
+ * TLOB_IOCTL_TRACE_START - begin monitoring the calling task.
+ *
+ * Arms a per-task hrtimer for threshold_us microseconds (CLOCK_MONOTONIC,
+ * so both on-CPU and off-CPU time count toward the budget).
+ *
+ * Returns 0 on success.
+ * Returns -EEXIST if TRACE_START was already called on this fd.
+ * Returns -ENOSPC if TLOB_MAX_MONITORED tasks are already being tracked.
+ * Returns -ENOMEM on allocation failure.
+ * Returns -ENODEV if the tlob monitor is not enabled.
+ * Returns -ERANGE if threshold_us is 0.
+ */
+#define TLOB_IOCTL_TRACE_START _IOW(RV_IOC_MAGIC, 1, struct tlob_start_args)
+
+/*
+ * TLOB_IOCTL_TRACE_STOP - end monitoring the calling task.
+ *
+ * Returns 0 if within budget.
+ * Returns -EOVERFLOW if the latency budget was exceeded.
+ * Returns -EINVAL if TLOB_IOCTL_TRACE_START was not called on this fd.
+ *
+ * poll/epoll: after TRACE_START the fd becomes readable (EPOLLIN) when the
+ * budget is exceeded. The caller may then issue TRACE_STOP to retrieve the
+ * result, or simply close the fd to clean up.
+ */
+#define TLOB_IOCTL_TRACE_STOP _IO(RV_IOC_MAGIC, 2)
+
+#endif /* _UAPI_LINUX_RV_H */
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [RFC PATCH v2 09/10] rv/tlob: add KUnit tests for the tlob monitor
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
` (7 preceding siblings ...)
2026-05-11 18:24 ` [RFC PATCH v2 08/10] rv/tlob: add tlob hybrid automaton monitor wen.yang
@ 2026-05-11 18:24 ` wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 10/10] selftests/verification: add tlob selftests wen.yang
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
Add five KUnit test suites gated behind CONFIG_TLOB_KUNIT_TEST
(depends on RV_MON_TLOB && KUNIT; default KUNIT_ALL_TESTS) with a
.kunitconfig fragment for the kunit.py runner.
tlob_task_api tests the start/stop API, error returns (-EEXIST,
-ESRCH, -EOVERFLOW, -ENOSPC, -ERANGE).
tlob_sched_integration covers context-switch accounting and monitoring
a kthread. tlob_parse_uprobe exercises the uprobe line parser.
tlob_trace_output checks sched_switch and error_env_tlob field layout.
tlob_violation_react verifies error_env_tlob fires once on budget
expiry and zero times when the budget is not exceeded.
Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
kernel/trace/rv/monitors/tlob/.kunitconfig | 5 +
kernel/trace/rv/monitors/tlob/tlob.c | 26 +
kernel/trace/rv/monitors/tlob/tlob_kunit.c | 881 +++++++++++++++++++++
3 files changed, 912 insertions(+)
create mode 100644 kernel/trace/rv/monitors/tlob/.kunitconfig
create mode 100644 kernel/trace/rv/monitors/tlob/tlob_kunit.c
diff --git a/kernel/trace/rv/monitors/tlob/.kunitconfig b/kernel/trace/rv/monitors/tlob/.kunitconfig
new file mode 100644
index 000000000000..977c58601ab7
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/.kunitconfig
@@ -0,0 +1,5 @@
+CONFIG_FTRACE=y
+CONFIG_KUNIT=y
+CONFIG_RV=y
+CONFIG_RV_MON_TLOB=y
+CONFIG_TLOB_KUNIT_TEST=y
diff --git a/kernel/trace/rv/monitors/tlob/tlob.c b/kernel/trace/rv/monitors/tlob/tlob.c
index 475e972ae9aa..90e7035a0b55 100644
--- a/kernel/trace/rv/monitors/tlob/tlob.c
+++ b/kernel/trace/rv/monitors/tlob/tlob.c
@@ -1024,6 +1024,7 @@ EXPORT_SYMBOL_IF_KUNIT(tlob_num_monitored_read);
/* Tracepoint probes for KUnit; rv_trace.h is only included here. */
static struct tlob_captured_event tlob_kunit_last_event;
static struct tlob_captured_error_env tlob_kunit_last_error_env;
+static struct tlob_captured_detail tlob_kunit_last_detail;
static atomic_t tlob_kunit_event_cnt = ATOMIC_INIT(0);
static atomic_t tlob_kunit_error_env_cnt = ATOMIC_INIT(0);
@@ -1054,6 +1055,17 @@ static void tlob_kunit_error_env_probe(void *data, int id, char *state,
atomic_inc(&tlob_kunit_error_env_cnt);
}
+static void tlob_kunit_detail_probe(void *data, int pid, u64 threshold_us,
+ u64 running_ns, u64 waiting_ns,
+ u64 sleeping_ns)
+{
+ tlob_kunit_last_detail.pid = pid;
+ tlob_kunit_last_detail.threshold_us = threshold_us;
+ tlob_kunit_last_detail.running_ns = running_ns;
+ tlob_kunit_last_detail.waiting_ns = waiting_ns;
+ tlob_kunit_last_detail.sleeping_ns = sleeping_ns;
+}
+
int tlob_register_kunit_probes(void)
{
int ret;
@@ -1069,6 +1081,12 @@ int tlob_register_kunit_probes(void)
unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
return ret;
}
+ ret = register_trace_detail_env_tlob(tlob_kunit_detail_probe, NULL);
+ if (ret) {
+ unregister_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL);
+ unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
+ return ret;
+ }
return 0;
}
EXPORT_SYMBOL_IF_KUNIT(tlob_register_kunit_probes);
@@ -1077,6 +1095,7 @@ void tlob_unregister_kunit_probes(void)
{
unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
unregister_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL);
+ unregister_trace_detail_env_tlob(tlob_kunit_detail_probe, NULL);
tracepoint_synchronize_unregister();
}
EXPORT_SYMBOL_IF_KUNIT(tlob_unregister_kunit_probes);
@@ -1105,6 +1124,7 @@ void tlob_error_env_count_reset(void)
}
EXPORT_SYMBOL_IF_KUNIT(tlob_error_env_count_reset);
+
const struct tlob_captured_event *tlob_last_event_read(void)
{
return &tlob_kunit_last_event;
@@ -1117,6 +1137,12 @@ const struct tlob_captured_error_env *tlob_last_error_env_read(void)
}
EXPORT_SYMBOL_IF_KUNIT(tlob_last_error_env_read);
+const struct tlob_captured_detail *tlob_last_detail_read(void)
+{
+ return &tlob_kunit_last_detail;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_last_detail_read);
+
#endif /* CONFIG_KUNIT */
VISIBLE_IF_KUNIT int tlob_enable_hooks(void)
diff --git a/kernel/trace/rv/monitors/tlob/tlob_kunit.c b/kernel/trace/rv/monitors/tlob/tlob_kunit.c
new file mode 100644
index 000000000000..ed2e7c7abaf8
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob_kunit.c
@@ -0,0 +1,881 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for the tlob RV monitor.
+ *
+ * tlob_task_api: start/stop lifecycle, error paths, violations.
+ * tlob_sched_integration: per-state accounting across real context switches.
+ * tlob_uprobe_format: uprobe binding format; add/remove acceptance and rejection.
+ * tlob_trace_output: trace event format for event_tlob, error_env_tlob.
+ * tlob_violation_react: error count per budget expiry; per-state breakdown.
+ *
+ * tlob_add_uprobe() duplicate-(binary, offset_start) constraint is not covered
+ * here: kern_path() requires a real filesystem; see selftests instead.
+ */
+#include <kunit/test.h>
+#include <linux/atomic.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+
+#include "tlob.h"
+
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+
+/*
+ * Kthread cleanup guard: registers a kunit action that stops a kthread on
+ * test exit, even when a KUNIT_ASSERT fires before normal teardown code runs.
+ *
+ * Caller must call get_task_struct() before registering the guard.
+ * Set guard->task = NULL before normal-path teardown to prevent double-stop.
+ * Pass the completion to unblock on early exit, or NULL if not needed.
+ */
+struct tlob_kthread_guard {
+ struct task_struct *task;
+ struct completion *unblock;
+};
+
+static void kthread_guard_fn(void *arg)
+{
+ struct tlob_kthread_guard *g = arg;
+
+ if (!g->task)
+ return;
+ if (g->unblock)
+ complete(g->unblock);
+ kthread_stop(g->task);
+ put_task_struct(g->task);
+}
+
+static struct tlob_kthread_guard *
+tlob_guard_kthread(struct kunit *test, struct task_struct *task,
+ struct completion *unblock)
+{
+ struct tlob_kthread_guard *g;
+
+ g = kunit_kzalloc(test, sizeof(*g), GFP_KERNEL);
+ if (!g)
+ return NULL;
+ g->task = task;
+ g->unblock = unblock;
+ if (kunit_add_action_or_reset(test, kthread_guard_fn, g))
+ return NULL;
+ return g;
+}
+
+/* Suite 1: task API - lifecycle, error paths, violations. */
+
+/* Basic start/stop cycle */
+static void tlob_start_stop_ok(struct kunit *test)
+{
+ int ret;
+
+ ret = tlob_start_task(current, 10000000ULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0);
+ KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0);
+}
+
+/* Double start must return -EALREADY; double stop must return -ESRCH. */
+static void tlob_double_start(struct kunit *test)
+{
+ KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000ULL), 0);
+ KUNIT_EXPECT_EQ(test, tlob_start_task(current, 10000000ULL), -EALREADY);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+ KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0);
+}
+
+/* Stop without start must return -ESRCH. */
+static void tlob_stop_without_start(struct kunit *test)
+{
+ tlob_stop_task(current);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+ KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0);
+}
+
+/* threshold_us == 0 is invalid and must return -ERANGE. */
+static void tlob_zero_threshold(struct kunit *test)
+{
+ KUNIT_EXPECT_EQ(test, tlob_start_task(current, 0), -ERANGE);
+}
+
+/* 1 ns budget: timer almost certainly fires before tlob_stop_task(). */
+static void tlob_immediate_deadline(struct kunit *test)
+{
+ int ret = tlob_start_task(current, 1);
+
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ udelay(100);
+ /* timer fired -> -EOVERFLOW; if we won the race, 0 is also valid */
+ ret = tlob_stop_task(current);
+ KUNIT_EXPECT_TRUE(test, ret == 0 || ret == -EOVERFLOW);
+ KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0);
+}
+
+/*
+ * kthreads provide distinct task_structs; fill to TLOB_MAX_MONITORED,
+ * then verify -ENOSPC.
+ */
+struct tlob_waiter_ctx {
+ struct completion start;
+ struct completion done;
+};
+
+static int tlob_waiter_fn(void *arg)
+{
+ struct tlob_waiter_ctx *ctx = arg;
+
+ wait_for_completion(&ctx->start);
+ complete(&ctx->done);
+ return 0;
+}
+
+static void tlob_enospc(struct kunit *test)
+{
+ struct tlob_waiter_ctx *ctxs;
+ struct task_struct **threads;
+ int i, ret;
+
+ ctxs = kunit_kcalloc(test, TLOB_MAX_MONITORED,
+ sizeof(*ctxs), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctxs);
+
+ threads = kunit_kcalloc(test, TLOB_MAX_MONITORED,
+ sizeof(*threads), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, threads);
+
+ KUNIT_ASSERT_EQ(test, tlob_num_monitored_read(), 0);
+
+ for (i = 0; i < TLOB_MAX_MONITORED; i++) {
+ init_completion(&ctxs[i].start);
+ init_completion(&ctxs[i].done);
+
+ threads[i] = kthread_run(tlob_waiter_fn, &ctxs[i],
+ "tlob_waiter_%d", i);
+ if (IS_ERR(threads[i])) {
+ KUNIT_FAIL(test, "kthread_run failed at i=%d", i);
+ threads[i] = NULL;
+ goto cleanup;
+ }
+ get_task_struct(threads[i]);
+
+ ret = tlob_start_task(threads[i], 10000000ULL);
+ if (ret != 0) {
+ KUNIT_FAIL(test, "tlob_start_task failed at i=%d: %d",
+ i, ret);
+ put_task_struct(threads[i]);
+ complete(&ctxs[i].start);
+ threads[i] = NULL;
+ goto cleanup;
+ }
+ }
+
+ ret = tlob_start_task(current, 10000000ULL);
+ KUNIT_EXPECT_EQ(test, ret, -ENOSPC);
+
+cleanup:
+ /* cancel monitoring and unblock first, then wait for full exit */
+ for (i = 0; i < TLOB_MAX_MONITORED; i++) {
+ if (!threads[i])
+ break;
+ tlob_stop_task(threads[i]);
+ complete(&ctxs[i].start);
+ }
+ for (i = 0; i < TLOB_MAX_MONITORED; i++) {
+ if (!threads[i])
+ break;
+ kthread_stop(threads[i]);
+ put_task_struct(threads[i]);
+ }
+}
+
+/*
+ * Holder kthread holds a mutex for 80 ms; arm a 10 ms budget, burn ~1 ms
+ * on-CPU, then block on the mutex; timer fires while sleeping -> -EOVERFLOW.
+ */
+struct tlob_holder_ctx {
+ struct mutex lock;
+ struct completion ready;
+ unsigned int hold_ms;
+};
+
+static int tlob_holder_fn(void *arg)
+{
+ struct tlob_holder_ctx *ctx = arg;
+
+ mutex_lock(&ctx->lock);
+ complete(&ctx->ready);
+ msleep(ctx->hold_ms);
+ mutex_unlock(&ctx->lock);
+ return 0;
+}
+
+static void tlob_deadline_fires_sleeping(struct kunit *test)
+{
+ struct tlob_holder_ctx *ctx;
+ struct tlob_kthread_guard *guard;
+ struct task_struct *holder;
+ ktime_t t0;
+ int ret;
+
+ ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctx);
+ ctx->hold_ms = 80;
+ mutex_init(&ctx->lock);
+ init_completion(&ctx->ready);
+
+ holder = kthread_run(tlob_holder_fn, ctx, "tlob_holder_kunit");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, holder);
+ get_task_struct(holder);
+
+ guard = tlob_guard_kthread(test, holder, NULL);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ wait_for_completion(&ctx->ready);
+
+ ret = tlob_start_task(current, 10000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 1000)
+ cpu_relax();
+
+ /* block on mutex: running->sleeping; timer fires while sleeping */
+ mutex_lock(&ctx->lock);
+ mutex_unlock(&ctx->lock);
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -EOVERFLOW);
+
+ guard->task = NULL;
+ kthread_stop(holder);
+ put_task_struct(holder);
+}
+
+/*
+ * yield() triggers a preempt sched_switch (prev_state==0): running->waiting.
+ * Busy-spin 50 ms so the 2 ms budget fires regardless of scheduler timing.
+ */
+static void tlob_deadline_fires_waiting(struct kunit *test)
+{
+ ktime_t t0;
+ int ret;
+
+ ret = tlob_start_task(current, 2000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ yield();
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 50000)
+ cpu_relax();
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -EOVERFLOW);
+}
+
+/* Arm a 1 ms budget and busy-spin for 50 ms; timer fires in running state. */
+static void tlob_deadline_fires_running(struct kunit *test)
+{
+ ktime_t t0;
+ int ret;
+
+ ret = tlob_start_task(current, 1000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 50000)
+ cpu_relax();
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -EOVERFLOW);
+}
+
+/* Start three tasks, reinit monitor, verify all entries are gone. */
+static int tlob_dummy_fn(void *arg)
+{
+ wait_for_completion((struct completion *)arg);
+ return 0;
+}
+
+static void tlob_reinit_clears_all(struct kunit *test)
+{
+ struct completion *done1, *done2;
+ struct tlob_kthread_guard *guard1, *guard2;
+ struct task_struct *t1, *t2;
+ int ret;
+
+ done1 = kunit_kzalloc(test, sizeof(*done1), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, done1);
+ done2 = kunit_kzalloc(test, sizeof(*done2), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, done2);
+
+ init_completion(done1);
+ init_completion(done2);
+
+ t1 = kthread_run(tlob_dummy_fn, done1, "tlob_dummy1");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t1);
+ get_task_struct(t1);
+ guard1 = tlob_guard_kthread(test, t1, done1);
+ KUNIT_ASSERT_NOT_NULL(test, guard1);
+
+ t2 = kthread_run(tlob_dummy_fn, done2, "tlob_dummy2");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t2);
+ get_task_struct(t2);
+ guard2 = tlob_guard_kthread(test, t2, done2);
+ KUNIT_ASSERT_NOT_NULL(test, guard2);
+
+ KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000ULL), 0);
+ KUNIT_ASSERT_EQ(test, tlob_start_task(t1, 10000000ULL), 0);
+ KUNIT_ASSERT_EQ(test, tlob_start_task(t2, 10000000ULL), 0);
+
+ tlob_destroy_monitor();
+ ret = tlob_init_monitor();
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(t1), -ESRCH);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(t2), -ESRCH);
+
+ /* null guards before teardown to prevent double-stop */
+ guard1->task = NULL;
+ guard2->task = NULL;
+ complete(done1);
+ complete(done2);
+ kthread_stop(t1);
+ kthread_stop(t2);
+ put_task_struct(t1);
+ put_task_struct(t2);
+}
+
+static int tlob_task_api_suite_init(struct kunit_suite *suite)
+{
+ rv_kunit_monitoring_on();
+ return tlob_init_monitor();
+}
+
+static void tlob_task_api_suite_exit(struct kunit_suite *suite)
+{
+ tlob_destroy_monitor();
+ rv_kunit_monitoring_off();
+}
+
+static void tlob_task_api_exit(struct kunit *test)
+{
+ /*
+ * tlob_stop_task() returns pool slots via call_rcu (da_pool_return_cb).
+ * Wait for all pending callbacks so each test starts with a full pool.
+ */
+ rcu_barrier();
+}
+
+static struct kunit_case tlob_task_api_cases[] = {
+ KUNIT_CASE(tlob_start_stop_ok),
+ KUNIT_CASE(tlob_double_start),
+ KUNIT_CASE(tlob_stop_without_start),
+ KUNIT_CASE(tlob_zero_threshold),
+ KUNIT_CASE(tlob_immediate_deadline),
+ KUNIT_CASE(tlob_enospc),
+ KUNIT_CASE(tlob_deadline_fires_sleeping),
+ KUNIT_CASE(tlob_deadline_fires_waiting),
+ KUNIT_CASE(tlob_deadline_fires_running),
+ KUNIT_CASE(tlob_reinit_clears_all),
+ {}
+};
+
+static struct kunit_suite tlob_task_api_suite = {
+ .name = "tlob_task_api",
+ .suite_init = tlob_task_api_suite_init,
+ .suite_exit = tlob_task_api_suite_exit,
+ .exit = tlob_task_api_exit,
+ .test_cases = tlob_task_api_cases,
+};
+
+/* Suite 2: sched integration - per-state ns accounting. */
+
+struct tlob_ping_ctx {
+ struct completion ping;
+ struct completion pong;
+};
+
+static int tlob_ping_fn(void *arg)
+{
+ struct tlob_ping_ctx *ctx = arg;
+
+ wait_for_completion(&ctx->ping);
+ complete(&ctx->pong);
+ return 0;
+}
+
+/* Force two context switches and verify stop returns 0 (within budget). */
+static void tlob_sched_switch_accounting(struct kunit *test)
+{
+ struct tlob_ping_ctx *ctx;
+ struct tlob_kthread_guard *guard;
+ struct task_struct *peer;
+ int ret;
+
+ ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctx);
+ init_completion(&ctx->ping);
+ init_completion(&ctx->pong);
+
+ peer = kthread_run(tlob_ping_fn, ctx, "tlob_ping_kunit");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, peer);
+ get_task_struct(peer);
+
+ guard = tlob_guard_kthread(test, peer, &ctx->ping);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ ret = tlob_start_task(current, 5000000ULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* complete(ping) -> peer runs, forcing a context switch out and back */
+ complete(&ctx->ping);
+ wait_for_completion(&ctx->pong);
+
+ ret = tlob_stop_task(current);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ guard->task = NULL;
+ kthread_stop(peer);
+ put_task_struct(peer);
+}
+
+/* start/stop monitoring a kthread other than current */
+static int tlob_block_fn(void *arg)
+{
+ struct completion *done = arg;
+
+ msleep(20);
+ complete(done);
+ return 0;
+}
+
+static void tlob_monitor_other_task(struct kunit *test)
+{
+ struct completion *done;
+ struct tlob_kthread_guard *guard;
+ struct task_struct *target;
+ int ret;
+
+ done = kunit_kzalloc(test, sizeof(*done), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, done);
+ init_completion(done);
+
+ target = kthread_run(tlob_block_fn, done, "tlob_target_kunit");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, target);
+ get_task_struct(target);
+
+ guard = tlob_guard_kthread(test, target, NULL);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ ret = tlob_start_task(target, 5000000ULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ wait_for_completion(done);
+
+ /* 5 s budget won't fire in 20 ms; 0 or -EOVERFLOW are both valid */
+ ret = tlob_stop_task(target);
+ KUNIT_EXPECT_TRUE(test, ret == 0 || ret == -EOVERFLOW);
+
+ guard->task = NULL;
+ kthread_stop(target);
+ put_task_struct(target);
+}
+
+static int tlob_sched_suite_init(struct kunit_suite *suite)
+{
+ rv_kunit_monitoring_on();
+ return tlob_init_monitor();
+}
+
+static void tlob_sched_suite_exit(struct kunit_suite *suite)
+{
+ tlob_destroy_monitor();
+ rv_kunit_monitoring_off();
+}
+
+static struct kunit_case tlob_sched_integration_cases[] = {
+ KUNIT_CASE(tlob_sched_switch_accounting),
+ KUNIT_CASE(tlob_monitor_other_task),
+ {}
+};
+
+static struct kunit_suite tlob_sched_integration_suite = {
+ .name = "tlob_sched_integration",
+ .suite_init = tlob_sched_suite_init,
+ .suite_exit = tlob_sched_suite_exit,
+ .test_cases = tlob_sched_integration_cases,
+};
+
+/* Suite 3: uprobe binding format - add/remove acceptance and rejection. */
+
+static const char * const tlob_format_valid[] = {
+ "p /usr/bin/myapp:4768 4848 threshold=5000",
+ "p /usr/bin/myapp:0x12a0 0x12f0 threshold=10000",
+ "p /opt/my:app/bin:0x100 0x200 threshold=1000",
+};
+
+static const char * const tlob_format_invalid[] = {
+ /* add: malformed */
+ "p /usr/bin/myapp:0x100 0x200 threshold=0",
+ "p :0x100 0x200 threshold=5000",
+ "p /usr/bin/myapp:0x100 threshold=5000",
+ "p /usr/bin/myapp:-1 0x200 threshold=5000",
+ "p /usr/bin/myapp:0x100 0x200",
+ "p /usr/bin/myapp:0x100 0x100 threshold=5000",
+ /* remove: malformed */
+ "-usr/bin/myapp:0x100",
+ "-/usr/bin/myapp",
+ "-/:0x100",
+ "-/usr/bin/myapp:abc",
+};
+
+/*
+ * Valid add lines return -ENOENT (path does not exist in the test environment)
+ * rather than 0; a non-(-EINVAL) return confirms the format was accepted.
+ */
+static void tlob_format_accepted(struct kunit *test)
+{
+ char buf[128];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tlob_format_valid); i++) {
+ strscpy(buf, tlob_format_valid[i], sizeof(buf));
+ KUNIT_EXPECT_NE(test, tlob_create_or_delete_uprobe(buf), -EINVAL);
+ }
+}
+
+static void tlob_format_rejected(struct kunit *test)
+{
+ char buf[128];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tlob_format_invalid); i++) {
+ strscpy(buf, tlob_format_invalid[i], sizeof(buf));
+ KUNIT_EXPECT_EQ(test, tlob_create_or_delete_uprobe(buf), -EINVAL);
+ }
+}
+
+static struct kunit_case tlob_uprobe_format_cases[] = {
+ KUNIT_CASE(tlob_format_accepted),
+ KUNIT_CASE(tlob_format_rejected),
+ {}
+};
+
+static struct kunit_suite tlob_uprobe_format_suite = {
+ .name = "tlob_uprobe_format",
+ .test_cases = tlob_uprobe_format_cases,
+};
+
+/* Suite 4: trace output - verify event_tlob and error_env_tlob field values. */
+
+static void tlob_trace_event_format(struct kunit *test)
+{
+ const struct tlob_captured_event *ev;
+ int pid = current->pid;
+ int ret;
+
+ tlob_event_count_reset();
+ ret = tlob_start_task(current, 5000000ULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* sleep/wakeup/switch_in: running->sleeping->waiting->running */
+ msleep(20);
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0);
+
+ KUNIT_EXPECT_GE(test, tlob_event_count_read(), 3);
+
+ ev = tlob_last_event_read();
+ KUNIT_EXPECT_EQ(test, ev->id, pid);
+ KUNIT_EXPECT_STREQ(test, ev->state, "waiting");
+ KUNIT_EXPECT_STREQ(test, ev->event, "switch_in");
+ KUNIT_EXPECT_STREQ(test, ev->next_state, "running");
+ KUNIT_EXPECT_TRUE(test, ev->final_state);
+}
+
+static void tlob_trace_error_env_format(struct kunit *test)
+{
+ const struct tlob_captured_error_env *err;
+ ktime_t t0;
+ int pid = current->pid;
+ int ret;
+
+ tlob_error_env_count_reset();
+ ret = tlob_start_task(current, 1000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 50000)
+ cpu_relax();
+
+ tlob_stop_task(current);
+
+ KUNIT_ASSERT_GE(test, tlob_error_env_count_read(), 1);
+
+ err = tlob_last_error_env_read();
+ KUNIT_EXPECT_EQ(test, err->id, pid);
+ KUNIT_EXPECT_STREQ(test, err->state, "running");
+ KUNIT_EXPECT_STREQ(test, err->event, "budget_exceeded");
+ KUNIT_EXPECT_TRUE(test, strncmp(err->env, "clk_elapsed=", 12) == 0);
+}
+
+static int tlob_trace_suite_init(struct kunit_suite *suite)
+{
+ int ret;
+
+ rv_kunit_monitoring_on();
+ ret = tlob_init_monitor();
+ if (ret)
+ goto err_mon_off;
+ ret = tlob_register_kunit_probes();
+ if (ret)
+ goto err_destroy;
+ ret = tlob_enable_hooks();
+ if (ret)
+ goto err_probes;
+ return 0;
+
+err_probes:
+ tlob_unregister_kunit_probes();
+err_destroy:
+ tlob_destroy_monitor();
+err_mon_off:
+ rv_kunit_monitoring_off();
+ return ret;
+}
+
+static void tlob_trace_suite_exit(struct kunit_suite *suite)
+{
+ tlob_disable_hooks();
+ tlob_unregister_kunit_probes();
+ tlob_destroy_monitor();
+ rv_kunit_monitoring_off();
+}
+
+static struct kunit_case tlob_trace_output_cases[] = {
+ KUNIT_CASE(tlob_trace_event_format),
+ KUNIT_CASE(tlob_trace_error_env_format),
+ {}
+};
+
+static struct kunit_suite tlob_trace_output_suite = {
+ .name = "tlob_trace_output",
+ .suite_init = tlob_trace_suite_init,
+ .suite_exit = tlob_trace_suite_exit,
+ .test_cases = tlob_trace_output_cases,
+};
+
+/*
+ * Suite 5: violation reaction - complement to Suite 4.
+ * Suite 4 checks trace field values; Suite 5 checks semantics:
+ * error count per budget expiry and per-state ns breakdown.
+ */
+
+/* generous budget; usleep forces state transitions; no error must fire */
+static void tlob_no_error_within_budget(struct kunit *test)
+{
+ tlob_error_env_count_reset();
+ tlob_event_count_reset();
+
+ KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000ULL), 0);
+ usleep_range(5000, 10000);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0);
+ KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 0);
+ KUNIT_EXPECT_GE(test, tlob_event_count_read(), 2);
+}
+
+/* busy-spin 50 ms >> 1 ms budget; running_ns must dominate */
+static void tlob_detail_running_dominates(struct kunit *test)
+{
+ const struct tlob_captured_detail *d;
+ u64 total_ns;
+ ktime_t t0;
+ int ret;
+
+ tlob_error_env_count_reset();
+
+ ret = tlob_start_task(current, 1000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 50000)
+ cpu_relax();
+
+ tlob_stop_task(current);
+
+ KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 1);
+ d = tlob_last_detail_read();
+ KUNIT_EXPECT_EQ(test, d->pid, current->pid);
+ KUNIT_EXPECT_EQ(test, d->threshold_us, 1000ULL);
+ total_ns = d->running_ns + d->waiting_ns + d->sleeping_ns;
+ KUNIT_EXPECT_GE(test, total_ns, 1000ULL * 1000);
+ KUNIT_EXPECT_GT(test, d->running_ns, d->sleeping_ns + d->waiting_ns);
+}
+
+struct tlob_hog_ctx {
+ int spin_ms;
+};
+
+static int tlob_hog_fn(void *arg)
+{
+ struct tlob_hog_ctx *ctx = arg;
+ ktime_t t0 = ktime_get();
+
+ while (!kthread_should_stop() &&
+ ktime_ms_delta(ktime_get(), t0) < ctx->spin_ms)
+ cpu_relax();
+ return 0;
+}
+
+/*
+ * SCHED_FIFO kthread bound to the same CPU preempts the monitored task
+ * (sched_switch prev_state == 0: running->waiting) and holds the CPU for
+ * 80 ms >> 10 ms budget, guaranteeing the timer fires in waiting state.
+ */
+static void tlob_detail_waiting_dominates(struct kunit *test)
+{
+ struct tlob_hog_ctx *ctx;
+ struct task_struct *hog;
+ struct tlob_kthread_guard *guard;
+ const struct tlob_captured_detail *d;
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ int ret;
+
+ tlob_error_env_count_reset();
+
+ ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctx);
+ ctx->spin_ms = 80;
+
+ hog = kthread_create(tlob_hog_fn, ctx, "tlob_s5_hog");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, hog);
+ get_task_struct(hog);
+
+ kthread_bind(hog, smp_processor_id());
+ sched_setscheduler_nocheck(hog, SCHED_FIFO, ¶m);
+
+ guard = tlob_guard_kthread(test, hog, NULL);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ ret = tlob_start_task(current, 10000); /* 10 ms budget */
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ wake_up_process(hog);
+ yield(); /* sched_switch prev_state == 0: running->waiting */
+
+ tlob_stop_task(current);
+
+ KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 1);
+ d = tlob_last_detail_read();
+ KUNIT_EXPECT_EQ(test, d->sleeping_ns, 0ULL);
+ KUNIT_EXPECT_GT(test, d->waiting_ns, d->running_ns + d->sleeping_ns);
+
+ guard->task = NULL;
+ kthread_stop(hog);
+ put_task_struct(hog);
+}
+
+/* block on mutex for 80 ms >> 10 ms budget; sleeping_ns must dominate */
+static void tlob_detail_sleeping_dominates(struct kunit *test)
+{
+ struct tlob_holder_ctx *ctx;
+ struct tlob_kthread_guard *guard;
+ struct task_struct *holder;
+ const struct tlob_captured_detail *d;
+ int ret;
+
+ tlob_error_env_count_reset();
+
+ ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctx);
+ ctx->hold_ms = 80;
+ mutex_init(&ctx->lock);
+ init_completion(&ctx->ready);
+
+ holder = kthread_run(tlob_holder_fn, ctx, "tlob_s5_detail");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, holder);
+ get_task_struct(holder);
+
+ guard = tlob_guard_kthread(test, holder, NULL);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ wait_for_completion(&ctx->ready);
+
+ ret = tlob_start_task(current, 10000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ mutex_lock(&ctx->lock);
+ mutex_unlock(&ctx->lock);
+
+ tlob_stop_task(current);
+
+ KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 1);
+ d = tlob_last_detail_read();
+ KUNIT_EXPECT_GT(test, d->sleeping_ns, d->running_ns + d->waiting_ns);
+
+ guard->task = NULL;
+ kthread_stop(holder);
+ put_task_struct(holder);
+}
+
+static int tlob_violation_suite_init(struct kunit_suite *suite)
+{
+ int ret;
+
+ rv_kunit_monitoring_on();
+ ret = tlob_init_monitor();
+ if (ret)
+ goto err_mon_off;
+ ret = tlob_register_kunit_probes();
+ if (ret)
+ goto err_destroy;
+ ret = tlob_enable_hooks();
+ if (ret)
+ goto err_probes;
+ return 0;
+
+err_probes:
+ tlob_unregister_kunit_probes();
+err_destroy:
+ tlob_destroy_monitor();
+err_mon_off:
+ rv_kunit_monitoring_off();
+ return ret;
+}
+
+static void tlob_violation_suite_exit(struct kunit_suite *suite)
+{
+ tlob_disable_hooks();
+ tlob_unregister_kunit_probes();
+ tlob_destroy_monitor();
+ rv_kunit_monitoring_off();
+}
+
+static struct kunit_case tlob_violation_react_cases[] = {
+ KUNIT_CASE(tlob_no_error_within_budget),
+ KUNIT_CASE(tlob_detail_running_dominates),
+ KUNIT_CASE(tlob_detail_sleeping_dominates),
+ KUNIT_CASE(tlob_detail_waiting_dominates),
+ {}
+};
+
+static struct kunit_suite tlob_violation_react_suite = {
+ .name = "tlob_violation_react",
+ .suite_init = tlob_violation_suite_init,
+ .suite_exit = tlob_violation_suite_exit,
+ .test_cases = tlob_violation_react_cases,
+};
+
+kunit_test_suites(&tlob_task_api_suite,
+ &tlob_sched_integration_suite,
+ &tlob_uprobe_format_suite,
+ &tlob_trace_output_suite,
+ &tlob_violation_react_suite);
+
+MODULE_DESCRIPTION("KUnit tests for the tlob RV monitor");
+MODULE_LICENSE("GPL");
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [RFC PATCH v2 10/10] selftests/verification: add tlob selftests
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
` (8 preceding siblings ...)
2026-05-11 18:24 ` [RFC PATCH v2 09/10] rv/tlob: add KUnit tests for the tlob monitor wen.yang
@ 2026-05-11 18:24 ` wen.yang
9 siblings, 0 replies; 11+ messages in thread
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
Add selftest coverage for the tlob RV monitor in
tools/testing/selftests/verification/.
Two helper binaries are built by tlob/Makefile: tlob_helper for the
ioctl interface (/dev/rv) and tlob_uprobe_target for the uprobe tests.
The top-level Makefile delegates to tlob/ via a generic MONITOR_SUBDIRS
pattern so monitor-specific build details stay within each monitor's
own subdirectory.
Eight test files cover the tracefs control interface (tracefs.tc), the
ioctl self-instrumentation interface (ioctl.tc, 8 scenarios), and the
uprobe external monitoring interface (uprobe_bind.tc, uprobe_violation.tc,
uprobe_no_event.tc, uprobe_multi.tc, uprobe_detail_sleeping.tc,
uprobe_detail_waiting.tc).
Tested on x86_64 with vng (virtme-ng):
TAP version 13
1..12
ok 1 Test monitor enable/disable
ok 2 Test monitor reactor setting
ok 3 Check available monitors
ok 4 Test wwnr monitor with printk reactor
ok 5 Test tlob ioctl self-instrumentation (within/over-budget, error paths)
ok 6 Test tlob monitor tracefs interface (enable/disable and files)
ok 7 uprobe binding: visible in monitor file, removable, duplicate offset rejected
ok 8 uprobe detail sleeping: sleeping_ns dominates when task blocks between probes
ok 9 uprobe detail waiting: waiting_ns dominates when task is preempted between probes
ok 10 Two bindings on same binary with different offsets and budgets fire independently
ok 11 Verify no spurious error_env_tlob events without an active uprobe binding
ok 12 uprobe violation: error_env_tlob and detail_env_tlob fire with correct fields
# Totals: pass:12 fail:0 xfail:0 xpass:0 skip:0 error:0
Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
tools/testing/selftests/verification/Makefile | 21 +-
.../verification/test.d/tlob/ioctl.tc | 36 +
.../verification/test.d/tlob/tracefs.tc | 17 +
.../verification/test.d/tlob/uprobe_bind.tc | 34 +
.../test.d/tlob/uprobe_detail_sleeping.tc | 47 ++
.../test.d/tlob/uprobe_detail_waiting.tc | 60 ++
.../verification/test.d/tlob/uprobe_multi.tc | 60 ++
.../test.d/tlob/uprobe_no_event.tc | 19 +
.../test.d/tlob/uprobe_violation.tc | 60 ++
.../selftests/verification/tlob/Makefile | 21 +
.../selftests/verification/tlob/tlob_ioctl.c | 626 ++++++++++++++++++
.../selftests/verification/tlob/tlob_target.c | 138 ++++
12 files changed, 1138 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/verification/test.d/tlob/ioctl.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/tracefs.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
create mode 100644 tools/testing/selftests/verification/tlob/Makefile
create mode 100644 tools/testing/selftests/verification/tlob/tlob_ioctl.c
create mode 100644 tools/testing/selftests/verification/tlob/tlob_target.c
diff --git a/tools/testing/selftests/verification/Makefile b/tools/testing/selftests/verification/Makefile
index aa8790c22a71..b5584fd3762d 100644
--- a/tools/testing/selftests/verification/Makefile
+++ b/tools/testing/selftests/verification/Makefile
@@ -1,8 +1,27 @@
# SPDX-License-Identifier: GPL-2.0
-all:
TEST_PROGS := verificationtest-ktap
TEST_FILES := test.d settings
EXTRA_CLEAN := $(OUTPUT)/logs/*
+# Subdirectories that provide helper binaries for the test runner.
+# Each entry must contain a Makefile that accepts OUTDIR= and deposits
+# its binaries there; verificationtest-ktap adds OUTDIR to PATH so
+# the ftracetest require-checks resolve the binaries by name.
+MONITOR_SUBDIRS := tlob
+
include ../lib.mk
+
+# Build and clean each monitor subdirectory.
+all: $(patsubst %,_build_%,$(MONITOR_SUBDIRS))
+
+clean: $(patsubst %,_clean_%,$(MONITOR_SUBDIRS))
+
+.PHONY: $(patsubst %,_build_%,$(MONITOR_SUBDIRS)) \
+ $(patsubst %,_clean_%,$(MONITOR_SUBDIRS))
+
+$(patsubst %,_build_%,$(MONITOR_SUBDIRS)): _build_%:
+ $(MAKE) -C $* OUTDIR="$(OUTPUT)" TOOLS_INCLUDES="$(TOOLS_INCLUDES)"
+
+$(patsubst %,_clean_%,$(MONITOR_SUBDIRS)): _clean_%:
+ $(MAKE) -C $* OUTDIR="$(OUTPUT)" clean
diff --git a/tools/testing/selftests/verification/test.d/tlob/ioctl.tc b/tools/testing/selftests/verification/test.d/tlob/ioctl.tc
new file mode 100644
index 000000000000..54ae249af9a6
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/ioctl.tc
@@ -0,0 +1,36 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob ioctl self-instrumentation (within/over-budget, error paths)
+# requires: tlob:monitor tlob_ioctl:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+
+[ -c /dev/rv ] || exit_unsupported
+
+echo 1 > monitors/tlob/enable
+
+# within budget: 50 ms threshold, 10 ms workload
+"$TLOB_HELPER" within_budget
+
+# over budget in running state: 1 ms threshold, 100 ms busy-spin
+"$TLOB_HELPER" over_budget_running
+
+# over budget in sleeping state: 3 ms threshold, 50 ms sleep
+"$TLOB_HELPER" over_budget_sleeping
+
+# over budget in waiting state: 1 us threshold, sched_yield
+"$TLOB_HELPER" over_budget_waiting
+
+# error paths
+"$TLOB_HELPER" double_start
+"$TLOB_HELPER" stop_no_start
+
+# per-thread isolation
+"$TLOB_HELPER" multi_thread
+
+# bind against disabled monitor must return ENODEV, not crash
+echo 0 > monitors/tlob/enable
+"$TLOB_HELPER" not_enabled
+echo 1 > monitors/tlob/enable
+
+echo 0 > monitors/tlob/enable
diff --git a/tools/testing/selftests/verification/test.d/tlob/tracefs.tc b/tools/testing/selftests/verification/test.d/tlob/tracefs.tc
new file mode 100644
index 000000000000..5d1e7cc02498
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/tracefs.tc
@@ -0,0 +1,17 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob monitor tracefs interface (enable/disable and files)
+# requires: tlob:monitor
+
+check_requires monitors/tlob/enable monitors/tlob/desc monitors/tlob/monitor
+
+# enable / disable via the enable file
+echo 1 > monitors/tlob/enable
+grep -q 1 monitors/tlob/enable
+echo "tlob" >> enabled_monitors
+grep -q tlob enabled_monitors
+
+echo 0 > monitors/tlob/enable
+grep -q 0 monitors/tlob/enable
+echo "!tlob" >> enabled_monitors
+! grep -q "^tlob$" enabled_monitors
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
new file mode 100644
index 000000000000..41e20d593855
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
@@ -0,0 +1,34 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test uprobe binding (visible in monitor file, removable, duplicate rejected)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+busy_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+stop_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+[ -n "$busy_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 30000 &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > monitors/tlob/enable
+echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=5000000" > "$TLOB_MONITOR"
+
+# Binding must appear in monitor file with canonical hex-offset format.
+grep -qE "^p ${UPROBE_TARGET}:0x[0-9a-f]+ 0x[0-9a-f]+ threshold=[0-9]+$" "$TLOB_MONITOR"
+grep -q "threshold=5000000" "$TLOB_MONITOR"
+
+# Duplicate offset_start must be rejected.
+! echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=9999" > "$TLOB_MONITOR" 2>/dev/null
+
+# Remove the binding; it must no longer appear.
+echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR"
+! grep -q "^p .*:0x${busy_offset#0x} " "$TLOB_MONITOR"
+
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+echo 0 > monitors/tlob/enable
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
new file mode 100644
index 000000000000..2b8656e0fef1
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
@@ -0,0 +1,47 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test uprobe detail sleeping (sleeping_ns dominates when task blocks between probes)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+start_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work 2>/dev/null)
+stop_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work_done 2>/dev/null)
+[ -n "$start_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 5000 sleep &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# 50 ms budget; task sleeps 200 ms per iteration -> sleeping_ns dominates.
+echo "p ${UPROBE_TARGET}:${start_offset} ${stop_offset} threshold=50000" > "$TLOB_MONITOR"
+
+found=0; i=0
+while [ "$i" -lt 30 ]; do
+ sleep 0.1
+ grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+ i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${start_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+waiting=$(echo "$line" | sed 's/.*waiting_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+[ "$sleeping" -gt "$((running + waiting))" ]
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
new file mode 100644
index 000000000000..0705854f24df
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
@@ -0,0 +1,60 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test uprobe detail waiting (waiting_ns dominates when task is preempted between probes)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+command -v chrt > /dev/null || exit_unsupported
+command -v taskset > /dev/null || exit_unsupported
+
+start_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_preempt_work 2>/dev/null)
+stop_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_preempt_work_done 2>/dev/null)
+[ -n "$start_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+cpu=0
+
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# Register probe before the target starts so the start uprobe fires on the
+# first entry to tlob_preempt_work. Budget: 500 ms.
+echo "p ${UPROBE_TARGET}:${start_offset} ${stop_offset} threshold=500000" > "$TLOB_MONITOR"
+
+# Target starts; start probe fires on tlob_preempt_work entry.
+taskset -c "$cpu" "$UPROBE_TARGET" 5000 preempt &
+busy_pid=$!
+sleep 0.05
+
+# RT hog on the same CPU preempts the target; target stays in waiting state
+# (runnable, off-CPU) until the budget expires -> waiting_ns dominates.
+chrt -f 99 taskset -c "$cpu" sh -c 'while true; do :; done' 2>/dev/null &
+hog_pid=$!
+
+found=0; i=0
+while [ "$i" -lt 30 ]; do
+ sleep 0.1
+ grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+ i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${start_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$hog_pid" 2>/dev/null; wait "$hog_pid" 2>/dev/null || true
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+waiting=$(echo "$line" | sed 's/.*waiting_ns=\([0-9]*\).*/\1/')
+[ "$waiting" -gt "$((running + sleeping))" ]
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
new file mode 100644
index 000000000000..c4b8f7108ae9
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
@@ -0,0 +1,60 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test two uprobe bindings on same binary (different offsets fire independently)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+busy_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+busy_stop=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+sleep_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work 2>/dev/null)
+sleep_stop=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work_done 2>/dev/null)
+[ -n "$busy_offset" ] || exit_unsupported
+[ -n "$busy_stop" ] || exit_unsupported
+[ -n "$sleep_offset" ] || exit_unsupported
+[ -n "$sleep_stop" ] || exit_unsupported
+
+"$UPROBE_TARGET" 30000 & # busy mode: tlob_busy_work fires every 200 ms
+busy_pid=$!
+"$UPROBE_TARGET" 30000 sleep & # sleep mode: tlob_sleep_work fires every 200 ms
+sleep_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# Binding A: 5 s budget on the busy probe - must not fire in 200 ms loops.
+echo "p ${UPROBE_TARGET}:${busy_offset} ${busy_stop} threshold=5000000" > "$TLOB_MONITOR"
+# Binding B: 10 ns budget on the sleep probe - fires on first invocation.
+echo "p ${UPROBE_TARGET}:${sleep_offset} ${sleep_stop} threshold=10" > "$TLOB_MONITOR"
+
+# Wait up to 2 s for error_env_tlob from binding B.
+found=0; i=0
+while [ "$i" -lt 20 ]; do
+ sleep 0.1
+ grep -q "error_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+ i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR" 2>/dev/null
+echo "-${UPROBE_TARGET}:${sleep_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$sleep_pid" 2>/dev/null; wait "$sleep_pid" 2>/dev/null || true
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+
+echo 0 > monitors/tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+
+[ "$found" = "1" ]
+# error_env_tlob payload: label and clock variable must be present.
+grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "budget_exceeded"
+grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "clk_elapsed="
+# detail_env_tlob must appear alongside the error.
+grep -q "detail_env_tlob" /sys/kernel/tracing/trace
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
new file mode 100644
index 000000000000..4a74853346e3
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test no spurious error_env_tlob events without an active uprobe binding
+# requires: tlob:monitor tlob_ioctl:program
+
+TLOB_MONITOR=monitors/tlob/monitor
+
+echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+sleep 0.5
+
+! grep -q "error_env_tlob" /sys/kernel/tracing/trace
+
+echo 0 > monitors/tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
new file mode 100644
index 000000000000..624fdb950f6b
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
@@ -0,0 +1,60 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test uprobe violation (error_env_tlob and detail_env_tlob fire with correct fields)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+busy_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+stop_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+[ -n "$busy_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 30000 &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# 10 ns budget - fires almost immediately; task is busy-spinning on-CPU.
+echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=10" > "$TLOB_MONITOR"
+
+# wait up to 2 s for detail_env_tlob
+found=0; i=0
+while [ "$i" -lt 20 ]; do
+ sleep 0.1
+ grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+ i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+# error_env_tlob event label must be budget_exceeded
+grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "budget_exceeded"
+
+# detail_env_tlob must have all five fields with the correct threshold
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+echo "$line" | grep -q "pid="
+echo "$line" | grep -q "threshold_us=10"
+echo "$line" | grep -q "running_ns="
+echo "$line" | grep -q "waiting_ns="
+echo "$line" | grep -q "sleeping_ns="
+
+# Busy-spin keeps the task on-CPU: running_ns must exceed sleeping_ns.
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+[ "$running" -gt "$sleeping" ]
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/tlob/Makefile b/tools/testing/selftests/verification/tlob/Makefile
new file mode 100644
index 000000000000..1bedf946cb34
--- /dev/null
+++ b/tools/testing/selftests/verification/tlob/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+# Builds tlob selftest helper binaries.
+#
+# Invoked by ../Makefile; pass OUTDIR to control the output directory
+# and TOOLS_INCLUDES for the in-tree UAPI -isystem flag.
+
+OUTDIR ?= $(CURDIR)/..
+CFLAGS += $(TOOLS_INCLUDES)
+
+.PHONY: all
+all: $(OUTDIR)/tlob_ioctl $(OUTDIR)/tlob_target
+
+$(OUTDIR)/tlob_ioctl: tlob_ioctl.c
+ $(CC) $(CFLAGS) -o $@ $< -lpthread
+
+$(OUTDIR)/tlob_target: tlob_target.c
+ $(CC) $(CFLAGS) -o $@ $<
+
+.PHONY: clean
+clean:
+ $(RM) $(OUTDIR)/tlob_ioctl $(OUTDIR)/tlob_target
diff --git a/tools/testing/selftests/verification/tlob/tlob_ioctl.c b/tools/testing/selftests/verification/tlob/tlob_ioctl.c
new file mode 100644
index 000000000000..abb4e2e80a2c
--- /dev/null
+++ b/tools/testing/selftests/verification/tlob/tlob_ioctl.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob_ioctl.c - ioctl test driver and ELF utility for tlob selftests
+ *
+ * Usage: tlob_ioctl <subcommand> [args...]
+ *
+ * not_enabled - TRACE_START without monitor enabled -> ENODEV
+ * within_budget - sleep within budget -> 0
+ * over_budget_running - busy-spin past budget -> EOVERFLOW
+ * over_budget_sleeping - sleep past budget -> EOVERFLOW
+ * over_budget_waiting - sched_yield into waiting state -> EOVERFLOW
+ * double_start - two starts without stop -> EALREADY
+ * stop_no_start - stop without start -> EINVAL
+ * multi_thread - two fds: thread A within budget, thread B over
+ * bench - TRACE_START/STOP latency (TAP output, always passes)
+ * sym_offset <binary> <symbol> - print ELF file offset of symbol
+ *
+ * Exit: 0 = pass, 1 = fail, 2 = skip (device not available).
+ */
+#define _GNU_SOURCE
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/rv.h>
+
+static int rv_fd = -1;
+
+static int open_rv(void)
+{
+ struct rv_bind_args bind = { .monitor_name = "tlob" };
+
+ rv_fd = open("/dev/rv", O_RDWR);
+ if (rv_fd < 0) {
+ fprintf(stderr, "open /dev/rv: %s\n", strerror(errno));
+ return -1;
+ }
+ if (ioctl(rv_fd, RV_IOCTL_BIND_MONITOR, &bind) < 0) {
+ fprintf(stderr, "bind tlob: %s\n", strerror(errno));
+ close(rv_fd);
+ rv_fd = -1;
+ return -1;
+ }
+ return 0;
+}
+
+static void busy_spin_us(unsigned long us)
+{
+ struct timespec start, now;
+ unsigned long elapsed;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ do {
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+ * 1000000000UL
+ + (unsigned long)(now.tv_nsec - start.tv_nsec);
+ } while (elapsed < us * 1000UL);
+}
+
+static int trace_start(uint64_t threshold_us)
+{
+ struct tlob_start_args args = {
+ .threshold_us = threshold_us,
+ };
+
+ return ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+}
+
+static int trace_stop(void)
+{
+ return ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+}
+
+/* Synchronous TRACE_START / TRACE_STOP tests */
+
+/* Bind to a disabled monitor must return ENODEV without crashing */
+static int test_not_enabled(void)
+{
+ struct rv_bind_args bind = { .monitor_name = "tlob" };
+ int fd;
+ int ret;
+
+ fd = open("/dev/rv", O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr, "open /dev/rv: %s\n", strerror(errno));
+ return 2; /* skip */
+ }
+
+ ret = ioctl(fd, RV_IOCTL_BIND_MONITOR, &bind);
+ close(fd);
+
+ if (ret == 0) {
+ fprintf(stderr, "RV_IOCTL_BIND_MONITOR: expected ENODEV, got success\n");
+ return 1;
+ }
+ if (errno != ENODEV) {
+ fprintf(stderr, "RV_IOCTL_BIND_MONITOR: expected ENODEV, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+static int test_within_budget(void)
+{
+ int ret;
+
+ /* 50 ms budget */
+ if (trace_start(50000) < 0) {
+ fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ usleep(10000); /* 10 ms */
+ ret = trace_stop();
+ if (ret != 0) {
+ fprintf(stderr, "TRACE_STOP: expected 0, got %d errno=%s\n",
+ ret, strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+static int test_over_budget_running(void)
+{
+ int ret;
+
+ /* 1 ms budget */
+ if (trace_start(1000) < 0) {
+ fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ busy_spin_us(100000); /* 100 ms */
+ ret = trace_stop();
+ if (ret == 0) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n");
+ return 1;
+ }
+ if (errno != EOVERFLOW) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+static int test_over_budget_sleeping(void)
+{
+ int ret;
+
+ /* 3 ms budget */
+ if (trace_start(3000) < 0) {
+ fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ usleep(50000); /* 50 ms; sleeping time counts toward budget */
+ ret = trace_stop();
+ if (ret == 0) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n");
+ return 1;
+ }
+ if (errno != EOVERFLOW) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+static int test_over_budget_waiting(void)
+{
+ int ret;
+
+ /* 1 us budget */
+ if (trace_start(1) < 0) {
+ fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ sched_yield(); /* running -> waiting -> running */
+ busy_spin_us(10); /* 10 us >> 1 us budget; hrtimer fires during spin */
+ ret = trace_stop();
+ if (ret == 0) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n");
+ return 1;
+ }
+ if (errno != EOVERFLOW) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+/* Error-handling tests */
+
+static int test_double_start(void)
+{
+ int ret;
+
+ /* 10 s: large enough the hrtimer won't fire during the test */
+ if (trace_start(10000000ULL) < 0) {
+ fprintf(stderr, "first TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ ret = trace_start(10000000);
+ if (ret == 0) {
+ fprintf(stderr, "second TRACE_START: expected EALREADY, got 0\n");
+ trace_stop();
+ return 1;
+ }
+ if (errno != EALREADY) {
+ fprintf(stderr, "second TRACE_START: expected EALREADY, got %s\n",
+ strerror(errno));
+ trace_stop();
+ return 1;
+ }
+ trace_stop();
+ return 0;
+}
+
+static int test_stop_no_start(void)
+{
+ int ret;
+
+ /* Ensure clean state: ignore error from a stale entry */
+ trace_stop();
+
+ ret = trace_stop();
+ if (ret == 0) {
+ fprintf(stderr, "TRACE_STOP: expected EINVAL, got 0\n");
+ return 1;
+ }
+ if (errno != EINVAL) {
+ fprintf(stderr, "TRACE_STOP: expected EINVAL, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+/* Two threads, each with its own fd: A within budget, B over budget. */
+
+struct mt_thread_args {
+ uint64_t threshold_us;
+ unsigned long workload_us;
+ int busy;
+ int expect_eoverflow;
+ int result;
+};
+
+static void *mt_thread_fn(void *arg)
+{
+ struct mt_thread_args *a = arg;
+ struct tlob_start_args args = { .threshold_us = a->threshold_us };
+ struct rv_bind_args bind = { .monitor_name = "tlob" };
+ int fd;
+ int ret;
+
+ fd = open("/dev/rv", O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr, "thread open /dev/rv: %s\n", strerror(errno));
+ a->result = 1;
+ return NULL;
+ }
+ if (ioctl(fd, RV_IOCTL_BIND_MONITOR, &bind) < 0) {
+ fprintf(stderr, "thread bind tlob: %s\n", strerror(errno));
+ close(fd);
+ a->result = 1;
+ return NULL;
+ }
+
+ ret = ioctl(fd, TLOB_IOCTL_TRACE_START, &args);
+ if (ret < 0) {
+ fprintf(stderr, "thread TRACE_START: %s\n", strerror(errno));
+ close(fd);
+ a->result = 1;
+ return NULL;
+ }
+
+ if (a->busy)
+ busy_spin_us(a->workload_us);
+ else
+ usleep(a->workload_us);
+
+ ret = ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ if (a->expect_eoverflow) {
+ if (ret == 0 || errno != EOVERFLOW) {
+ fprintf(stderr, "thread: expected EOVERFLOW, got ret=%d errno=%s\n",
+ ret, strerror(errno));
+ close(fd);
+ a->result = 1;
+ return NULL;
+ }
+ } else {
+ if (ret != 0) {
+ fprintf(stderr, "thread: expected 0, got ret=%d errno=%s\n",
+ ret, strerror(errno));
+ close(fd);
+ a->result = 1;
+ return NULL;
+ }
+ }
+ close(fd);
+ a->result = 0;
+ return NULL;
+}
+
+static int test_multi_thread(void)
+{
+ pthread_t ta, tb;
+ struct mt_thread_args a = {
+ .threshold_us = 20000, /* 20 ms */
+ .workload_us = 5000, /* 5 ms sleep -> within budget */
+ .busy = 0,
+ .expect_eoverflow = 0,
+ };
+ struct mt_thread_args b = {
+ .threshold_us = 3000, /* 3 ms */
+ .workload_us = 30000, /* 30 ms spin -> over budget */
+ .busy = 1,
+ .expect_eoverflow = 1,
+ };
+
+ pthread_create(&ta, NULL, mt_thread_fn, &a);
+ pthread_create(&tb, NULL, mt_thread_fn, &b);
+ pthread_join(ta, NULL);
+ pthread_join(tb, NULL);
+
+ return (a.result || b.result) ? 1 : 0;
+}
+
+/*
+ * Benchmark TRACE_START, TRACE_STOP, and round-trip ioctls.
+ * Output uses TAP '#' prefix; always returns 0.
+ */
+#define BENCH_WARMUP 32
+#define BENCH_N 1000
+
+static long long timespec_diff_ns(const struct timespec *a,
+ const struct timespec *b)
+{
+ return (long long)(b->tv_sec - a->tv_sec) * 1000000000LL
+ + (b->tv_nsec - a->tv_nsec);
+}
+
+static int test_bench(void)
+{
+ struct tlob_start_args args = {
+ .threshold_us = 10000000ULL, /* 10 s */
+ };
+ struct timespec t0, t1;
+ long long total_start_ns = 0, total_stop_ns = 0, total_rt_ns = 0;
+ int i;
+
+ /* warm up */
+ for (i = 0; i < BENCH_WARMUP; i++) {
+ if (ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args) == 0)
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ }
+
+ /* start only */
+ for (i = 0; i < BENCH_N; i++) {
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ total_start_ns += timespec_diff_ns(&t0, &t1);
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ }
+
+ /* stop only */
+ for (i = 0; i < BENCH_N; i++) {
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ total_stop_ns += timespec_diff_ns(&t0, &t1);
+ }
+
+ /* round-trip */
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ for (i = 0; i < BENCH_N; i++) {
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ }
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ total_rt_ns = timespec_diff_ns(&t0, &t1);
+
+ printf("# start ioctl only: %lld ns/iter (N=%d, includes syscall)\n",
+ total_start_ns / BENCH_N, BENCH_N);
+ printf("# stop ioctl only: %lld ns/iter (N=%d, includes syscall)\n",
+ total_stop_ns / BENCH_N, BENCH_N);
+ printf("# start+stop roundtrip: %lld ns/iter (N=%d, includes 2 syscalls)\n",
+ total_rt_ns / BENCH_N, BENCH_N);
+ return 0;
+}
+
+/*
+ * Print the ELF file offset of <symname> in <binary>. Walks .symtab
+ * (falling back to .dynsym) and converts vaddr to file offset via PT_LOAD.
+ * Supports 32- and 64-bit ELF.
+ */
+static int sym_offset(const char *binary, const char *symname)
+{
+ int fd;
+ struct stat st;
+ void *map;
+ Elf64_Ehdr *ehdr;
+ Elf32_Ehdr *ehdr32;
+ int is64;
+ uint64_t sym_vaddr = 0;
+ int found = 0;
+ uint64_t file_offset = 0;
+
+ fd = open(binary, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "open %s: %s\n", binary, strerror(errno));
+ return 1;
+ }
+ if (fstat(fd, &st) < 0) {
+ close(fd);
+ return 1;
+ }
+ map = mmap(NULL, (size_t)st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ close(fd);
+ if (map == MAP_FAILED) {
+ fprintf(stderr, "mmap: %s\n", strerror(errno));
+ return 1;
+ }
+
+ ehdr = (Elf64_Ehdr *)map;
+ ehdr32 = (Elf32_Ehdr *)map;
+ if (st.st_size < 4 ||
+ ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
+ ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+ ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
+ ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+ fprintf(stderr, "%s: not an ELF file\n", binary);
+ munmap(map, (size_t)st.st_size);
+ return 1;
+ }
+ is64 = (ehdr->e_ident[EI_CLASS] == ELFCLASS64);
+
+ if (is64) {
+ Elf64_Shdr *shdrs = (Elf64_Shdr *)((char *)map + ehdr->e_shoff);
+ Elf64_Shdr *shstrtab_hdr = &shdrs[ehdr->e_shstrndx];
+ const char *shstrtab = (char *)map + shstrtab_hdr->sh_offset;
+ int si;
+
+ /* prefer .symtab; fall back to .dynsym */
+ for (int pass = 0; pass < 2 && !found; pass++) {
+ const char *target = pass ? ".dynsym" : ".symtab";
+
+ for (si = 0; si < ehdr->e_shnum && !found; si++) {
+ Elf64_Shdr *sh = &shdrs[si];
+ const char *name = shstrtab + sh->sh_name;
+
+ if (strcmp(name, target) != 0)
+ continue;
+
+ Elf64_Shdr *strtab_sh = &shdrs[sh->sh_link];
+ const char *strtab = (char *)map + strtab_sh->sh_offset;
+ Elf64_Sym *syms = (Elf64_Sym *)((char *)map + sh->sh_offset);
+ uint64_t nsyms = sh->sh_size / sizeof(Elf64_Sym);
+ uint64_t j;
+
+ for (j = 0; j < nsyms; j++) {
+ if (strcmp(strtab + syms[j].st_name, symname) == 0) {
+ sym_vaddr = syms[j].st_value;
+ found = 1;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!found) {
+ fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary);
+ munmap(map, (size_t)st.st_size);
+ return 1;
+ }
+
+ /* Convert vaddr to file offset via PT_LOAD segments */
+ Elf64_Phdr *phdrs = (Elf64_Phdr *)((char *)map + ehdr->e_phoff);
+ int pi;
+
+ for (pi = 0; pi < ehdr->e_phnum; pi++) {
+ Elf64_Phdr *ph = &phdrs[pi];
+
+ if (ph->p_type != PT_LOAD)
+ continue;
+ if (sym_vaddr >= ph->p_vaddr &&
+ sym_vaddr < ph->p_vaddr + ph->p_filesz) {
+ file_offset = sym_vaddr - ph->p_vaddr + ph->p_offset;
+ break;
+ }
+ }
+ } else {
+ /* 32-bit ELF */
+ Elf32_Shdr *shdrs = (Elf32_Shdr *)((char *)map + ehdr32->e_shoff);
+ Elf32_Shdr *shstrtab_hdr = &shdrs[ehdr32->e_shstrndx];
+ const char *shstrtab = (char *)map + shstrtab_hdr->sh_offset;
+ int si;
+ uint32_t sym_vaddr32 = 0;
+
+ for (int pass = 0; pass < 2 && !found; pass++) {
+ const char *target = pass ? ".dynsym" : ".symtab";
+
+ for (si = 0; si < ehdr32->e_shnum && !found; si++) {
+ Elf32_Shdr *sh = &shdrs[si];
+ const char *name = shstrtab + sh->sh_name;
+
+ if (strcmp(name, target) != 0)
+ continue;
+
+ Elf32_Shdr *strtab_sh = &shdrs[sh->sh_link];
+ const char *strtab = (char *)map + strtab_sh->sh_offset;
+ Elf32_Sym *syms = (Elf32_Sym *)((char *)map + sh->sh_offset);
+ uint32_t nsyms = sh->sh_size / sizeof(Elf32_Sym);
+ uint32_t j;
+
+ for (j = 0; j < nsyms; j++) {
+ if (strcmp(strtab + syms[j].st_name, symname) == 0) {
+ sym_vaddr32 = syms[j].st_value;
+ found = 1;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!found) {
+ fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary);
+ munmap(map, (size_t)st.st_size);
+ return 1;
+ }
+
+ Elf32_Phdr *phdrs = (Elf32_Phdr *)((char *)map + ehdr32->e_phoff);
+ int pi;
+
+ for (pi = 0; pi < ehdr32->e_phnum; pi++) {
+ Elf32_Phdr *ph = &phdrs[pi];
+
+ if (ph->p_type != PT_LOAD)
+ continue;
+ if (sym_vaddr32 >= ph->p_vaddr &&
+ sym_vaddr32 < ph->p_vaddr + ph->p_filesz) {
+ file_offset = sym_vaddr32 - ph->p_vaddr + ph->p_offset;
+ break;
+ }
+ }
+ sym_vaddr = sym_vaddr32;
+ }
+
+ munmap(map, (size_t)st.st_size);
+
+ if (!file_offset && sym_vaddr) {
+ fprintf(stderr, "could not map vaddr 0x%lx to file offset\n",
+ (unsigned long)sym_vaddr);
+ return 1;
+ }
+
+ printf("0x%lx\n", (unsigned long)file_offset);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int rc;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s <subcommand> [args...]\n", argv[0]);
+ return 1;
+ }
+
+ /* sym_offset does not need /dev/rv */
+ if (strcmp(argv[1], "sym_offset") == 0) {
+ if (argc < 4) {
+ fprintf(stderr, "Usage: %s sym_offset <binary> <symbol>\n",
+ argv[0]);
+ return 1;
+ }
+ return sym_offset(argv[2], argv[3]);
+ }
+
+ /* not_enabled: monitor is disabled; bind must return ENODEV without open_rv() */
+ if (strcmp(argv[1], "not_enabled") == 0)
+ return test_not_enabled();
+
+ if (open_rv() < 0)
+ return 2; /* skip */
+
+ if (strcmp(argv[1], "bench") == 0)
+ rc = test_bench();
+ else if (strcmp(argv[1], "within_budget") == 0)
+ rc = test_within_budget();
+ else if (strcmp(argv[1], "over_budget_running") == 0)
+ rc = test_over_budget_running();
+ else if (strcmp(argv[1], "over_budget_sleeping") == 0)
+ rc = test_over_budget_sleeping();
+ else if (strcmp(argv[1], "over_budget_waiting") == 0)
+ rc = test_over_budget_waiting();
+ else if (strcmp(argv[1], "double_start") == 0)
+ rc = test_double_start();
+ else if (strcmp(argv[1], "stop_no_start") == 0)
+ rc = test_stop_no_start();
+ else if (strcmp(argv[1], "multi_thread") == 0)
+ rc = test_multi_thread();
+ else {
+ fprintf(stderr, "Unknown test: %s\n", argv[1]);
+ rc = 1;
+ }
+
+ close(rv_fd);
+ return rc;
+}
diff --git a/tools/testing/selftests/verification/tlob/tlob_target.c b/tools/testing/selftests/verification/tlob/tlob_target.c
new file mode 100644
index 000000000000..0fdbc575d71d
--- /dev/null
+++ b/tools/testing/selftests/verification/tlob/tlob_target.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob_target.c - uprobe target binary for tlob selftests.
+ *
+ * Provides three start/stop probe pairs, each designed to exercise a
+ * different dominant component of the detail_env_tlob ns breakdown:
+ *
+ * tlob_busy_work / tlob_busy_work_done - busy-spin: running_ns dominates
+ * tlob_sleep_work / tlob_sleep_work_done - nanosleep: sleeping_ns dominates
+ * tlob_preempt_work / tlob_preempt_work_done - busy-spin: waiting_ns dominates
+ * (needs an RT competitor on the same CPU)
+ *
+ * Usage: tlob_target <duration_ms> [mode]
+ *
+ * mode is one of: busy (default), sleep, preempt.
+ * Loops in 200 ms iterations until <duration_ms> has elapsed
+ * (0 = run for ~24 hours).
+ */
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#ifndef noinline
+#define noinline __attribute__((noinline))
+#endif
+
+static inline int timespec_before(const struct timespec *a,
+ const struct timespec *b)
+{
+ return a->tv_sec < b->tv_sec ||
+ (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
+}
+
+static void timespec_add_ms(struct timespec *ts, unsigned long ms)
+{
+ ts->tv_sec += ms / 1000;
+ ts->tv_nsec += (long)(ms % 1000) * 1000000L;
+ if (ts->tv_nsec >= 1000000000L) {
+ ts->tv_sec++;
+ ts->tv_nsec -= 1000000000L;
+ }
+}
+
+/* stop probe; noinline keeps the entry point visible to uprobes */
+noinline void tlob_busy_work_done(void)
+{
+ /* empty: uprobe fires on entry */
+}
+
+/* start probe; busy-spin so running_ns dominates */
+noinline void tlob_busy_work(unsigned long duration_ns)
+{
+ struct timespec start, now;
+ unsigned long elapsed;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ do {
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+ * 1000000000UL
+ + (unsigned long)(now.tv_nsec - start.tv_nsec);
+ } while (elapsed < duration_ns);
+
+ tlob_busy_work_done();
+}
+
+/* stop probe; noinline keeps the entry point visible to uprobes */
+noinline void tlob_sleep_work_done(void)
+{
+ /* empty: uprobe fires on entry */
+}
+
+/* start probe; nanosleep so sleeping_ns dominates */
+noinline void tlob_sleep_work(unsigned long duration_ms)
+{
+ struct timespec ts = {
+ .tv_sec = duration_ms / 1000,
+ .tv_nsec = (long)(duration_ms % 1000) * 1000000L,
+ };
+ nanosleep(&ts, NULL);
+ tlob_sleep_work_done();
+}
+
+/* stop probe; noinline keeps the entry point visible to uprobes */
+noinline void tlob_preempt_work_done(void)
+{
+ /* empty: uprobe fires on entry */
+}
+
+/*
+ * start probe; busy-spin so an RT competitor on the same CPU drives
+ * waiting_ns (prev_state==0 -> preempt event, task stays runnable off-CPU).
+ */
+noinline void tlob_preempt_work(unsigned long duration_ms)
+{
+ struct timespec start, now;
+ unsigned long elapsed;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ do {
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+ * 1000000000UL
+ + (unsigned long)(now.tv_nsec - start.tv_nsec);
+ } while (elapsed < duration_ms * 1000000UL);
+
+ tlob_preempt_work_done();
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long duration_ms = 0;
+ const char *mode = "busy";
+ struct timespec deadline, now;
+
+ if (argc >= 2)
+ duration_ms = strtoul(argv[1], NULL, 10);
+ if (argc >= 3)
+ mode = argv[2];
+
+ clock_gettime(CLOCK_MONOTONIC, &deadline);
+ timespec_add_ms(&deadline, duration_ms ? duration_ms : 86400000UL);
+
+ do {
+ if (strcmp(mode, "sleep") == 0)
+ tlob_sleep_work(200);
+ else if (strcmp(mode, "preempt") == 0)
+ tlob_preempt_work(200);
+ else
+ tlob_busy_work(200 * 1000000UL);
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ } while (timespec_before(&now, &deadline));
+
+ return 0;
+}
--
2.25.1
^ permalink raw reply related [flat|nested] 11+ messages in thread