Linux Trace Kernel
 help / color / mirror / Atom feed
* Re: [PATCH 04/61] ext4: Prefer IS_ERR_OR_NULL over manual NULL check
From: Theodore Ts'o @ 2026-04-10 15:18 UTC (permalink / raw)
  To: amd-gfx, apparmor, bpf, ceph-devel, cocci, dm-devel, dri-devel,
	gfs2, intel-gfx, intel-wired-lan, iommu, kvm, linux-arm-kernel,
	linux-block, linux-bluetooth, linux-btrfs, linux-cifs, linux-clk,
	linux-erofs, linux-ext4, linux-fsdevel, linux-gpio, linux-hyperv,
	linux-input, linux-kernel, linux-leds, linux-media, linux-mips,
	linux-mm, linux-modules, linux-mtd, linux-nfs, linux-omap,
	linux-phy, linux-pm, linux-rockchip, linux-s390, linux-scsi,
	linux-sctp, linux-security-module, linux-sh, linux-sound,
	linux-stm32, linux-trace-kernel, linux-usb, linux-wireless,
	netdev, ntfs3, samba-technical, sched-ext, target-devel,
	tipc-discussion, v9fs, Philipp Hahn
  Cc: Theodore Ts'o, Andreas Dilger
In-Reply-To: <20260310-b4-is_err_or_null-v1-4-bd63b656022d@avm.de>


On Tue, 10 Mar 2026 12:48:30 +0100, Philipp Hahn wrote:
> Prefer using IS_ERR_OR_NULL() over using IS_ERR() and a manual NULL
> check.
> 
> Change generated with coccinelle.

Applied, thanks!

[04/61] ext4: Prefer IS_ERR_OR_NULL over manual NULL check
        commit: 1d749e110277ce4103f27bd60d6181e52c0cc1e3

Best regards,
-- 
Theodore Ts'o <tytso@mit.edu>

^ permalink raw reply

* [PATCH v4 0/3] tracing/fprobe: Fix fprobe_ip_table related bugs
From: Masami Hiramatsu (Google) @ 2026-04-10 17:11 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel

Here are patches to fix bugs in fprobe.

The previous version is here.

https://lore.kernel.org/all/177581370903.617881.3002655215679528157.stgit@mhiramat.tok.corp.google.com/

In this version, I fixed some issues on the previous version.

Patch 1/3 updates:
      - Remove short-cut case because we always need to upadte ftrace_ops.
      - Use guard(mutex) in register_fprobe_ips() to unlock it correctly.
      - Remove redundant !ret check in register_fprobe_ips().
      - Do not set hlist_array->size in failure case, instead,
        hlist_array->array[i].fp is set only when insertion is succeeded.
Patch 2/3 updates:
      - fix a build error typo in case of CONFIG_DYNAMIC_FTRACE=n.

Thank you!


---

Masami Hiramatsu (Google) (3):
      tracing/fprobe: Remove fprobe from hash in failure path
      tracing/fprobe: Avoid kcalloc() in rcu_read_lock section
      tracing/fprobe: Check the same type fprobe on table as the unregistered one


 kernel/trace/fprobe.c |  236 ++++++++++++++++++++++++++++---------------------
 1 file changed, 135 insertions(+), 101 deletions(-)

--
Signature

^ permalink raw reply

* [PATCH v4 1/3] tracing/fprobe: Remove fprobe from hash in failure path
From: Masami Hiramatsu (Google) @ 2026-04-10 17:11 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177584108931.388483.11311214679686745474.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

When register_fprobe_ips() fails, it tries to remove a list of
fprobe_hash_node from fprobe_ip_table, but it missed to remove
fprobe itself from fprobe_table. Moreover, when removing
the fprobe_hash_node which is added to rhltable once, it must
use kfree_rcu() after removing from rhltable.

To fix these issues, this reuses unregister_fprobe() internal
code to rollback the half-way registered fprobe.

Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v4:
  - Remove short-cut case because we always need to upadte ftrace_ops.
  - Use guard(mutex) in register_fprobe_ips() to unlock it correctly.
  - Remove redundant !ret check in register_fprobe_ips().
  - Do not set hlist_array->size in failure case, instead,
    hlist_array->array[i].fp is set only when insertion is succeeded.
  Changes in v3:
  - Newly added.
---
 kernel/trace/fprobe.c |   95 +++++++++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 47 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index dcadf1d23b8a..a7c0d5f9016b 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -4,6 +4,7 @@
  */
 #define pr_fmt(fmt) "fprobe: " fmt
 
+#include <linux/cleanup.h>
 #include <linux/err.h>
 #include <linux/fprobe.h>
 #include <linux/kallsyms.h>
@@ -78,20 +79,27 @@ static const struct rhashtable_params fprobe_rht_params = {
 };
 
 /* Node insertion and deletion requires the fprobe_mutex */
-static int insert_fprobe_node(struct fprobe_hlist_node *node)
+static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
 {
+	int ret;
+
 	lockdep_assert_held(&fprobe_mutex);
 
-	return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
+	ret = rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
+	/* Set the fprobe pointer if insertion was successful. */
+	if (!ret)
+		WRITE_ONCE(node->fp, fp);
+	return ret;
 }
 
 /* Return true if there are synonims */
 static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 {
-	lockdep_assert_held(&fprobe_mutex);
 	bool ret;
 
-	/* Avoid double deleting */
+	lockdep_assert_held(&fprobe_mutex);
+
+	/* Avoid double deleting and non-inserted nodes */
 	if (READ_ONCE(node->fp) != NULL) {
 		WRITE_ONCE(node->fp, NULL);
 		rhltable_remove(&fprobe_ip_table, &node->hlist,
@@ -759,7 +767,6 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
 	fp->hlist_array = hlist_array;
 	hlist_array->fp = fp;
 	for (i = 0; i < num; i++) {
-		hlist_array->array[i].fp = fp;
 		addr = ftrace_location(addrs[i]);
 		if (!addr) {
 			fprobe_fail_cleanup(fp);
@@ -823,6 +830,8 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter
 }
 EXPORT_SYMBOL_GPL(register_fprobe);
 
+static int unregister_fprobe_nolock(struct fprobe *fp);
+
 /**
  * register_fprobe_ips() - Register fprobe to ftrace by address.
  * @fp: A fprobe data structure to be registered.
@@ -845,31 +854,26 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 	if (ret)
 		return ret;
 
-	mutex_lock(&fprobe_mutex);
+	guard(mutex)(&fprobe_mutex);
 
-	hlist_array = fp->hlist_array;
 	if (fprobe_is_ftrace(fp))
 		ret = fprobe_ftrace_add_ips(addrs, num);
 	else
 		ret = fprobe_graph_add_ips(addrs, num);
+	if (ret) {
+		fprobe_fail_cleanup(fp);
+		return ret;
+	}
 
-	if (!ret) {
-		add_fprobe_hash(fp);
-		for (i = 0; i < hlist_array->size; i++) {
-			ret = insert_fprobe_node(&hlist_array->array[i]);
-			if (ret)
-				break;
-		}
-		/* fallback on insert error */
+	hlist_array = fp->hlist_array;
+	add_fprobe_hash(fp);
+	for (i = 0; i < hlist_array->size; i++) {
+		ret = insert_fprobe_node(&hlist_array->array[i], fp);
 		if (ret) {
-			for (i--; i >= 0; i--)
-				delete_fprobe_node(&hlist_array->array[i]);
+			unregister_fprobe_nolock(fp);
+			break;
 		}
 	}
-	mutex_unlock(&fprobe_mutex);
-
-	if (ret)
-		fprobe_fail_cleanup(fp);
 
 	return ret;
 }
@@ -913,32 +917,15 @@ bool fprobe_is_registered(struct fprobe *fp)
 	return true;
 }
 
-/**
- * unregister_fprobe() - Unregister fprobe.
- * @fp: A fprobe data structure to be unregistered.
- *
- * Unregister fprobe (and remove ftrace hooks from the function entries).
- *
- * Return 0 if @fp is unregistered successfully, -errno if not.
- */
-int unregister_fprobe(struct fprobe *fp)
+static int unregister_fprobe_nolock(struct fprobe *fp)
 {
-	struct fprobe_hlist *hlist_array;
+	struct fprobe_hlist *hlist_array = fp->hlist_array;
 	unsigned long *addrs = NULL;
-	int ret = 0, i, count;
+	int i, count;
 
-	mutex_lock(&fprobe_mutex);
-	if (!fp || !is_fprobe_still_exist(fp)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	hlist_array = fp->hlist_array;
 	addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
-	if (!addrs) {
-		ret = -ENOMEM;	/* TODO: Fallback to one-by-one loop */
-		goto out;
-	}
+	if (!addrs)
+		return -ENOMEM;	/* TODO: Fallback to one-by-one loop */
 
 	/* Remove non-synonim ips from table and hash */
 	count = 0;
@@ -955,12 +942,26 @@ int unregister_fprobe(struct fprobe *fp)
 
 	kfree_rcu(hlist_array, rcu);
 	fp->hlist_array = NULL;
+	kfree(addrs);
 
-out:
-	mutex_unlock(&fprobe_mutex);
+	return 0;
+}
 
-	kfree(addrs);
-	return ret;
+/**
+ * unregister_fprobe() - Unregister fprobe.
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+	guard(mutex)(&fprobe_mutex);
+	if (!fp || !is_fprobe_still_exist(fp))
+		return -EINVAL;
+
+	return unregister_fprobe_nolock(fp);
 }
 EXPORT_SYMBOL_GPL(unregister_fprobe);
 


^ permalink raw reply related

* [PATCH v4 2/3] tracing/fprobe: Avoid kcalloc() in rcu_read_lock section
From: Masami Hiramatsu (Google) @ 2026-04-10 17:11 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177584108931.388483.11311214679686745474.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

fprobe_remove_node_in_module() is called under RCU read locked, but
this invokes kcalloc() if there are more than 8 fprobes installed
on the module. Sashiko warns it because kcalloc() can sleep [1].

 [1] https://sashiko.dev/#/patchset/177552432201.853249.5125045538812833325.stgit%40mhiramat.tok.corp.google.com

To fix this issue, expand the batch size to 128 and do not expand
the fprobe_addr_list, but just cancel walking on fprobe_ip_table,
update fgraph/ftrace_ops and retry the loop again.

Fixes: 0de4c70d04a4 ("tracing: fprobe: use rhltable for fprobe_ip_table")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v4:
  - fix a build error typo in case of CONFIG_DYNAMIC_FTRACE=n.
 Changes in v3:
  - Retry inside rhltable_walk_enter/exit().
  - Rename fprobe_set_ips() to fprobe_remove_ips().
  - Rename 'retry' label to 'again'.
---
 kernel/trace/fprobe.c |   75 ++++++++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 45 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index a7c0d5f9016b..799332f865f8 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -346,11 +346,10 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 }
 
 #ifdef CONFIG_MODULES
-static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
-			   int reset)
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
-	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
-	ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset);
+	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
+	ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0);
 }
 #endif
 #else
@@ -369,10 +368,9 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 }
 
 #ifdef CONFIG_MODULES
-static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
-			   int reset)
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
-	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
+	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
 }
 #endif
 #endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */
@@ -546,7 +544,7 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
 
 #ifdef CONFIG_MODULES
 
-#define FPROBE_IPS_BATCH_INIT 8
+#define FPROBE_IPS_BATCH_INIT 128
 /* instruction pointer address list */
 struct fprobe_addr_list {
 	int index;
@@ -554,45 +552,21 @@ struct fprobe_addr_list {
 	unsigned long *addrs;
 };
 
-static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr)
+static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
+					 struct fprobe_addr_list *alist)
 {
-	unsigned long *addrs;
-
-	/* Previously we failed to expand the list. */
-	if (alist->index == alist->size)
-		return -ENOSPC;
-
-	alist->addrs[alist->index++] = addr;
-	if (alist->index < alist->size)
+	if (!within_module(node->addr, mod))
 		return 0;
 
-	/* Expand the address list */
-	addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL);
-	if (!addrs)
-		return -ENOMEM;
-
-	memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs));
-	alist->size *= 2;
-	kfree(alist->addrs);
-	alist->addrs = addrs;
+	if (delete_fprobe_node(node))
+		return 0;
 
+	alist->addrs[alist->index++] = node->addr;
+	if (alist->index == alist->size)
+		return -ENOSPC;
 	return 0;
 }
 
-static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
-					 struct fprobe_addr_list *alist)
-{
-	if (!within_module(node->addr, mod))
-		return;
-	if (delete_fprobe_node(node))
-		return;
-	/*
-	 * If failed to update alist, just continue to update hlist.
-	 * Therefore, at list user handler will not hit anymore.
-	 */
-	fprobe_addr_list_add(alist, node->addr);
-}
-
 /* Handle module unloading to manage fprobe_ip_table. */
 static int fprobe_module_callback(struct notifier_block *nb,
 				  unsigned long val, void *data)
@@ -601,6 +575,7 @@ static int fprobe_module_callback(struct notifier_block *nb,
 	struct fprobe_hlist_node *node;
 	struct rhashtable_iter iter;
 	struct module *mod = data;
+	bool retry;
 
 	if (val != MODULE_STATE_GOING)
 		return NOTIFY_DONE;
@@ -612,18 +587,28 @@ static int fprobe_module_callback(struct notifier_block *nb,
 
 	mutex_lock(&fprobe_mutex);
 	rhltable_walk_enter(&fprobe_ip_table, &iter);
+again:
+	retry = false;
+	alist.index = 0;
 	do {
 		rhashtable_walk_start(&iter);
 
 		while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node))
-			fprobe_remove_node_in_module(mod, node, &alist);
+			if (fprobe_remove_node_in_module(mod, node, &alist) < 0) {
+				retry = true;
+				break;
+			}
 
 		rhashtable_walk_stop(&iter);
-	} while (node == ERR_PTR(-EAGAIN));
-	rhashtable_walk_exit(&iter);
+	} while (node == ERR_PTR(-EAGAIN) && !retry);
+	/* Remove any ips from hash table(s) */
+	if (alist.index > 0) {
+		fprobe_remove_ips(alist.addrs, alist.index);
+		if (retry)
+			goto again;
+	}
 
-	if (alist.index > 0)
-		fprobe_set_ips(alist.addrs, alist.index, 1, 0);
+	rhashtable_walk_exit(&iter);
 	mutex_unlock(&fprobe_mutex);
 
 	kfree(alist.addrs);


^ permalink raw reply related

* [PATCH v4 3/3] tracing/fprobe: Check the same type fprobe on table as the unregistered one
From: Masami Hiramatsu (Google) @ 2026-04-10 17:11 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177584108931.388483.11311214679686745474.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Commit 2c67dc457bc6 ("tracing: fprobe: optimization for entry only case")
introduced a different ftrace_ops for entry-only fprobes.

However, when unregistering an fprobe, the kernel only checks if another
fprobe exists at the same address, without checking which type of fprobe
it is.
If different fprobes are registered at the same address, the same address
will be registered in both fgraph_ops and ftrace_ops, but only one of
them will be deleted when unregistering. (the one removed first will not
be deleted from the ops).

This results in junk entries remaining in either fgraph_ops or ftrace_ops.
For example:
 =======
 cd /sys/kernel/tracing

 # 'Add entry and exit events on the same place'
 echo 'f:event1 vfs_read' >> dynamic_events
 echo 'f:event2 vfs_read%return' >> dynamic_events

 # 'Enable both of them'
 echo 1 > events/fprobes/enable
 cat enabled_functions
vfs_read (2)            ->arch_ftrace_ops_list_func+0x0/0x210

 # 'Disable and remove exit event'
 echo 0 > events/fprobes/event2/enable
 echo -:event2 >> dynamic_events

 # 'Disable and remove all events'
 echo 0 > events/fprobes/enable
 echo > dynamic_events

 # 'Add another event'
 echo 'f:event3 vfs_open%return' > dynamic_events
 cat dynamic_events
f:fprobes/event3 vfs_open%return

 echo 1 > events/fprobes/enable
 cat enabled_functions
vfs_open (1)            tramp: 0xffffffffa0001000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60    subops: {ent:fprobe_fgraph_entry+0x0/0x620 ret:fprobe_return+0x0/0x150}
vfs_read (1)            tramp: 0xffffffffa0001000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60    subops: {ent:fprobe_fgraph_entry+0x0/0x620 ret:fprobe_return+0x0/0x150}
 =======

As you can see, an entry for the vfs_read remains.

To fix this issue, when unregistering, the kernel should also check if
there is the same type of fprobes still exist at the same address, and
if not, delete its entry from either fgraph_ops or ftrace_ops.

Fixes: 2c67dc457bc6 ("tracing: fprobe: optimization for entry only case")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/fprobe.c |   82 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 17 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 799332f865f8..2cac2252f78f 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -92,11 +92,8 @@ static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
 	return ret;
 }
 
-/* Return true if there are synonims */
-static bool delete_fprobe_node(struct fprobe_hlist_node *node)
+static void delete_fprobe_node(struct fprobe_hlist_node *node)
 {
-	bool ret;
-
 	lockdep_assert_held(&fprobe_mutex);
 
 	/* Avoid double deleting and non-inserted nodes */
@@ -105,13 +102,6 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 		rhltable_remove(&fprobe_ip_table, &node->hlist,
 				fprobe_rht_params);
 	}
-
-	rcu_read_lock();
-	ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr,
-				fprobe_rht_params);
-	rcu_read_unlock();
-
-	return ret;
 }
 
 /* Check existence of the fprobe */
@@ -345,6 +335,32 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 	return !fp->exit_handler;
 }
 
+static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace)
+{
+	struct rhlist_head *head, *pos;
+	struct fprobe_hlist_node *node;
+	struct fprobe *fp;
+
+	guard(rcu)();
+	head = rhltable_lookup(&fprobe_ip_table, &ip,
+				fprobe_rht_params);
+	if (!head)
+		return false;
+	/* We have to check the same type on the list. */
+	rhl_for_each_entry_rcu(node, pos, head, hlist) {
+		if (node->addr != ip)
+			break;
+		fp = READ_ONCE(node->fp);
+		if (likely(fp)) {
+			if ((!ftrace && fp->exit_handler) ||
+			    (ftrace && !fp->exit_handler))
+				return true;
+		}
+	}
+
+	return false;
+}
+
 #ifdef CONFIG_MODULES
 static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
@@ -367,6 +383,29 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 	return false;
 }
 
+static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace __maybe_unused)
+{
+	struct rhlist_head *head, *pos;
+	struct fprobe_hlist_node *node;
+	struct fprobe *fp;
+
+	guard(rcu)();
+	head = rhltable_lookup(&fprobe_ip_table, &ip,
+				fprobe_rht_params);
+	if (!head)
+		return false;
+	/* We only need to check fp is there. */
+	rhl_for_each_entry_rcu(node, pos, head, hlist) {
+		if (node->addr != ip)
+			break;
+		fp = READ_ONCE(node->fp);
+		if (likely(fp))
+			return true;
+	}
+
+	return false;
+}
+
 #ifdef CONFIG_MODULES
 static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
@@ -555,15 +594,22 @@ struct fprobe_addr_list {
 static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
 					 struct fprobe_addr_list *alist)
 {
+	lockdep_assert_in_rcu_read_lock();
+
 	if (!within_module(node->addr, mod))
 		return 0;
 
-	if (delete_fprobe_node(node))
-		return 0;
+	delete_fprobe_node(node);
+	/*
+	 * Don't care the type here, because all fprobes on the same
+	 * address must be removed eventually.
+	 */
+	if (!rhltable_lookup(&fprobe_ip_table, &node->addr, fprobe_rht_params)) {
+		alist->addrs[alist->index++] = node->addr;
+		if (alist->index == alist->size)
+			return -ENOSPC;
+	}
 
-	alist->addrs[alist->index++] = node->addr;
-	if (alist->index == alist->size)
-		return -ENOSPC;
 	return 0;
 }
 
@@ -915,7 +961,9 @@ static int unregister_fprobe_nolock(struct fprobe *fp)
 	/* Remove non-synonim ips from table and hash */
 	count = 0;
 	for (i = 0; i < hlist_array->size; i++) {
-		if (!delete_fprobe_node(&hlist_array->array[i]))
+		delete_fprobe_node(&hlist_array->array[i]);
+		if (!fprobe_exists_on_hash(hlist_array->array[i].addr,
+					   fprobe_is_ftrace(fp)))
 			addrs[count++] = hlist_array->array[i].addr;
 	}
 	del_fprobe_hash(fp);


^ permalink raw reply related

* Re: [PATCH v3 09/11] dt-bindings: input: Document hid-over-spi DT schema
From: Conor Dooley @ 2026-04-10 17:35 UTC (permalink / raw)
  To: Dmitry Torokhov
  Cc: Jingyuan Liang, Jiri Kosina, Benjamin Tissoires, Jonathan Corbet,
	Mark Brown, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, linux-input,
	linux-doc, linux-kernel, linux-spi, linux-trace-kernel,
	devicetree, hbarnor, tfiga, Dmitry Antipov, Jarrett Schultz
In-Reply-To: <adfdkwq_bF9dirAq@google.com>

[-- Attachment #1: Type: text/plain, Size: 6021 bytes --]

On Thu, Apr 09, 2026 at 10:16:46AM -0700, Dmitry Torokhov wrote:
> On Thu, Apr 09, 2026 at 05:02:11PM +0100, Conor Dooley wrote:
> > On Thu, Apr 02, 2026 at 01:59:46AM +0000, Jingyuan Liang wrote:
> > > Documentation describes the required and optional properties for
> > > implementing Device Tree for a Microsoft G6 Touch Digitizer that
> > > supports HID over SPI Protocol 1.0 specification.
> > > 
> > > The properties are common to HID over SPI.
> > > 
> > > Signed-off-by: Dmitry Antipov <dmanti@microsoft.com>
> > > Signed-off-by: Jarrett Schultz <jaschultz@microsoft.com>
> > > Signed-off-by: Jingyuan Liang <jingyliang@chromium.org>
> > > ---
> > >  .../devicetree/bindings/input/hid-over-spi.yaml    | 126 +++++++++++++++++++++
> > >  1 file changed, 126 insertions(+)
> > > 
> > > diff --git a/Documentation/devicetree/bindings/input/hid-over-spi.yaml b/Documentation/devicetree/bindings/input/hid-over-spi.yaml
> > > new file mode 100644
> > > index 000000000000..d1b0a2e26c32
> > > --- /dev/null
> > > +++ b/Documentation/devicetree/bindings/input/hid-over-spi.yaml
> > > @@ -0,0 +1,126 @@
> > > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> > > +%YAML 1.2
> > > +---
> > > +$id: http://devicetree.org/schemas/input/hid-over-spi.yaml#
> > > +$schema: http://devicetree.org/meta-schemas/core.yaml#
> > > +
> > > +title: HID over SPI Devices
> > > +
> > > +maintainers:
> > > +  - Benjamin Tissoires <benjamin.tissoires@redhat.com>
> > > +  - Jiri Kosina <jkosina@suse.cz>
> > 
> > Why them and not you, the developers of the series?
> > 
> > > +
> > > +description: |+
> > > +  HID over SPI provides support for various Human Interface Devices over the
> > > +  SPI bus. These devices can be for example touchpads, keyboards, touch screens
> > > +  or sensors.
> > > +
> > > +  The specification has been written by Microsoft and is currently available
> > > +  here: https://www.microsoft.com/en-us/download/details.aspx?id=103325
> > > +
> > > +  If this binding is used, the kernel module spi-hid will handle the
> > > +  communication with the device and the generic hid core layer will handle the
> > > +  protocol.
> > 
> > This is not relevant to the binding, please remove it.
> > 
> > > +
> > > +allOf:
> > > +  - $ref: /schemas/input/touchscreen/touchscreen.yaml#
> > > +
> > > +properties:
> > > +  compatible:
> > > +    oneOf:
> > > +      - items:
> > > +          - enum:
> > > +              - microsoft,g6-touch-digitizer
> > > +          - const: hid-over-spi
> > > +      - description: Just "hid-over-spi" alone is allowed, but not recommended.
> > > +        const: hid-over-spi
> > 
> > Why is it allowed but not recommended? Seems to me like we should
> > require device-specific compatibles.
> 
> Why would we want to change the driver code to add a new compatible each
> time a vendor decides to create a chip that is fully hid-spi-protocol
> compliant? Or is the plan to still allow "hid-over-spi" fallback but
> require device-specific compatible that will be ignored unless there is
> device-specific quirk needed?

This has nothing to do with the driver, just the oddity of having a
comment saying that not having a device specific compatible was
permitted by not recommended in a binding. Requiring device-specific
compatibles is the norm after all and a comment like this makes draws
more attention to the fact that this is abnormal. Regardless of what the
driver does, device-specific compatibles should be required.

> > > +
> > > +  reg:
> > > +    maxItems: 1
> > > +
> > > +  interrupts:
> > > +    maxItems: 1
> > > +
> > > +  reset-gpios:
> > > +    maxItems: 1
> > > +    description:
> > > +      GPIO specifier for the digitizer's reset pin (active low). The line must
> > > +      be flagged with GPIO_ACTIVE_LOW.
> > > +
> > > +  vdd-supply:
> > > +    description:
> > > +      Regulator for the VDD supply voltage.
> > > +
> > > +  input-report-header-address:
> > > +    $ref: /schemas/types.yaml#/definitions/uint32
> > > +    minimum: 0
> > > +    maximum: 0xffffff
> > > +    description:
> > > +      A value to be included in the Read Approval packet, listing an address of
> > > +      the input report header to be put on the SPI bus. This address has 24
> > > +      bits.
> > > +
> > > +  input-report-body-address:
> > > +    $ref: /schemas/types.yaml#/definitions/uint32
> > > +    minimum: 0
> > > +    maximum: 0xffffff
> > > +    description:
> > > +      A value to be included in the Read Approval packet, listing an address of
> > > +      the input report body to be put on the SPI bus. This address has 24 bits.
> > > +
> > > +  output-report-address:
> > > +    $ref: /schemas/types.yaml#/definitions/uint32
> > > +    minimum: 0
> > > +    maximum: 0xffffff
> > > +    description:
> > > +      A value to be included in the Output Report sent by the host, listing an
> > > +      address where the output report on the SPI bus is to be written to. This
> > > +      address has 24 bits.
> > > +
> > > +  read-opcode:
> > > +    $ref: /schemas/types.yaml#/definitions/uint8
> > > +    description:
> > > +      Value to be used in Read Approval packets. 1 byte.
> > > +
> > > +  write-opcode:
> > > +    $ref: /schemas/types.yaml#/definitions/uint8
> > > +    description:
> > > +      Value to be used in Write Approval packets. 1 byte.
> > 
> > Why can none of these things be determined from the device's compatible?
> > On the surface, they like the kinds of things that could/should be.
> 
> Why would we want to keep tables of these values in the kernel and again
> have to update the driver for each new chip?

That's pretty normal though innit? It's what match data does.
If someone wants to have properties that communicate data that
can be determined from the compatible, they need to provide
justification why it is being done.

> It also probably firmware-dependent.


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 3/4] livepatch: Add "replaceable" attribute to klp_patch
From: Yafang Shao @ 2026-04-12 12:09 UTC (permalink / raw)
  To: Petr Mladek
  Cc: Song Liu, Joe Lawrence, Dylan Hatch, jpoimboe, jikos, mbenes,
	rostedt, mhiramat, mathieu.desnoyers, kpsingh, mattbobrowski,
	jolsa, ast, daniel, andrii, martin.lau, eddyz87, memxor,
	yonghong.song, live-patching, linux-kernel, linux-trace-kernel,
	bpf
In-Reply-To: <adY_WgA54CDtWBq6@pathway.suse.cz>

On Wed, Apr 8, 2026 at 7:43 PM Petr Mladek <pmladek@suse.com> wrote:
>
> On Wed 2026-04-08 10:40:10, Yafang Shao wrote:
> > On Tue, Apr 7, 2026 at 11:08 PM Petr Mladek <pmladek@suse.com> wrote:
> > >
> > > On Tue 2026-04-07 17:45:31, Yafang Shao wrote:
> > > > On Tue, Apr 7, 2026 at 11:16 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> > > > >
> > > > > On Tue, Apr 7, 2026 at 10:54 AM Song Liu <song@kernel.org> wrote:
> > > > > >
> > > > > > On Mon, Apr 6, 2026 at 2:12 PM Joe Lawrence <joe.lawrence@redhat.com> wrote:
> > > > > > [...]
> > > > > > > > > > - The regular livepatches are cumulative, have the replace flag; and
> > > > > > > > > >   are replaceable.
> > > > > > > > > > - The occasional "off-band" livepatches do not have the replace flag,
> > > > > > > > > >   and are not replaceable.
> > > > > > > > > >
> > > > > > > > > > With this setup, for systems with off-band livepatches loaded, we can
> > > > > > > > > > still release a cumulative livepatch to replace the previous cumulative
> > > > > > > > > > livepatch. Is this the expected use case?
> > > > > > > > >
> > > > > > > > > That matches our expected use case.
> > > > > > > >
> > > > > > > > If we really want to serve use cases like this, I think we can introduce
> > > > > > > > some replace tag concept: Each livepatch will have a tag, u32 number.
> > > > > > > > Newly loaded livepatch will only replace existing livepatch with the
> > > > > > > > same tag. We can even reuse the existing "bool replace" in klp_patch,
> > > > > > > > and make it u32: replace=0 means no replace; replace > 0 are the
> > > > > > > > replace tag.
> > > > > > > >
> > > > > > > > For current users of cumulative patches, all the livepatch will have the
> > > > > > > > same tag, say 1. For your use case, you can assign each user a
> > > > > > > > unique tag. Then all these users can do atomic upgrades of their
> > > > > > > > own livepatches.
> > > > > > > >
> > > > > > > > We may also need to check whether two livepatches of different tags
> > > > > > > > touch the same kernel function. When that happens, the later
> > > > > > > > livepatch should fail to load.
> > >
> > > I still think how to make the hybrid mode more secure:
> > >
> > >     + The isolated sets of livepatched functions look like a good rule.
> > >     + What about isolating the shadow variables/states as well?
> >
> > We might consider extending the klp_shadow_* API to support the new
> > livepatch tag.
>
> It would be nice to associate shadow variables with states so that
> we could check which shadow variables are used by each livepatch.
>
> It is partially implemented in my earlier RFC, see
> https://lore.kernel.org/all/20250115082431.5550-3-pmladek@suse.com/

This patch is still pending acceptance, but it offers a nice
improvement. With your modifications, the remaining task would be to
integrate a new replace_set into klp_state and update the API
accordingly

[...]

-- 
Regards
Yafang

^ permalink raw reply

* Re: [RFC PATCH 3/4] livepatch: Add "replaceable" attribute to klp_patch
From: Yafang Shao @ 2026-04-12 12:18 UTC (permalink / raw)
  To: Petr Mladek
  Cc: Song Liu, Joe Lawrence, Dylan Hatch, jpoimboe, jikos, mbenes,
	rostedt, mhiramat, mathieu.desnoyers, kpsingh, mattbobrowski,
	jolsa, ast, daniel, andrii, martin.lau, eddyz87, memxor,
	yonghong.song, live-patching, linux-kernel, linux-trace-kernel,
	bpf
In-Reply-To: <addW_-whBavyHY-Z@pathway.suse.cz>

On Thu, Apr 9, 2026 at 3:36 PM Petr Mladek <pmladek@suse.com> wrote:
>
> On Wed 2026-04-08 11:19:50, Song Liu wrote:
> > On Wed, Apr 8, 2026 at 4:43 AM Petr Mladek <pmladek@suse.com> wrote:
> > [...]
> > > > >
> > > > > This is weird semantic. Which livepatch tag would be allowed to
> > > > > supersede it, please?
> > > > >
> > > > > Do we still need this category?
> > > >
> > > > It can be superseded by any livepatch that has a non-zero tag set.
> > >
> > > And this exactly the weird thing.
> > >
> > > A patch with the .replace flag set is supposed to obsolete all already
> > > installed livepatches. It means that it should provide all existing
> > > fixes and features.
> > >
> > > Now, we want to introduce a replace flag/set which would allow to
> > > replace/obsolete only the livepatch with the same tag/set number.
> > > And we want to prevent conflicts by making sure that livepatches with
> > > different tag/set number will never livepatch the same function.
> > >
> > > Obviously, livepatches with different tag/set number could not
> > > obsolete the same no-replace livepatch. They would need to livepatch
> > > the same functions touched by the no-replace livepatch and would
> > > conflict.
> > >
> > > So, I suggest to remove the no-replace mode completely. It should
> > > not be needed. A livepatch which should be installed in parallel
> > > will simply use another unique tag/set number.
> >
> > I think I see your point now. Existing code works as:
> > - replace=false doesn't replace anything
> > - replace=true replaces everything
> >
> > If we assume false=0 and true=1, it is technically possible to define:
> > - replace_set=0 doesn't replace anything
> > - replace_set=1 replaces everything
> > - replace_set=2+ only replace the same replace_set
>
> Yes. This well describes my point.
>
> > This is probably a little too complicated.
> >
> > > > This ensures backward compatibility: while a non-atomic-replace
> > > > livepatch can be superseded by an atomic-replace one, the reverse is
> > > > not permitted—an atomic-replace livepatch cannot be superseded by a
> > > > non-atomic one.
> > >
> > > IMHO, the backward compatibility would just create complexity and mess
> > > in this case.
> >
> > Given that livepatch is for expert users, I think we can make this work
> > without backward compatibility. But breaking compatibility is always not
> > preferred.
>
> I believe that it is acceptable because:
>
>   1. It was always hard to combine no-replace and replace livepatches.
>      I wonder if anyone combines them at all.

Because 'replace' patches can supersede 'no-replace' ones, users have
to maintain a strict loading order. I doubt anyone actually combines
them in production.

>
>   2. I believe that nobody tries to load the same livepatch module on
>      different kernel versions. Instead, everyone prepares a custom
>      livepatch module for each livepatched kernel version/release.

Correct. We always build and apply distinct livepatches for each
specific kernel version.

>
>      And the tooling for creating livepatches will need to be updated
>      to use "number" instead of "true/false" anyway.
>
> That said, it is easier to always use "0" for non-replace patches
> instead of assigning an unique "number" to avoid replacing. But
> I do not think that this would justify the complexity of having
> different semantic for 0, 1, and 2+ replace_set numbers.

Fair enough. Let's drop backward compatibility to keep the
implementation simple.

--
Regards
Yafang

^ permalink raw reply

* Re: [RFC PATCH 2/4] trace: Allow kprobes to override livepatched functions
From: Yafang Shao @ 2026-04-12 13:08 UTC (permalink / raw)
  To: Miroslav Benes
  Cc: jpoimboe, jikos, pmladek, joe.lawrence, rostedt, mhiramat,
	mathieu.desnoyers, kpsingh, mattbobrowski, song, jolsa, ast,
	daniel, andrii, martin.lau, eddyz87, memxor, yonghong.song,
	live-patching, linux-kernel, linux-trace-kernel, bpf
In-Reply-To: <alpine.LSU.2.21.2604091145340.31526@pobox.suse.cz>

On Thu, Apr 9, 2026 at 5:47 PM Miroslav Benes <mbenes@suse.cz> wrote:
>
> Hi,
>
> On Thu, 2 Apr 2026, Yafang Shao wrote:
>
> > Introduce the ability for kprobes to override the return values of
> > functions that have been livepatched. This functionality is guarded by the
> > CONFIG_KPROBE_OVERRIDE_KLP_FUNC configuration option.
>
> this is imprecise if I read the code correctly. You want to override live
> patch functions, not the original ones which are live patched.

Correct. The BPF program will override the livepatched functions
rather than the original ones.

>
> I also think that if nothing else, it needs to be more specific then just
> checking mod->klp. It should check if a function itself in klp module is
> overridable to keep it as limited as possible.

That is exactly what I am currently implementing in
trace_kprobe_klp_func_overridable().

> Even with that, the
> concerns expressed by the others still apply.

-- 
Regards
Yafang

^ permalink raw reply

* Re: [RFC PATCH 0/4] trace, livepatch: Allow kprobe return overriding for livepatched functions
From: Yafang Shao @ 2026-04-12 13:30 UTC (permalink / raw)
  To: Miroslav Benes
  Cc: Song Liu, jpoimboe, jikos, pmladek, joe.lawrence, rostedt,
	mhiramat, mathieu.desnoyers, kpsingh, mattbobrowski, jolsa, ast,
	daniel, andrii, martin.lau, eddyz87, memxor, yonghong.song,
	live-patching, linux-kernel, linux-trace-kernel, bpf
In-Reply-To: <alpine.LSU.2.21.2604091205250.31526@pobox.suse.cz>

On Thu, Apr 9, 2026 at 6:08 PM Miroslav Benes <mbenes@suse.cz> wrote:
>
> > Can we add something like ALLOW_LIVEPATCH_ERROR_INJECTION() to allow
> > error injection on functions defined inside a livepatch?
>
> No.
>
> I am sorry but you always seem to find band aids to your set up and how
> you deal with live patches internally. While I can see that something like
> a hybrid mode might be useful to people if done right (and we are not
> there yet), the combination of it with bpf overrides or anything like that
> is not something I would like to see in upstream.

The upstream kernel already allows for combining BPF and livepatch to
override functions. Song’s patch offers a great reference for
implementing this without changing the kernel:

  https://lore.kernel.org/bpf/20260408175217.1011024-1-song@kernel.org/

-- 
Regards
Yafang

^ permalink raw reply

* Re: [RFC PATCH 0/4] trace, livepatch: Allow kprobe return overriding for livepatched functions
From: Yafang Shao @ 2026-04-12 13:50 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: jpoimboe, jikos, mbenes, pmladek, joe.lawrence, rostedt,
	mathieu.desnoyers, kpsingh, mattbobrowski, song, jolsa, ast,
	daniel, andrii, martin.lau, eddyz87, memxor, yonghong.song,
	live-patching, linux-kernel, linux-trace-kernel, bpf
In-Reply-To: <20260410133844.56ab7964da7628d1c3482acb@kernel.org>

On Fri, Apr 10, 2026 at 12:38 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> Hi Yafang,
>
> On Thu,  2 Apr 2026 17:26:03 +0800
> Yafang Shao <laoar.shao@gmail.com> wrote:
>
> > Livepatching allows for rapid experimentation with new kernel features
> > without interrupting production workloads. However, static livepatches lack
> > the flexibility required to tune features based on task-specific attributes,
> > such as cgroup membership, which is critical in multi-tenant k8s
> > environments. Furthermore, hardcoding logic into a livepatch prevents
> > dynamic adjustments based on the runtime environment.
> >
> > To address this, we propose a hybrid approach using BPF. Our production use
> > case involves:
> >
> > 1. Deploying a Livepatch function to serve as a stable BPF hook.
> >
> > 2. Utilizing bpf_override_return() to dynamically modify the return value
> >    of that hook based on the current task's context.
>
> First of all, I don't like this approach to test a new feature in the
> kernel, because it sounds like allowing multiple different generations
> of implementations to coexist simultaneously. The standard kernel code
> is not designed to withstand such implementations.

However, this approach is invaluable for rapidly deploying new kernel
features to production servers without downtime. Upgrading kernels
across a large fleet remains a significant challenge.

>
> For example, if you implement a well-designed framework in a specific
> subsystem, like Schedext, which allows multiple implementations extended
> with BPF to coexist, there's no problem (at least it's debatable).
>
> But if it is for any function, it is dangerous feature. Bugs that occur
> in kernels that use this functionality cannot be addressed here. They
> need to be treated the same way as out-of-tree drivers or forked kernels.
> I mean, add a tainted flag for this feature. And we don't care of it.

Agreed. This should be handled as an OOT module rather than part of
the core kernel.

>
> >
> > A significant challenge arises when atomic-replace is enabled. In this
> > mode, deploying a new livepatch changes the target function's address,
> > forcing a re-attachment of the BPF program. This re-attachment latency is
> > unacceptable in critical paths, such as those handling networking policies.
> >
> > To solve this, we introduce a hybrid livepatch mode that allows specific
> > patches to remain non-replaceable, ensuring the function address remains
> > stable and the BPF program stays attached.
>
> Can you share your actual problem to be solved?

Here is an example we recently deployed on our production servers:

  https://lore.kernel.org/bpf/CALOAHbDnNba_w_nWH3-S9GAXw0+VKuLTh1gy5hy9Yqgeo4C0iA@mail.gmail.com/

In one of our specific clusters, we needed to send BGP traffic out
through specific NICs based on the destination IP. To achieve this
without interrupting service, we live-patched
bond_xmit_3ad_xor_slave_get(), added a new hook called
bond_get_slave_hook(), and then ran a BPF program attached to that
hook to select the outgoing NIC from the SKB. This allowed us to
rapidly deploy the feature with zero downtime.

[...]

-- 
Regards
Yafang

^ permalink raw reply

* Re: [PATCH] kernel/trace/ftrace: introduce ftrace module notifier
From: Song Chen @ 2026-04-12 14:10 UTC (permalink / raw)
  To: Petr Mladek
  Cc: Steven Rostedt, Miroslav Benes, mcgrof, petr.pavlu, da.gomez,
	samitolvanen, atomlin, mhiramat, mark.rutland, mathieu.desnoyers,
	linux-modules, linux-kernel, linux-trace-kernel, live-patching
In-Reply-To: <aaqk-GrpCTqO36xj@pathway.suse.cz>

Hi,


在 2026/3/6 17:57, Petr Mladek 写道:
> On Fri 2026-02-27 09:34:59, Song Chen wrote:
>> Hi,
>>
>> 在 2026/2/27 01:30, Steven Rostedt 写道:
>>> On Thu, 26 Feb 2026 11:51:53 +0100 (CET)
>>> Miroslav Benes <mbenes@suse.cz> wrote:
>>>
>>>>> Let me see if there is any way to use notifier and remain below calling
>>>>> sequence:
>>>>>
>>>>> ftrace_module_enable
>>>>> klp_module_coming
>>>>> blocking_notifier_call_chain_robust(MODULE_STATE_COMING)
>>>>>
>>>>> blocking_notifier_call_chain(MODULE_STATE_GOING)
>>>>> klp_module_going
>>>>> ftrace_release_mod
>>>>
>>>> Both klp and ftrace used module notifiers in the past. We abandoned that
>>>> and opted for direct calls due to issues with ordering at the time. I do
>>>> not have the list of problems at hand but I remember it was very fragile.
>>>>
>>>> See commits 7dcd182bec27 ("ftrace/module: remove ftrace module
>>>> notifier"), 7e545d6eca20 ("livepatch/module: remove livepatch module
>>>> notifier") and their surroundings.
>>>>
>>>> So unless there is a reason for the change (which should be then carefully
>>>> reviewed and properly tested), I would prefer to keep it as is. What is
>>>> the motivation? I am failing to find it in the commit log.
>>
>> There is no special motivation, i just read btf initialization in module
>> loading and found direct calls of ftrace and klp, i thought they were just
>> forgotten to use notifier and i even didn't search git log to verify, sorry
>> about that.
>>
>>>
>>> Honestly, I do think just decoupling ftrace and live kernel patching from
>>> modules is rationale enough, as it makes the code a bit cleaner. But to do
>>> so, we really need to make sure there is absolutely no regressions.
>>>
>>> Thus, to allow such a change, I would ask those that are proposing it, show
>>> a full work flow of how ftrace, live kernel patching, and modules work with
>>> each other and why those functions are currently injected in the module code.
>>>
>>> As Miroslav stated, we tried to do it via notifiers in the past and it
>>> failed. I don't want to find out why they failed by just adding them back
>>> to notifiers again. Instead, the reasons must be fully understood and
>>> updates made to make sure they will not fail in the future.
>>
>> Yes, you are right, i read commit msg of 7dcd182bec27, this patch just
>> reverses it simply and will introduce order issue back. I will try to find
>> out the problem in the past at first.
> 
> AFAIK, the root of the problem is that livepatch uses the ftrace
> framework. It means that:
> 
>     + ftrace must be initialized before livepatch gets enabled
>     + livepatch must be disabled before ftrace support gets removed
> 
> My understanding is that this can't be achieved by notifiers easily
> because they are always proceed in the same order.
> 
> An elegant solution would be to introduce  notifier_reverse_call_chain()
> which would process the callbacks in the reverse order. But it might
> be non-trivial:
> 
>    + We would need to make sure that it does not break some
>      existing "hidden" dependencies.
> 
Thanks so much, this is the solution i'm working on. I replaced next 
with a list_head in notifier_block and implemented 
anotifier_call_chain_reverse to address the order issues, like your 
suggestion. And a new robust revision for rolling back.

+static int notifier_call_chain_reverse(struct list_head *nl,
+                    struct notifier_block *start,
+                    unsigned long val, void *v,
+                    int nr_to_call, int *nr_calls)
+{
+    int ret = NOTIFY_DONE;
+    struct notifier_block *nb;
+    bool do_call = (start == NULL);
+
+    if (!nr_to_call)
+        return ret;
+
+    list_for_each_entry_reverse(nb, nl, entry) {
+        if (!do_call) {
+            if (nb == start)
+                do_call = true;
+            continue;
+        }
+#ifdef CONFIG_DEBUG_NOTIFIERS
+        if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
+            WARN(1, "Invalid notifier called!");
+            continue;
+        }
+#endif
+        trace_notifier_run((void *)nb->notifier_call);
+        ret = nb->notifier_call(nb, val, v);
+
+        if (nr_calls)
+            (*nr_calls)++;
+
+        if (ret & NOTIFY_STOP_MASK)
+            break;
+
+        if (nr_to_call-- == 0)
+            break;
+    }
+    return ret;
+}
+NOKPROBE_SYMBOL(notifier_call_chain_reverse);

I'll send the patches for review soon.

Best regards

Song
>    + notifier_call_chain() uses RCU to process the list of registered
>      callbacks. I am not sure how complicated would be to make it safe
>      in both directions.
> 
> Best Regards,
> Petr
> 
> 

^ permalink raw reply

* Re: [PATCH v2 03/17] landlock: Split struct landlock_domain from struct landlock_ruleset
From: Tingmao Wang @ 2026-04-12 16:27 UTC (permalink / raw)
  To: Mickaël Salaün, Günther Noack
  Cc: Christian Brauner, Steven Rostedt, Jann Horn, Jeff Xu,
	Justin Suess, Kees Cook, Masami Hiramatsu, Mathieu Desnoyers,
	Matthieu Buffet, Mikhail Ivanov, kernel-team, linux-fsdevel,
	linux-security-module, linux-trace-kernel
In-Reply-To: <20260406143717.1815792-4-mic@digikod.net>

On 4/6/26 15:37, Mickaël Salaün wrote:
> [...]
> @@ -197,10 +179,10 @@ static void build_check_ruleset(void)
>   *
>   * Return: 0 on success, -errno on failure.
>   */
> -static int insert_rule(struct landlock_rules *const rules,
> -		       const struct landlock_id id,
> -		       const struct landlock_layer (*layers)[],
> -		       const size_t num_layers)
> +int landlock_rule_insert(struct landlock_rules *const rules,
> +			 const struct landlock_id id,
> +			 const struct landlock_layer (*layers)[],
> +			 const size_t num_layers)

Maybe this is slightly off topic, but previously I've found this function,
along with create_rule and merge_tree, to be quite confusing, because the
logic for three different use cases (creating a copy of an old domain,
merging a new layer into this domain, and inserting a new rule into an
unmerged ruleset) are mixed in the code, and personally now that I'm
reading it again I still find these functions to be hard to reason about.
Therefore, given that we're refactoring these areas, I think this might be
a good opportunity to rewrite them (while getting the necessary testing
for this rewrite "for free" as part of this whole domain refactor).

For example, for this snippet:

		/* Only a single-level layer should match an existing rule. */
		if (WARN_ON_ONCE(num_layers != 1))
			return -EINVAL;

Someone unfamiliar with how this function is being used by its caller may
not realize that the reason the comment is true is because the only use
case where we have multiple layers in @layers being passed into this
function is when we're copying an existing domain into a new domain, and
so we should never match something that already exists.

Also, further along in this function, there is this snippet:

		/*
		 * Intersects access rights when it is a merge between a
		 * ruleset and a domain.
		 */
		new_rule = create_rule(id, &this->layers, this->num_layers,
				       &(*layers)[0]);

I also found the comment to be confusing because it's not "intersect"ing
anything (it's adding a new layer to an existing rule in the domain when
the object pointed to by "id" exists already in the domain, and the
intersection is only a consequence of how Landlock works when there are
multiple layers).  This realization is made harder by the fact that a few
lines above we just OR'd the access rights (for modification of an
unmerged ruleset), and so it makes it sound like it's doing a similar
thing except with AND instead of OR.

I think it might make sense to have separate functions, even if they result in some slight code duplication, for use by:

1. Copying a domain: inherit_ruleset() -> inherit_tree() -> _____()
    RB tree search to find the insertion point + create_rule().  Maybe
    this logic could just be in inherit_tree() without creating a separate
    function?
2. Merging a rule into a domain: merge_ruleset() -> merge_tree() -> _____()
    insert_or_append_rule()?  RB tree search, call create_rule() to create
    a new rule with the new layer added, then either
    rb_link_node()+rb_insert_color() or rb_replace_node().

Neither functions above will contain any actual logical AND/OR.

3. Inserting a new rule into an unmerged ruleset: landlock_add_rule() -> ... -> _____()
    insert_or_update_rule()?  RB tree search, and either update the access
    rights of an existing rule in-place (as we currently do), or create a
    new rule if the search fails.

We could create a utility static function or macro for the shared RB tree
search code.

How does this sound?

^ permalink raw reply

* Re: [PATCH v2 01/17] landlock: Prepare ruleset and domain type split
From: Tingmao Wang @ 2026-04-12 16:29 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: Christian Brauner, Günther Noack, Steven Rostedt, Jann Horn,
	Jeff Xu, Justin Suess, Kees Cook, Masami Hiramatsu,
	Mathieu Desnoyers, Matthieu Buffet, Mikhail Ivanov, kernel-team,
	linux-fsdevel, linux-security-module, linux-trace-kernel
In-Reply-To: <20260406143717.1815792-2-mic@digikod.net>

On 4/6/26 15:36, Mickaël Salaün wrote:
> [...]

Hi Mickaël,

I like this approach, as I basically ended up doing similar refactoring
previously for the hashtable / array-based domain changes, and having this
done first should make it easier to adopt the domain data structure
changes in the future.

I assume it's fine for me to add:
Reviewed-by: Tingmao Wang <m@maowtm.org>

> @@ -175,19 +163,24 @@ static void free_rule(struct landlock_rule *const rule,
>  
>  static void build_check_ruleset(void)
>  {
> -	const struct landlock_ruleset ruleset = {
> +	const struct landlock_rules rules = {
>  		.num_rules = ~0,
> +	};
> +	const struct landlock_ruleset ruleset = {
>  		.num_layers = ~0,
>  	};
>  
> -	BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES);
> +	BUILD_BUG_ON(rules.num_rules < LANDLOCK_MAX_NUM_RULES);
>  	BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS);
>  }
>  
>  /**
> - * insert_rule - Create and insert a rule in a ruleset
> + * insert_rule - Create and insert a rule in a rule set
                                                  ^^^^^^^^

Should this be rule storage to be consistent with the next 2 lines?

Alternatively maybe we can just say "struct landlock_rules" to avoid
inventing new names?

>   *
> - * @ruleset: The ruleset to be updated.
> + * @rules: The rule storage to be updated.  The caller is responsible for
> + *         any required locking.  For rulesets, this means holding
> + *         landlock_ruleset.lock.  For domains under construction, no lock is
> + *         needed because the domain is not yet visible to other tasks.
>   * @id: The ID to build the new rule with.  The underlying kernel object, if
>   *      any, must be held by the caller.
>   * @layers: One or multiple layers to be copied into the new rule.

^ permalink raw reply

* [RFC PATCH 0/4] rv/tlob: Add task latency over budget RV monitor
From: wen.yang @ 2026-04-12 19:27 UTC (permalink / raw)
  To: Steven Rostedt, Gabriele Monaco, Masami Hiramatsu,
	Mathieu Desnoyers
  Cc: linux-trace-kernel, linux-kernel, Wen Yang

From: Wen Yang <wen.yang@linux.dev>

This series introduces tlob (task latency over budget), a new per-task
Runtime Verification monitor.

Background
----------
The RV framework formalises kernel behavioural properties as
deterministic automata. Existing monitors (wwnr, sssw, opid, etc.) cover
scheduling and locking invariants; none tracks wall-clock latency of
a per-task code path, including off-CPU time. This property is needed
in ADAS perception/planning pipelines, industrial real-time
controllers, and similar mixed-criticality deployments.

tlob adds this capability. A caller demarcates a code path via a
start/stop pair; the kernel arms a per-task hrtimer for the requested
budget. If the task has not called TRACE_STOP before the timer fires,
a violation is recorded, the stop call returns -EOVERFLOW, and an
event is pushed to the caller's mmap ring.

The tracefs interface requires only tracefs write permissions, avoiding
the CAP_BPF privilege needed for equivalent eBPF-based approaches. The
DA model (patch 1) can be independently verified with standard model-
checking tools.

Design
------
The monitor is a three-state deterministic automaton (DA):

  unmonitored --trace_start--> on_cpu
  on_cpu       --switch_out--> off_cpu
  off_cpu      --switch_in---> on_cpu
  {on_cpu, off_cpu} --{trace_stop, budget_expired}--> unmonitored

Per-task state lives in a fixed-size hash table (TLOB_MAX_MONITORED
slots) with RCU-deferred free. Timing is based on CLOCK_MONOTONIC
(ktime_get()), so budgets account for off-CPU time.

Two userspace interfaces are provided:

  tracefs: uprobe pair registration via the monitor/enable files; no
           new UAPI required.

  /dev/rv ioctls (CONFIG_RV_CHARDEV):
    TLOB_IOCTL_TRACE_START  — arm the budget for a target task
    TLOB_IOCTL_TRACE_STOP   — disarm; returns -EOVERFLOW on violation

  Each /dev/rv file descriptor has a per-fd mmap ring (a physically
  contiguous control page struct tlob_mmap_page followed by an array of
  struct tlob_event records). Head/tail/dropped are userspace-readable
  without locking; overflow uses a drop-new policy.

New UAPI (include/uapi/linux/rv.h): tlob_start_args, tlob_event,
tlob_mmap_page, ioctl numbers (RV_IOC_MAGIC=0xB9, registered in
Documentation/userspace-api/ioctl/ioctl-number.rst).

Testing
-------
KUnit (patch 3): six suites (38 cases) gated on CONFIG_TLOB_KUNIT_TEST.

  ./tools/testing/kunit/kunit.py run \
    --kunitconfig kernel/trace/rv/monitors/tlob/.kunitconfig

  Coverage: automaton state transitions, start/stop API error paths,
  scheduler context-switch accounting, tracepoint payload fields,
  ring-buffer push/overflow/wakeup, and the uprobe line parser.

kselftest (patch 4): 19 TAP test points under
tools/testing/selftests/rv/. Requires CONFIG_RV_MON_TLOB=y,
CONFIG_RV_CHARDEV=y, and root.

  make -C tools/testing/selftests/rv
  sudo ./test_tlob.sh

Patch overview
--------------
Patch 1 — DOT model: formal automaton specification for verification.
Patch 2 — monitor implementation, UAPI, and documentation.
Patch 3 — KUnit in-kernel unit tests.
Patch 4 — kselftest user-space integration tests.

Wen Yang (4):
  rv/tlob: Add tlob model DOT file
  rv/tlob: Add tlob deterministic automaton monitor
  rv/tlob: Add KUnit tests for the tlob monitor
  selftests/rv: Add selftest for the tlob monitor

 Documentation/trace/rv/index.rst              |    1 +
 Documentation/trace/rv/monitor_tlob.rst       |  381 ++++++
 .../userspace-api/ioctl/ioctl-number.rst      |    1 +
 MAINTAINERS                                   |    3 +
 include/uapi/linux/rv.h                       |  181 +++
 kernel/trace/rv/Kconfig                       |   17 +
 kernel/trace/rv/Makefile                      |    3 +
 kernel/trace/rv/monitors/tlob/.kunitconfig    |    5 +
 kernel/trace/rv/monitors/tlob/Kconfig         |   63 +
 kernel/trace/rv/monitors/tlob/tlob.c          |  987 ++++++++++++++
 kernel/trace/rv/monitors/tlob/tlob.h          |  145 ++
 kernel/trace/rv/monitors/tlob/tlob_kunit.c    | 1194 +++++++++++++++++
 kernel/trace/rv/monitors/tlob/tlob_trace.h    |   42 +
 kernel/trace/rv/rv.c                          |    4 +
 kernel/trace/rv/rv_dev.c                      |  602 +++++++++
 kernel/trace/rv/rv_trace.h                    |   50 +
 tools/include/uapi/linux/rv.h                 |   54 +
 tools/testing/selftests/rv/Makefile           |   18 +
 tools/testing/selftests/rv/test_tlob.sh       |  563 ++++++++
 tools/testing/selftests/rv/tlob_helper.c      |  994 ++++++++++++++
 .../testing/selftests/rv/tlob_uprobe_target.c |  108 ++
 tools/verification/models/tlob.dot            |   25 +
 22 files changed, 5441 insertions(+)
 create mode 100644 Documentation/trace/rv/monitor_tlob.rst
 create mode 100644 include/uapi/linux/rv.h
 create mode 100644 kernel/trace/rv/monitors/tlob/.kunitconfig
 create mode 100644 kernel/trace/rv/monitors/tlob/Kconfig
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob.c
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob.h
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob_kunit.c
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob_trace.h
 create mode 100644 kernel/trace/rv/rv_dev.c
 create mode 100644 tools/include/uapi/linux/rv.h
 create mode 100644 tools/testing/selftests/rv/Makefile
 create mode 100755 tools/testing/selftests/rv/test_tlob.sh
 create mode 100644 tools/testing/selftests/rv/tlob_helper.c
 create mode 100644 tools/testing/selftests/rv/tlob_uprobe_target.c
 create mode 100644 tools/verification/models/tlob.dot

-- 
2.43.0


^ permalink raw reply

* [RFC PATCH 1/4] rv/tlob: Add tlob model DOT file
From: wen.yang @ 2026-04-12 19:27 UTC (permalink / raw)
  To: Steven Rostedt, Gabriele Monaco, Masami Hiramatsu,
	Mathieu Desnoyers
  Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1776020428.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Add the Graphviz DOT specification for the tlob (task latency over
budget) deterministic automaton.

The model has three states: unmonitored, on_cpu, and off_cpu.
trace_start transitions from unmonitored to on_cpu; switch_out and
switch_in cycle between on_cpu and off_cpu; trace_stop and
budget_expired return to unmonitored from either active state.
unmonitored is the sole accepting state.

switch_in, switch_out, and sched_wakeup self-loop in unmonitored;
sched_wakeup self-loops in on_cpu; switch_out and sched_wakeup
self-loop in off_cpu.

Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 MAINTAINERS                        |  3 +++
 tools/verification/models/tlob.dot | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 tools/verification/models/tlob.dot

diff --git a/MAINTAINERS b/MAINTAINERS
index 9fbb619c6..c2c56236c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23242,7 +23242,10 @@ S:	Maintained
 F:	Documentation/trace/rv/
 F:	include/linux/rv.h
 F:	include/rv/
+F:	include/uapi/linux/rv.h
 F:	kernel/trace/rv/
+F:	samples/rv/
+F:	tools/testing/selftests/rv/
 F:	tools/testing/selftests/verification/
 F:	tools/verification/
 
diff --git a/tools/verification/models/tlob.dot b/tools/verification/models/tlob.dot
new file mode 100644
index 000000000..df34a14b8
--- /dev/null
+++ b/tools/verification/models/tlob.dot
@@ -0,0 +1,25 @@
+digraph state_automaton {
+	center = true;
+	size = "7,11";
+	{node [shape = plaintext, style=invis, label=""] "__init_unmonitored"};
+	{node [shape = ellipse] "unmonitored"};
+	{node [shape = plaintext] "unmonitored"};
+	{node [shape = plaintext] "on_cpu"};
+	{node [shape = plaintext] "off_cpu"};
+	"__init_unmonitored" -> "unmonitored";
+	"unmonitored" [label = "unmonitored", color = green3];
+	"unmonitored" -> "on_cpu" [ label = "trace_start" ];
+	"unmonitored" -> "unmonitored" [ label = "switch_in\nswitch_out\nsched_wakeup" ];
+	"on_cpu" [label = "on_cpu"];
+	"on_cpu" -> "off_cpu" [ label = "switch_out" ];
+	"on_cpu" -> "unmonitored" [ label = "trace_stop\nbudget_expired" ];
+	"on_cpu" -> "on_cpu" [ label = "sched_wakeup" ];
+	"off_cpu" [label = "off_cpu"];
+	"off_cpu" -> "on_cpu" [ label = "switch_in" ];
+	"off_cpu" -> "unmonitored" [ label = "trace_stop\nbudget_expired" ];
+	"off_cpu" -> "off_cpu" [ label = "switch_out\nsched_wakeup" ];
+	{ rank = min ;
+		"__init_unmonitored";
+		"unmonitored";
+	}
+}
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 2/4] rv/tlob: Add tlob deterministic automaton monitor
From: wen.yang @ 2026-04-12 19:27 UTC (permalink / raw)
  To: Steven Rostedt, Gabriele Monaco, Masami Hiramatsu,
	Mathieu Desnoyers
  Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1776020428.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Add the tlob (task latency over budget) RV monitor. tlob tracks the
monotonic elapsed time (CLOCK_MONOTONIC) of a marked per-task code
path, including time off-CPU, and fires a per-task hrtimer when the
elapsed time exceeds a configurable budget.

Three-state DA (unmonitored/on_cpu/off_cpu) driven by trace_start,
switch_in/out, and budget_expired events. Per-task state lives in a
fixed-size hash table (TLOB_MAX_MONITORED slots) with RCU-deferred
free.

Two userspace interfaces:
 - tracefs: uprobe pair registration via the monitor file using the
   format "pid:threshold_us:offset_start:offset_stop:binary_path"
 - /dev/rv ioctls (CONFIG_RV_CHARDEV): TLOB_IOCTL_TRACE_START /
   TRACE_STOP; TRACE_STOP returns -EOVERFLOW on violation

Each /dev/rv fd has a per-fd mmap ring buffer (physically contiguous
pages). A control page (struct tlob_mmap_page) at offset 0 exposes
head/tail/dropped for lockless userspace reads; struct tlob_event
records follow at data_offset. Drop-new policy on overflow.

UAPI: include/uapi/linux/rv.h (tlob_start_args, tlob_event,
      tlob_mmap_page, ioctl numbers), monitor_tlob.rst,
      ioctl-number.rst (RV_IOC_MAGIC=0xB9).

Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 Documentation/trace/rv/index.rst              |   1 +
 Documentation/trace/rv/monitor_tlob.rst       | 381 +++++++
 .../userspace-api/ioctl/ioctl-number.rst      |   1 +
 include/uapi/linux/rv.h                       | 181 ++++
 kernel/trace/rv/Kconfig                       |  17 +
 kernel/trace/rv/Makefile                      |   2 +
 kernel/trace/rv/monitors/tlob/Kconfig         |  51 +
 kernel/trace/rv/monitors/tlob/tlob.c          | 986 ++++++++++++++++++
 kernel/trace/rv/monitors/tlob/tlob.h          | 145 +++
 kernel/trace/rv/monitors/tlob/tlob_trace.h    |  42 +
 kernel/trace/rv/rv.c                          |   4 +
 kernel/trace/rv/rv_dev.c                      | 602 +++++++++++
 kernel/trace/rv/rv_trace.h                    |  50 +
 13 files changed, 2463 insertions(+)
 create mode 100644 Documentation/trace/rv/monitor_tlob.rst
 create mode 100644 include/uapi/linux/rv.h
 create mode 100644 kernel/trace/rv/monitors/tlob/Kconfig
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob.c
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob.h
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob_trace.h
 create mode 100644 kernel/trace/rv/rv_dev.c

diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst
index a2812ac5c..4f2bfaf38 100644
--- a/Documentation/trace/rv/index.rst
+++ b/Documentation/trace/rv/index.rst
@@ -15,3 +15,4 @@ Runtime Verification
    monitor_wwnr.rst
    monitor_sched.rst
    monitor_rtapp.rst
+   monitor_tlob.rst
diff --git a/Documentation/trace/rv/monitor_tlob.rst b/Documentation/trace/rv/monitor_tlob.rst
new file mode 100644
index 000000000..d498e9894
--- /dev/null
+++ b/Documentation/trace/rv/monitor_tlob.rst
@@ -0,0 +1,381 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Monitor tlob
+============
+
+- Name: tlob - task latency over budget
+- Type: per-task deterministic automaton
+- Author: Wen Yang <wen.yang@linux.dev>
+
+Description
+-----------
+
+The tlob monitor tracks per-task elapsed time (CLOCK_MONOTONIC, including
+both on-CPU and off-CPU time) and reports a violation when the monitored
+task exceeds a configurable latency budget threshold.
+
+The monitor implements a three-state deterministic automaton::
+
+                              |
+                              | (initial)
+                              v
+                    +--------------+
+          +-------> | unmonitored  |
+          |         +--------------+
+          |                |
+          |          trace_start
+          |                v
+          |         +--------------+
+          |         |   on_cpu     |
+          |         +--------------+
+          |           |         |
+          |  switch_out|         | trace_stop / budget_expired
+          |            v         v
+          |  +--------------+  (unmonitored)
+          |  |   off_cpu    |
+          |  +--------------+
+          |     |         |
+          |     | switch_in| trace_stop / budget_expired
+          |     v         v
+          |  (on_cpu)  (unmonitored)
+          |
+          +-- trace_stop (from on_cpu or off_cpu)
+
+  Key transitions:
+    unmonitored   --(trace_start)-->   on_cpu
+    on_cpu        --(switch_out)-->    off_cpu
+    off_cpu       --(switch_in)-->     on_cpu
+    on_cpu        --(trace_stop)-->    unmonitored
+    off_cpu       --(trace_stop)-->    unmonitored
+    on_cpu        --(budget_expired)-> unmonitored   [violation]
+    off_cpu       --(budget_expired)-> unmonitored   [violation]
+
+  sched_wakeup self-loops in on_cpu and unmonitored; switch_out and
+  sched_wakeup self-loop in off_cpu.  budget_expired is fired by the one-shot hrtimer; it always
+  transitions to unmonitored regardless of whether the task is on-CPU
+  or off-CPU when the timer fires.
+
+State Descriptions
+------------------
+
+- **unmonitored**: Task is not being traced.  Scheduling events
+  (``switch_in``, ``switch_out``, ``sched_wakeup``) are silently
+  ignored (self-loop).  The monitor waits for a ``trace_start`` event
+  to begin a new observation window.
+
+- **on_cpu**: Task is running on the CPU with the deadline timer armed.
+  A one-shot hrtimer was set for ``threshold_us`` microseconds at
+  ``trace_start`` time.  A ``switch_out`` event transitions to
+  ``off_cpu``; the hrtimer keeps running (off-CPU time counts toward
+  the budget).  A ``trace_stop`` cancels the timer and returns to
+  ``unmonitored`` (normal completion).  If the hrtimer fires
+  (``budget_expired``) the violation is recorded and the automaton
+  transitions to ``unmonitored``.
+
+- **off_cpu**: Task was preempted or blocked.  The one-shot hrtimer
+  continues to run.  A ``switch_in`` event returns to ``on_cpu``.
+  A ``trace_stop`` cancels the timer and returns to ``unmonitored``.
+  If the hrtimer fires (``budget_expired``) while the task is off-CPU,
+  the violation is recorded and the automaton transitions to
+  ``unmonitored``.
+
+Rationale
+---------
+
+The per-task latency budget threshold allows operators to express timing
+requirements in microseconds and receive an immediate ftrace event when a
+task exceeds its budget.  This is useful for real-time tasks
+(``SCHED_FIFO`` / ``SCHED_DEADLINE``) where total elapsed time must
+remain within a known bound.
+
+Each task has an independent threshold, so up to ``TLOB_MAX_MONITORED``
+(64) tasks with different timing requirements can be monitored
+simultaneously.
+
+On threshold violation the automaton records a ``tlob_budget_exceeded``
+ftrace event carrying the final on-CPU / off-CPU time breakdown, but does
+not kill or throttle the task.  Monitoring can be restarted by issuing a
+new ``trace_start`` event (or a new ``TLOB_IOCTL_TRACE_START`` ioctl).
+
+A per-task one-shot hrtimer is armed at ``trace_start`` for exactly
+``threshold_us`` microseconds.  It fires at most once per monitoring
+window, performs an O(1) hash lookup, records the violation, and injects
+the ``budget_expired`` event into the DA.  When ``CONFIG_RV_MON_TLOB``
+is not set there is zero runtime cost.
+
+Usage
+-----
+
+tracefs interface (uprobe-based external monitoring)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``monitor`` tracefs file allows any privileged user to instrument an
+unmodified binary via uprobes, without changing its source code.  Write a
+four-field record to attach two plain entry uprobes: one at
+``offset_start`` fires ``tlob_start_task()`` and one at ``offset_stop``
+fires ``tlob_stop_task()``, so the latency budget covers exactly the code
+region between the two offsets::
+
+  threshold_us:offset_start:offset_stop:binary_path
+
+``binary_path`` comes last so it may freely contain ``:`` (e.g. paths
+inside a container namespace).
+
+The uprobes fire for every task that executes the probed instruction in
+the binary, consistent with the native uprobe semantics.  All tasks that
+execute the code region get independent per-task monitoring slots.
+
+Using two plain entry uprobes (rather than a uretprobe for the stop) means
+that a mistyped offset can never corrupt the call stack; the worst outcome
+of a bad ``offset_stop`` is a missed stop that causes the hrtimer to fire
+and report a budget violation.
+
+Example  --  monitor a code region in ``/usr/bin/myapp`` with a 5 ms
+budget, where the region starts at offset 0x12a0 and ends at 0x12f0::
+
+  echo 1 > /sys/kernel/tracing/rv/monitors/tlob/enable
+
+  # Bind uprobes: start probe starts the clock, stop probe stops it
+  echo "5000:0x12a0:0x12f0:/usr/bin/myapp" \
+      > /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+  # Remove the uprobe binding for this code region
+  echo "-0x12a0:/usr/bin/myapp" > /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+  # List registered uprobe bindings (mirrors the write format)
+  cat /sys/kernel/tracing/rv/monitors/tlob/monitor
+  # -> 5000:0x12a0:0x12f0:/usr/bin/myapp
+
+  # Read violations from the trace buffer
+  cat /sys/kernel/tracing/trace
+
+Up to ``TLOB_MAX_MONITORED`` tasks may be monitored simultaneously.
+
+The offsets can be obtained with ``nm`` or ``readelf``::
+
+  nm -n /usr/bin/myapp | grep my_function
+  # -> 0000000000012a0 T my_function
+
+  readelf -s /usr/bin/myapp | grep my_function
+  # -> 42: 0000000000012a0  336 FUNC GLOBAL DEFAULT  13 my_function
+
+  # offset_start = 0x12a0 (function entry)
+  # offset_stop  = 0x12a0 + 0x50 = 0x12f0 (or any instruction before return)
+
+Notes:
+
+- The uprobes fire for every task that executes the probed instruction,
+  so concurrent calls from different threads each get independent
+  monitoring slots.
+- ``offset_stop`` need not be a function return; it can be any instruction
+  within the region.  If the stop probe is never reached (e.g. early exit
+  path bypasses it), the hrtimer fires and a budget violation is reported.
+- Each ``(binary_path, offset_start)`` pair may only be registered once.
+  A second write with the same ``offset_start`` for the same binary is
+  rejected with ``-EEXIST``.  Two entry uprobes at the same address would
+  both fire for every task, causing ``tlob_start_task()`` to be called
+  twice; the second call would silently fail with ``-EEXIST`` and the
+  second binding's threshold would never take effect.  Different code
+  regions that share the same ``offset_stop`` (common exit point) are
+  explicitly allowed.
+- The uprobe binding is removed when ``-offset_start:binary_path`` is
+  written to ``monitor``, or when the monitor is disabled.
+- The ``tag`` field in every ``tlob_budget_exceeded`` event is
+  automatically set to ``offset_start`` for the tracefs path, so
+  violation events for different code regions are immediately
+  distinguishable even when ``threshold_us`` values are identical.
+
+ftrace ring buffer (budget violation events)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When a monitored task exceeds its latency budget the hrtimer fires,
+records the violation, and emits a single ``tlob_budget_exceeded`` event
+into the ftrace ring buffer.  **Nothing is written to the ftrace ring
+buffer while the task is within budget.**
+
+The event carries the on-CPU / off-CPU time breakdown so that root-cause
+analysis (CPU-bound vs. scheduling / I/O overrun) is immediate::
+
+  cat /sys/kernel/tracing/trace
+
+Example output::
+
+  myapp-1234 [003] .... 12345.678: tlob_budget_exceeded: \
+    myapp[1234]: budget exceeded threshold=5000 \
+    on_cpu=820 off_cpu=4500 switches=3 state=off_cpu tag=0x00000000000012a0
+
+Field descriptions:
+
+``threshold``
+  Configured latency budget in microseconds.
+
+``on_cpu``
+  Cumulative on-CPU time since ``trace_start``, in microseconds.
+
+``off_cpu``
+  Cumulative off-CPU (scheduling + I/O wait) time since ``trace_start``,
+  in microseconds.
+
+``switches``
+  Number of times the task was scheduled out during this window.
+
+``state``
+  DA state when the hrtimer fired: ``on_cpu`` means the task was executing
+  when the budget expired (CPU-bound overrun); ``off_cpu`` means the task
+  was preempted or blocked (scheduling / I/O overrun).
+
+``tag``
+  Opaque 64-bit cookie supplied by the caller via ``tlob_start_args.tag``
+  (ioctl path) or automatically set to ``offset_start`` (tracefs uprobe
+  path).  Use it to distinguish violations from different code regions
+  monitored by the same thread.  Zero when not set.
+
+To capture violations in a file::
+
+  trace-cmd record -e tlob_budget_exceeded &
+  # ... run workload ...
+  trace-cmd report
+
+/dev/rv ioctl interface (self-instrumentation)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tasks can self-instrument their own code paths via the ``/dev/rv`` misc
+device (requires ``CONFIG_RV_CHARDEV``).  The kernel key is
+``task_struct``; multiple threads sharing a single fd each get their own
+independent monitoring slot.
+
+**Synchronous mode**  --  the calling thread checks its own result::
+
+  int fd = open("/dev/rv", O_RDWR);
+
+  struct tlob_start_args args = {
+      .threshold_us = 50000,   /* 50 ms */
+      .tag          = 0,       /* optional; 0 = don't care */
+      .notify_fd    = -1,      /* no fd notification */
+  };
+  ioctl(fd, TLOB_IOCTL_TRACE_START, &args);
+
+  /* ... code path under observation ... */
+
+  int ret = ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL);
+  /* ret == 0:          within budget  */
+  /* ret == -EOVERFLOW: budget exceeded */
+
+  close(fd);
+
+**Asynchronous mode**  --  a dedicated monitor thread receives violation
+records via ``read()`` on a shared fd, decoupling the observation from
+the critical path::
+
+  /* Monitor thread: open a dedicated fd. */
+  int monitor_fd = open("/dev/rv", O_RDWR);
+
+  /* Worker thread: set notify_fd = monitor_fd in TRACE_START args. */
+  int work_fd = open("/dev/rv", O_RDWR);
+  struct tlob_start_args args = {
+      .threshold_us = 10000,   /* 10 ms */
+      .tag          = REGION_A,
+      .notify_fd    = monitor_fd,
+  };
+  ioctl(work_fd, TLOB_IOCTL_TRACE_START, &args);
+  /* ... critical section ... */
+  ioctl(work_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+
+  /* Monitor thread: blocking read() returns one or more tlob_event records. */
+  struct tlob_event ntfs[8];
+  ssize_t n = read(monitor_fd, ntfs, sizeof(ntfs));
+  for (int i = 0; i < n / sizeof(struct tlob_event); i++) {
+      struct tlob_event *ntf = &ntfs[i];
+      printf("tid=%u tag=0x%llx exceeded budget=%llu us "
+             "(on_cpu=%llu off_cpu=%llu switches=%u state=%s)\n",
+             ntf->tid, ntf->tag, ntf->threshold_us,
+             ntf->on_cpu_us, ntf->off_cpu_us, ntf->switches,
+             ntf->state ? "on_cpu" : "off_cpu");
+  }
+
+**mmap ring buffer**  --  zero-copy consumption of violation events::
+
+  int fd = open("/dev/rv", O_RDWR);
+  struct tlob_start_args args = {
+      .threshold_us = 1000,   /* 1 ms */
+      .notify_fd    = fd,     /* push violations to own ring buffer */
+  };
+  ioctl(fd, TLOB_IOCTL_TRACE_START, &args);
+
+  /* Map the ring: one control page + capacity data records. */
+  size_t pagesize = sysconf(_SC_PAGESIZE);
+  size_t cap = 64;   /* read from page->capacity after mmap */
+  size_t len = pagesize + cap * sizeof(struct tlob_event);
+  void *map = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+  struct tlob_mmap_page *page = map;
+  struct tlob_event *data =
+      (struct tlob_event *)((char *)map + page->data_offset);
+
+  /* Consumer loop: poll for events, read without copying. */
+  while (1) {
+      poll(&(struct pollfd){fd, POLLIN, 0}, 1, -1);
+
+      uint32_t head = __atomic_load_n(&page->data_head, __ATOMIC_ACQUIRE);
+      uint32_t tail = page->data_tail;
+      while (tail != head) {
+          handle(&data[tail & (page->capacity - 1)]);
+          tail++;
+      }
+      __atomic_store_n(&page->data_tail, tail, __ATOMIC_RELEASE);
+  }
+
+Note: ``read()`` and ``mmap()`` share the same ring and ``data_tail``
+cursor.  Do not use both simultaneously on the same fd.
+
+``tlob_event`` fields:
+
+``tid``
+  Thread ID (``task_pid_vnr``) of the violating task.
+
+``threshold_us``
+  Budget that was exceeded, in microseconds.
+
+``on_cpu_us``
+  Cumulative on-CPU time at violation time, in microseconds.
+
+``off_cpu_us``
+  Cumulative off-CPU time at violation time, in microseconds.
+
+``switches``
+  Number of context switches since ``TRACE_START``.
+
+``state``
+  1 = timer fired while task was on-CPU; 0 = timer fired while off-CPU.
+
+``tag``
+  Cookie from ``tlob_start_args.tag``; for the tracefs uprobe path this
+  equals ``offset_start``.  Zero when not set.
+
+tracefs files
+-------------
+
+The following files are created under
+``/sys/kernel/tracing/rv/monitors/tlob/``:
+
+``enable`` (rw)
+  Write ``1`` to enable the monitor; write ``0`` to disable it and
+  stop all currently monitored tasks.
+
+``desc`` (ro)
+  Human-readable description of the monitor.
+
+``monitor`` (rw)
+  Write ``threshold_us:offset_start:offset_stop:binary_path`` to bind two
+  plain entry uprobes in *binary_path*.  The uprobe at *offset_start* fires
+  ``tlob_start_task()``; the uprobe at *offset_stop* fires
+  ``tlob_stop_task()``.  Returns ``-EEXIST`` if a binding with the same
+  *offset_start* already exists for *binary_path*.  Write
+  ``-offset_start:binary_path`` to remove the binding.  Read to list
+  registered bindings, one
+  ``threshold_us:0xoffset_start:0xoffset_stop:binary_path`` entry per line.
+
+Specification
+-------------
+
+Graphviz DOT file in tools/verification/models/tlob.dot
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 331223761..8d3af68db 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -385,6 +385,7 @@ Code  Seq#    Include File                                             Comments
 0xB8  01-02  uapi/misc/mrvl_cn10k_dpi.h                                Marvell CN10K DPI driver
 0xB8  all    uapi/linux/mshv.h                                         Microsoft Hyper-V /dev/mshv driver
                                                                        <mailto:linux-hyperv@vger.kernel.org>
+0xB9  00-3F  linux/rv.h                                                Runtime Verification (RV) monitors
 0xBA  00-0F  uapi/linux/liveupdate.h                                   Pasha Tatashin
                                                                        <mailto:pasha.tatashin@soleen.com>
 0xC0  00-0F  linux/usb/iowarrior.h
diff --git a/include/uapi/linux/rv.h b/include/uapi/linux/rv.h
new file mode 100644
index 000000000..d1b96d8cd
--- /dev/null
+++ b/include/uapi/linux/rv.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * UAPI definitions for Runtime Verification (RV) monitors.
+ *
+ * All RV monitors that expose an ioctl self-instrumentation interface
+ * share the magic byte RV_IOC_MAGIC (0xB9), registered in
+ * Documentation/userspace-api/ioctl/ioctl-number.rst.
+ *
+ * A single /dev/rv misc device serves as the entry point.  ioctl numbers
+ * encode both the monitor identity and the operation:
+ *
+ *   0x01 - 0x1F  tlob (task latency over budget)
+ *   0x20 - 0x3F  reserved for future RV monitors
+ *
+ * Usage examples and design rationale are in:
+ *   Documentation/trace/rv/monitor_tlob.rst
+ */
+
+#ifndef _UAPI_LINUX_RV_H
+#define _UAPI_LINUX_RV_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/* Magic byte shared by all RV monitor ioctls. */
+#define RV_IOC_MAGIC	0xB9
+
+/* -----------------------------------------------------------------------
+ * tlob: task latency over budget monitor  (nr 0x01 - 0x1F)
+ * -----------------------------------------------------------------------
+ */
+
+/**
+ * struct tlob_start_args - arguments for TLOB_IOCTL_TRACE_START
+ * @threshold_us: Latency budget for this critical section, in microseconds.
+ *               Must be greater than zero.
+ * @tag:         Opaque 64-bit cookie supplied by the caller.  Echoed back
+ *               verbatim in the tlob_budget_exceeded ftrace event and in any
+ *               tlob_event record delivered via @notify_fd.  Use it to identify
+ *               which code region triggered a violation when the same thread
+ *               monitors multiple regions sequentially.  Set to 0 if not
+ *               needed.
+ * @notify_fd:   File descriptor that will receive a tlob_event record on
+ *               violation.  Must refer to an open /dev/rv fd.  May equal
+ *               the calling fd (self-notification, useful for retrieving the
+ *               on_cpu_us / off_cpu_us breakdown after TRACE_STOP returns
+ *               -EOVERFLOW).  Set to -1 to disable fd notification; in that
+ *               case violations are only signalled via the TRACE_STOP return
+ *               value and the tlob_budget_exceeded ftrace event.
+ * @flags:       Must be 0.  Reserved for future extensions.
+ */
+struct tlob_start_args {
+	__u64 threshold_us;
+	__u64 tag;
+	__s32 notify_fd;
+	__u32 flags;
+};
+
+/**
+ * struct tlob_event - one budget-exceeded event
+ *
+ * Consumed by read() on the notify_fd registered at TLOB_IOCTL_TRACE_START.
+ * Each record describes a single budget exceedance for one task.
+ *
+ * @tid:          Thread ID (task_pid_vnr) of the violating task.
+ * @threshold_us: Budget that was exceeded, in microseconds.
+ * @on_cpu_us:    Cumulative on-CPU time at violation time, in microseconds.
+ * @off_cpu_us:   Cumulative off-CPU (scheduling + I/O wait) time at
+ *               violation time, in microseconds.
+ * @switches:     Number of context switches since TRACE_START.
+ * @state:        DA state at violation: 1 = on_cpu, 0 = off_cpu.
+ * @tag:          Cookie from tlob_start_args.tag; for the tracefs uprobe path
+ *               this is the offset_start value.  Zero when not set.
+ */
+struct tlob_event {
+	__u32 tid;
+	__u32 pad;
+	__u64 threshold_us;
+	__u64 on_cpu_us;
+	__u64 off_cpu_us;
+	__u32 switches;
+	__u32 state;   /* 1 = on_cpu, 0 = off_cpu */
+	__u64 tag;
+};
+
+/**
+ * struct tlob_mmap_page - control page for the mmap'd violation ring buffer
+ *
+ * Mapped at offset 0 of the mmap region returned by mmap(2) on a /dev/rv fd.
+ * The data array of struct tlob_event records begins at offset @data_offset
+ * (always one page from the mmap base; use this field rather than hard-coding
+ * PAGE_SIZE so the code remains correct across architectures).
+ *
+ * Ring layout:
+ *
+ *   mmap base + 0             : struct tlob_mmap_page  (one page)
+ *   mmap base + data_offset   : struct tlob_event[capacity]
+ *
+ * The mmap length determines the ring capacity.  Compute it as:
+ *
+ *   raw    = sysconf(_SC_PAGESIZE) + capacity * sizeof(struct tlob_event)
+ *   length = (raw + sysconf(_SC_PAGESIZE) - 1) & ~(sysconf(_SC_PAGESIZE) - 1)
+ *
+ * i.e. round the raw byte count up to the next page boundary before
+ * passing it to mmap(2).  The kernel requires a page-aligned length.
+ * capacity must be a power of 2.  Read @capacity after a successful
+ * mmap(2) for the actual value.
+ *
+ * Producer/consumer ordering contract:
+ *
+ *   Kernel (producer):
+ *     data[data_head & (capacity - 1)] = event;
+ *     // pairs with load-acquire in userspace:
+ *     smp_store_release(&page->data_head, data_head + 1);
+ *
+ *   Userspace (consumer):
+ *     // pairs with store-release in kernel:
+ *     head = __atomic_load_n(&page->data_head, __ATOMIC_ACQUIRE);
+ *     for (tail = page->data_tail; tail != head; tail++)
+ *         handle(&data[tail & (capacity - 1)]);
+ *     __atomic_store_n(&page->data_tail, tail, __ATOMIC_RELEASE);
+ *
+ * @data_head and @data_tail are monotonically increasing __u32 counters
+ * in units of records.  Unsigned 32-bit wrap-around is handled correctly
+ * by modular arithmetic; the ring is full when
+ * (data_head - data_tail) == capacity.
+ *
+ * When the ring is full the kernel drops the incoming record and increments
+ * @dropped.  The consumer should check @dropped periodically to detect loss.
+ *
+ * read() and mmap() share the same ring buffer.  Do not use both
+ * simultaneously on the same fd.
+ *
+ * @data_head:   Next write slot index.  Updated by the kernel with
+ *               store-release ordering.  Read by userspace with load-acquire.
+ * @data_tail:   Next read slot index.  Updated by userspace.  Read by the
+ *               kernel to detect overflow.
+ * @capacity:    Actual ring capacity in records (power of 2).  Written once
+ *               by the kernel at mmap time; read-only for userspace thereafter.
+ * @version:     Ring buffer ABI version; currently 1.
+ * @data_offset: Byte offset from the mmap base to the data array.
+ *               Always equal to sysconf(_SC_PAGESIZE) on the running kernel.
+ * @record_size: sizeof(struct tlob_event) as seen by the kernel.  Verify
+ *               this matches userspace's sizeof before indexing the array.
+ * @dropped:     Number of events dropped because the ring was full.
+ *               Monotonically increasing; read with __ATOMIC_RELAXED.
+ */
+struct tlob_mmap_page {
+	__u32  data_head;
+	__u32  data_tail;
+	__u32  capacity;
+	__u32  version;
+	__u32  data_offset;
+	__u32  record_size;
+	__u64  dropped;
+};
+
+/*
+ * TLOB_IOCTL_TRACE_START - begin monitoring the calling task.
+ *
+ * Arms a per-task hrtimer for threshold_us microseconds.  If args.notify_fd
+ * is >= 0, a tlob_event record is pushed into that fd's ring buffer on
+ * violation in addition to the tlob_budget_exceeded ftrace event.
+ * args.notify_fd == -1 disables fd notification.
+ *
+ * Violation records are consumed by read() on the notify_fd (blocking or
+ * non-blocking depending on O_NONBLOCK).  On violation, TLOB_IOCTL_TRACE_STOP
+ * also returns -EOVERFLOW regardless of whether notify_fd is set.
+ *
+ * args.flags must be 0.
+ */
+#define TLOB_IOCTL_TRACE_START		_IOW(RV_IOC_MAGIC, 0x01, struct tlob_start_args)
+
+/*
+ * TLOB_IOCTL_TRACE_STOP - end monitoring the calling task.
+ *
+ * Returns 0 if within budget, -EOVERFLOW if the budget was exceeded.
+ */
+#define TLOB_IOCTL_TRACE_STOP		_IO(RV_IOC_MAGIC,  0x02)
+
+#endif /* _UAPI_LINUX_RV_H */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 5b4be87ba..227573cda 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -65,6 +65,7 @@ source "kernel/trace/rv/monitors/pagefault/Kconfig"
 source "kernel/trace/rv/monitors/sleep/Kconfig"
 # Add new rtapp monitors here
 
+source "kernel/trace/rv/monitors/tlob/Kconfig"
 # Add new monitors here
 
 config RV_REACTORS
@@ -93,3 +94,19 @@ config RV_REACT_PANIC
 	help
 	  Enables the panic reactor. The panic reactor emits a printk()
 	  message if an exception is found and panic()s the system.
+
+config RV_CHARDEV
+	bool "RV ioctl interface via /dev/rv"
+	depends on RV
+	default n
+	help
+	  Register a /dev/rv misc device that exposes an ioctl interface
+	  for RV monitor self-instrumentation.  All RV monitors share the
+	  single device node; ioctl numbers encode the monitor identity.
+
+	  When enabled, user-space programs can open /dev/rv and use
+	  monitor-specific ioctl commands to bracket code regions they
+	  want the kernel RV subsystem to observe.
+
+	  Say Y here if you want to use the tlob self-instrumentation
+	  ioctl interface; otherwise say N.
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 750e4ad6f..cc3781a3b 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -3,6 +3,7 @@
 ccflags-y += -I $(src)		# needed for trace events
 
 obj-$(CONFIG_RV) += rv.o
+obj-$(CONFIG_RV_CHARDEV) += rv_dev.o
 obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
 obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
 obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o
@@ -17,6 +18,7 @@ obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o
 obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o
 obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o
 obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
+obj-$(CONFIG_RV_MON_TLOB) += monitors/tlob/tlob.o
 # Add new monitors here
 obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
 obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
diff --git a/kernel/trace/rv/monitors/tlob/Kconfig b/kernel/trace/rv/monitors/tlob/Kconfig
new file mode 100644
index 000000000..010237480
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/Kconfig
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_TLOB
+	depends on RV
+	depends on UPROBES
+	select DA_MON_EVENTS_ID
+	bool "tlob monitor"
+	help
+	  Enable the tlob (task latency over budget) monitor. This monitor
+	  tracks the elapsed time (CLOCK_MONOTONIC) of a marked code path within a
+	  task (including both on-CPU and off-CPU time) and reports a
+	  violation when the elapsed time exceeds a configurable budget
+	  threshold.
+
+	  The monitor implements a three-state deterministic automaton.
+	  States: unmonitored, on_cpu, off_cpu.
+	  Key transitions:
+	    unmonitored    --(trace_start)-->    on_cpu
+	    on_cpu   --(switch_out)-->     off_cpu
+	    off_cpu  --(switch_in)-->      on_cpu
+	    on_cpu   --(trace_stop)-->    unmonitored
+	    off_cpu  --(trace_stop)-->    unmonitored
+	    on_cpu   --(budget_expired)--> unmonitored
+	    off_cpu  --(budget_expired)--> unmonitored
+
+	  External configuration is done via the tracefs "monitor" file:
+	    echo pid:threshold_us:binary:offset_start:offset_stop > .../rv/monitors/tlob/monitor
+	    echo -pid             > .../rv/monitors/tlob/monitor  (remove task)
+	    cat                     .../rv/monitors/tlob/monitor  (list tasks)
+
+	  The uprobe binding places two plain entry uprobes at offset_start and
+	  offset_stop in the binary; these trigger tlob_start_task() and
+	  tlob_stop_task() respectively.  Using two entry uprobes (rather than a
+	  uretprobe) means that a mistyped offset can never corrupt the call
+	  stack; the worst outcome is a missed stop, which causes the hrtimer to
+	  fire and report a budget violation.
+
+	  Violation events are delivered via a lock-free mmap ring buffer on
+	  /dev/rv (enabled by CONFIG_RV_CHARDEV).  The consumer mmap()s the
+	  device, reads records from the data array using the head/tail indices
+	  in the control page, and advances data_tail when done.
+
+	  For self-instrumentation, use TLOB_IOCTL_TRACE_START /
+	  TLOB_IOCTL_TRACE_STOP via the /dev/rv misc device (enabled by
+	  CONFIG_RV_CHARDEV).
+
+	  Up to TLOB_MAX_MONITORED tasks may be monitored simultaneously.
+
+	  For further information, see:
+	    Documentation/trace/rv/monitor_tlob.rst
+
diff --git a/kernel/trace/rv/monitors/tlob/tlob.c b/kernel/trace/rv/monitors/tlob/tlob.c
new file mode 100644
index 000000000..a6e474025
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob.c
@@ -0,0 +1,986 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob: task latency over budget monitor
+ *
+ * Track the elapsed wall-clock time of a marked code path and detect when
+ * a monitored task exceeds its per-task latency budget.  CLOCK_MONOTONIC
+ * is used so both on-CPU and off-CPU time count toward the budget.
+ *
+ * Per-task state is maintained in a spinlock-protected hash table.  A
+ * one-shot hrtimer fires at the deadline; if the task has not called
+ * trace_stop by then, a violation is recorded.
+ *
+ * Up to TLOB_MAX_MONITORED tasks may be tracked simultaneously.
+ *
+ * Copyright (C) 2026 Wen Yang <wen.yang@linux.dev>
+ */
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/rv.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <linux/uprobes.h>
+#include <kunit/visibility.h>
+#include <rv/instrumentation.h>
+
+/* rv_interface_lock is defined in kernel/trace/rv/rv.c */
+extern struct mutex rv_interface_lock;
+
+#define MODULE_NAME "tlob"
+
+#include <rv_trace.h>
+#include <trace/events/sched.h>
+
+#define RV_MON_TYPE RV_MON_PER_TASK
+#include "tlob.h"
+#include <rv/da_monitor.h>
+
+/* Hash table size; must be a power of two. */
+#define TLOB_HTABLE_BITS		6
+#define TLOB_HTABLE_SIZE		(1 << TLOB_HTABLE_BITS)
+
+/* Maximum binary path length for uprobe binding. */
+#define TLOB_MAX_PATH			256
+
+/* Per-task latency monitoring state. */
+struct tlob_task_state {
+	struct hlist_node	hlist;
+	struct task_struct	*task;
+	u64			threshold_us;
+	u64			tag;
+	struct hrtimer		deadline_timer;
+	int			canceled;	/* protected by entry_lock */
+	struct file		*notify_file;	/* NULL or held reference */
+
+	/*
+	 * entry_lock serialises the mutable accounting fields below.
+	 * Lock order: tlob_table_lock -> entry_lock (never reverse).
+	 */
+	raw_spinlock_t		entry_lock;
+	u64			on_cpu_us;
+	u64			off_cpu_us;
+	ktime_t			last_ts;
+	u32			switches;
+	u8			da_state;
+
+	struct rcu_head		rcu;	/* for call_rcu() teardown */
+};
+
+/* Per-uprobe-binding state: a start + stop probe pair for one binary region. */
+struct tlob_uprobe_binding {
+	struct list_head	list;
+	u64			threshold_us;
+	struct path		path;
+	char			binpath[TLOB_MAX_PATH];	/* canonical path for read/remove */
+	loff_t			offset_start;
+	loff_t			offset_stop;
+	struct uprobe_consumer	entry_uc;
+	struct uprobe_consumer	stop_uc;
+	struct uprobe		*entry_uprobe;
+	struct uprobe		*stop_uprobe;
+};
+
+/* Object pool for tlob_task_state. */
+static struct kmem_cache *tlob_state_cache;
+
+/* Hash table and lock protecting table structure (insert/delete/canceled). */
+static struct hlist_head tlob_htable[TLOB_HTABLE_SIZE];
+static DEFINE_RAW_SPINLOCK(tlob_table_lock);
+static atomic_t tlob_num_monitored = ATOMIC_INIT(0);
+
+/* Uprobe binding list; protected by tlob_uprobe_mutex. */
+static LIST_HEAD(tlob_uprobe_list);
+static DEFINE_MUTEX(tlob_uprobe_mutex);
+
+/* Forward declaration */
+static enum hrtimer_restart tlob_deadline_timer_fn(struct hrtimer *timer);
+
+/* Hash table helpers */
+
+static unsigned int tlob_hash_task(const struct task_struct *task)
+{
+	return hash_ptr((void *)task, TLOB_HTABLE_BITS);
+}
+
+/*
+ * tlob_find_rcu - look up per-task state.
+ * Must be called under rcu_read_lock() or with tlob_table_lock held.
+ */
+static struct tlob_task_state *tlob_find_rcu(struct task_struct *task)
+{
+	struct tlob_task_state *ws;
+	unsigned int h = tlob_hash_task(task);
+
+	hlist_for_each_entry_rcu(ws, &tlob_htable[h], hlist,
+				 lockdep_is_held(&tlob_table_lock))
+		if (ws->task == task)
+			return ws;
+	return NULL;
+}
+
+/* Allocate and initialise a new per-task state entry. */
+static struct tlob_task_state *tlob_alloc(struct task_struct *task,
+					  u64 threshold_us, u64 tag)
+{
+	struct tlob_task_state *ws;
+
+	ws = kmem_cache_zalloc(tlob_state_cache, GFP_ATOMIC);
+	if (!ws)
+		return NULL;
+
+	ws->task = task;
+	get_task_struct(task);
+	ws->threshold_us = threshold_us;
+	ws->tag = tag;
+	ws->last_ts = ktime_get();
+	ws->da_state = on_cpu_tlob;
+	raw_spin_lock_init(&ws->entry_lock);
+	hrtimer_setup(&ws->deadline_timer, tlob_deadline_timer_fn,
+		      CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	return ws;
+}
+
+/* RCU callback: free the slab once no readers remain. */
+static void tlob_free_rcu_slab(struct rcu_head *head)
+{
+	struct tlob_task_state *ws =
+		container_of(head, struct tlob_task_state, rcu);
+	kmem_cache_free(tlob_state_cache, ws);
+}
+
+/* Arm the one-shot deadline timer for threshold_us microseconds. */
+static void tlob_arm_deadline(struct tlob_task_state *ws)
+{
+	hrtimer_start(&ws->deadline_timer,
+		      ns_to_ktime(ws->threshold_us * NSEC_PER_USEC),
+		      HRTIMER_MODE_REL);
+}
+
+/*
+ * Push a violation record into a monitor fd's ring buffer (softirq context).
+ * Drop-new policy: discard incoming record when full.  smp_store_release on
+ * data_head pairs with smp_load_acquire in the consumer.
+ */
+static void tlob_event_push(struct rv_file_priv *priv,
+			    const struct tlob_event *info)
+{
+	struct tlob_ring *ring = &priv->ring;
+	unsigned long flags;
+	u32 head, tail;
+
+	spin_lock_irqsave(&ring->lock, flags);
+
+	head = ring->page->data_head;
+	tail = READ_ONCE(ring->page->data_tail);
+
+	if (head - tail > ring->mask) {
+		/* Ring full: drop incoming record. */
+		ring->page->dropped++;
+		spin_unlock_irqrestore(&ring->lock, flags);
+		return;
+	}
+
+	ring->data[head & ring->mask] = *info;
+	/* pairs with smp_load_acquire() in the consumer */
+	smp_store_release(&ring->page->data_head, head + 1);
+
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	wake_up_interruptible_poll(&priv->waitq, EPOLLIN | EPOLLRDNORM);
+}
+
+#if IS_ENABLED(CONFIG_KUNIT)
+void tlob_event_push_kunit(struct rv_file_priv *priv,
+			  const struct tlob_event *info)
+{
+	tlob_event_push(priv, info);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_event_push_kunit);
+#endif /* CONFIG_KUNIT */
+
+/*
+ * Budget exceeded: remove the entry, record the violation, and inject
+ * budget_expired into the DA.
+ *
+ * Lock order: tlob_table_lock -> entry_lock.  tlob_stop_task() sets
+ * ws->canceled under both locks; if we see it here the stop path owns cleanup.
+ * fput/put_task_struct are done before call_rcu(); the RCU callback only
+ * reclaims the slab.
+ */
+static enum hrtimer_restart tlob_deadline_timer_fn(struct hrtimer *timer)
+{
+	struct tlob_task_state *ws =
+		container_of(timer, struct tlob_task_state, deadline_timer);
+	struct tlob_event info = {};
+	struct file *notify_file;
+	struct task_struct *task;
+	unsigned long flags;
+	/* snapshots taken under entry_lock */
+	u64 on_cpu_us, off_cpu_us, threshold_us, tag;
+	u32 switches;
+	bool on_cpu;
+	bool push_event = false;
+
+	raw_spin_lock_irqsave(&tlob_table_lock, flags);
+	/* stop path sets canceled under both locks; if set it owns cleanup */
+	if (ws->canceled) {
+		raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+		return HRTIMER_NORESTART;
+	}
+
+	/* Finalize accounting and snapshot all fields under entry_lock. */
+	raw_spin_lock(&ws->entry_lock);
+
+	{
+		ktime_t now = ktime_get();
+		u64 delta_us = ktime_to_us(ktime_sub(now, ws->last_ts));
+
+		if (ws->da_state == on_cpu_tlob)
+			ws->on_cpu_us += delta_us;
+		else
+			ws->off_cpu_us += delta_us;
+	}
+
+	ws->canceled  = 1;
+	on_cpu_us     = ws->on_cpu_us;
+	off_cpu_us    = ws->off_cpu_us;
+	threshold_us  = ws->threshold_us;
+	tag           = ws->tag;
+	switches      = ws->switches;
+	on_cpu        = (ws->da_state == on_cpu_tlob);
+	notify_file   = ws->notify_file;
+	if (notify_file) {
+		info.tid          = task_pid_vnr(ws->task);
+		info.threshold_us = threshold_us;
+		info.on_cpu_us    = on_cpu_us;
+		info.off_cpu_us   = off_cpu_us;
+		info.switches     = switches;
+		info.state        = on_cpu ? 1 : 0;
+		info.tag          = tag;
+		push_event        = true;
+	}
+
+	raw_spin_unlock(&ws->entry_lock);
+
+	hlist_del_rcu(&ws->hlist);
+	atomic_dec(&tlob_num_monitored);
+	/*
+	 * Hold a reference so task remains valid across da_handle_event()
+	 * after we drop tlob_table_lock.
+	 */
+	task = ws->task;
+	get_task_struct(task);
+	raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+
+	/*
+	 * Both locks are now released; ws is exclusively owned (removed from
+	 * the hash table with canceled=1).  Emit the tracepoint and push the
+	 * violation record.
+	 */
+	trace_tlob_budget_exceeded(ws->task, threshold_us, on_cpu_us,
+				   off_cpu_us, switches, on_cpu, tag);
+
+	if (push_event) {
+		struct rv_file_priv *priv = notify_file->private_data;
+
+		if (priv)
+			tlob_event_push(priv, &info);
+	}
+
+	da_handle_event(task, budget_expired_tlob);
+
+	if (notify_file)
+		fput(notify_file);		/* ref from fget() at TRACE_START */
+	put_task_struct(ws->task);		/* ref from tlob_alloc() */
+	put_task_struct(task);			/* extra ref from get_task_struct() above */
+	call_rcu(&ws->rcu, tlob_free_rcu_slab);
+	return HRTIMER_NORESTART;
+}
+
+/* Tracepoint handlers */
+
+/*
+ * handle_sched_switch - advance the DA and accumulate on/off-CPU time.
+ *
+ * RCU read-side for lock-free lookup; entry_lock for per-task accounting.
+ * da_handle_event() is called after rcu_read_unlock() to avoid holding the
+ * read-side critical section across the RV framework.
+ */
+static void handle_sched_switch(void *data, bool preempt,
+				struct task_struct *prev,
+				struct task_struct *next,
+				unsigned int prev_state)
+{
+	struct tlob_task_state *ws;
+	unsigned long flags;
+	bool do_prev = false, do_next = false;
+	ktime_t now;
+
+	rcu_read_lock();
+
+	ws = tlob_find_rcu(prev);
+	if (ws) {
+		raw_spin_lock_irqsave(&ws->entry_lock, flags);
+		if (!ws->canceled) {
+			now = ktime_get();
+			ws->on_cpu_us += ktime_to_us(ktime_sub(now, ws->last_ts));
+			ws->last_ts = now;
+			ws->switches++;
+			ws->da_state = off_cpu_tlob;
+			do_prev = true;
+		}
+		raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+	}
+
+	ws = tlob_find_rcu(next);
+	if (ws) {
+		raw_spin_lock_irqsave(&ws->entry_lock, flags);
+		if (!ws->canceled) {
+			now = ktime_get();
+			ws->off_cpu_us += ktime_to_us(ktime_sub(now, ws->last_ts));
+			ws->last_ts = now;
+			ws->da_state = on_cpu_tlob;
+			do_next = true;
+		}
+		raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+	}
+
+	rcu_read_unlock();
+
+	if (do_prev)
+		da_handle_event(prev, switch_out_tlob);
+	if (do_next)
+		da_handle_event(next, switch_in_tlob);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+	struct tlob_task_state *ws;
+	unsigned long flags;
+	bool found = false;
+
+	rcu_read_lock();
+	ws = tlob_find_rcu(p);
+	if (ws) {
+		raw_spin_lock_irqsave(&ws->entry_lock, flags);
+		found = !ws->canceled;
+		raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+	}
+	rcu_read_unlock();
+
+	if (found)
+		da_handle_event(p, sched_wakeup_tlob);
+}
+
+/* -----------------------------------------------------------------------
+ * Core start/stop helpers (also called from rv_dev.c)
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * __tlob_insert - insert @ws into the hash table and arm its deadline timer.
+ *
+ * Re-checks for duplicates and capacity under tlob_table_lock; the caller
+ * may have done a lock-free pre-check before allocating @ws.  On failure @ws
+ * is freed directly (never in table, so no call_rcu needed).
+ */
+static int __tlob_insert(struct task_struct *task, struct tlob_task_state *ws)
+{
+	unsigned int h;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&tlob_table_lock, flags);
+	if (tlob_find_rcu(task)) {
+		raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+		if (ws->notify_file)
+			fput(ws->notify_file);
+		put_task_struct(ws->task);
+		kmem_cache_free(tlob_state_cache, ws);
+		return -EEXIST;
+	}
+	if (atomic_read(&tlob_num_monitored) >= TLOB_MAX_MONITORED) {
+		raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+		if (ws->notify_file)
+			fput(ws->notify_file);
+		put_task_struct(ws->task);
+		kmem_cache_free(tlob_state_cache, ws);
+		return -ENOSPC;
+	}
+	h = tlob_hash_task(task);
+	hlist_add_head_rcu(&ws->hlist, &tlob_htable[h]);
+	atomic_inc(&tlob_num_monitored);
+	raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+
+	da_handle_start_run_event(task, trace_start_tlob);
+	tlob_arm_deadline(ws);
+	return 0;
+}
+
+/**
+ * tlob_start_task - begin monitoring @task with latency budget @threshold_us.
+ *
+ * @notify_file: /dev/rv fd whose ring buffer receives a tlob_event on
+ *               violation; caller transfers the fget() reference to tlob.c.
+ *               Pass NULL for synchronous mode (violations only via
+ *               TRACE_STOP return value and the tlob_budget_exceeded event).
+ *
+ * Returns 0, -ENODEV, -EEXIST, -ENOSPC, or -ENOMEM.  On failure the caller
+ * retains responsibility for any @notify_file reference.
+ */
+int tlob_start_task(struct task_struct *task, u64 threshold_us,
+		    struct file *notify_file, u64 tag)
+{
+	struct tlob_task_state *ws;
+	unsigned long flags;
+
+	if (!tlob_state_cache)
+		return -ENODEV;
+
+	if (threshold_us > (u64)KTIME_MAX / NSEC_PER_USEC)
+		return -ERANGE;
+
+	/* Quick pre-check before allocation. */
+	raw_spin_lock_irqsave(&tlob_table_lock, flags);
+	if (tlob_find_rcu(task)) {
+		raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+		return -EEXIST;
+	}
+	if (atomic_read(&tlob_num_monitored) >= TLOB_MAX_MONITORED) {
+		raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+		return -ENOSPC;
+	}
+	raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+
+	ws = tlob_alloc(task, threshold_us, tag);
+	if (!ws)
+		return -ENOMEM;
+
+	ws->notify_file = notify_file;
+	return __tlob_insert(task, ws);
+}
+EXPORT_SYMBOL_GPL(tlob_start_task);
+
+/**
+ * tlob_stop_task - stop monitoring @task before the deadline fires.
+ *
+ * Sets canceled under entry_lock (inside tlob_table_lock) before calling
+ * hrtimer_cancel(), racing safely with the timer callback.
+ *
+ * Returns 0 if within budget, -ESRCH if the entry is gone (deadline already
+ * fired, or TRACE_START was never called).
+ */
+int tlob_stop_task(struct task_struct *task)
+{
+	struct tlob_task_state *ws;
+	struct file *notify_file;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&tlob_table_lock, flags);
+	ws = tlob_find_rcu(task);
+	if (!ws) {
+		raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+		return -ESRCH;
+	}
+
+	/* Prevent handle_sched_switch from updating accounting after removal. */
+	raw_spin_lock(&ws->entry_lock);
+	ws->canceled = 1;
+	raw_spin_unlock(&ws->entry_lock);
+
+	hlist_del_rcu(&ws->hlist);
+	atomic_dec(&tlob_num_monitored);
+	raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+
+	hrtimer_cancel(&ws->deadline_timer);
+
+	da_handle_event(task, trace_stop_tlob);
+
+	notify_file = ws->notify_file;
+	if (notify_file)
+		fput(notify_file);
+	put_task_struct(ws->task);
+	call_rcu(&ws->rcu, tlob_free_rcu_slab);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tlob_stop_task);
+
+/* Stop monitoring all tracked tasks; called on monitor disable. */
+static void tlob_stop_all(void)
+{
+	struct tlob_task_state *batch[TLOB_MAX_MONITORED];
+	struct tlob_task_state *ws;
+	struct hlist_node *tmp;
+	unsigned long flags;
+	int n = 0, i;
+
+	raw_spin_lock_irqsave(&tlob_table_lock, flags);
+	for (i = 0; i < TLOB_HTABLE_SIZE; i++) {
+		hlist_for_each_entry_safe(ws, tmp, &tlob_htable[i], hlist) {
+			raw_spin_lock(&ws->entry_lock);
+			ws->canceled = 1;
+			raw_spin_unlock(&ws->entry_lock);
+			hlist_del_rcu(&ws->hlist);
+			atomic_dec(&tlob_num_monitored);
+			if (n < TLOB_MAX_MONITORED)
+				batch[n++] = ws;
+		}
+	}
+	raw_spin_unlock_irqrestore(&tlob_table_lock, flags);
+
+	for (i = 0; i < n; i++) {
+		ws = batch[i];
+		hrtimer_cancel(&ws->deadline_timer);
+		da_handle_event(ws->task, trace_stop_tlob);
+		if (ws->notify_file)
+			fput(ws->notify_file);
+		put_task_struct(ws->task);
+		call_rcu(&ws->rcu, tlob_free_rcu_slab);
+	}
+}
+
+/* uprobe binding helpers */
+
+static int tlob_uprobe_entry_handler(struct uprobe_consumer *uc,
+				     struct pt_regs *regs, __u64 *data)
+{
+	struct tlob_uprobe_binding *b =
+		container_of(uc, struct tlob_uprobe_binding, entry_uc);
+
+	tlob_start_task(current, b->threshold_us, NULL, (u64)b->offset_start);
+	return 0;
+}
+
+static int tlob_uprobe_stop_handler(struct uprobe_consumer *uc,
+				    struct pt_regs *regs, __u64 *data)
+{
+	tlob_stop_task(current);
+	return 0;
+}
+
+/*
+ * Register start + stop entry uprobes for a binding.
+ * Both are plain entry uprobes (no uretprobe), so a wrong offset never
+ * corrupts the call stack; the worst outcome is a missed stop (hrtimer
+ * fires and reports a budget violation).
+ * Called with tlob_uprobe_mutex held.
+ */
+static int tlob_add_uprobe(u64 threshold_us, const char *binpath,
+			   loff_t offset_start, loff_t offset_stop)
+{
+	struct tlob_uprobe_binding *b, *tmp_b;
+	char pathbuf[TLOB_MAX_PATH];
+	struct inode *inode;
+	char *canon;
+	int ret;
+
+	b = kzalloc(sizeof(*b), GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+
+	if (binpath[0] != '/') {
+		kfree(b);
+		return -EINVAL;
+	}
+
+	b->threshold_us = threshold_us;
+	b->offset_start = offset_start;
+	b->offset_stop  = offset_stop;
+
+	ret = kern_path(binpath, LOOKUP_FOLLOW, &b->path);
+	if (ret)
+		goto err_free;
+
+	if (!d_is_reg(b->path.dentry)) {
+		ret = -EINVAL;
+		goto err_path;
+	}
+
+	/* Reject duplicate start offset for the same binary. */
+	list_for_each_entry(tmp_b, &tlob_uprobe_list, list) {
+		if (tmp_b->offset_start == offset_start &&
+		    tmp_b->path.dentry == b->path.dentry) {
+			ret = -EEXIST;
+			goto err_path;
+		}
+	}
+
+	/* Store canonical path for read-back and removal matching. */
+	canon = d_path(&b->path, pathbuf, sizeof(pathbuf));
+	if (IS_ERR(canon)) {
+		ret = PTR_ERR(canon);
+		goto err_path;
+	}
+	strscpy(b->binpath, canon, sizeof(b->binpath));
+
+	b->entry_uc.handler = tlob_uprobe_entry_handler;
+	b->stop_uc.handler  = tlob_uprobe_stop_handler;
+
+	inode = d_real_inode(b->path.dentry);
+
+	b->entry_uprobe = uprobe_register(inode, offset_start, 0, &b->entry_uc);
+	if (IS_ERR(b->entry_uprobe)) {
+		ret = PTR_ERR(b->entry_uprobe);
+		b->entry_uprobe = NULL;
+		goto err_path;
+	}
+
+	b->stop_uprobe = uprobe_register(inode, offset_stop, 0, &b->stop_uc);
+	if (IS_ERR(b->stop_uprobe)) {
+		ret = PTR_ERR(b->stop_uprobe);
+		b->stop_uprobe = NULL;
+		goto err_entry;
+	}
+
+	list_add_tail(&b->list, &tlob_uprobe_list);
+	return 0;
+
+err_entry:
+	uprobe_unregister_nosync(b->entry_uprobe, &b->entry_uc);
+	uprobe_unregister_sync();
+err_path:
+	path_put(&b->path);
+err_free:
+	kfree(b);
+	return ret;
+}
+
+/*
+ * Remove the uprobe binding for (offset_start, binpath).
+ * binpath is resolved to a dentry for comparison so symlinks are handled
+ * correctly.  Called with tlob_uprobe_mutex held.
+ */
+static void tlob_remove_uprobe_by_key(loff_t offset_start, const char *binpath)
+{
+	struct tlob_uprobe_binding *b, *tmp;
+	struct path remove_path;
+
+	if (kern_path(binpath, LOOKUP_FOLLOW, &remove_path))
+		return;
+
+	list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
+		if (b->offset_start != offset_start)
+			continue;
+		if (b->path.dentry != remove_path.dentry)
+			continue;
+		uprobe_unregister_nosync(b->entry_uprobe, &b->entry_uc);
+		uprobe_unregister_nosync(b->stop_uprobe,  &b->stop_uc);
+		list_del(&b->list);
+		uprobe_unregister_sync();
+		path_put(&b->path);
+		kfree(b);
+		break;
+	}
+
+	path_put(&remove_path);
+}
+
+/* Unregister all uprobe bindings; called from disable_tlob(). */
+static void tlob_remove_all_uprobes(void)
+{
+	struct tlob_uprobe_binding *b, *tmp;
+
+	mutex_lock(&tlob_uprobe_mutex);
+	list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
+		uprobe_unregister_nosync(b->entry_uprobe, &b->entry_uc);
+		uprobe_unregister_nosync(b->stop_uprobe,  &b->stop_uc);
+		list_del(&b->list);
+		path_put(&b->path);
+		kfree(b);
+	}
+	mutex_unlock(&tlob_uprobe_mutex);
+	uprobe_unregister_sync();
+}
+
+/*
+ * tracefs "monitor" file
+ *
+ * Read:  one "threshold_us:0xoffset_start:0xoffset_stop:binary_path\n"
+ *        line per registered uprobe binding.
+ * Write: "threshold_us:offset_start:offset_stop:binary_path" - add uprobe binding
+ *        "-offset_start:binary_path"                         - remove uprobe binding
+ */
+
+static ssize_t tlob_monitor_read(struct file *file,
+				 char __user *ubuf,
+				 size_t count, loff_t *ppos)
+{
+	/* pid(10) + threshold(20) + 2 offsets(2*18) + path(256) + delimiters */
+	const int line_sz = TLOB_MAX_PATH + 72;
+	struct tlob_uprobe_binding *b;
+	char *buf, *p;
+	int n = 0, buf_sz, pos = 0;
+	ssize_t ret;
+
+	mutex_lock(&tlob_uprobe_mutex);
+	list_for_each_entry(b, &tlob_uprobe_list, list)
+		n++;
+	mutex_unlock(&tlob_uprobe_mutex);
+
+	buf_sz = (n ? n : 1) * line_sz + 1;
+	buf = kmalloc(buf_sz, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_lock(&tlob_uprobe_mutex);
+	list_for_each_entry(b, &tlob_uprobe_list, list) {
+		p = b->binpath;
+		pos += scnprintf(buf + pos, buf_sz - pos,
+				 "%llu:0x%llx:0x%llx:%s\n",
+				 b->threshold_us,
+				 (unsigned long long)b->offset_start,
+				 (unsigned long long)b->offset_stop,
+				 p);
+	}
+	mutex_unlock(&tlob_uprobe_mutex);
+
+	ret = simple_read_from_buffer(ubuf, count, ppos, buf, pos);
+	kfree(buf);
+	return ret;
+}
+
+/*
+ * Parse "threshold_us:offset_start:offset_stop:binary_path".
+ * binary_path comes last so it may freely contain ':'.
+ * Returns 0 on success.
+ */
+VISIBLE_IF_KUNIT int tlob_parse_uprobe_line(char *buf, u64 *thr_out,
+					    char **path_out,
+					    loff_t *start_out, loff_t *stop_out)
+{
+	unsigned long long thr;
+	long long start, stop;
+	int n = 0;
+
+	/*
+	 * %llu : decimal-only (microseconds)
+	 * %lli : auto-base, accepts 0x-prefixed hex for offsets
+	 * %n   : records the byte offset of the first path character
+	 */
+	if (sscanf(buf, "%llu:%lli:%lli:%n", &thr, &start, &stop, &n) != 3)
+		return -EINVAL;
+	if (thr == 0 || n == 0 || buf[n] == '\0')
+		return -EINVAL;
+	if (start < 0 || stop < 0)
+		return -EINVAL;
+
+	*thr_out   = thr;
+	*start_out = start;
+	*stop_out  = stop;
+	*path_out  = buf + n;
+	return 0;
+}
+
+static ssize_t tlob_monitor_write(struct file *file,
+				  const char __user *ubuf,
+				  size_t count, loff_t *ppos)
+{
+	char buf[TLOB_MAX_PATH + 64];
+	loff_t offset_start, offset_stop;
+	u64 threshold_us;
+	char *binpath;
+	int ret;
+
+	if (count >= sizeof(buf))
+		return -EINVAL;
+	if (copy_from_user(buf, ubuf, count))
+		return -EFAULT;
+	buf[count] = '\0';
+
+	if (count > 0 && buf[count - 1] == '\n')
+		buf[count - 1] = '\0';
+
+	/* Remove request: "-offset_start:binary_path" */
+	if (buf[0] == '-') {
+		long long off;
+		int n = 0;
+
+		if (sscanf(buf + 1, "%lli:%n", &off, &n) != 1 || n == 0)
+			return -EINVAL;
+		binpath = buf + 1 + n;
+		if (binpath[0] != '/')
+			return -EINVAL;
+
+		mutex_lock(&tlob_uprobe_mutex);
+		tlob_remove_uprobe_by_key((loff_t)off, binpath);
+		mutex_unlock(&tlob_uprobe_mutex);
+
+		return (ssize_t)count;
+	}
+
+	/*
+	 * Uprobe binding: "threshold_us:offset_start:offset_stop:binary_path"
+	 * binpath points into buf at the start of the path field.
+	 */
+	ret = tlob_parse_uprobe_line(buf, &threshold_us,
+				     &binpath, &offset_start, &offset_stop);
+	if (ret)
+		return ret;
+
+	mutex_lock(&tlob_uprobe_mutex);
+	ret = tlob_add_uprobe(threshold_us, binpath, offset_start, offset_stop);
+	mutex_unlock(&tlob_uprobe_mutex);
+	return ret ? ret : (ssize_t)count;
+}
+
+static const struct file_operations tlob_monitor_fops = {
+	.open	= simple_open,
+	.read	= tlob_monitor_read,
+	.write	= tlob_monitor_write,
+	.llseek	= noop_llseek,
+};
+
+/*
+ * __tlob_init_monitor / __tlob_destroy_monitor - called with rv_interface_lock
+ * held (required by da_monitor_init/destroy via rv_get/put_task_monitor_slot).
+ */
+static int __tlob_init_monitor(void)
+{
+	int i, retval;
+
+	tlob_state_cache = kmem_cache_create("tlob_task_state",
+					     sizeof(struct tlob_task_state),
+					     0, 0, NULL);
+	if (!tlob_state_cache)
+		return -ENOMEM;
+
+	for (i = 0; i < TLOB_HTABLE_SIZE; i++)
+		INIT_HLIST_HEAD(&tlob_htable[i]);
+	atomic_set(&tlob_num_monitored, 0);
+
+	retval = da_monitor_init();
+	if (retval) {
+		kmem_cache_destroy(tlob_state_cache);
+		tlob_state_cache = NULL;
+		return retval;
+	}
+
+	rv_this.enabled = 1;
+	return 0;
+}
+
+static void __tlob_destroy_monitor(void)
+{
+	rv_this.enabled = 0;
+	tlob_stop_all();
+	tlob_remove_all_uprobes();
+	/*
+	 * Drain pending call_rcu() callbacks from tlob_stop_all() before
+	 * destroying the kmem_cache.
+	 */
+	synchronize_rcu();
+	da_monitor_destroy();
+	kmem_cache_destroy(tlob_state_cache);
+	tlob_state_cache = NULL;
+}
+
+/*
+ * tlob_init_monitor / tlob_destroy_monitor - KUnit wrappers that acquire
+ * rv_interface_lock, satisfying the lockdep_assert_held() inside
+ * rv_get/put_task_monitor_slot().
+ */
+VISIBLE_IF_KUNIT int tlob_init_monitor(void)
+{
+	int ret;
+
+	mutex_lock(&rv_interface_lock);
+	ret = __tlob_init_monitor();
+	mutex_unlock(&rv_interface_lock);
+	return ret;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_init_monitor);
+
+VISIBLE_IF_KUNIT void tlob_destroy_monitor(void)
+{
+	mutex_lock(&rv_interface_lock);
+	__tlob_destroy_monitor();
+	mutex_unlock(&rv_interface_lock);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_destroy_monitor);
+
+VISIBLE_IF_KUNIT int tlob_enable_hooks(void)
+{
+	rv_attach_trace_probe("tlob", sched_switch, handle_sched_switch);
+	rv_attach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup);
+	return 0;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_enable_hooks);
+
+VISIBLE_IF_KUNIT void tlob_disable_hooks(void)
+{
+	rv_detach_trace_probe("tlob", sched_switch, handle_sched_switch);
+	rv_detach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_disable_hooks);
+
+/*
+ * enable_tlob / disable_tlob - called by rv_enable/disable_monitor() which
+ * already holds rv_interface_lock; call the __ variants directly.
+ */
+static int enable_tlob(void)
+{
+	int retval;
+
+	retval = __tlob_init_monitor();
+	if (retval)
+		return retval;
+
+	return tlob_enable_hooks();
+}
+
+static void disable_tlob(void)
+{
+	tlob_disable_hooks();
+	__tlob_destroy_monitor();
+}
+
+static struct rv_monitor rv_this = {
+	.name		= "tlob",
+	.description	= "Per-task latency-over-budget monitor.",
+	.enable		= enable_tlob,
+	.disable	= disable_tlob,
+	.reset		= da_monitor_reset_all,
+	.enabled	= 0,
+};
+
+static int __init register_tlob(void)
+{
+	int ret;
+
+	ret = rv_register_monitor(&rv_this, NULL);
+	if (ret)
+		return ret;
+
+	if (rv_this.root_d) {
+		tracefs_create_file("monitor", 0644, rv_this.root_d, NULL,
+				    &tlob_monitor_fops);
+	}
+
+	return 0;
+}
+
+static void __exit unregister_tlob(void)
+{
+	rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_tlob);
+module_exit(unregister_tlob);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wen Yang <wen.yang@linux.dev>");
+MODULE_DESCRIPTION("tlob: task latency over budget per-task monitor.");
diff --git a/kernel/trace/rv/monitors/tlob/tlob.h b/kernel/trace/rv/monitors/tlob/tlob.h
new file mode 100644
index 000000000..3438a6175
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RV_TLOB_H
+#define _RV_TLOB_H
+
+/*
+ * C representation of the tlob automaton, generated from tlob.dot via rvgen
+ * and extended with tlob_start_task()/tlob_stop_task() declarations.
+ * For the format description see Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#include <linux/rv.h>
+#include <uapi/linux/rv.h>
+
+#define MONITOR_NAME tlob
+
+enum states_tlob {
+	unmonitored_tlob,
+	on_cpu_tlob,
+	off_cpu_tlob,
+	state_max_tlob,
+};
+
+#define INVALID_STATE state_max_tlob
+
+enum events_tlob {
+	trace_start_tlob,
+	switch_in_tlob,
+	switch_out_tlob,
+	sched_wakeup_tlob,
+	trace_stop_tlob,
+	budget_expired_tlob,
+	event_max_tlob,
+};
+
+struct automaton_tlob {
+	char *state_names[state_max_tlob];
+	char *event_names[event_max_tlob];
+	unsigned char function[state_max_tlob][event_max_tlob];
+	unsigned char initial_state;
+	bool final_states[state_max_tlob];
+};
+
+static const struct automaton_tlob automaton_tlob = {
+	.state_names = {
+		"unmonitored",
+		"on_cpu",
+		"off_cpu",
+	},
+	.event_names = {
+		"trace_start",
+		"switch_in",
+		"switch_out",
+		"sched_wakeup",
+		"trace_stop",
+		"budget_expired",
+	},
+	.function = {
+		/* unmonitored */
+		{
+			on_cpu_tlob,		/* trace_start    */
+			unmonitored_tlob,	/* switch_in      */
+			unmonitored_tlob,	/* switch_out     */
+			unmonitored_tlob,	/* sched_wakeup   */
+			INVALID_STATE,		/* trace_stop     */
+			INVALID_STATE,		/* budget_expired */
+		},
+		/* on_cpu */
+		{
+			INVALID_STATE,		/* trace_start    */
+			INVALID_STATE,		/* switch_in      */
+			off_cpu_tlob,		/* switch_out     */
+			on_cpu_tlob,		/* sched_wakeup   */
+			unmonitored_tlob,	/* trace_stop     */
+			unmonitored_tlob,	/* budget_expired */
+		},
+		/* off_cpu */
+		{
+			INVALID_STATE,		/* trace_start    */
+			on_cpu_tlob,		/* switch_in      */
+			off_cpu_tlob,		/* switch_out     */
+			off_cpu_tlob,		/* sched_wakeup   */
+			unmonitored_tlob,	/* trace_stop     */
+			unmonitored_tlob,	/* budget_expired */
+		},
+	},
+	/*
+	 * final_states: unmonitored is the sole accepting state.
+	 * Violations are recorded via ntf_push and tlob_budget_exceeded.
+	 */
+	.initial_state = unmonitored_tlob,
+	.final_states = { 1, 0, 0 },
+};
+
+/* Exported for use by the RV ioctl layer (rv_dev.c) */
+int tlob_start_task(struct task_struct *task, u64 threshold_us,
+		    struct file *notify_file, u64 tag);
+int tlob_stop_task(struct task_struct *task);
+
+/* Maximum number of concurrently monitored tasks (also used by KUnit). */
+#define TLOB_MAX_MONITORED	64U
+
+/*
+ * Ring buffer constants (also published in UAPI for mmap size calculation).
+ */
+#define TLOB_RING_DEFAULT_CAP	64U	/* records allocated at open()  */
+#define TLOB_RING_MIN_CAP	 8U	/* minimum accepted by mmap()   */
+#define TLOB_RING_MAX_CAP	4096U	/* maximum accepted by mmap()   */
+
+/**
+ * struct tlob_ring - per-fd mmap-capable violation ring buffer.
+ *
+ * Allocated as a contiguous page range at rv_open() time:
+ *   page 0:    struct tlob_mmap_page  (shared with userspace)
+ *   pages 1-N: struct tlob_event[capacity]
+ */
+struct tlob_ring {
+	struct tlob_mmap_page	*page;
+	struct tlob_event	*data;
+	u32			 mask;
+	spinlock_t		 lock;
+	unsigned long		 base;
+	unsigned int		 order;
+};
+
+/**
+ * struct rv_file_priv - per-fd private data for /dev/rv.
+ */
+struct rv_file_priv {
+	struct tlob_ring	ring;
+	wait_queue_head_t	waitq;
+};
+
+#if IS_ENABLED(CONFIG_KUNIT)
+int tlob_init_monitor(void);
+void tlob_destroy_monitor(void);
+int tlob_enable_hooks(void);
+void tlob_disable_hooks(void);
+void tlob_event_push_kunit(struct rv_file_priv *priv,
+			  const struct tlob_event *info);
+int tlob_parse_uprobe_line(char *buf, u64 *thr_out,
+			   char **path_out,
+			   loff_t *start_out, loff_t *stop_out);
+#endif /* CONFIG_KUNIT */
+
+#endif /* _RV_TLOB_H */
diff --git a/kernel/trace/rv/monitors/tlob/tlob_trace.h b/kernel/trace/rv/monitors/tlob/tlob_trace.h
new file mode 100644
index 000000000..b08d67776
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob_trace.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_TLOB
+/*
+ * tlob uses the generic event_da_monitor_id and error_da_monitor_id event
+ * classes so that both event classes are instantiated.  This avoids a
+ * -Werror=unused-variable warning that the compiler emits when a
+ * DECLARE_EVENT_CLASS has no corresponding DEFINE_EVENT instance.
+ *
+ * The event_tlob tracepoint is defined here but the call-site in
+ * da_handle_event() is overridden with a no-op macro below so that no
+ * trace record is emitted on every scheduler context switch.  Budget
+ * violations are reported via the dedicated tlob_budget_exceeded event.
+ *
+ * error_tlob IS kept active so that invalid DA transitions (programming
+ * errors) are still visible in the ftrace ring buffer for debugging.
+ */
+DEFINE_EVENT(event_da_monitor_id, event_tlob,
+	     TP_PROTO(int id, char *state, char *event, char *next_state,
+		      bool final_state),
+	     TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_tlob,
+	     TP_PROTO(int id, char *state, char *event),
+	     TP_ARGS(id, state, event));
+
+/*
+ * Override the trace_event_tlob() call-site with a no-op after the
+ * DEFINE_EVENT above has satisfied the event class instantiation
+ * requirement.  The tracepoint symbol itself exists (and can be enabled
+ * via tracefs) but the automatic call from da_handle_event() is silenced
+ * to avoid per-context-switch ftrace noise during normal operation.
+ */
+#undef trace_event_tlob
+#define trace_event_tlob(id, state, event, next_state, final_state)	\
+	do { (void)(id); (void)(state); (void)(event);			\
+	     (void)(next_state); (void)(final_state); } while (0)
+#endif /* CONFIG_RV_MON_TLOB */
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index ee4e68102..e754e76d5 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -148,6 +148,10 @@
 #include <rv_trace.h>
 #endif
 
+#ifdef CONFIG_RV_MON_TLOB
+EXPORT_TRACEPOINT_SYMBOL_GPL(tlob_budget_exceeded);
+#endif
+
 #include "rv.h"
 
 DEFINE_MUTEX(rv_interface_lock);
diff --git a/kernel/trace/rv/rv_dev.c b/kernel/trace/rv/rv_dev.c
new file mode 100644
index 000000000..a052f3203
--- /dev/null
+++ b/kernel/trace/rv/rv_dev.c
@@ -0,0 +1,602 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * rv_dev.c - /dev/rv misc device for RV monitor self-instrumentation
+ *
+ * A single misc device (MISC_DYNAMIC_MINOR) serves all RV monitors.
+ * ioctl numbers encode the monitor identity:
+ *
+ *   0x01 - 0x1F  tlob (task latency over budget)
+ *   0x20 - 0x3F  reserved
+ *
+ * Each monitor exports tlob_start_task() / tlob_stop_task() which are
+ * called here.  The calling task is identified by current.
+ *
+ * Magic: RV_IOC_MAGIC (0xB9), defined in include/uapi/linux/rv.h
+ *
+ * Per-fd private data (rv_file_priv)
+ * ------------------------------------
+ * Every open() of /dev/rv allocates an rv_file_priv (defined in tlob.h).
+ * When TLOB_IOCTL_TRACE_START is called with args.notify_fd >= 0, violations
+ * are pushed as tlob_event records into that fd's per-fd ring buffer (tlob_ring)
+ * and its poll/epoll waitqueue is woken.
+ *
+ * Consumers drain records with read() on the notify_fd; read() blocks until
+ * at least one record is available (unless O_NONBLOCK is set).
+ *
+ * Per-thread "started" tracking (tlob_task_handle)
+ * -------------------------------------------------
+ * tlob_stop_task() returns -ESRCH in two distinct situations:
+ *
+ *   (a) The deadline timer already fired and removed the tlob hash-table
+ *       entry before TRACE_STOP arrived -> budget was exceeded -> -EOVERFLOW
+ *
+ *   (b) TRACE_START was never called for this thread -> programming error
+ *       -> -ESRCH
+ *
+ * To distinguish them, rv_dev.c maintains a lightweight hash table
+ * (tlob_handles) that records a tlob_task_handle for every task_struct *
+ * for which a successful TLOB_IOCTL_TRACE_START has been
+ * issued but the corresponding TLOB_IOCTL_TRACE_STOP has not yet arrived.
+ *
+ * tlob_task_handle is a thin "session ticket"  --  it carries only the
+ * task pointer and the owning file descriptor.  The heavy per-task state
+ * (hrtimer, DA state, threshold) lives in tlob_task_state inside tlob.c.
+ *
+ * The table is keyed on task_struct * (same key as tlob.c), protected
+ * by tlob_handles_lock (spinlock, irq-safe).  No get_task_struct()
+ * refcount is needed here because tlob.c already holds a reference for
+ * each live entry.
+ *
+ * Multiple threads may share the same fd.  Each thread has its own
+ * tlob_task_handle in the table, so concurrent TRACE_START / TRACE_STOP
+ * calls from different threads do not interfere.
+ *
+ * The fd release path (rv_release) calls tlob_stop_task() for every
+ * handle in tlob_handles that belongs to the closing fd, ensuring cleanup
+ * even if the user forgets to call TRACE_STOP.
+ */
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/rv.h>
+
+#ifdef CONFIG_RV_MON_TLOB
+#include "monitors/tlob/tlob.h"
+#endif
+
+/* -----------------------------------------------------------------------
+ * tlob_task_handle - per-thread session ticket for the ioctl interface
+ *
+ * One handle is allocated by TLOB_IOCTL_TRACE_START and freed by
+ * TLOB_IOCTL_TRACE_STOP (or by rv_release if the fd is closed).
+ *
+ * @hlist:  Hash-table linkage in tlob_handles (keyed on task pointer).
+ * @task:   The monitored thread.  Plain pointer; no refcount held here
+ *          because tlob.c holds one for the lifetime of the monitoring
+ *          window, which encompasses the lifetime of this handle.
+ * @file:   The /dev/rv file descriptor that issued TRACE_START.
+ *          Used by rv_release() to sweep orphaned handles on close().
+ * -----------------------------------------------------------------------
+ */
+#define TLOB_HANDLES_BITS	5
+#define TLOB_HANDLES_SIZE	(1 << TLOB_HANDLES_BITS)
+
+struct tlob_task_handle {
+	struct hlist_node	hlist;
+	struct task_struct	*task;
+	struct file		*file;
+};
+
+static struct hlist_head tlob_handles[TLOB_HANDLES_SIZE];
+static DEFINE_SPINLOCK(tlob_handles_lock);
+
+static unsigned int tlob_handle_hash(const struct task_struct *task)
+{
+	return hash_ptr((void *)task, TLOB_HANDLES_BITS);
+}
+
+/* Must be called with tlob_handles_lock held. */
+static struct tlob_task_handle *
+tlob_handle_find_locked(struct task_struct *task)
+{
+	struct tlob_task_handle *h;
+	unsigned int slot = tlob_handle_hash(task);
+
+	hlist_for_each_entry(h, &tlob_handles[slot], hlist) {
+		if (h->task == task)
+			return h;
+	}
+	return NULL;
+}
+
+/*
+ * tlob_handle_alloc - record that @task has an active monitoring session
+ *                     opened via @file.
+ *
+ * Returns 0 on success, -EEXIST if @task already has a handle (double
+ * TRACE_START without TRACE_STOP), -ENOMEM on allocation failure.
+ */
+static int tlob_handle_alloc(struct task_struct *task, struct file *file)
+{
+	struct tlob_task_handle *h;
+	unsigned long flags;
+	unsigned int slot;
+
+	h = kmalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+	h->task = task;
+	h->file = file;
+
+	spin_lock_irqsave(&tlob_handles_lock, flags);
+	if (tlob_handle_find_locked(task)) {
+		spin_unlock_irqrestore(&tlob_handles_lock, flags);
+		kfree(h);
+		return -EEXIST;
+	}
+	slot = tlob_handle_hash(task);
+	hlist_add_head(&h->hlist, &tlob_handles[slot]);
+	spin_unlock_irqrestore(&tlob_handles_lock, flags);
+	return 0;
+}
+
+/*
+ * tlob_handle_free - remove the handle for @task and free it.
+ *
+ * Returns 1 if a handle existed (TRACE_START was called), 0 if not found
+ * (TRACE_START was never called for this thread).
+ */
+static int tlob_handle_free(struct task_struct *task)
+{
+	struct tlob_task_handle *h;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tlob_handles_lock, flags);
+	h = tlob_handle_find_locked(task);
+	if (h) {
+		hlist_del_init(&h->hlist);
+		spin_unlock_irqrestore(&tlob_handles_lock, flags);
+		kfree(h);
+		return 1;
+	}
+	spin_unlock_irqrestore(&tlob_handles_lock, flags);
+	return 0;
+}
+
+/*
+ * tlob_handle_sweep_file - release all handles owned by @file.
+ *
+ * Called from rv_release() when the fd is closed without TRACE_STOP.
+ * Calls tlob_stop_task() for each orphaned handle to drain the tlob
+ * monitoring entries and prevent resource leaks in tlob.c.
+ *
+ * Handles are collected under the lock (short critical section), then
+ * processed outside it (tlob_stop_task() may sleep/spin internally).
+ */
+#ifdef CONFIG_RV_MON_TLOB
+static void tlob_handle_sweep_file(struct file *file)
+{
+	struct tlob_task_handle *batch[TLOB_HANDLES_SIZE];
+	struct tlob_task_handle *h;
+	struct hlist_node *tmp;
+	unsigned long flags;
+	int i, n = 0;
+
+	spin_lock_irqsave(&tlob_handles_lock, flags);
+	for (i = 0; i < TLOB_HANDLES_SIZE; i++) {
+		hlist_for_each_entry_safe(h, tmp, &tlob_handles[i], hlist) {
+			if (h->file == file) {
+				hlist_del_init(&h->hlist);
+				batch[n++] = h;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&tlob_handles_lock, flags);
+
+	for (i = 0; i < n; i++) {
+		/*
+		 * Ignore -ESRCH: the deadline timer may have already fired
+		 * and cleaned up the tlob entry.
+		 */
+		tlob_stop_task(batch[i]->task);
+		kfree(batch[i]);
+	}
+}
+#else
+static inline void tlob_handle_sweep_file(struct file *file) {}
+#endif /* CONFIG_RV_MON_TLOB */
+
+/* -----------------------------------------------------------------------
+ * Ring buffer lifecycle
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * tlob_ring_alloc - allocate a ring of @cap records (must be a power of 2).
+ *
+ * Allocates a physically contiguous block of pages:
+ *   page 0     : struct tlob_mmap_page  (control page, shared with userspace)
+ *   pages 1..N : struct tlob_event[cap] (data pages)
+ *
+ * Each page is marked reserved so it can be mapped to userspace via mmap().
+ */
+static int tlob_ring_alloc(struct tlob_ring *ring, u32 cap)
+{
+	unsigned int total = PAGE_SIZE + cap * sizeof(struct tlob_event);
+	unsigned int order = get_order(total);
+	unsigned long base;
+	unsigned int i;
+
+	base = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!base)
+		return -ENOMEM;
+
+	for (i = 0; i < (1u << order); i++)
+		SetPageReserved(virt_to_page((void *)(base + i * PAGE_SIZE)));
+
+	ring->base  = base;
+	ring->order = order;
+	ring->page  = (struct tlob_mmap_page *)base;
+	ring->data  = (struct tlob_event *)(base + PAGE_SIZE);
+	ring->mask  = cap - 1;
+	spin_lock_init(&ring->lock);
+
+	ring->page->capacity    = cap;
+	ring->page->version     = 1;
+	ring->page->data_offset = PAGE_SIZE;
+	ring->page->record_size = sizeof(struct tlob_event);
+	return 0;
+}
+
+static void tlob_ring_free(struct tlob_ring *ring)
+{
+	unsigned int i;
+
+	if (!ring->base)
+		return;
+
+	for (i = 0; i < (1u << ring->order); i++)
+		ClearPageReserved(virt_to_page((void *)(ring->base + i * PAGE_SIZE)));
+
+	free_pages(ring->base, ring->order);
+	ring->base = 0;
+	ring->page = NULL;
+	ring->data = NULL;
+}
+
+/* -----------------------------------------------------------------------
+ * File operations
+ * -----------------------------------------------------------------------
+ */
+
+static int rv_open(struct inode *inode, struct file *file)
+{
+	struct rv_file_priv *priv;
+	int ret;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	ret = tlob_ring_alloc(&priv->ring, TLOB_RING_DEFAULT_CAP);
+	if (ret) {
+		kfree(priv);
+		return ret;
+	}
+
+	init_waitqueue_head(&priv->waitq);
+	file->private_data = priv;
+	return 0;
+}
+
+static int rv_release(struct inode *inode, struct file *file)
+{
+	struct rv_file_priv *priv = file->private_data;
+
+	tlob_handle_sweep_file(file);
+	tlob_ring_free(&priv->ring);
+	kfree(priv);
+	file->private_data = NULL;
+	return 0;
+}
+
+static __poll_t rv_poll(struct file *file, poll_table *wait)
+{
+	struct rv_file_priv *priv = file->private_data;
+
+	if (!priv)
+		return EPOLLERR;
+
+	poll_wait(file, &priv->waitq, wait);
+
+	/*
+	 * Pairs with smp_store_release(&ring->page->data_head, ...) in
+	 * tlob_event_push().  No lock needed: head is written by the kernel
+	 * producer and read here; tail is written by the consumer and we only
+	 * need an approximate check for the poll fast path.
+	 */
+	if (smp_load_acquire(&priv->ring.page->data_head) !=
+	    READ_ONCE(priv->ring.page->data_tail))
+		return EPOLLIN | EPOLLRDNORM;
+
+	return 0;
+}
+
+/*
+ * rv_read - consume tlob_event violation records from this fd's ring buffer.
+ *
+ * Each read() returns a whole number of struct tlob_event records.  @count must
+ * be at least sizeof(struct tlob_event); partial-record sizes are rejected with
+ * -EINVAL.
+ *
+ * Blocking behaviour follows O_NONBLOCK on the fd:
+ *   O_NONBLOCK clear: blocks until at least one record is available.
+ *   O_NONBLOCK set:   returns -EAGAIN immediately if the ring is empty.
+ *
+ * Returns the number of bytes copied (always a multiple of sizeof tlob_event),
+ * -EAGAIN if non-blocking and empty, or a negative error code.
+ *
+ * read() and mmap() share the same ring and data_tail cursor; do not use
+ * both simultaneously on the same fd.
+ */
+static ssize_t rv_read(struct file *file, char __user *buf, size_t count,
+		       loff_t *ppos)
+{
+	struct rv_file_priv *priv = file->private_data;
+	struct tlob_ring *ring;
+	size_t rec = sizeof(struct tlob_event);
+	unsigned long irqflags;
+	ssize_t done = 0;
+	int ret;
+
+	if (!priv)
+		return -ENODEV;
+
+	ring = &priv->ring;
+
+	if (count < rec)
+		return -EINVAL;
+
+	/* Blocking path: sleep until the producer advances data_head. */
+	if (!(file->f_flags & O_NONBLOCK)) {
+		ret = wait_event_interruptible(priv->waitq,
+			/* pairs with smp_store_release() in the producer */
+			smp_load_acquire(&ring->page->data_head) !=
+			READ_ONCE(ring->page->data_tail));
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Drain records into the caller's buffer.  ring->lock serialises
+	 * concurrent read() callers and the softirq producer.
+	 */
+	while (done + rec <= count) {
+		struct tlob_event record;
+		u32 head, tail;
+
+		spin_lock_irqsave(&ring->lock, irqflags);
+		/* pairs with smp_store_release() in the producer */
+		head = smp_load_acquire(&ring->page->data_head);
+		tail = ring->page->data_tail;
+		if (head == tail) {
+			spin_unlock_irqrestore(&ring->lock, irqflags);
+			break;
+		}
+		record = ring->data[tail & ring->mask];
+		WRITE_ONCE(ring->page->data_tail, tail + 1);
+		spin_unlock_irqrestore(&ring->lock, irqflags);
+
+		if (copy_to_user(buf + done, &record, rec))
+			return done ? done : -EFAULT;
+		done += rec;
+	}
+
+	return done ? done : -EAGAIN;
+}
+
+/*
+ * rv_mmap - map the per-fd violation ring buffer into userspace.
+ *
+ * The mmap region covers the full ring allocation:
+ *
+ *   offset 0          : struct tlob_mmap_page  (control page)
+ *   offset PAGE_SIZE  : struct tlob_event[capacity]  (data pages)
+ *
+ * The caller must map exactly PAGE_SIZE + capacity * sizeof(struct tlob_event)
+ * bytes starting at offset 0 (vm_pgoff must be 0).  The actual capacity is
+ * read from tlob_mmap_page.capacity after a successful mmap(2).
+ *
+ * Private mappings (MAP_PRIVATE) are rejected: the shared data_tail field
+ * written by userspace must be visible to the kernel producer.
+ */
+static int rv_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct rv_file_priv *priv = file->private_data;
+	struct tlob_ring    *ring;
+	unsigned long        size = vma->vm_end - vma->vm_start;
+	unsigned long        ring_size;
+
+	if (!priv)
+		return -ENODEV;
+
+	ring = &priv->ring;
+
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	ring_size = PAGE_ALIGN(PAGE_SIZE + ((unsigned long)(ring->mask + 1) *
+					    sizeof(struct tlob_event)));
+	if (size != ring_size)
+		return -EINVAL;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	return remap_pfn_range(vma, vma->vm_start,
+			       page_to_pfn(virt_to_page((void *)ring->base)),
+			       ring_size, vma->vm_page_prot);
+}
+
+/* -----------------------------------------------------------------------
+ * ioctl dispatcher
+ * -----------------------------------------------------------------------
+ */
+
+static long rv_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	unsigned int nr = _IOC_NR(cmd);
+
+	/*
+	 * Verify the magic byte so we don't accidentally handle ioctls
+	 * intended for a different device.
+	 */
+	if (_IOC_TYPE(cmd) != RV_IOC_MAGIC)
+		return -ENOTTY;
+
+#ifdef CONFIG_RV_MON_TLOB
+	/* tlob: ioctl numbers 0x01 - 0x1F */
+	switch (cmd) {
+	case TLOB_IOCTL_TRACE_START: {
+		struct tlob_start_args args;
+		struct file *notify_file = NULL;
+		int ret, hret;
+
+		if (copy_from_user(&args,
+				   (struct tlob_start_args __user *)arg,
+				   sizeof(args)))
+			return -EFAULT;
+		if (args.threshold_us == 0)
+			return -EINVAL;
+		if (args.flags != 0)
+			return -EINVAL;
+
+		/*
+		 * If notify_fd >= 0, resolve it to a file pointer.
+		 * fget() bumps the reference count; tlob.c drops it
+		 * via fput() when the monitoring window ends.
+		 * Reject non-/dev/rv fds to prevent type confusion.
+		 */
+		if (args.notify_fd >= 0) {
+			notify_file = fget(args.notify_fd);
+			if (!notify_file)
+				return -EBADF;
+			if (notify_file->f_op != file->f_op) {
+				fput(notify_file);
+				return -EINVAL;
+			}
+		}
+
+		ret = tlob_start_task(current, args.threshold_us,
+				      notify_file, args.tag);
+		if (ret != 0) {
+			/* tlob.c did not take ownership; drop ref. */
+			if (notify_file)
+				fput(notify_file);
+			return ret;
+		}
+
+		/*
+		 * Record session handle.  Free any stale handle left by
+		 * a previous window whose deadline timer fired (timer
+		 * removes tlob_task_state but cannot touch tlob_handles).
+		 */
+		tlob_handle_free(current);
+		hret = tlob_handle_alloc(current, file);
+		if (hret < 0) {
+			tlob_stop_task(current);
+			return hret;
+		}
+		return 0;
+	}
+	case TLOB_IOCTL_TRACE_STOP: {
+		int had_handle;
+		int ret;
+
+		/*
+		 * Atomically remove the session handle for current.
+		 *
+		 *   had_handle == 0: TRACE_START was never called for
+		 *                    this thread -> caller bug -> -ESRCH
+		 *
+		 *   had_handle == 1: TRACE_START was called.  If
+		 *                    tlob_stop_task() now returns
+		 *                    -ESRCH, the deadline timer already
+		 *                    fired -> budget exceeded -> -EOVERFLOW
+		 */
+		had_handle = tlob_handle_free(current);
+		if (!had_handle)
+			return -ESRCH;
+
+		ret = tlob_stop_task(current);
+		return (ret == -ESRCH) ? -EOVERFLOW : ret;
+	}
+	default:
+		break;
+	}
+#endif /* CONFIG_RV_MON_TLOB */
+
+	return -ENOTTY;
+}
+
+/* -----------------------------------------------------------------------
+ * Module init / exit
+ * -----------------------------------------------------------------------
+ */
+
+static const struct file_operations rv_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rv_open,
+	.release	= rv_release,
+	.read		= rv_read,
+	.poll		= rv_poll,
+	.mmap		= rv_mmap,
+	.unlocked_ioctl	= rv_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= rv_ioctl,
+#endif
+	.llseek		= noop_llseek,
+};
+
+/*
+ * 0666: /dev/rv is a self-instrumentation device.  All ioctls operate
+ * exclusively on the calling task (current); no task can monitor another
+ * via this interface.  Opening the device does not grant any privilege
+ * beyond observing one's own latency, so world-read/write is appropriate.
+ */
+static struct miscdevice rv_miscdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "rv",
+	.fops	= &rv_fops,
+	.mode	= 0666,
+};
+
+static int __init rv_ioctl_init(void)
+{
+	int i;
+
+	for (i = 0; i < TLOB_HANDLES_SIZE; i++)
+		INIT_HLIST_HEAD(&tlob_handles[i]);
+
+	return misc_register(&rv_miscdev);
+}
+
+static void __exit rv_ioctl_exit(void)
+{
+	misc_deregister(&rv_miscdev);
+}
+
+module_init(rv_ioctl_init);
+module_exit(rv_ioctl_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RV ioctl interface via /dev/rv");
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 4a6faddac..65d6c6485 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -126,6 +126,7 @@ DECLARE_EVENT_CLASS(error_da_monitor_id,
 #include <monitors/snroc/snroc_trace.h>
 #include <monitors/nrp/nrp_trace.h>
 #include <monitors/sssw/sssw_trace.h>
+#include <monitors/tlob/tlob_trace.h>
 // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
 
 #endif /* CONFIG_DA_MON_EVENTS_ID */
@@ -202,6 +203,55 @@ TRACE_EVENT(rv_retries_error,
 		__get_str(event), __get_str(name))
 );
 #endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */
+
+#ifdef CONFIG_RV_MON_TLOB
+/*
+ * tlob_budget_exceeded - emitted when a monitored task exceeds its latency
+ * budget.  Carries the on-CPU / off-CPU time breakdown so that the cause
+ * of the overrun (CPU-bound vs. scheduling/I/O latency) is immediately
+ * visible in the ftrace ring buffer without post-processing.
+ */
+TRACE_EVENT(tlob_budget_exceeded,
+
+	TP_PROTO(struct task_struct *task, u64 threshold_us,
+		 u64 on_cpu_us, u64 off_cpu_us, u32 switches,
+		 bool state_is_on_cpu, u64 tag),
+
+	TP_ARGS(task, threshold_us, on_cpu_us, off_cpu_us, switches,
+		state_is_on_cpu, tag),
+
+	TP_STRUCT__entry(
+		__string(comm,		task->comm)
+		__field(pid_t,		pid)
+		__field(u64,		threshold_us)
+		__field(u64,		on_cpu_us)
+		__field(u64,		off_cpu_us)
+		__field(u32,		switches)
+		__field(bool,		state_is_on_cpu)
+		__field(u64,		tag)
+	),
+
+	TP_fast_assign(
+		__assign_str(comm);
+		__entry->pid		= task->pid;
+		__entry->threshold_us	= threshold_us;
+		__entry->on_cpu_us	= on_cpu_us;
+		__entry->off_cpu_us	= off_cpu_us;
+		__entry->switches	= switches;
+		__entry->state_is_on_cpu = state_is_on_cpu;
+		__entry->tag		= tag;
+	),
+
+	TP_printk("%s[%d]: budget exceeded threshold=%llu on_cpu=%llu off_cpu=%llu switches=%u state=%s tag=0x%016llx",
+		__get_str(comm), __entry->pid,
+		__entry->threshold_us,
+		__entry->on_cpu_us, __entry->off_cpu_us,
+		__entry->switches,
+		__entry->state_is_on_cpu ? "on_cpu" : "off_cpu",
+		__entry->tag)
+);
+#endif /* CONFIG_RV_MON_TLOB */
+
 #endif /* _TRACE_RV_H */
 
 /* This part must be outside protection */
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 3/4] rv/tlob: Add KUnit tests for the tlob monitor
From: wen.yang @ 2026-04-12 19:27 UTC (permalink / raw)
  To: Steven Rostedt, Gabriele Monaco, Masami Hiramatsu,
	Mathieu Desnoyers
  Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1776020428.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Add six KUnit test suites gated behind CONFIG_TLOB_KUNIT_TEST
(depends on RV_MON_TLOB && KUNIT; default KUNIT_ALL_TESTS).
A .kunitconfig fragment is provided for the kunit.py runner.

Coverage: automaton state transitions and self-loops; start/stop API
error paths (duplicate start, missing start, overflow threshold,
table-full, immediate deadline); scheduler context-switch accounting
for on/off-CPU time; violation tracepoint payload fields; ring buffer
push, drop-new overflow, and wakeup; and the uprobe line parser.

Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 kernel/trace/rv/Makefile                   |    1 +
 kernel/trace/rv/monitors/tlob/.kunitconfig |    5 +
 kernel/trace/rv/monitors/tlob/Kconfig      |   12 +
 kernel/trace/rv/monitors/tlob/tlob.c       |    1 +
 kernel/trace/rv/monitors/tlob/tlob_kunit.c | 1194 ++++++++++++++++++++
 5 files changed, 1213 insertions(+)
 create mode 100644 kernel/trace/rv/monitors/tlob/.kunitconfig
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob_kunit.c

diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index cc3781a3b..6d963207d 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o
 obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o
 obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
 obj-$(CONFIG_RV_MON_TLOB) += monitors/tlob/tlob.o
+obj-$(CONFIG_TLOB_KUNIT_TEST) += monitors/tlob/tlob_kunit.o
 # Add new monitors here
 obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
 obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
diff --git a/kernel/trace/rv/monitors/tlob/.kunitconfig b/kernel/trace/rv/monitors/tlob/.kunitconfig
new file mode 100644
index 000000000..977c58601
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/.kunitconfig
@@ -0,0 +1,5 @@
+CONFIG_FTRACE=y
+CONFIG_KUNIT=y
+CONFIG_RV=y
+CONFIG_RV_MON_TLOB=y
+CONFIG_TLOB_KUNIT_TEST=y
diff --git a/kernel/trace/rv/monitors/tlob/Kconfig b/kernel/trace/rv/monitors/tlob/Kconfig
index 010237480..4ccd2f881 100644
--- a/kernel/trace/rv/monitors/tlob/Kconfig
+++ b/kernel/trace/rv/monitors/tlob/Kconfig
@@ -49,3 +49,15 @@ config RV_MON_TLOB
 	  For further information, see:
 	    Documentation/trace/rv/monitor_tlob.rst
 
+config TLOB_KUNIT_TEST
+	tristate "KUnit tests for tlob monitor" if !KUNIT_ALL_TESTS
+	depends on RV_MON_TLOB && KUNIT
+	default KUNIT_ALL_TESTS
+	help
+	  Enable KUnit in-kernel unit tests for the tlob RV monitor.
+
+	  Tests cover automaton state transitions, the hash table helpers,
+	  the start/stop task interface, and the event ring buffer including
+	  overflow handling and wakeup behaviour.
+
+	  Say Y or M here to run the tlob KUnit test suite; otherwise say N.
diff --git a/kernel/trace/rv/monitors/tlob/tlob.c b/kernel/trace/rv/monitors/tlob/tlob.c
index a6e474025..dd959eb9b 100644
--- a/kernel/trace/rv/monitors/tlob/tlob.c
+++ b/kernel/trace/rv/monitors/tlob/tlob.c
@@ -784,6 +784,7 @@ VISIBLE_IF_KUNIT int tlob_parse_uprobe_line(char *buf, u64 *thr_out,
 	*path_out  = buf + n;
 	return 0;
 }
+EXPORT_SYMBOL_IF_KUNIT(tlob_parse_uprobe_line);
 
 static ssize_t tlob_monitor_write(struct file *file,
 				  const char __user *ubuf,
diff --git a/kernel/trace/rv/monitors/tlob/tlob_kunit.c b/kernel/trace/rv/monitors/tlob/tlob_kunit.c
new file mode 100644
index 000000000..64f5abb34
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob_kunit.c
@@ -0,0 +1,1194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for the tlob RV monitor.
+ *
+ * tlob_automaton:         DA transition table coverage.
+ * tlob_task_api:          tlob_start_task()/tlob_stop_task() lifecycle and errors.
+ * tlob_sched_integration: on/off-CPU accounting across real context switches.
+ * tlob_trace_output:      tlob_budget_exceeded tracepoint field verification.
+ * tlob_event_buf:         ring buffer push, overflow, and wakeup.
+ * tlob_parse_uprobe:      uprobe format string parser acceptance and rejection.
+ *
+ * The duplicate-(binary, offset_start) constraint enforced by tlob_add_uprobe()
+ * is not covered here: that function calls kern_path() and requires a real
+ * filesystem, which is outside the scope of unit tests. It is covered by the
+ * uprobe_duplicate_offset case in tools/testing/selftests/rv/test_tlob.sh.
+ */
+#include <kunit/test.h>
+#include <linux/atomic.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/tracepoint.h>
+
+/*
+ * Pull in the rv tracepoint declarations so that
+ * register_trace_tlob_budget_exceeded() is available.
+ * No CREATE_TRACE_POINTS here  --  the tracepoint implementation lives in rv.c.
+ */
+#include <rv_trace.h>
+
+#include "tlob.h"
+
+/*
+ * da_handle_event_tlob - apply one automaton transition on @da_mon.
+ *
+ * This helper is used only by the KUnit automaton suite. It applies the
+ * tlob transition table directly on a supplied da_monitor without touching
+ * per-task slots, tracepoints, or timers.
+ */
+static void da_handle_event_tlob(struct da_monitor *da_mon,
+				 enum events_tlob event)
+{
+	enum states_tlob curr_state = (enum states_tlob)da_mon->curr_state;
+	enum states_tlob next_state =
+		(enum states_tlob)automaton_tlob.function[curr_state][event];
+
+	if (next_state != INVALID_STATE)
+		da_mon->curr_state = next_state;
+}
+
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+
+/*
+ * Suite 1: automaton state-machine transitions
+ */
+
+/* unmonitored -> trace_start -> on_cpu */
+static void tlob_unmonitored_to_on_cpu(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = unmonitored_tlob };
+
+	da_handle_event_tlob(&mon, trace_start_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)on_cpu_tlob);
+}
+
+/* on_cpu -> switch_out -> off_cpu */
+static void tlob_on_cpu_switch_out(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = on_cpu_tlob };
+
+	da_handle_event_tlob(&mon, switch_out_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)off_cpu_tlob);
+}
+
+/* off_cpu -> switch_in -> on_cpu */
+static void tlob_off_cpu_switch_in(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = off_cpu_tlob };
+
+	da_handle_event_tlob(&mon, switch_in_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)on_cpu_tlob);
+}
+
+/* on_cpu -> budget_expired -> unmonitored */
+static void tlob_on_cpu_budget_expired(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = on_cpu_tlob };
+
+	da_handle_event_tlob(&mon, budget_expired_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)unmonitored_tlob);
+}
+
+/* off_cpu -> budget_expired -> unmonitored */
+static void tlob_off_cpu_budget_expired(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = off_cpu_tlob };
+
+	da_handle_event_tlob(&mon, budget_expired_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)unmonitored_tlob);
+}
+
+/* on_cpu -> trace_stop -> unmonitored */
+static void tlob_on_cpu_trace_stop(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = on_cpu_tlob };
+
+	da_handle_event_tlob(&mon, trace_stop_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)unmonitored_tlob);
+}
+
+/* off_cpu -> trace_stop -> unmonitored */
+static void tlob_off_cpu_trace_stop(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = off_cpu_tlob };
+
+	da_handle_event_tlob(&mon, trace_stop_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)unmonitored_tlob);
+}
+
+/* budget_expired -> unmonitored; a single trace_start re-enters on_cpu. */
+static void tlob_violation_then_restart(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = unmonitored_tlob };
+
+	da_handle_event_tlob(&mon, trace_start_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)on_cpu_tlob);
+
+	da_handle_event_tlob(&mon, budget_expired_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)unmonitored_tlob);
+
+	/* Single trace_start is sufficient to re-enter on_cpu */
+	da_handle_event_tlob(&mon, trace_start_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)on_cpu_tlob);
+
+	da_handle_event_tlob(&mon, trace_stop_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)unmonitored_tlob);
+}
+
+/* off_cpu self-loops on switch_out and sched_wakeup. */
+static void tlob_off_cpu_self_loops(struct kunit *test)
+{
+	static const enum events_tlob events[] = {
+		switch_out_tlob, sched_wakeup_tlob,
+	};
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(events); i++) {
+		struct da_monitor mon = { .curr_state = off_cpu_tlob };
+
+		da_handle_event_tlob(&mon, events[i]);
+		KUNIT_EXPECT_EQ_MSG(test, (int)mon.curr_state,
+				    (int)off_cpu_tlob,
+				    "event %u should self-loop in off_cpu",
+				    events[i]);
+	}
+}
+
+/* on_cpu self-loops on sched_wakeup. */
+static void tlob_on_cpu_self_loops(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = on_cpu_tlob };
+
+	da_handle_event_tlob(&mon, sched_wakeup_tlob);
+	KUNIT_EXPECT_EQ_MSG(test, (int)mon.curr_state, (int)on_cpu_tlob,
+			    "sched_wakeup should self-loop in on_cpu");
+}
+
+/* Scheduling events in unmonitored self-loop (no state change). */
+static void tlob_unmonitored_ignores_sched(struct kunit *test)
+{
+	static const enum events_tlob events[] = {
+		switch_in_tlob, switch_out_tlob, sched_wakeup_tlob,
+	};
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(events); i++) {
+		struct da_monitor mon = { .curr_state = unmonitored_tlob };
+
+		da_handle_event_tlob(&mon, events[i]);
+		KUNIT_EXPECT_EQ_MSG(test, (int)mon.curr_state,
+				    (int)unmonitored_tlob,
+				    "event %u should self-loop in unmonitored",
+				    events[i]);
+	}
+}
+
+static void tlob_full_happy_path(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = unmonitored_tlob };
+
+	da_handle_event_tlob(&mon, trace_start_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)on_cpu_tlob);
+
+	da_handle_event_tlob(&mon, switch_out_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)off_cpu_tlob);
+
+	da_handle_event_tlob(&mon, switch_in_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)on_cpu_tlob);
+
+	da_handle_event_tlob(&mon, trace_stop_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)unmonitored_tlob);
+}
+
+static void tlob_multiple_switches(struct kunit *test)
+{
+	struct da_monitor mon = { .curr_state = unmonitored_tlob };
+	int i;
+
+	da_handle_event_tlob(&mon, trace_start_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)on_cpu_tlob);
+
+	for (i = 0; i < 3; i++) {
+		da_handle_event_tlob(&mon, switch_out_tlob);
+		KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)off_cpu_tlob);
+		da_handle_event_tlob(&mon, switch_in_tlob);
+		KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)on_cpu_tlob);
+	}
+
+	da_handle_event_tlob(&mon, trace_stop_tlob);
+	KUNIT_EXPECT_EQ(test, (int)mon.curr_state, (int)unmonitored_tlob);
+}
+
+static struct kunit_case tlob_automaton_cases[] = {
+	KUNIT_CASE(tlob_unmonitored_to_on_cpu),
+	KUNIT_CASE(tlob_on_cpu_switch_out),
+	KUNIT_CASE(tlob_off_cpu_switch_in),
+	KUNIT_CASE(tlob_on_cpu_budget_expired),
+	KUNIT_CASE(tlob_off_cpu_budget_expired),
+	KUNIT_CASE(tlob_on_cpu_trace_stop),
+	KUNIT_CASE(tlob_off_cpu_trace_stop),
+	KUNIT_CASE(tlob_off_cpu_self_loops),
+	KUNIT_CASE(tlob_on_cpu_self_loops),
+	KUNIT_CASE(tlob_unmonitored_ignores_sched),
+	KUNIT_CASE(tlob_full_happy_path),
+	KUNIT_CASE(tlob_violation_then_restart),
+	KUNIT_CASE(tlob_multiple_switches),
+	{}
+};
+
+static struct kunit_suite tlob_automaton_suite = {
+	.name       = "tlob_automaton",
+	.test_cases = tlob_automaton_cases,
+};
+
+/*
+ * Suite 2: task registration API
+ */
+
+/* Basic start/stop cycle */
+static void tlob_start_stop_ok(struct kunit *test)
+{
+	int ret;
+
+	ret = tlob_start_task(current, 10000000 /* 10 s, won't fire */, NULL, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+	KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0);
+}
+
+/* Double start must return -EEXIST. */
+static void tlob_double_start(struct kunit *test)
+{
+	KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000, NULL, 0), 0);
+	KUNIT_EXPECT_EQ(test, tlob_start_task(current, 10000000, NULL, 0), -EEXIST);
+	tlob_stop_task(current);
+}
+
+/* Stop without start must return -ESRCH. */
+static void tlob_stop_without_start(struct kunit *test)
+{
+	tlob_stop_task(current);  /* clear any stale entry first */
+	KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+}
+
+/*
+ * A 1 us budget fires before tlob_stop_task() is called. Either the
+ * timer wins (-ESRCH) or we are very fast (0); both are valid.
+ */
+static void tlob_immediate_deadline(struct kunit *test)
+{
+	int ret = tlob_start_task(current, 1 /* 1 us - fires almost immediately */, NULL, 0);
+
+	KUNIT_ASSERT_EQ(test, ret, 0);
+	/* Let the 1 us timer fire */
+	udelay(100);
+	/*
+	 * By now the hrtimer has almost certainly fired. Either it has
+	 * (returns -ESRCH) or we were very fast (returns 0). Both are
+	 * acceptable; just ensure no crash and the table is clean after.
+	 */
+	ret = tlob_stop_task(current);
+	KUNIT_EXPECT_TRUE(test, ret == 0 || ret == -ESRCH);
+}
+
+/*
+ * Fill the table to TLOB_MAX_MONITORED using kthreads (each needs a
+ * distinct task_struct), then verify the next start returns -ENOSPC.
+ */
+struct tlob_waiter_ctx {
+	struct completion start;
+	struct completion done;
+};
+
+static int tlob_waiter_fn(void *arg)
+{
+	struct tlob_waiter_ctx *ctx = arg;
+
+	wait_for_completion(&ctx->start);
+	complete(&ctx->done);
+	return 0;
+}
+
+static void tlob_enospc(struct kunit *test)
+{
+	struct tlob_waiter_ctx *ctxs;
+	struct task_struct **threads;
+	int i, ret;
+
+	ctxs = kunit_kcalloc(test, TLOB_MAX_MONITORED,
+			     sizeof(*ctxs), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, ctxs);
+
+	threads = kunit_kcalloc(test, TLOB_MAX_MONITORED,
+				sizeof(*threads), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, threads);
+
+	/* Start TLOB_MAX_MONITORED kthreads and monitor each */
+	for (i = 0; i < TLOB_MAX_MONITORED; i++) {
+		init_completion(&ctxs[i].start);
+		init_completion(&ctxs[i].done);
+
+		threads[i] = kthread_run(tlob_waiter_fn, &ctxs[i],
+					 "tlob_waiter_%d", i);
+		if (IS_ERR(threads[i])) {
+			KUNIT_FAIL(test, "kthread_run failed at i=%d", i);
+			threads[i] = NULL;
+			goto cleanup;
+		}
+		get_task_struct(threads[i]);
+
+		ret = tlob_start_task(threads[i], 10000000, NULL, 0);
+		if (ret != 0) {
+			KUNIT_FAIL(test, "tlob_start_task failed at i=%d: %d",
+				   i, ret);
+			put_task_struct(threads[i]);
+			complete(&ctxs[i].start);
+			goto cleanup;
+		}
+	}
+
+	/* The table is now full: one more must fail with -ENOSPC */
+	ret = tlob_start_task(current, 10000000, NULL, 0);
+	KUNIT_EXPECT_EQ(test, ret, -ENOSPC);
+
+cleanup:
+	/*
+	 * Two-pass cleanup: cancel tlob monitoring and unblock kthreads first,
+	 * then kthread_stop() to wait for full exit before releasing refs.
+	 */
+	for (i = 0; i < TLOB_MAX_MONITORED; i++) {
+		if (!threads[i])
+			break;
+		tlob_stop_task(threads[i]);
+		complete(&ctxs[i].start);
+	}
+	for (i = 0; i < TLOB_MAX_MONITORED; i++) {
+		if (!threads[i])
+			break;
+		kthread_stop(threads[i]);
+		put_task_struct(threads[i]);
+	}
+}
+
+/*
+ * A kthread holds a mutex for 80 ms; arm a 10 ms budget, burn ~1 ms
+ * on-CPU, then block on the mutex. The timer fires off-CPU; stop
+ * must return -ESRCH.
+ */
+struct tlob_holder_ctx {
+	struct mutex		lock;
+	struct completion	ready;
+	unsigned int		hold_ms;
+};
+
+static int tlob_holder_fn(void *arg)
+{
+	struct tlob_holder_ctx *ctx = arg;
+
+	mutex_lock(&ctx->lock);
+	complete(&ctx->ready);
+	msleep(ctx->hold_ms);
+	mutex_unlock(&ctx->lock);
+	return 0;
+}
+
+static void tlob_deadline_fires_off_cpu(struct kunit *test)
+{
+	struct tlob_holder_ctx ctx = { .hold_ms = 80 };
+	struct task_struct *holder;
+	ktime_t t0;
+	int ret;
+
+	mutex_init(&ctx.lock);
+	init_completion(&ctx.ready);
+
+	holder = kthread_run(tlob_holder_fn, &ctx, "tlob_holder_kunit");
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, holder);
+	wait_for_completion(&ctx.ready);
+
+	/* Arm 10 ms budget while kthread holds the mutex. */
+	ret = tlob_start_task(current, 10000, NULL, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	/* Phase 1: burn ~1 ms on-CPU to exercise on_cpu accounting. */
+	t0 = ktime_get();
+	while (ktime_us_delta(ktime_get(), t0) < 1000)
+		cpu_relax();
+
+	/*
+	 * Phase 2: block on the mutex -> on_cpu->off_cpu transition.
+	 * The 10 ms budget fires while we are off-CPU.
+	 */
+	mutex_lock(&ctx.lock);
+	mutex_unlock(&ctx.lock);
+
+	/* Timer already fired and removed the entry -> -ESRCH */
+	KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+}
+
+/* Arm a 1 ms budget and busy-spin for 50 ms; timer fires on-CPU. */
+static void tlob_deadline_fires_on_cpu(struct kunit *test)
+{
+	ktime_t t0;
+	int ret;
+
+	ret = tlob_start_task(current, 1000 /* 1 ms */, NULL, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	/* Busy-spin 50 ms - 50x the budget */
+	t0 = ktime_get();
+	while (ktime_us_delta(ktime_get(), t0) < 50000)
+		cpu_relax();
+
+	/* Timer fired during the spin; entry is gone */
+	KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+}
+
+/*
+ * Start three tasks, call tlob_destroy_monitor() + tlob_init_monitor(),
+ * and verify the table is empty afterwards.
+ */
+static int tlob_dummy_fn(void *arg)
+{
+	wait_for_completion((struct completion *)arg);
+	return 0;
+}
+
+static void tlob_stop_all_cleanup(struct kunit *test)
+{
+	struct completion done1, done2;
+	struct task_struct *t1, *t2;
+	int ret;
+
+	init_completion(&done1);
+	init_completion(&done2);
+
+	t1 = kthread_run(tlob_dummy_fn, &done1, "tlob_dummy1");
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t1);
+	get_task_struct(t1);
+
+	t2 = kthread_run(tlob_dummy_fn, &done2, "tlob_dummy2");
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t2);
+	get_task_struct(t2);
+
+	KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000, NULL, 0), 0);
+	KUNIT_ASSERT_EQ(test, tlob_start_task(t1, 10000000, NULL, 0), 0);
+	KUNIT_ASSERT_EQ(test, tlob_start_task(t2, 10000000, NULL, 0), 0);
+
+	/* Destroy clears all entries via tlob_stop_all() */
+	tlob_destroy_monitor();
+	ret = tlob_init_monitor();
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	/* Table must be empty now */
+	KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+	KUNIT_EXPECT_EQ(test, tlob_stop_task(t1), -ESRCH);
+	KUNIT_EXPECT_EQ(test, tlob_stop_task(t2), -ESRCH);
+
+	complete(&done1);
+	complete(&done2);
+	/*
+	 * completions live on stack; wait for kthreads to exit before return.
+	 */
+	kthread_stop(t1);
+	kthread_stop(t2);
+	put_task_struct(t1);
+	put_task_struct(t2);
+}
+
+/* A threshold that overflows ktime_t must be rejected with -ERANGE. */
+static void tlob_overflow_threshold(struct kunit *test)
+{
+	/* KTIME_MAX / NSEC_PER_USEC + 1 overflows ktime_t */
+	u64 too_large = (u64)(KTIME_MAX / NSEC_PER_USEC) + 1;
+
+	KUNIT_EXPECT_EQ(test,
+		tlob_start_task(current, too_large, NULL, 0),
+		-ERANGE);
+}
+
+static int tlob_task_api_suite_init(struct kunit_suite *suite)
+{
+	return tlob_init_monitor();
+}
+
+static void tlob_task_api_suite_exit(struct kunit_suite *suite)
+{
+	tlob_destroy_monitor();
+}
+
+static struct kunit_case tlob_task_api_cases[] = {
+	KUNIT_CASE(tlob_start_stop_ok),
+	KUNIT_CASE(tlob_double_start),
+	KUNIT_CASE(tlob_stop_without_start),
+	KUNIT_CASE(tlob_immediate_deadline),
+	KUNIT_CASE(tlob_enospc),
+	KUNIT_CASE(tlob_overflow_threshold),
+	KUNIT_CASE(tlob_deadline_fires_off_cpu),
+	KUNIT_CASE(tlob_deadline_fires_on_cpu),
+	KUNIT_CASE(tlob_stop_all_cleanup),
+	{}
+};
+
+static struct kunit_suite tlob_task_api_suite = {
+	.name       = "tlob_task_api",
+	.suite_init = tlob_task_api_suite_init,
+	.suite_exit = tlob_task_api_suite_exit,
+	.test_cases = tlob_task_api_cases,
+};
+
+/*
+ * Suite 3: scheduling integration
+ */
+
+struct tlob_ping_ctx {
+	struct completion ping;
+	struct completion pong;
+};
+
+static int tlob_ping_fn(void *arg)
+{
+	struct tlob_ping_ctx *ctx = arg;
+
+	/* Wait for main to give us the CPU back */
+	wait_for_completion(&ctx->ping);
+	complete(&ctx->pong);
+	return 0;
+}
+
+/* Force two context switches and verify stop returns 0 (within budget). */
+static void tlob_sched_switch_accounting(struct kunit *test)
+{
+	struct tlob_ping_ctx ctx;
+	struct task_struct *peer;
+	int ret;
+
+	init_completion(&ctx.ping);
+	init_completion(&ctx.pong);
+
+	peer = kthread_run(tlob_ping_fn, &ctx, "tlob_ping_kunit");
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, peer);
+
+	/* Arm a generous 5 s budget so the timer never fires */
+	ret = tlob_start_task(current, 5000000, NULL, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	/*
+	 * complete(ping) -> peer runs, forcing a context switch out and back.
+	 */
+	complete(&ctx.ping);
+	wait_for_completion(&ctx.pong);
+
+	/*
+	 * Back on CPU after one off-CPU interval; stop must return 0.
+	 */
+	ret = tlob_stop_task(current);
+	KUNIT_EXPECT_EQ(test, ret, 0);
+}
+
+/*
+ * Verify that monitoring a kthread (not current) works: start on behalf
+ * of a kthread, let it block, then stop it.
+ */
+static int tlob_block_fn(void *arg)
+{
+	struct completion *done = arg;
+
+	/* Block briefly, exercising off_cpu accounting for this task */
+	msleep(20);
+	complete(done);
+	return 0;
+}
+
+static void tlob_monitor_other_task(struct kunit *test)
+{
+	struct completion done;
+	struct task_struct *target;
+	int ret;
+
+	init_completion(&done);
+
+	target = kthread_run(tlob_block_fn, &done, "tlob_target_kunit");
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, target);
+	get_task_struct(target);
+
+	/* Arm a 5 s budget for the target task */
+	ret = tlob_start_task(target, 5000000, NULL, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	wait_for_completion(&done);
+
+	/*
+	 * Target has finished; stop_task may return 0 (still in htable)
+	 * or -ESRCH (kthread exited and timer fired / entry cleaned up).
+	 */
+	ret = tlob_stop_task(target);
+	KUNIT_EXPECT_TRUE(test, ret == 0 || ret == -ESRCH);
+	put_task_struct(target);
+}
+
+static int tlob_sched_suite_init(struct kunit_suite *suite)
+{
+	return tlob_init_monitor();
+}
+
+static void tlob_sched_suite_exit(struct kunit_suite *suite)
+{
+	tlob_destroy_monitor();
+}
+
+static struct kunit_case tlob_sched_integration_cases[] = {
+	KUNIT_CASE(tlob_sched_switch_accounting),
+	KUNIT_CASE(tlob_monitor_other_task),
+	{}
+};
+
+static struct kunit_suite tlob_sched_integration_suite = {
+	.name       = "tlob_sched_integration",
+	.suite_init = tlob_sched_suite_init,
+	.suite_exit = tlob_sched_suite_exit,
+	.test_cases = tlob_sched_integration_cases,
+};
+
+/*
+ * Suite 4: ftrace tracepoint field verification
+ */
+
+/* Capture fields from trace_tlob_budget_exceeded for inspection. */
+struct tlob_exceeded_capture {
+	atomic_t	fired;		/* 1 after first call */
+	pid_t		pid;
+	u64		threshold_us;
+	u64		on_cpu_us;
+	u64		off_cpu_us;
+	u32		switches;
+	bool		state_is_on_cpu;
+	u64		tag;
+};
+
+static void
+probe_tlob_budget_exceeded(void *data,
+			   struct task_struct *task, u64 threshold_us,
+			   u64 on_cpu_us, u64 off_cpu_us,
+			   u32 switches, bool state_is_on_cpu, u64 tag)
+{
+	struct tlob_exceeded_capture *cap = data;
+
+	/* Only capture the first event to avoid races. */
+	if (atomic_cmpxchg(&cap->fired, 0, 1) != 0)
+		return;
+
+	cap->pid		= task->pid;
+	cap->threshold_us	= threshold_us;
+	cap->on_cpu_us		= on_cpu_us;
+	cap->off_cpu_us		= off_cpu_us;
+	cap->switches		= switches;
+	cap->state_is_on_cpu	= state_is_on_cpu;
+	cap->tag		= tag;
+}
+
+/*
+ * Arm a 2 ms budget and busy-spin for 60 ms. Verify the tracepoint fires
+ * once with matching threshold, correct pid, and total time >= budget.
+ *
+ * state_is_on_cpu is not asserted: preemption during the spin makes it
+ * non-deterministic.
+ */
+static void tlob_trace_budget_exceeded_on_cpu(struct kunit *test)
+{
+	struct tlob_exceeded_capture cap = {};
+	const u64 threshold_us = 2000; /* 2 ms */
+	ktime_t t0;
+	int ret;
+
+	atomic_set(&cap.fired, 0);
+
+	ret = register_trace_tlob_budget_exceeded(probe_tlob_budget_exceeded,
+						  &cap);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	ret = tlob_start_task(current, threshold_us, NULL, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	/* Busy-spin 60 ms  --  30x the budget */
+	t0 = ktime_get();
+	while (ktime_us_delta(ktime_get(), t0) < 60000)
+		cpu_relax();
+
+	/* Entry removed by timer; stop returns -ESRCH */
+	tlob_stop_task(current);
+
+	/*
+	 * Synchronise: ensure the probe callback has completed before we
+	 * read the captured fields.
+	 */
+	tracepoint_synchronize_unregister();
+	unregister_trace_tlob_budget_exceeded(probe_tlob_budget_exceeded, &cap);
+
+	KUNIT_EXPECT_EQ(test, atomic_read(&cap.fired), 1);
+	KUNIT_EXPECT_EQ(test, (int)cap.pid, (int)current->pid);
+	KUNIT_EXPECT_EQ(test, cap.threshold_us, threshold_us);
+	/* Total elapsed must cover at least the budget */
+	KUNIT_EXPECT_GE(test, cap.on_cpu_us + cap.off_cpu_us, threshold_us);
+}
+
+/*
+ * Holder kthread grabs a mutex for 80 ms; arm 10 ms budget, burn ~1 ms
+ * on-CPU, then block on the mutex. Timer fires off-CPU. Verify:
+ * state_is_on_cpu == false, switches >= 1, off_cpu_us > 0.
+ */
+static void tlob_trace_budget_exceeded_off_cpu(struct kunit *test)
+{
+	struct tlob_exceeded_capture cap = {};
+	struct tlob_holder_ctx ctx = { .hold_ms = 80 };
+	struct task_struct *holder;
+	const u64 threshold_us = 10000; /* 10 ms */
+	ktime_t t0;
+	int ret;
+
+	atomic_set(&cap.fired, 0);
+
+	mutex_init(&ctx.lock);
+	init_completion(&ctx.ready);
+
+	holder = kthread_run(tlob_holder_fn, &ctx, "tlob_holder2_kunit");
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, holder);
+	wait_for_completion(&ctx.ready);
+
+	ret = register_trace_tlob_budget_exceeded(probe_tlob_budget_exceeded,
+						  &cap);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	ret = tlob_start_task(current, threshold_us, NULL, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	/* Phase 1: ~1 ms on-CPU */
+	t0 = ktime_get();
+	while (ktime_us_delta(ktime_get(), t0) < 1000)
+		cpu_relax();
+
+	/* Phase 2: block -> off-CPU; timer fires here */
+	mutex_lock(&ctx.lock);
+	mutex_unlock(&ctx.lock);
+
+	tlob_stop_task(current);
+
+	tracepoint_synchronize_unregister();
+	unregister_trace_tlob_budget_exceeded(probe_tlob_budget_exceeded, &cap);
+
+	KUNIT_EXPECT_EQ(test, atomic_read(&cap.fired), 1);
+	KUNIT_EXPECT_EQ(test, cap.threshold_us, threshold_us);
+	/* Violation happened off-CPU */
+	KUNIT_EXPECT_FALSE(test, cap.state_is_on_cpu);
+	/* At least the switch_out event was counted */
+	KUNIT_EXPECT_GE(test, (u64)cap.switches, (u64)1);
+	/* Off-CPU time must be non-zero */
+	KUNIT_EXPECT_GT(test, cap.off_cpu_us, (u64)0);
+}
+
+/* threshold_us in the tracepoint must exactly match the start argument. */
+static void tlob_trace_threshold_field_accuracy(struct kunit *test)
+{
+	static const u64 thresholds[] = { 500, 1000, 3000 };
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(thresholds); i++) {
+		struct tlob_exceeded_capture cap = {};
+		ktime_t t0;
+		int ret;
+
+		atomic_set(&cap.fired, 0);
+
+		ret = register_trace_tlob_budget_exceeded(
+			probe_tlob_budget_exceeded, &cap);
+		KUNIT_ASSERT_EQ(test, ret, 0);
+
+		ret = tlob_start_task(current, thresholds[i], NULL, 0);
+		KUNIT_ASSERT_EQ(test, ret, 0);
+
+		/* Spin for 20x the threshold to ensure timer fires */
+		t0 = ktime_get();
+		while (ktime_us_delta(ktime_get(), t0) <
+		       (s64)(thresholds[i] * 20))
+			cpu_relax();
+
+		tlob_stop_task(current);
+
+		tracepoint_synchronize_unregister();
+		unregister_trace_tlob_budget_exceeded(
+			probe_tlob_budget_exceeded, &cap);
+
+		KUNIT_EXPECT_EQ_MSG(test, cap.threshold_us, thresholds[i],
+				    "threshold mismatch for entry %u", i);
+	}
+}
+
+static int tlob_trace_suite_init(struct kunit_suite *suite)
+{
+	int ret;
+
+	ret = tlob_init_monitor();
+	if (ret)
+		return ret;
+	return tlob_enable_hooks();
+}
+
+static void tlob_trace_suite_exit(struct kunit_suite *suite)
+{
+	tlob_disable_hooks();
+	tlob_destroy_monitor();
+}
+
+static struct kunit_case tlob_trace_output_cases[] = {
+	KUNIT_CASE(tlob_trace_budget_exceeded_on_cpu),
+	KUNIT_CASE(tlob_trace_budget_exceeded_off_cpu),
+	KUNIT_CASE(tlob_trace_threshold_field_accuracy),
+	{}
+};
+
+static struct kunit_suite tlob_trace_output_suite = {
+	.name       = "tlob_trace_output",
+	.suite_init = tlob_trace_suite_init,
+	.suite_exit = tlob_trace_suite_exit,
+	.test_cases = tlob_trace_output_cases,
+};
+
+/* Suite 5: ring buffer */
+
+/*
+ * Allocate a synthetic rv_file_priv for ring buffer tests. Uses
+ * kunit_kzalloc() instead of __get_free_pages() since the ring is never
+ * mmap'd here.
+ */
+static struct rv_file_priv *alloc_priv_kunit(struct kunit *test, u32 cap)
+{
+	struct rv_file_priv *priv;
+	struct tlob_ring *ring;
+
+	priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return NULL;
+
+	ring = &priv->ring;
+
+	ring->page = kunit_kzalloc(test, sizeof(struct tlob_mmap_page),
+				   GFP_KERNEL);
+	if (!ring->page)
+		return NULL;
+
+	ring->data = kunit_kzalloc(test, cap * sizeof(struct tlob_event),
+				   GFP_KERNEL);
+	if (!ring->data)
+		return NULL;
+
+	ring->mask            = cap - 1;
+	ring->page->capacity  = cap;
+	ring->page->version   = 1;
+	ring->page->data_offset = PAGE_SIZE; /* nominal; not used in tests */
+	ring->page->record_size = sizeof(struct tlob_event);
+	spin_lock_init(&ring->lock);
+	init_waitqueue_head(&priv->waitq);
+	return priv;
+}
+
+/* Push one record and verify all fields survive the round-trip. */
+static void tlob_event_push_one(struct kunit *test)
+{
+	struct rv_file_priv *priv;
+	struct tlob_ring *ring;
+	struct tlob_event in = {
+		.tid		= 1234,
+		.threshold_us	= 5000,
+		.on_cpu_us	= 3000,
+		.off_cpu_us	= 2000,
+		.switches	= 3,
+		.state		= 1,
+	};
+	struct tlob_event out = {};
+	u32 tail;
+
+	priv = alloc_priv_kunit(test, TLOB_RING_DEFAULT_CAP);
+	KUNIT_ASSERT_NOT_NULL(test, priv);
+
+	ring = &priv->ring;
+
+	tlob_event_push_kunit(priv, &in);
+
+	/* One record written, none dropped */
+	KUNIT_EXPECT_EQ(test, ring->page->data_head, 1u);
+	KUNIT_EXPECT_EQ(test, ring->page->data_tail, 0u);
+	KUNIT_EXPECT_EQ(test, ring->page->dropped,   0ull);
+
+	/* Dequeue manually */
+	tail = ring->page->data_tail;
+	out  = ring->data[tail & ring->mask];
+	ring->page->data_tail = tail + 1;
+
+	KUNIT_EXPECT_EQ(test, out.tid,          in.tid);
+	KUNIT_EXPECT_EQ(test, out.threshold_us, in.threshold_us);
+	KUNIT_EXPECT_EQ(test, out.on_cpu_us,    in.on_cpu_us);
+	KUNIT_EXPECT_EQ(test, out.off_cpu_us,   in.off_cpu_us);
+	KUNIT_EXPECT_EQ(test, out.switches,     in.switches);
+	KUNIT_EXPECT_EQ(test, out.state,        in.state);
+
+	/* Ring is now empty */
+	KUNIT_EXPECT_EQ(test, ring->page->data_head, ring->page->data_tail);
+}
+
+/*
+ * Fill to capacity, push one more. Drop-new policy: head stays at cap,
+ * dropped == 1, oldest record is preserved.
+ */
+static void tlob_event_push_overflow(struct kunit *test)
+{
+	struct rv_file_priv *priv;
+	struct tlob_ring *ring;
+	struct tlob_event ntf = {};
+	struct tlob_event out = {};
+	const u32 cap = TLOB_RING_MIN_CAP;
+	u32 i;
+
+	priv = alloc_priv_kunit(test, cap);
+	KUNIT_ASSERT_NOT_NULL(test, priv);
+
+	ring = &priv->ring;
+
+	/* Push cap + 1 records; tid encodes the sequence */
+	for (i = 0; i <= cap; i++) {
+		ntf.tid          = i;
+		ntf.threshold_us = (u64)i * 1000;
+		tlob_event_push_kunit(priv, &ntf);
+	}
+
+	/* Drop-new: head stopped at cap; one record was silently discarded */
+	KUNIT_EXPECT_EQ(test, ring->page->data_head, cap);
+	KUNIT_EXPECT_EQ(test, ring->page->data_tail, 0u);
+	KUNIT_EXPECT_EQ(test, ring->page->dropped,   1ull);
+
+	/* Oldest surviving record must be the first one pushed (tid == 0) */
+	out = ring->data[ring->page->data_tail & ring->mask];
+	KUNIT_EXPECT_EQ(test, out.tid, 0u);
+
+	/* Drain the ring; the last record must have tid == cap - 1 */
+	for (i = 0; i < cap; i++) {
+		u32 tail = ring->page->data_tail;
+
+		out = ring->data[tail & ring->mask];
+		ring->page->data_tail = tail + 1;
+	}
+	KUNIT_EXPECT_EQ(test, out.tid, cap - 1);
+	KUNIT_EXPECT_EQ(test, ring->page->data_head, ring->page->data_tail);
+}
+
+/* A freshly initialised ring is empty. */
+static void tlob_event_empty(struct kunit *test)
+{
+	struct rv_file_priv *priv;
+	struct tlob_ring *ring;
+
+	priv = alloc_priv_kunit(test, TLOB_RING_DEFAULT_CAP);
+	KUNIT_ASSERT_NOT_NULL(test, priv);
+
+	ring = &priv->ring;
+
+	KUNIT_EXPECT_EQ(test, ring->page->data_head, 0u);
+	KUNIT_EXPECT_EQ(test, ring->page->data_tail, 0u);
+	KUNIT_EXPECT_EQ(test, ring->page->dropped,   0ull);
+}
+
+/* A kthread blocks on wait_event_interruptible(); pushing one record must
+ * wake it within 1 s.
+ */
+
+struct tlob_wakeup_ctx {
+	struct rv_file_priv	*priv;
+	struct completion	 ready;
+	struct completion	 done;
+	int			 woke;
+};
+
+static int tlob_wakeup_thread(void *arg)
+{
+	struct tlob_wakeup_ctx *ctx = arg;
+	struct tlob_ring *ring = &ctx->priv->ring;
+
+	complete(&ctx->ready);
+
+	wait_event_interruptible(ctx->priv->waitq,
+		smp_load_acquire(&ring->page->data_head) !=
+		READ_ONCE(ring->page->data_tail) ||
+		kthread_should_stop());
+
+	if (smp_load_acquire(&ring->page->data_head) !=
+	    READ_ONCE(ring->page->data_tail))
+		ctx->woke = 1;
+
+	complete(&ctx->done);
+	return 0;
+}
+
+static void tlob_ring_wakeup(struct kunit *test)
+{
+	struct rv_file_priv *priv;
+	struct tlob_wakeup_ctx ctx;
+	struct task_struct *t;
+	struct tlob_event ev = { .tid = 99 };
+	long timeout;
+
+	priv = alloc_priv_kunit(test, TLOB_RING_DEFAULT_CAP);
+	KUNIT_ASSERT_NOT_NULL(test, priv);
+
+	init_completion(&ctx.ready);
+	init_completion(&ctx.done);
+	ctx.priv = priv;
+	ctx.woke = 0;
+
+	t = kthread_run(tlob_wakeup_thread, &ctx, "tlob_wakeup_kunit");
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t);
+	get_task_struct(t);
+
+	/* Let the kthread reach wait_event_interruptible */
+	wait_for_completion(&ctx.ready);
+	usleep_range(10000, 20000);
+
+	/* Push one record  --  must wake the waiter */
+	tlob_event_push_kunit(priv, &ev);
+
+	timeout = wait_for_completion_timeout(&ctx.done, msecs_to_jiffies(1000));
+	kthread_stop(t);
+	put_task_struct(t);
+
+	KUNIT_EXPECT_GT(test, timeout, 0L);
+	KUNIT_EXPECT_EQ(test, ctx.woke, 1);
+	KUNIT_EXPECT_EQ(test, priv->ring.page->data_head, 1u);
+}
+
+static struct kunit_case tlob_event_buf_cases[] = {
+	KUNIT_CASE(tlob_event_push_one),
+	KUNIT_CASE(tlob_event_push_overflow),
+	KUNIT_CASE(tlob_event_empty),
+	KUNIT_CASE(tlob_ring_wakeup),
+	{}
+};
+
+static struct kunit_suite tlob_event_buf_suite = {
+	.name       = "tlob_event_buf",
+	.test_cases = tlob_event_buf_cases,
+};
+
+/* Suite 6: uprobe format string parser */
+
+/* Happy path: decimal offsets, plain path. */
+static void tlob_parse_decimal_offsets(struct kunit *test)
+{
+	char buf[] = "5000:4768:4848:/usr/bin/myapp";
+	u64 thr; loff_t start, stop; char *path;
+
+	KUNIT_EXPECT_EQ(test,
+		tlob_parse_uprobe_line(buf, &thr, &path, &start, &stop),
+		0);
+	KUNIT_EXPECT_EQ(test, thr,      (u64)5000);
+	KUNIT_EXPECT_EQ(test, start,    (loff_t)4768);
+	KUNIT_EXPECT_EQ(test, stop,     (loff_t)4848);
+	KUNIT_EXPECT_STREQ(test, path,  "/usr/bin/myapp");
+}
+
+/* Happy path: 0x-prefixed hex offsets. */
+static void tlob_parse_hex_offsets(struct kunit *test)
+{
+	char buf[] = "10000:0x12a0:0x12f0:/usr/bin/myapp";
+	u64 thr; loff_t start, stop; char *path;
+
+	KUNIT_EXPECT_EQ(test,
+		tlob_parse_uprobe_line(buf, &thr, &path, &start, &stop),
+		0);
+	KUNIT_EXPECT_EQ(test, start,   (loff_t)0x12a0);
+	KUNIT_EXPECT_EQ(test, stop,    (loff_t)0x12f0);
+	KUNIT_EXPECT_STREQ(test, path, "/usr/bin/myapp");
+}
+
+/* Path containing ':' must not be truncated. */
+static void tlob_parse_path_with_colon(struct kunit *test)
+{
+	char buf[] = "1000:0x100:0x200:/opt/my:app/bin";
+	u64 thr; loff_t start, stop; char *path;
+
+	KUNIT_EXPECT_EQ(test,
+		tlob_parse_uprobe_line(buf, &thr, &path, &start, &stop),
+		0);
+	KUNIT_EXPECT_STREQ(test, path, "/opt/my:app/bin");
+}
+
+/* Zero threshold must be rejected. */
+static void tlob_parse_zero_threshold(struct kunit *test)
+{
+	char buf[] = "0:0x100:0x200:/usr/bin/myapp";
+	u64 thr; loff_t start, stop; char *path;
+
+	KUNIT_EXPECT_EQ(test,
+		tlob_parse_uprobe_line(buf, &thr, &path, &start, &stop),
+		-EINVAL);
+}
+
+/* Empty path (trailing ':' with nothing after) must be rejected. */
+static void tlob_parse_empty_path(struct kunit *test)
+{
+	char buf[] = "5000:0x100:0x200:";
+	u64 thr; loff_t start, stop; char *path;
+
+	KUNIT_EXPECT_EQ(test,
+		tlob_parse_uprobe_line(buf, &thr, &path, &start, &stop),
+		-EINVAL);
+}
+
+/* Missing field (3 tokens instead of 4) must be rejected. */
+static void tlob_parse_too_few_fields(struct kunit *test)
+{
+	char buf[] = "5000:0x100:/usr/bin/myapp";
+	u64 thr; loff_t start, stop; char *path;
+
+	KUNIT_EXPECT_EQ(test,
+		tlob_parse_uprobe_line(buf, &thr, &path, &start, &stop),
+		-EINVAL);
+}
+
+/* Negative offset must be rejected. */
+static void tlob_parse_negative_offset(struct kunit *test)
+{
+	char buf[] = "5000:-1:0x200:/usr/bin/myapp";
+	u64 thr; loff_t start, stop; char *path;
+
+	KUNIT_EXPECT_EQ(test,
+		tlob_parse_uprobe_line(buf, &thr, &path, &start, &stop),
+		-EINVAL);
+}
+
+static struct kunit_case tlob_parse_uprobe_cases[] = {
+	KUNIT_CASE(tlob_parse_decimal_offsets),
+	KUNIT_CASE(tlob_parse_hex_offsets),
+	KUNIT_CASE(tlob_parse_path_with_colon),
+	KUNIT_CASE(tlob_parse_zero_threshold),
+	KUNIT_CASE(tlob_parse_empty_path),
+	KUNIT_CASE(tlob_parse_too_few_fields),
+	KUNIT_CASE(tlob_parse_negative_offset),
+	{}
+};
+
+static struct kunit_suite tlob_parse_uprobe_suite = {
+	.name       = "tlob_parse_uprobe",
+	.test_cases = tlob_parse_uprobe_cases,
+};
+
+kunit_test_suites(&tlob_automaton_suite,
+		  &tlob_task_api_suite,
+		  &tlob_sched_integration_suite,
+		  &tlob_trace_output_suite,
+		  &tlob_event_buf_suite,
+		  &tlob_parse_uprobe_suite);
+
+MODULE_DESCRIPTION("KUnit tests for the tlob RV monitor");
+MODULE_LICENSE("GPL");
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH 4/4] selftests/rv: Add selftest for the tlob monitor
From: wen.yang @ 2026-04-12 19:27 UTC (permalink / raw)
  To: Steven Rostedt, Gabriele Monaco, Masami Hiramatsu,
	Mathieu Desnoyers
  Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1776020428.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Add a kselftest suite (TAP output, 19 test points) for the tlob RV
monitor under tools/testing/selftests/rv/.

test_tlob.sh drives a compiled C helper (tlob_helper) and, for uprobe
tests, a target binary (tlob_uprobe_target). Coverage spans the
tracefs enable/disable path, uprobe-triggered violations, and the
ioctl interface (within-budget stop, CPU-bound and sleep violations,
duplicate start, ring buffer mmap and consumption).

Requires CONFIG_RV_MON_TLOB=y and CONFIG_RV_CHARDEV=y; must be run
as root.

Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 tools/include/uapi/linux/rv.h                 |  54 +
 tools/testing/selftests/rv/Makefile           |  18 +
 tools/testing/selftests/rv/test_tlob.sh       | 563 ++++++++++
 tools/testing/selftests/rv/tlob_helper.c      | 994 ++++++++++++++++++
 .../testing/selftests/rv/tlob_uprobe_target.c | 108 ++
 5 files changed, 1737 insertions(+)
 create mode 100644 tools/include/uapi/linux/rv.h
 create mode 100644 tools/testing/selftests/rv/Makefile
 create mode 100755 tools/testing/selftests/rv/test_tlob.sh
 create mode 100644 tools/testing/selftests/rv/tlob_helper.c
 create mode 100644 tools/testing/selftests/rv/tlob_uprobe_target.c

diff --git a/tools/include/uapi/linux/rv.h b/tools/include/uapi/linux/rv.h
new file mode 100644
index 000000000..bef07aded
--- /dev/null
+++ b/tools/include/uapi/linux/rv.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * UAPI definitions for Runtime Verification (RV) monitors.
+ *
+ * This is a tools-friendly copy of include/uapi/linux/rv.h.
+ * Keep in sync with the kernel header.
+ */
+
+#ifndef _UAPI_LINUX_RV_H
+#define _UAPI_LINUX_RV_H
+
+#include <linux/types.h>
+#include <sys/ioctl.h>
+
+/* Magic byte shared by all RV monitor ioctls. */
+#define RV_IOC_MAGIC	0xB9
+
+/* -----------------------------------------------------------------------
+ * tlob: task latency over budget monitor  (nr 0x01 - 0x1F)
+ * -----------------------------------------------------------------------
+ */
+
+struct tlob_start_args {
+	__u64 threshold_us;
+	__u64 tag;
+	__s32 notify_fd;
+	__u32 flags;
+};
+
+struct tlob_event {
+	__u32 tid;
+	__u32 pad;
+	__u64 threshold_us;
+	__u64 on_cpu_us;
+	__u64 off_cpu_us;
+	__u32 switches;
+	__u32 state;   /* 1 = on_cpu, 0 = off_cpu */
+	__u64 tag;
+};
+
+struct tlob_mmap_page {
+	__u32  data_head;
+	__u32  data_tail;
+	__u32  capacity;
+	__u32  version;
+	__u32  data_offset;
+	__u32  record_size;
+	__u64  dropped;
+};
+
+#define TLOB_IOCTL_TRACE_START	_IOW(RV_IOC_MAGIC, 0x01, struct tlob_start_args)
+#define TLOB_IOCTL_TRACE_STOP	_IO(RV_IOC_MAGIC,  0x02)
+
+#endif /* _UAPI_LINUX_RV_H */
diff --git a/tools/testing/selftests/rv/Makefile b/tools/testing/selftests/rv/Makefile
new file mode 100644
index 000000000..14e94a1ab
--- /dev/null
+++ b/tools/testing/selftests/rv/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for rv selftests
+
+TEST_GEN_PROGS := tlob_helper tlob_uprobe_target
+
+TEST_PROGS := \
+	test_tlob.sh \
+
+# TOOLS_INCLUDES is defined by ../lib.mk; provides -isystem to
+# tools/include/uapi so that #include <linux/rv.h> resolves to the
+# in-tree UAPI header without requiring make headers_install.
+# Note: both must be added to the global variables, not as target-specific
+# overrides, because lib.mk rewrites TEST_GEN_PROGS to $(OUTPUT)/name
+# before per-target rules would be evaluated.
+CFLAGS += $(TOOLS_INCLUDES)
+LDLIBS += -lpthread
+
+include ../lib.mk
diff --git a/tools/testing/selftests/rv/test_tlob.sh b/tools/testing/selftests/rv/test_tlob.sh
new file mode 100755
index 000000000..3ba2125eb
--- /dev/null
+++ b/tools/testing/selftests/rv/test_tlob.sh
@@ -0,0 +1,563 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Selftest for the tlob (task latency over budget) RV monitor.
+#
+# Two interfaces are tested:
+#
+#   1. tracefs interface:
+#        enable/disable, presence of tracefs files,
+#        uprobe binding (threshold_us:offset_start:offset_stop:binary_path) and
+#        violation detection via the ftrace ring buffer.
+#
+#   2. /dev/rv ioctl self-instrumentation (via tlob_helper):
+#        within-budget, over-budget on-CPU, over-budget off-CPU (sleep),
+#        double-start, stop-without-start.
+#
+# Written to be POSIX sh compatible (no bash-specific extensions).
+
+ksft_skip=4
+t_pass=0; t_fail=0; t_skip=0; t_total=0
+
+tap_header() { echo "TAP version 13"; }
+tap_plan()   { echo "1..$1"; }
+tap_pass()   { t_pass=$((t_pass+1)); echo "ok $t_total - $1"; }
+tap_fail()   { t_fail=$((t_fail+1)); echo "not ok $t_total - $1"
+               [ -n "$2" ] && echo "  # $2"; }
+tap_skip()   { t_skip=$((t_skip+1)); echo "ok $t_total - $1 # SKIP $2"; }
+next_test()  { t_total=$((t_total+1)); }
+
+TRACEFS=$(grep -m1 tracefs /proc/mounts 2>/dev/null | awk '{print $2}')
+[ -z "$TRACEFS" ] && TRACEFS=/sys/kernel/tracing
+
+RV_DIR="${TRACEFS}/rv"
+TLOB_DIR="${RV_DIR}/monitors/tlob"
+TRACE_FILE="${TRACEFS}/trace"
+TRACING_ON="${TRACEFS}/tracing_on"
+TLOB_MONITOR="${TLOB_DIR}/monitor"
+BUDGET_EXCEEDED_ENABLE="${TRACEFS}/events/rv/tlob_budget_exceeded/enable"
+RV_DEV="/dev/rv"
+
+# tlob_helper and tlob_uprobe_target must be in the same directory as
+# this script or on PATH.
+SCRIPT_DIR=$(dirname "$0")
+IOCTL_HELPER="${SCRIPT_DIR}/tlob_helper"
+UPROBE_TARGET="${SCRIPT_DIR}/tlob_uprobe_target"
+
+check_root()     { [ "$(id -u)" = "0" ] || { echo "# Need root" >&2; exit $ksft_skip; }; }
+check_tracefs()  { [ -d "${TRACEFS}" ]   || { echo "# No tracefs" >&2; exit $ksft_skip; }; }
+check_rv_dir()   { [ -d "${RV_DIR}" ]    || { echo "# No RV infra" >&2; exit $ksft_skip; }; }
+check_tlob()     { [ -d "${TLOB_DIR}" ]  || { echo "# No tlob monitor" >&2; exit $ksft_skip; }; }
+
+tlob_enable()         { echo 1 > "${TLOB_DIR}/enable"; }
+tlob_disable()        { echo 0 > "${TLOB_DIR}/enable" 2>/dev/null; }
+tlob_is_enabled()     { [ "$(cat "${TLOB_DIR}/enable" 2>/dev/null)" = "1" ]; }
+trace_event_enable()  { echo 1 > "${BUDGET_EXCEEDED_ENABLE}" 2>/dev/null; }
+trace_event_disable() { echo 0 > "${BUDGET_EXCEEDED_ENABLE}" 2>/dev/null; }
+trace_on()            { echo 1 > "${TRACING_ON}" 2>/dev/null; }
+trace_clear()         { echo > "${TRACE_FILE}"; }
+trace_grep()          { grep -q "$1" "${TRACE_FILE}" 2>/dev/null; }
+
+cleanup() {
+	tlob_disable
+	trace_event_disable
+	trace_clear
+}
+
+# ---------------------------------------------------------------------------
+# Test 1: enable / disable
+# ---------------------------------------------------------------------------
+run_test_enable_disable() {
+	next_test; cleanup
+	tlob_enable
+	if ! tlob_is_enabled; then
+		tap_fail "enable_disable" "not enabled after echo 1"; cleanup; return
+	fi
+	tlob_disable
+	if tlob_is_enabled; then
+		tap_fail "enable_disable" "still enabled after echo 0"; cleanup; return
+	fi
+	tap_pass "enable_disable"; cleanup
+}
+
+# ---------------------------------------------------------------------------
+# Test 2: tracefs files present
+# ---------------------------------------------------------------------------
+run_test_tracefs_files() {
+	next_test; cleanup
+	missing=""
+	for f in enable desc monitor; do
+		[ ! -e "${TLOB_DIR}/${f}" ] && missing="${missing} ${f}"
+	done
+	[ -n "${missing}" ] \
+		&& tap_fail "tracefs_files" "missing:${missing}" \
+		|| tap_pass "tracefs_files"
+	cleanup
+}
+
+# ---------------------------------------------------------------------------
+# Helper: resolve file offset of a function inside a binary.
+#
+# Usage: resolve_offset <binary> <vaddr_hex>
+# Prints the hex file offset, or empty string on failure.
+# ---------------------------------------------------------------------------
+resolve_offset() {
+	bin=$1; vaddr=$2
+	# Parse /proc/self/maps to find the mapping that contains vaddr.
+	# Each line: start-end perms offset dev inode [path]
+	while IFS= read -r line; do
+		set -- $line
+		range=$1; off=$4; path=$7
+		[ -z "$path" ] && continue
+		# Only consider the mapping for our binary
+		[ "$path" != "$bin" ] && continue
+		# Split range into start and end
+		start=$(echo "$range" | cut -d- -f1)
+		end=$(echo "$range" | cut -d- -f2)
+		# Convert hex to decimal for comparison (use printf)
+		s=$(printf "%d" "0x${start}" 2>/dev/null) || continue
+		e=$(printf "%d" "0x${end}"   2>/dev/null) || continue
+		v=$(printf "%d" "${vaddr}"   2>/dev/null) || continue
+		o=$(printf "%d" "0x${off}"   2>/dev/null) || continue
+		if [ "$v" -ge "$s" ] && [ "$v" -lt "$e" ]; then
+			file_off=$(printf "0x%x" $(( (v - s) + o )))
+			echo "$file_off"
+			return
+		fi
+	done < /proc/self/maps
+}
+
+# ---------------------------------------------------------------------------
+# Test 3: uprobe binding - no false positive
+#
+# Bind this process with a 10 s budget.  Do nothing for 0.5 s.
+# No budget_exceeded event should appear in the trace.
+# ---------------------------------------------------------------------------
+run_test_uprobe_no_false_positive() {
+	next_test; cleanup
+	if [ ! -e "${TLOB_MONITOR}" ]; then
+		tap_skip "uprobe_no_false_positive" "monitor file not available"
+		cleanup; return
+	fi
+	# We probe the "sleep" command that we will run as a subprocess.
+	# Use /bin/sleep as the binary; find a valid function offset (0x0
+	# resolves to the ELF entry point, which is sufficient for a
+	# no-false-positive test since we just need the binding to exist).
+	sleep_bin=$(command -v sleep 2>/dev/null)
+	if [ -z "$sleep_bin" ]; then
+		tap_skip "uprobe_no_false_positive" "sleep not found"; cleanup; return
+	fi
+	pid=$$
+	# offset 0x0 probes the entry point of /bin/sleep - this is a
+	# deliberate probe that will not fire during a simple 'sleep 10'
+	# invoked in a subshell, but registers the pid in tlob.
+	#
+	# Instead, bind our own pid with a generous 10 s threshold and
+	# verify that 0.5 s of idle time does NOT fire the timer.
+	#
+	# Since we cannot easily get a valid uprobe offset in pure shell,
+	# we skip this sub-test if we cannot form a valid binding.
+	exe=$(readlink /proc/self/exe 2>/dev/null)
+	if [ -z "$exe" ]; then
+		tap_skip "uprobe_no_false_positive" "cannot read /proc/self/exe"
+		cleanup; return
+	fi
+	trace_event_enable
+	trace_on
+	tlob_enable
+	trace_clear
+	# Sleep without any binding - just verify no spurious events
+	sleep 0.5
+	trace_grep "budget_exceeded" \
+		&& tap_fail "uprobe_no_false_positive" \
+			"spurious budget_exceeded without any binding" \
+		|| tap_pass "uprobe_no_false_positive"
+	cleanup
+}
+
+# ---------------------------------------------------------------------------
+# Helper: get_uprobe_offset <binary> <symbol>
+#
+# Use tlob_helper sym_offset to get the ELF file offset of <symbol>
+# in <binary>.  Prints the hex offset (e.g. "0x11d0") or empty string on
+# failure.
+# ---------------------------------------------------------------------------
+get_uprobe_offset() {
+	bin=$1; sym=$2
+	if [ ! -x "${IOCTL_HELPER}" ]; then
+		return
+	fi
+	"${IOCTL_HELPER}" sym_offset "${bin}" "${sym}" 2>/dev/null
+}
+
+# ---------------------------------------------------------------------------
+# Test 4: uprobe binding - violation detected
+#
+# Start tlob_uprobe_target (a busy-spin binary with a well-known symbol),
+# attach a uprobe on tlob_busy_work with a 10 ms threshold, and verify
+# that a budget_expired event appears.
+# ---------------------------------------------------------------------------
+run_test_uprobe_violation() {
+	next_test; cleanup
+	if [ ! -e "${TLOB_MONITOR}" ]; then
+		tap_skip "uprobe_violation" "monitor file not available"
+		cleanup; return
+	fi
+	if [ ! -x "${UPROBE_TARGET}" ]; then
+		tap_skip "uprobe_violation" \
+			"tlob_uprobe_target not found or not executable"
+		cleanup; return
+	fi
+
+	# Get the file offsets of the start and stop probe symbols
+	busy_offset=$(get_uprobe_offset "${UPROBE_TARGET}" "tlob_busy_work")
+	if [ -z "${busy_offset}" ]; then
+		tap_skip "uprobe_violation" \
+			"cannot resolve tlob_busy_work offset in ${UPROBE_TARGET}"
+		cleanup; return
+	fi
+	stop_offset=$(get_uprobe_offset "${UPROBE_TARGET}" "tlob_busy_work_done")
+	if [ -z "${stop_offset}" ]; then
+		tap_skip "uprobe_violation" \
+			"cannot resolve tlob_busy_work_done offset in ${UPROBE_TARGET}"
+		cleanup; return
+	fi
+
+	# Start the busy-spin target (run for 30 s so the test can observe it)
+	"${UPROBE_TARGET}" 30000 &
+	busy_pid=$!
+	sleep 0.05
+
+	trace_event_enable
+	trace_on
+	tlob_enable
+	trace_clear
+
+	# Bind the target: 10 us budget; start=tlob_busy_work, stop=tlob_busy_work_done
+	binding="10:${busy_offset}:${stop_offset}:${UPROBE_TARGET}"
+	if ! echo "${binding}" > "${TLOB_MONITOR}" 2>/dev/null; then
+		kill "${busy_pid}" 2>/dev/null; wait "${busy_pid}" 2>/dev/null
+		tap_skip "uprobe_violation" \
+			"uprobe binding rejected (CONFIG_UPROBES=y needed)"
+		cleanup; return
+	fi
+
+	# Wait up to 2 s for a budget_exceeded event
+	found=0; i=0
+	while [ "$i" -lt 20 ]; do
+		sleep 0.1
+		trace_grep "budget_exceeded" && { found=1; break; }
+		i=$((i+1))
+	done
+
+	echo "-${busy_offset}:${UPROBE_TARGET}" > "${TLOB_MONITOR}" 2>/dev/null
+	kill "${busy_pid}" 2>/dev/null; wait "${busy_pid}" 2>/dev/null
+
+	if [ "${found}" != "1" ]; then
+		tap_fail "uprobe_violation" "no budget_exceeded within 2 s"
+		cleanup; return
+	fi
+
+	# Validate the event fields: threshold must match, on_cpu must be non-zero
+	# (CPU-bound violation), and state must be on_cpu.
+	ev=$(grep "budget_exceeded" "${TRACE_FILE}" | head -n 1)
+	if ! echo "${ev}" | grep -q "threshold=10 "; then
+		tap_fail "uprobe_violation" "threshold field mismatch: ${ev}"
+		cleanup; return
+	fi
+	on_cpu=$(echo "${ev}" | grep -o "on_cpu=[0-9]*" | cut -d= -f2)
+	if [ "${on_cpu:-0}" -eq 0 ]; then
+		tap_fail "uprobe_violation" "on_cpu=0 for a CPU-bound spin: ${ev}"
+		cleanup; return
+	fi
+	if ! echo "${ev}" | grep -q "state=on_cpu"; then
+		tap_fail "uprobe_violation" "state is not on_cpu: ${ev}"
+		cleanup; return
+	fi
+	tap_pass "uprobe_violation"
+	cleanup
+}
+
+# ---------------------------------------------------------------------------
+# Test 5: uprobe binding - remove binding stops monitoring
+#
+# Bind a pid via tlob_uprobe_target, then immediately remove it.
+# Verify that after removal the monitor file no longer lists the pid.
+# ---------------------------------------------------------------------------
+run_test_uprobe_unbind() {
+	next_test; cleanup
+	if [ ! -e "${TLOB_MONITOR}" ]; then
+		tap_skip "uprobe_unbind" "monitor file not available"
+		cleanup; return
+	fi
+	if [ ! -x "${UPROBE_TARGET}" ]; then
+		tap_skip "uprobe_unbind" \
+			"tlob_uprobe_target not found or not executable"
+		cleanup; return
+	fi
+
+	busy_offset=$(get_uprobe_offset "${UPROBE_TARGET}" "tlob_busy_work")
+	stop_offset=$(get_uprobe_offset "${UPROBE_TARGET}" "tlob_busy_work_done")
+	if [ -z "${busy_offset}" ] || [ -z "${stop_offset}" ]; then
+		tap_skip "uprobe_unbind" \
+			"cannot resolve tlob_busy_work/tlob_busy_work_done offset"
+		cleanup; return
+	fi
+
+	"${UPROBE_TARGET}" 30000 &
+	busy_pid=$!
+	sleep 0.05
+
+	tlob_enable
+	# 5 s budget - should not fire during this quick test
+	binding="5000000:${busy_offset}:${stop_offset}:${UPROBE_TARGET}"
+	if ! echo "${binding}" > "${TLOB_MONITOR}" 2>/dev/null; then
+		kill "${busy_pid}" 2>/dev/null; wait "${busy_pid}" 2>/dev/null
+		tap_skip "uprobe_unbind" \
+			"uprobe binding rejected (CONFIG_UPROBES=y needed)"
+		cleanup; return
+	fi
+
+	# Remove the binding
+	echo "-${busy_offset}:${UPROBE_TARGET}" > "${TLOB_MONITOR}" 2>/dev/null
+
+	# The monitor file should no longer list the binding for this offset
+	if grep -q "^[0-9]*:0x${busy_offset#0x}:" "${TLOB_MONITOR}" 2>/dev/null; then
+		kill "${busy_pid}" 2>/dev/null; wait "${busy_pid}" 2>/dev/null
+		tap_fail "uprobe_unbind" "pid still listed after removal"
+		cleanup; return
+	fi
+
+	kill "${busy_pid}" 2>/dev/null; wait "${busy_pid}" 2>/dev/null
+	tap_pass "uprobe_unbind"
+	cleanup
+}
+
+# ---------------------------------------------------------------------------
+# Test 6: uprobe - duplicate offset_start rejected
+#
+# Registering a second binding with the same offset_start in the same binary
+# must be rejected with an error, since two entry uprobes at the same address
+# would cause double tlob_start_task() calls and undefined behaviour.
+# ---------------------------------------------------------------------------
+run_test_uprobe_duplicate_offset() {
+	next_test; cleanup
+	if [ ! -e "${TLOB_MONITOR}" ]; then
+		tap_skip "uprobe_duplicate_offset" "monitor file not available"
+		cleanup; return
+	fi
+	if [ ! -x "${UPROBE_TARGET}" ]; then
+		tap_skip "uprobe_duplicate_offset" \
+			"tlob_uprobe_target not found or not executable"
+		cleanup; return
+	fi
+
+	busy_offset=$(get_uprobe_offset "${UPROBE_TARGET}" "tlob_busy_work")
+	stop_offset=$(get_uprobe_offset "${UPROBE_TARGET}" "tlob_busy_work_done")
+	if [ -z "${busy_offset}" ] || [ -z "${stop_offset}" ]; then
+		tap_skip "uprobe_duplicate_offset" \
+			"cannot resolve tlob_busy_work/tlob_busy_work_done offset"
+		cleanup; return
+	fi
+
+	tlob_enable
+
+	# First binding: should succeed
+	if ! echo "5000000:${busy_offset}:${stop_offset}:${UPROBE_TARGET}" \
+	        > "${TLOB_MONITOR}" 2>/dev/null; then
+		tap_skip "uprobe_duplicate_offset" \
+			"uprobe binding rejected (CONFIG_UPROBES=y needed)"
+		cleanup; return
+	fi
+
+	# Second binding with same offset_start: must be rejected
+	if echo "9999:${busy_offset}:${stop_offset}:${UPROBE_TARGET}" \
+	        > "${TLOB_MONITOR}" 2>/dev/null; then
+		echo "-${busy_offset}:${UPROBE_TARGET}" > "${TLOB_MONITOR}" 2>/dev/null
+		tap_fail "uprobe_duplicate_offset" \
+			"duplicate offset_start was accepted (expected error)"
+		cleanup; return
+	fi
+
+	echo "-${busy_offset}:${UPROBE_TARGET}" > "${TLOB_MONITOR}" 2>/dev/null
+	tap_pass "uprobe_duplicate_offset"
+	cleanup
+}
+
+
+#
+# Region A: tlob_busy_work with a 5 s budget - should NOT fire during the test.
+# Region B: tlob_busy_work_done with a 10 us budget - SHOULD fire quickly since
+#           tlob_uprobe_target calls tlob_busy_work_done after a busy spin.
+#
+# Verifies that independent bindings for different offsets in the same binary
+# are tracked separately and that only the tight-budget binding triggers a
+# budget_exceeded event.
+# ---------------------------------------------------------------------------
+run_test_uprobe_independent_thresholds() {
+	next_test; cleanup
+	if [ ! -e "${TLOB_MONITOR}" ]; then
+		tap_skip "uprobe_independent_thresholds" \
+			"monitor file not available"; cleanup; return
+	fi
+	if [ ! -x "${UPROBE_TARGET}" ]; then
+		tap_skip "uprobe_independent_thresholds" \
+			"tlob_uprobe_target not found or not executable"
+		cleanup; return
+	fi
+
+	busy_offset=$(get_uprobe_offset "${UPROBE_TARGET}" "tlob_busy_work")
+	busy_stop_offset=$(get_uprobe_offset "${UPROBE_TARGET}" "tlob_busy_work_done")
+	if [ -z "${busy_offset}" ] || [ -z "${busy_stop_offset}" ]; then
+		tap_skip "uprobe_independent_thresholds" \
+			"cannot resolve tlob_busy_work/tlob_busy_work_done offset"
+		cleanup; return
+	fi
+
+	"${UPROBE_TARGET}" 30000 &
+	busy_pid=$!
+	sleep 0.05
+
+	trace_event_enable
+	trace_on
+	tlob_enable
+	trace_clear
+
+	# Region A: generous 5 s budget on tlob_busy_work entry (should not fire)
+	if ! echo "5000000:${busy_offset}:${busy_stop_offset}:${UPROBE_TARGET}" \
+	        > "${TLOB_MONITOR}" 2>/dev/null; then
+		kill "${busy_pid}" 2>/dev/null; wait "${busy_pid}" 2>/dev/null
+		tap_skip "uprobe_independent_thresholds" \
+			"uprobe binding rejected (CONFIG_UPROBES=y needed)"
+		cleanup; return
+	fi
+	# Region B: tight 10 us budget on tlob_busy_work_done (fires quickly)
+	echo "10:${busy_stop_offset}:${busy_stop_offset}:${UPROBE_TARGET}" \
+		> "${TLOB_MONITOR}" 2>/dev/null
+
+	found=0; i=0
+	while [ "$i" -lt 20 ]; do
+		sleep 0.1
+		trace_grep "budget_exceeded" && { found=1; break; }
+		i=$((i+1))
+	done
+
+	echo "-${busy_offset}:${UPROBE_TARGET}" > "${TLOB_MONITOR}" 2>/dev/null
+	echo "-${busy_stop_offset}:${UPROBE_TARGET}" > "${TLOB_MONITOR}" 2>/dev/null
+	kill "${busy_pid}" 2>/dev/null; wait "${busy_pid}" 2>/dev/null
+
+	if [ "${found}" != "1" ]; then
+		tap_fail "uprobe_independent_thresholds" \
+			"budget_exceeded not raised for tight-budget region within 2 s"
+		cleanup; return
+	fi
+
+	# The violation must carry threshold=10 (Region B's budget).
+	ev=$(grep "budget_exceeded" "${TRACE_FILE}" | head -n 1)
+	if ! echo "${ev}" | grep -q "threshold=10 "; then
+		tap_fail "uprobe_independent_thresholds" \
+			"violation threshold is not Region B's 10 us: ${ev}"
+		cleanup; return
+	fi
+	tap_pass "uprobe_independent_thresholds"
+	cleanup
+}
+
+# ---------------------------------------------------------------------------
+# ioctl tests via tlob_helper
+#
+# Each test invokes the helper with a sub-test name.
+# Exit code: 0=pass, 1=fail, 2=skip.
+# ---------------------------------------------------------------------------
+run_ioctl_test() {
+	testname=$1
+	next_test
+
+	if [ ! -x "${IOCTL_HELPER}" ]; then
+		tap_skip "ioctl_${testname}" \
+			"tlob_helper not found or not executable"
+		return
+	fi
+	if [ ! -c "${RV_DEV}" ]; then
+		tap_skip "ioctl_${testname}" \
+			"${RV_DEV} not present (CONFIG_RV_CHARDEV=y needed)"
+		return
+	fi
+
+	tlob_enable
+	"${IOCTL_HELPER}" "${testname}"
+	rc=$?
+	tlob_disable
+
+	case "${rc}" in
+	0) tap_pass "ioctl_${testname}" ;;
+	2) tap_skip "ioctl_${testname}" "helper returned skip" ;;
+	*) tap_fail "ioctl_${testname}" "helper exited with code ${rc}" ;;
+	esac
+}
+
+# run_ioctl_test_not_enabled - like run_ioctl_test but deliberately does NOT
+# enable the tlob monitor before invoking the helper.  Used to verify that
+# ioctls issued against a disabled monitor return ENODEV rather than crashing
+# the kernel with a NULL pointer dereference.
+run_ioctl_test_not_enabled()
+{
+	next_test
+
+	if [ ! -x "${IOCTL_HELPER}" ]; then
+		tap_skip "ioctl_not_enabled" \
+			"tlob_helper not found or not executable"
+		return
+	fi
+	if [ ! -c "${RV_DEV}" ]; then
+		tap_skip "ioctl_not_enabled" \
+			"${RV_DEV} not present (CONFIG_RV_CHARDEV=y needed)"
+		return
+	fi
+
+	# Monitor intentionally left disabled.
+	tlob_disable
+	"${IOCTL_HELPER}" not_enabled
+	rc=$?
+
+	case "${rc}" in
+	0) tap_pass "ioctl_not_enabled" ;;
+	2) tap_skip "ioctl_not_enabled" "helper returned skip" ;;
+	*) tap_fail "ioctl_not_enabled" "helper exited with code ${rc}" ;;
+	esac
+}
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+check_root; check_tracefs; check_rv_dir; check_tlob
+tap_header; tap_plan 20
+
+# tracefs interface tests
+run_test_enable_disable
+run_test_tracefs_files
+
+# uprobe external monitoring tests
+run_test_uprobe_no_false_positive
+run_test_uprobe_violation
+run_test_uprobe_unbind
+run_test_uprobe_duplicate_offset
+run_test_uprobe_independent_thresholds
+
+# /dev/rv ioctl self-instrumentation tests
+run_ioctl_test_not_enabled
+run_ioctl_test within_budget
+run_ioctl_test over_budget_cpu
+run_ioctl_test over_budget_sleep
+run_ioctl_test double_start
+run_ioctl_test stop_no_start
+run_ioctl_test multi_thread
+run_ioctl_test self_watch
+run_ioctl_test invalid_flags
+run_ioctl_test notify_fd_bad
+run_ioctl_test mmap_basic
+run_ioctl_test mmap_errors
+run_ioctl_test mmap_consume
+
+echo "# Passed: ${t_pass} Failed: ${t_fail} Skipped: ${t_skip}"
+[ "${t_fail}" -gt 0 ] && exit 1 || exit 0
diff --git a/tools/testing/selftests/rv/tlob_helper.c b/tools/testing/selftests/rv/tlob_helper.c
new file mode 100644
index 000000000..cd76b56d1
--- /dev/null
+++ b/tools/testing/selftests/rv/tlob_helper.c
@@ -0,0 +1,994 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob_helper.c - test helper and ELF utility for tlob selftests
+ *
+ * Called by test_tlob.sh to exercise the /dev/rv ioctl interface and to
+ * resolve ELF symbol offsets for uprobe bindings.  One subcommand per
+ * invocation so the shell script can report each as an independent TAP
+ * test case.
+ *
+ * Usage: tlob_helper <subcommand> [args...]
+ *
+ * Synchronous TRACE_START / TRACE_STOP tests:
+ *   not_enabled        - TRACE_START without tlob enabled -> ENODEV (no kernel crash)
+ *   within_budget      - start(50000 us), sleep 10 ms, stop -> expect 0
+ *   over_budget_cpu    - start(5000 us), busyspin 100 ms, stop -> EOVERFLOW
+ *   over_budget_sleep  - start(3000 us), sleep 50 ms, stop -> EOVERFLOW
+ *
+ * Error-handling tests:
+ *   double_start       - two starts without stop -> EEXIST on second
+ *   stop_no_start      - stop without start -> ESRCH
+ *
+ * Per-thread isolation test:
+ *   multi_thread       - two threads share one fd; one within budget, one over
+ *
+ * Asynchronous notification test (notify_fd + read()):
+ *   self_watch         - one worker exceeds budget; monitor fd receives one ntf via read()
+ *
+ * Input-validation tests (TRACE_START error paths):
+ *   invalid_flags      - TRACE_START with flags != 0 -> EINVAL
+ *   notify_fd_bad      - TRACE_START with notify_fd = stdout (non-rv fd) -> EINVAL
+ *
+ * mmap ring buffer tests (Scenario D):
+ *   mmap_basic         - mmap succeeds; verify tlob_mmap_page fields
+ *                        (version, capacity, data_offset, record_size)
+ *   mmap_errors        - MAP_PRIVATE, wrong size, and non-zero pgoff all
+ *                        return EINVAL
+ *   mmap_consume       - trigger a real violation via self-notification and
+ *                        consume the event through the mmap'd ring
+ *
+ * ELF utility (does not require /dev/rv):
+ *   sym_offset <binary> <symbol>
+ *                      - print the ELF file offset of <symbol> in <binary>
+ *                        (used by the shell script to build uprobe bindings)
+ *
+ * Exit code: 0 = pass, 1 = fail, 2 = skip (device not available).
+ */
+#define _GNU_SOURCE
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/rv.h>
+
+/* Default ring capacity allocated at open(); matches TLOB_RING_DEFAULT_CAP. */
+#define TLOB_RING_DEFAULT_CAP	64U
+
+static int rv_fd = -1;
+
+static int open_rv(void)
+{
+	rv_fd = open("/dev/rv", O_RDWR);
+	if (rv_fd < 0) {
+		fprintf(stderr, "open /dev/rv: %s\n", strerror(errno));
+		return -1;
+	}
+	return 0;
+}
+
+static void busy_spin_us(unsigned long us)
+{
+	struct timespec start, now;
+	unsigned long elapsed;
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	do {
+		clock_gettime(CLOCK_MONOTONIC, &now);
+		elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+			  * 1000000000UL
+			+ (unsigned long)(now.tv_nsec - start.tv_nsec);
+	} while (elapsed < us * 1000UL);
+}
+
+static int do_start(uint64_t threshold_us)
+{
+	struct tlob_start_args args = {
+		.threshold_us = threshold_us,
+		.notify_fd    = -1,
+	};
+
+	return ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+}
+
+static int do_stop(void)
+{
+	return ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+}
+
+/* -----------------------------------------------------------------------
+ * Synchronous TRACE_START / TRACE_STOP tests
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * test_not_enabled - TRACE_START must return ENODEV when the tlob monitor
+ * has not been enabled (tlob_state_cache is NULL).
+ *
+ * The shell wrapper deliberately does NOT call tlob_enable before invoking
+ * this subcommand, so the ioctl is expected to fail with ENODEV rather than
+ * crashing the kernel with a NULL pointer dereference in kmem_cache_alloc.
+ */
+static int test_not_enabled(void)
+{
+	int ret;
+
+	ret = do_start(1000);
+	if (ret == 0) {
+		fprintf(stderr, "TRACE_START: expected ENODEV, got success\n");
+		do_stop();
+		return 1;
+	}
+	if (errno != ENODEV) {
+		fprintf(stderr, "TRACE_START: expected ENODEV, got %s\n",
+			strerror(errno));
+		return 1;
+	}
+	return 0;
+}
+
+static int test_within_budget(void)
+{
+	int ret;
+
+	if (do_start(50000) < 0) {
+		fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+		return 1;
+	}
+	usleep(10000); /* 10 ms < 50 ms budget */
+	ret = do_stop();
+	if (ret != 0) {
+		fprintf(stderr, "TRACE_STOP: expected 0, got %d errno=%s\n",
+			ret, strerror(errno));
+		return 1;
+	}
+	return 0;
+}
+
+static int test_over_budget_cpu(void)
+{
+	int ret;
+
+	if (do_start(5000) < 0) {
+		fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+		return 1;
+	}
+	busy_spin_us(100000); /* 100 ms >> 5 ms budget */
+	ret = do_stop();
+	if (ret == 0) {
+		fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n");
+		return 1;
+	}
+	if (errno != EOVERFLOW) {
+		fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n",
+			strerror(errno));
+		return 1;
+	}
+	return 0;
+}
+
+static int test_over_budget_sleep(void)
+{
+	int ret;
+
+	if (do_start(3000) < 0) {
+		fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+		return 1;
+	}
+	usleep(50000); /* 50 ms >> 3 ms budget, off-CPU time counts */
+	ret = do_stop();
+	if (ret == 0) {
+		fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n");
+		return 1;
+	}
+	if (errno != EOVERFLOW) {
+		fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n",
+			strerror(errno));
+		return 1;
+	}
+	return 0;
+}
+
+/* -----------------------------------------------------------------------
+ * Error-handling tests
+ * -----------------------------------------------------------------------
+ */
+
+static int test_double_start(void)
+{
+	int ret;
+
+	if (do_start(10000000) < 0) {
+		fprintf(stderr, "first TRACE_START: %s\n", strerror(errno));
+		return 1;
+	}
+	ret = do_start(10000000);
+	if (ret == 0) {
+		fprintf(stderr, "second TRACE_START: expected EEXIST, got 0\n");
+		do_stop();
+		return 1;
+	}
+	if (errno != EEXIST) {
+		fprintf(stderr, "second TRACE_START: expected EEXIST, got %s\n",
+			strerror(errno));
+		do_stop();
+		return 1;
+	}
+	do_stop(); /* clean up */
+	return 0;
+}
+
+static int test_stop_no_start(void)
+{
+	int ret;
+
+	/* Ensure clean state: ignore error from a stale entry */
+	do_stop();
+
+	ret = do_stop();
+	if (ret == 0) {
+		fprintf(stderr, "TRACE_STOP: expected ESRCH, got 0\n");
+		return 1;
+	}
+	if (errno != ESRCH) {
+		fprintf(stderr, "TRACE_STOP: expected ESRCH, got %s\n",
+			strerror(errno));
+		return 1;
+	}
+	return 0;
+}
+
+/* -----------------------------------------------------------------------
+ * Per-thread isolation test
+ *
+ * Two threads share a single /dev/rv fd.  The monitor uses task_struct *
+ * as the key, so each thread gets an independent slot regardless of the
+ * shared fd.
+ * -----------------------------------------------------------------------
+ */
+
+struct mt_thread_args {
+	uint64_t      threshold_us;
+	unsigned long workload_us;
+	int           busy;
+	int           expect_eoverflow;
+	int           result;
+};
+
+static void *mt_thread_fn(void *arg)
+{
+	struct mt_thread_args *a = arg;
+	int ret;
+
+	if (do_start(a->threshold_us) < 0) {
+		fprintf(stderr, "thread TRACE_START: %s\n", strerror(errno));
+		a->result = 1;
+		return NULL;
+	}
+
+	if (a->busy)
+		busy_spin_us(a->workload_us);
+	else
+		usleep(a->workload_us);
+
+	ret = do_stop();
+	if (a->expect_eoverflow) {
+		if (ret == 0 || errno != EOVERFLOW) {
+			fprintf(stderr, "thread: expected EOVERFLOW, got ret=%d errno=%s\n",
+				ret, strerror(errno));
+			a->result = 1;
+			return NULL;
+		}
+	} else {
+		if (ret != 0) {
+			fprintf(stderr, "thread: expected 0, got ret=%d errno=%s\n",
+				ret, strerror(errno));
+			a->result = 1;
+			return NULL;
+		}
+	}
+	a->result = 0;
+	return NULL;
+}
+
+static int test_multi_thread(void)
+{
+	pthread_t ta, tb;
+	struct mt_thread_args a = {
+		.threshold_us     = 20000,  /* 20 ms */
+		.workload_us      = 5000,   /* 5 ms sleep -> within budget */
+		.busy             = 0,
+		.expect_eoverflow = 0,
+	};
+	struct mt_thread_args b = {
+		.threshold_us     = 3000,   /* 3 ms */
+		.workload_us      = 30000,  /* 30 ms spin -> over budget */
+		.busy             = 1,
+		.expect_eoverflow = 1,
+	};
+
+	pthread_create(&ta, NULL, mt_thread_fn, &a);
+	pthread_create(&tb, NULL, mt_thread_fn, &b);
+	pthread_join(ta, NULL);
+	pthread_join(tb, NULL);
+
+	return (a.result || b.result) ? 1 : 0;
+}
+
+/* -----------------------------------------------------------------------
+ * Asynchronous notification test (notify_fd + read())
+ *
+ * A dedicated monitor_fd is opened by the main thread.  Two worker threads
+ * each open their own work_fd and call TLOB_IOCTL_TRACE_START with
+ * notify_fd = monitor_fd, nominating it as the violation target.  Worker A
+ * stays within budget; worker B exceeds it.  The main thread reads from
+ * monitor_fd and expects exactly one tlob_event record.
+ * -----------------------------------------------------------------------
+ */
+
+struct sw_worker_args {
+	int           monitor_fd;
+	uint64_t      threshold_us;
+	unsigned long workload_us;
+	int           busy;
+	int           result;
+};
+
+static void *sw_worker_fn(void *arg)
+{
+	struct sw_worker_args *a = arg;
+	struct tlob_start_args args = {
+		.threshold_us = a->threshold_us,
+		.notify_fd    = a->monitor_fd,
+	};
+	int work_fd;
+	int ret;
+
+	work_fd = open("/dev/rv", O_RDWR);
+	if (work_fd < 0) {
+		fprintf(stderr, "worker open /dev/rv: %s\n", strerror(errno));
+		a->result = 1;
+		return NULL;
+	}
+
+	ret = ioctl(work_fd, TLOB_IOCTL_TRACE_START, &args);
+	if (ret < 0) {
+		fprintf(stderr, "TRACE_START (notify): %s\n", strerror(errno));
+		close(work_fd);
+		a->result = 1;
+		return NULL;
+	}
+
+	if (a->busy)
+		busy_spin_us(a->workload_us);
+	else
+		usleep(a->workload_us);
+
+	ioctl(work_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+	close(work_fd);
+	a->result = 0;
+	return NULL;
+}
+
+static int test_self_watch(void)
+{
+	int monitor_fd;
+	pthread_t ta, tb;
+	struct sw_worker_args a = {
+		.threshold_us = 50000,  /* 50 ms */
+		.workload_us  = 5000,   /* 5 ms sleep -> no violation */
+		.busy         = 0,
+	};
+	struct sw_worker_args b = {
+		.threshold_us = 3000,   /* 3 ms */
+		.workload_us  = 30000,  /* 30 ms spin -> violation */
+		.busy         = 1,
+	};
+	struct tlob_event ntfs[8];
+	int violations = 0;
+	ssize_t n;
+
+	/*
+	 * Open monitor_fd with O_NONBLOCK so read() after the workers finish
+	 * returns immediately rather than blocking forever.
+	 */
+	monitor_fd = open("/dev/rv", O_RDWR | O_NONBLOCK);
+	if (monitor_fd < 0) {
+		fprintf(stderr, "open /dev/rv (monitor_fd): %s\n", strerror(errno));
+		return 1;
+	}
+	a.monitor_fd = monitor_fd;
+	b.monitor_fd = monitor_fd;
+
+	pthread_create(&ta, NULL, sw_worker_fn, &a);
+	pthread_create(&tb, NULL, sw_worker_fn, &b);
+	pthread_join(ta, NULL);
+	pthread_join(tb, NULL);
+
+	if (a.result || b.result) {
+		close(monitor_fd);
+		return 1;
+	}
+
+	/*
+	 * Drain all available tlob_event records.  With O_NONBLOCK the final
+	 * read() returns -EAGAIN when the buffer is empty.
+	 */
+	while ((n = read(monitor_fd, ntfs, sizeof(ntfs))) > 0)
+		violations += (int)(n / sizeof(struct tlob_event));
+
+	close(monitor_fd);
+
+	if (violations != 1) {
+		fprintf(stderr, "self_watch: expected 1 violation, got %d\n",
+			violations);
+		return 1;
+	}
+	return 0;
+}
+
+/* -----------------------------------------------------------------------
+ * Input-validation tests (TRACE_START error paths)
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * test_invalid_flags - TRACE_START with flags != 0 must return EINVAL.
+ *
+ * The flags field is reserved for future extensions and must be zero.
+ * Callers that set it to a non-zero value are rejected early so that a
+ * future kernel can assign meaning to those bits without silently
+ * ignoring them.
+ */
+static int test_invalid_flags(void)
+{
+	struct tlob_start_args args = {
+		.threshold_us = 1000,
+		.notify_fd    = -1,
+		.flags        = 1,   /* non-zero: must be rejected */
+	};
+	int ret;
+
+	ret = ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+	if (ret == 0) {
+		fprintf(stderr, "TRACE_START(flags=1): expected EINVAL, got success\n");
+		do_stop();
+		return 1;
+	}
+	if (errno != EINVAL) {
+		fprintf(stderr, "TRACE_START(flags=1): expected EINVAL, got %s\n",
+			strerror(errno));
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * test_notify_fd_bad - TRACE_START with a non-/dev/rv notify_fd must return
+ * EINVAL.
+ *
+ * When notify_fd >= 0, the kernel resolves it to a struct file and checks
+ * that its private_data is non-NULL (i.e. it is a /dev/rv file descriptor).
+ * Passing stdout (fd 1) supplies a real, open fd whose private_data is NULL,
+ * so the kernel must reject it with EINVAL.
+ */
+static int test_notify_fd_bad(void)
+{
+	struct tlob_start_args args = {
+		.threshold_us = 1000,
+		.notify_fd    = STDOUT_FILENO,   /* open but not a /dev/rv fd */
+		.flags        = 0,
+	};
+	int ret;
+
+	ret = ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+	if (ret == 0) {
+		fprintf(stderr,
+			"TRACE_START(notify_fd=stdout): expected EINVAL, got success\n");
+		do_stop();
+		return 1;
+	}
+	if (errno != EINVAL) {
+		fprintf(stderr,
+			"TRACE_START(notify_fd=stdout): expected EINVAL, got %s\n",
+			strerror(errno));
+		return 1;
+	}
+	return 0;
+}
+
+/* -----------------------------------------------------------------------
+ * mmap ring buffer tests (Scenario D)
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * test_mmap_basic - mmap the ring buffer and verify the control page fields.
+ *
+ * The kernel allocates TLOB_RING_DEFAULT_CAP records at open().  A shared
+ * mmap of PAGE_SIZE + cap * record_size must succeed and the tlob_mmap_page
+ * header must contain consistent values.
+ */
+static int test_mmap_basic(void)
+{
+	long pagesize = sysconf(_SC_PAGESIZE);
+	size_t mmap_len = (size_t)pagesize +
+			  TLOB_RING_DEFAULT_CAP * sizeof(struct tlob_event);
+	/* rv_mmap requires a page-aligned length */
+	mmap_len = (mmap_len + (size_t)(pagesize - 1)) & ~(size_t)(pagesize - 1);
+	struct tlob_mmap_page *page;
+	struct tlob_event *data;
+	void *map;
+	int ret = 0;
+
+	map = mmap(NULL, mmap_len, PROT_READ | PROT_WRITE, MAP_SHARED, rv_fd, 0);
+	if (map == MAP_FAILED) {
+		fprintf(stderr, "mmap_basic: mmap: %s\n", strerror(errno));
+		return 1;
+	}
+
+	page = (struct tlob_mmap_page *)map;
+	data = (struct tlob_event *)((char *)map + page->data_offset);
+
+	if (page->version != 1) {
+		fprintf(stderr, "mmap_basic: expected version=1, got %u\n",
+			page->version);
+		ret = 1;
+		goto out;
+	}
+	if (page->capacity != TLOB_RING_DEFAULT_CAP) {
+		fprintf(stderr, "mmap_basic: expected capacity=%u, got %u\n",
+			TLOB_RING_DEFAULT_CAP, page->capacity);
+		ret = 1;
+		goto out;
+	}
+	if (page->data_offset != (uint32_t)pagesize) {
+		fprintf(stderr, "mmap_basic: expected data_offset=%ld, got %u\n",
+			pagesize, page->data_offset);
+		ret = 1;
+		goto out;
+	}
+	if (page->record_size != sizeof(struct tlob_event)) {
+		fprintf(stderr, "mmap_basic: expected record_size=%zu, got %u\n",
+			sizeof(struct tlob_event), page->record_size);
+		ret = 1;
+		goto out;
+	}
+	if (page->data_head != 0 || page->data_tail != 0) {
+		fprintf(stderr, "mmap_basic: ring not empty at open: head=%u tail=%u\n",
+			page->data_head, page->data_tail);
+		ret = 1;
+		goto out;
+	}
+	/* Touch the data array to confirm it is accessible. */
+	(void)data[0].tid;
+out:
+	munmap(map, mmap_len);
+	return ret;
+}
+
+/*
+ * test_mmap_errors - verify that rv_mmap() rejects invalid mmap parameters.
+ *
+ * Four cases are tested, each must return MAP_FAILED with errno == EINVAL:
+ *   1. size one page short of the correct ring length
+ *   2. size one page larger than the correct ring length
+ *   3. MAP_PRIVATE (only MAP_SHARED is permitted)
+ *   4. non-zero vm_pgoff (offset must be 0)
+ */
+static int test_mmap_errors(void)
+{
+	long pagesize = sysconf(_SC_PAGESIZE);
+	size_t correct_len = (size_t)pagesize +
+			     TLOB_RING_DEFAULT_CAP * sizeof(struct tlob_event);
+	/* rv_mmap requires a page-aligned length */
+	correct_len = (correct_len + (size_t)(pagesize - 1)) & ~(size_t)(pagesize - 1);
+	void *map;
+	int ret = 0;
+
+	/* Case 1: size one page short (correct_len - 1 still rounds up to correct_len) */
+	map = mmap(NULL, correct_len - (size_t)pagesize, PROT_READ | PROT_WRITE,
+		   MAP_SHARED, rv_fd, 0);
+	if (map != MAP_FAILED) {
+		fprintf(stderr, "mmap_errors: short-size mmap succeeded (expected EINVAL)\n");
+		munmap(map, correct_len - (size_t)pagesize);
+		ret = 1;
+	} else if (errno != EINVAL) {
+		fprintf(stderr, "mmap_errors: short-size: expected EINVAL, got %s\n",
+			strerror(errno));
+		ret = 1;
+	}
+
+	/* Case 2: size one page too large */
+	map = mmap(NULL, correct_len + (size_t)pagesize, PROT_READ | PROT_WRITE,
+		   MAP_SHARED, rv_fd, 0);
+	if (map != MAP_FAILED) {
+		fprintf(stderr, "mmap_errors: oversized mmap succeeded (expected EINVAL)\n");
+		munmap(map, correct_len + (size_t)pagesize);
+		ret = 1;
+	} else if (errno != EINVAL) {
+		fprintf(stderr, "mmap_errors: oversized: expected EINVAL, got %s\n",
+			strerror(errno));
+		ret = 1;
+	}
+
+	/* Case 3: MAP_PRIVATE instead of MAP_SHARED */
+	map = mmap(NULL, correct_len, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE, rv_fd, 0);
+	if (map != MAP_FAILED) {
+		fprintf(stderr, "mmap_errors: MAP_PRIVATE succeeded (expected EINVAL)\n");
+		munmap(map, correct_len);
+		ret = 1;
+	} else if (errno != EINVAL) {
+		fprintf(stderr, "mmap_errors: MAP_PRIVATE: expected EINVAL, got %s\n",
+			strerror(errno));
+		ret = 1;
+	}
+
+	/* Case 4: non-zero file offset (pgoff = 1) */
+	map = mmap(NULL, correct_len, PROT_READ | PROT_WRITE,
+		   MAP_SHARED, rv_fd, (off_t)pagesize);
+	if (map != MAP_FAILED) {
+		fprintf(stderr, "mmap_errors: non-zero pgoff mmap succeeded (expected EINVAL)\n");
+		munmap(map, correct_len);
+		ret = 1;
+	} else if (errno != EINVAL) {
+		fprintf(stderr, "mmap_errors: non-zero pgoff: expected EINVAL, got %s\n",
+			strerror(errno));
+		ret = 1;
+	}
+
+	return ret;
+}
+
+/*
+ * test_mmap_consume - zero-copy consumption of a real violation event.
+ *
+ * Arms a 5 ms budget with self-notification (notify_fd = rv_fd), sleeps
+ * 50 ms (off-CPU violation), then reads the pushed event through the mmap'd
+ * ring without calling read().  Verifies:
+ *   - TRACE_STOP returns EOVERFLOW (budget was exceeded)
+ *   - data_head == 1 after the violation
+ *   - the event fields (threshold_us, tag, tid) are correct
+ *   - data_tail can be advanced to consume the record (ring empties)
+ */
+static int test_mmap_consume(void)
+{
+	long pagesize = sysconf(_SC_PAGESIZE);
+	size_t mmap_len = (size_t)pagesize +
+			  TLOB_RING_DEFAULT_CAP * sizeof(struct tlob_event);
+	/* rv_mmap requires a page-aligned length */
+	mmap_len = (mmap_len + (size_t)(pagesize - 1)) & ~(size_t)(pagesize - 1);
+	struct tlob_start_args args = {
+		.threshold_us = 5000,		/* 5 ms */
+		.notify_fd    = rv_fd,		/* self-notification */
+		.tag          = 0xdeadbeefULL,
+		.flags        = 0,
+	};
+	struct tlob_mmap_page *page;
+	struct tlob_event *data;
+	void *map;
+	int stop_ret;
+	int ret = 0;
+
+	map = mmap(NULL, mmap_len, PROT_READ | PROT_WRITE, MAP_SHARED, rv_fd, 0);
+	if (map == MAP_FAILED) {
+		fprintf(stderr, "mmap_consume: mmap: %s\n", strerror(errno));
+		return 1;
+	}
+
+	page = (struct tlob_mmap_page *)map;
+	data = (struct tlob_event *)((char *)map + page->data_offset);
+
+	if (ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args) < 0) {
+		fprintf(stderr, "mmap_consume: TRACE_START: %s\n", strerror(errno));
+		ret = 1;
+		goto out;
+	}
+
+	usleep(50000); /* 50 ms >> 5 ms budget -> off-CPU violation */
+
+	stop_ret = ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+	if (stop_ret == 0) {
+		fprintf(stderr, "mmap_consume: TRACE_STOP returned 0, expected EOVERFLOW\n");
+		ret = 1;
+		goto out;
+	}
+	if (errno != EOVERFLOW) {
+		fprintf(stderr, "mmap_consume: TRACE_STOP: expected EOVERFLOW, got %s\n",
+			strerror(errno));
+		ret = 1;
+		goto out;
+	}
+
+	/* Pairs with smp_store_release in tlob_event_push. */
+	if (__atomic_load_n(&page->data_head, __ATOMIC_ACQUIRE) != 1) {
+		fprintf(stderr, "mmap_consume: expected data_head=1, got %u\n",
+			page->data_head);
+		ret = 1;
+		goto out;
+	}
+	if (page->data_tail != 0) {
+		fprintf(stderr, "mmap_consume: expected data_tail=0, got %u\n",
+			page->data_tail);
+		ret = 1;
+		goto out;
+	}
+
+	/* Verify record content */
+	if (data[0].threshold_us != 5000) {
+		fprintf(stderr, "mmap_consume: expected threshold_us=5000, got %llu\n",
+			(unsigned long long)data[0].threshold_us);
+		ret = 1;
+		goto out;
+	}
+	if (data[0].tag != 0xdeadbeefULL) {
+		fprintf(stderr, "mmap_consume: expected tag=0xdeadbeef, got %llx\n",
+			(unsigned long long)data[0].tag);
+		ret = 1;
+		goto out;
+	}
+	if (data[0].tid == 0) {
+		fprintf(stderr, "mmap_consume: tid is 0\n");
+		ret = 1;
+		goto out;
+	}
+
+	/* Consume: advance data_tail and confirm ring is empty */
+	__atomic_store_n(&page->data_tail, 1U, __ATOMIC_RELEASE);
+	if (__atomic_load_n(&page->data_head, __ATOMIC_ACQUIRE) !=
+	    __atomic_load_n(&page->data_tail, __ATOMIC_ACQUIRE)) {
+		fprintf(stderr, "mmap_consume: ring not empty after consume\n");
+		ret = 1;
+	}
+
+out:
+	munmap(map, mmap_len);
+	return ret;
+}
+
+/* -----------------------------------------------------------------------
+ * ELF utility: sym_offset
+ *
+ * Print the ELF file offset of a symbol in a binary.  Supports 32- and
+ * 64-bit ELF.  Walks the section headers to find .symtab (falling back to
+ * .dynsym), then converts the symbol's virtual address to a file offset
+ * via the PT_LOAD program headers.
+ *
+ * Does not require /dev/rv; used by the shell script to build uprobe
+ * bindings of the form pid:threshold_us:offset_start:offset_stop:binary_path.
+ *
+ * Returns 0 on success (offset printed to stdout), 1 on failure.
+ * -----------------------------------------------------------------------
+ */
+static int sym_offset(const char *binary, const char *symname)
+{
+	int fd;
+	struct stat st;
+	void *map;
+	Elf64_Ehdr *ehdr;
+	Elf32_Ehdr *ehdr32;
+	int is64;
+	uint64_t sym_vaddr = 0;
+	int found = 0;
+	uint64_t file_offset = 0;
+
+	fd = open(binary, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "open %s: %s\n", binary, strerror(errno));
+		return 1;
+	}
+	if (fstat(fd, &st) < 0) {
+		close(fd);
+		return 1;
+	}
+	map = mmap(NULL, (size_t)st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	close(fd);
+	if (map == MAP_FAILED) {
+		fprintf(stderr, "mmap: %s\n", strerror(errno));
+		return 1;
+	}
+
+	/* Identify ELF class */
+	ehdr = (Elf64_Ehdr *)map;
+	ehdr32 = (Elf32_Ehdr *)map;
+	if (st.st_size < 4 ||
+	    ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
+	    ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+	    ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
+	    ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+		fprintf(stderr, "%s: not an ELF file\n", binary);
+		munmap(map, (size_t)st.st_size);
+		return 1;
+	}
+	is64 = (ehdr->e_ident[EI_CLASS] == ELFCLASS64);
+
+	if (is64) {
+		/* Walk section headers to find .symtab or .dynsym */
+		Elf64_Shdr *shdrs = (Elf64_Shdr *)((char *)map + ehdr->e_shoff);
+		Elf64_Shdr *shstrtab_hdr = &shdrs[ehdr->e_shstrndx];
+		const char *shstrtab = (char *)map + shstrtab_hdr->sh_offset;
+		int si;
+
+		/* Prefer .symtab; fall back to .dynsym */
+		for (int pass = 0; pass < 2 && !found; pass++) {
+			const char *target = pass ? ".dynsym" : ".symtab";
+
+			for (si = 0; si < ehdr->e_shnum && !found; si++) {
+				Elf64_Shdr *sh = &shdrs[si];
+				const char *name = shstrtab + sh->sh_name;
+
+				if (strcmp(name, target) != 0)
+					continue;
+
+				Elf64_Shdr *strtab_sh = &shdrs[sh->sh_link];
+				const char *strtab = (char *)map + strtab_sh->sh_offset;
+				Elf64_Sym *syms = (Elf64_Sym *)((char *)map + sh->sh_offset);
+				uint64_t nsyms = sh->sh_size / sizeof(Elf64_Sym);
+				uint64_t j;
+
+				for (j = 0; j < nsyms; j++) {
+					if (strcmp(strtab + syms[j].st_name, symname) == 0) {
+						sym_vaddr = syms[j].st_value;
+						found = 1;
+						break;
+					}
+				}
+			}
+		}
+
+		if (!found) {
+			fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary);
+			munmap(map, (size_t)st.st_size);
+			return 1;
+		}
+
+		/* Convert vaddr to file offset via PT_LOAD segments */
+		Elf64_Phdr *phdrs = (Elf64_Phdr *)((char *)map + ehdr->e_phoff);
+		int pi;
+
+		for (pi = 0; pi < ehdr->e_phnum; pi++) {
+			Elf64_Phdr *ph = &phdrs[pi];
+
+			if (ph->p_type != PT_LOAD)
+				continue;
+			if (sym_vaddr >= ph->p_vaddr &&
+			    sym_vaddr < ph->p_vaddr + ph->p_filesz) {
+				file_offset = sym_vaddr - ph->p_vaddr + ph->p_offset;
+				break;
+			}
+		}
+	} else {
+		/* 32-bit ELF */
+		Elf32_Shdr *shdrs = (Elf32_Shdr *)((char *)map + ehdr32->e_shoff);
+		Elf32_Shdr *shstrtab_hdr = &shdrs[ehdr32->e_shstrndx];
+		const char *shstrtab = (char *)map + shstrtab_hdr->sh_offset;
+		int si;
+		uint32_t sym_vaddr32 = 0;
+
+		for (int pass = 0; pass < 2 && !found; pass++) {
+			const char *target = pass ? ".dynsym" : ".symtab";
+
+			for (si = 0; si < ehdr32->e_shnum && !found; si++) {
+				Elf32_Shdr *sh = &shdrs[si];
+				const char *name = shstrtab + sh->sh_name;
+
+				if (strcmp(name, target) != 0)
+					continue;
+
+				Elf32_Shdr *strtab_sh = &shdrs[sh->sh_link];
+				const char *strtab = (char *)map + strtab_sh->sh_offset;
+				Elf32_Sym *syms = (Elf32_Sym *)((char *)map + sh->sh_offset);
+				uint32_t nsyms = sh->sh_size / sizeof(Elf32_Sym);
+				uint32_t j;
+
+				for (j = 0; j < nsyms; j++) {
+					if (strcmp(strtab + syms[j].st_name, symname) == 0) {
+						sym_vaddr32 = syms[j].st_value;
+						found = 1;
+						break;
+					}
+				}
+			}
+		}
+
+		if (!found) {
+			fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary);
+			munmap(map, (size_t)st.st_size);
+			return 1;
+		}
+
+		Elf32_Phdr *phdrs = (Elf32_Phdr *)((char *)map + ehdr32->e_phoff);
+		int pi;
+
+		for (pi = 0; pi < ehdr32->e_phnum; pi++) {
+			Elf32_Phdr *ph = &phdrs[pi];
+
+			if (ph->p_type != PT_LOAD)
+				continue;
+			if (sym_vaddr32 >= ph->p_vaddr &&
+			    sym_vaddr32 < ph->p_vaddr + ph->p_filesz) {
+				file_offset = sym_vaddr32 - ph->p_vaddr + ph->p_offset;
+				break;
+			}
+		}
+		sym_vaddr = sym_vaddr32;
+	}
+
+	munmap(map, (size_t)st.st_size);
+
+	if (!file_offset && sym_vaddr) {
+		fprintf(stderr, "could not map vaddr 0x%lx to file offset\n",
+			(unsigned long)sym_vaddr);
+		return 1;
+	}
+
+	printf("0x%lx\n", (unsigned long)file_offset);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int rc;
+
+	if (argc < 2) {
+		fprintf(stderr, "Usage: %s <subcommand> [args...]\n", argv[0]);
+		return 1;
+	}
+
+	/* sym_offset does not need /dev/rv */
+	if (strcmp(argv[1], "sym_offset") == 0) {
+		if (argc < 4) {
+			fprintf(stderr, "Usage: %s sym_offset <binary> <symbol>\n",
+				argv[0]);
+			return 1;
+		}
+		return sym_offset(argv[2], argv[3]);
+	}
+
+	if (open_rv() < 0)
+		return 2; /* skip */
+
+	if (strcmp(argv[1], "not_enabled") == 0)
+		rc = test_not_enabled();
+	else if (strcmp(argv[1], "within_budget") == 0)
+		rc = test_within_budget();
+	else if (strcmp(argv[1], "over_budget_cpu") == 0)
+		rc = test_over_budget_cpu();
+	else if (strcmp(argv[1], "over_budget_sleep") == 0)
+		rc = test_over_budget_sleep();
+	else if (strcmp(argv[1], "double_start") == 0)
+		rc = test_double_start();
+	else if (strcmp(argv[1], "stop_no_start") == 0)
+		rc = test_stop_no_start();
+	else if (strcmp(argv[1], "multi_thread") == 0)
+		rc = test_multi_thread();
+	else if (strcmp(argv[1], "self_watch") == 0)
+		rc = test_self_watch();
+	else if (strcmp(argv[1], "invalid_flags") == 0)
+		rc = test_invalid_flags();
+	else if (strcmp(argv[1], "notify_fd_bad") == 0)
+		rc = test_notify_fd_bad();
+	else if (strcmp(argv[1], "mmap_basic") == 0)
+		rc = test_mmap_basic();
+	else if (strcmp(argv[1], "mmap_errors") == 0)
+		rc = test_mmap_errors();
+	else if (strcmp(argv[1], "mmap_consume") == 0)
+		rc = test_mmap_consume();
+	else {
+		fprintf(stderr, "Unknown test: %s\n", argv[1]);
+		rc = 1;
+	}
+
+	close(rv_fd);
+	return rc;
+}
diff --git a/tools/testing/selftests/rv/tlob_uprobe_target.c b/tools/testing/selftests/rv/tlob_uprobe_target.c
new file mode 100644
index 000000000..6c895cb40
--- /dev/null
+++ b/tools/testing/selftests/rv/tlob_uprobe_target.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob_uprobe_target.c - uprobe target binary for tlob selftests.
+ *
+ * Provides two well-known probe points:
+ *   tlob_busy_work()      - start probe: arms the tlob budget timer
+ *   tlob_busy_work_done() - stop  probe: cancels the timer on completion
+ *
+ * The tlob selftest writes a five-field uprobe binding:
+ *   pid:threshold_us:binary:offset_start:offset_stop
+ * where offset_start is the file offset of tlob_busy_work and offset_stop
+ * is the file offset of tlob_busy_work_done (resolved via tlob_helper
+ * sym_offset).
+ *
+ * Both probe points are plain entry uprobes (no uretprobe).  The busy loop
+ * keeps the task on-CPU so that either the stop probe fires cleanly (within
+ * budget) or the hrtimer fires first and emits tlob_budget_exceeded (over
+ * budget).
+ *
+ * Usage: tlob_uprobe_target <duration_ms>
+ *
+ * Loops calling tlob_busy_work() in 200 ms iterations until <duration_ms>
+ * has elapsed (0 = run for ~24 hours).  Short iterations ensure the uprobe
+ * entry fires on every call even if the uprobe is installed after the
+ * program has started.
+ */
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#ifndef noinline
+#define noinline __attribute__((noinline))
+#endif
+
+static inline int timespec_before(const struct timespec *a,
+				   const struct timespec *b)
+{
+	return a->tv_sec < b->tv_sec ||
+	       (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
+}
+
+static void timespec_add_ms(struct timespec *ts, unsigned long ms)
+{
+	ts->tv_sec  += ms / 1000;
+	ts->tv_nsec += (long)(ms % 1000) * 1000000L;
+	if (ts->tv_nsec >= 1000000000L) {
+		ts->tv_sec++;
+		ts->tv_nsec -= 1000000000L;
+	}
+}
+
+/*
+ * tlob_busy_work_done - stop-probe target.
+ *
+ * Called by tlob_busy_work() after the busy loop.  The uprobe on this
+ * function's entry fires tlob_stop_task(), cancelling the budget timer.
+ * noinline ensures the compiler never merges this function with its caller,
+ * guaranteeing the entry uprobe always fires.
+ */
+noinline void tlob_busy_work_done(void)
+{
+	/* empty: the uprobe fires on entry */
+}
+
+/*
+ * tlob_busy_work - start-probe target.
+ *
+ * The uprobe on this function's entry fires tlob_start_task(), arming the
+ * budget timer.  noinline prevents the compiler and linker (including LTO)
+ * from inlining this function into its callers, ensuring the entry uprobe
+ * fires on every call.
+ */
+noinline void tlob_busy_work(unsigned long duration_ns)
+{
+	struct timespec start, now;
+	unsigned long elapsed;
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	do {
+		clock_gettime(CLOCK_MONOTONIC, &now);
+		elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+			  * 1000000000UL
+			+ (unsigned long)(now.tv_nsec - start.tv_nsec);
+	} while (elapsed < duration_ns);
+
+	tlob_busy_work_done();
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long duration_ms = 0;
+	struct timespec deadline, now;
+
+	if (argc >= 2)
+		duration_ms = strtoul(argv[1], NULL, 10);
+
+	clock_gettime(CLOCK_MONOTONIC, &deadline);
+	timespec_add_ms(&deadline, duration_ms ? duration_ms : 86400000UL);
+
+	do {
+		tlob_busy_work(200 * 1000000UL); /* 200 ms per iteration */
+		clock_gettime(CLOCK_MONOTONIC, &now);
+	} while (timespec_before(&now, &deadline));
+
+	return 0;
+}
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH mm-unstable v15 06/13] mm/khugepaged: skip collapsing mTHP to smaller orders
From: Nico Pache @ 2026-04-13  1:38 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <94290ad5-f63d-4fa7-a898-dcfe6cd9998b@kernel.org>

On Thu, Mar 12, 2026 at 3:00 PM David Hildenbrand (Arm)
<david@kernel.org> wrote:
>
> On 2/26/26 04:24, Nico Pache wrote:
> > khugepaged may try to collapse a mTHP to a smaller mTHP, resulting in
> > some pages being unmapped. Skip these cases until we have a way to check
> > if its ok to collapse to a smaller mTHP size (like in the case of a
> > partially mapped folio).
> >
> > This patch is inspired by Dev Jain's work on khugepaged mTHP support [1].
> >
> > [1] https://lore.kernel.org/lkml/20241216165105.56185-11-dev.jain@arm.com/
> >
> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Co-developed-by: Dev Jain <dev.jain@arm.com>
> > Signed-off-by: Dev Jain <dev.jain@arm.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >  mm/khugepaged.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index fb3ba8fe5a6c..c739f26dd61e 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -638,6 +638,14 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                               goto out;
> >                       }
> >               }
> > +             /*
> > +              * TODO: In some cases of partially-mapped folios, we'd actually
> > +              * want to collapse.
> > +              */
> > +             if (!is_pmd_order(order) && folio_order(folio) >= order) {
> > +                     result = SCAN_PTE_MAPPED_HUGEPAGE;
> > +                     goto out;
> > +             }
> >
> >               if (folio_test_large(folio)) {
> >                       struct folio *f;
>
> Why aren't we doing the same in hpage_collapse_scan_pmd() ?

We can't do this in the scan phase because we are not yet aware of the
order we want to collapse to.

The scan phase builds the bitmap (if mthp sizes are enabled). I tried
to think if there were any similar checks we could perform; the only
one that came to mind is whether the current folio size exceeds the
highest enabled folio size. Is that worth checking?

Cheers,
-- Nico

>
> --
> Cheers,
>
> David
>


^ permalink raw reply

* Re: [PATCH mm-unstable v15 07/13] mm/khugepaged: add per-order mTHP collapse failure statistics
From: Nico Pache @ 2026-04-13  2:48 UTC (permalink / raw)
  To: Lorenzo Stoakes (Oracle)
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <c832d503-8b8c-487a-b61a-df74a3057308@lucifer.local>

On Tue, Mar 17, 2026 at 11:05 AM Lorenzo Stoakes (Oracle)
<ljs@kernel.org> wrote:
>
> On Wed, Feb 25, 2026 at 08:25:04PM -0700, Nico Pache wrote:
> > Add three new mTHP statistics to track collapse failures for different
> > orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
> >
> > - collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
> >       PTEs
> >
> > - collapse_exceed_none_pte: Counts when mTHP collapse fails due to
> >       exceeding the none PTE threshold for the given order
> >
> > - collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
> >       PTEs
> >
> > These statistics complement the existing THP_SCAN_EXCEED_* events by
> > providing per-order granularity for mTHP collapse attempts. The stats are
> > exposed via sysfs under
> > `/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
> > supported hugepage size.
> >
> > As we currently dont support collapsing mTHPs that contain a swap or
> > shared entry, those statistics keep track of how often we are
> > encountering failed mTHP collapses due to these restrictions.
> >
> > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >  Documentation/admin-guide/mm/transhuge.rst | 24 ++++++++++++++++++++++
> >  include/linux/huge_mm.h                    |  3 +++
> >  mm/huge_memory.c                           |  7 +++++++
> >  mm/khugepaged.c                            | 16 ++++++++++++---
> >  4 files changed, 47 insertions(+), 3 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> > index c51932e6275d..eebb1f6bbc6c 100644
> > --- a/Documentation/admin-guide/mm/transhuge.rst
> > +++ b/Documentation/admin-guide/mm/transhuge.rst
> > @@ -714,6 +714,30 @@ nr_anon_partially_mapped
> >         an anonymous THP as "partially mapped" and count it here, even though it
> >         is not actually partially mapped anymore.
> >
> > +collapse_exceed_none_pte
> > +       The number of collapse attempts that failed due to exceeding the
> > +       max_ptes_none threshold. For mTHP collapse, Currently only max_ptes_none
> > +       values of 0 and (HPAGE_PMD_NR - 1) are supported. Any other value will
> > +       emit a warning and no mTHP collapse will be attempted. khugepaged will
>
> It's weird to document this here but not elsewhere in the document? I mean I
> made this comment on the documentation patch also.

I can add some more documentation but TBH I don't really know where or
what else to put. I checked a few of these other per-mTHP stats, and
none are referenced elsewhere. if anything these 3 additions are the
best documented ones.

>
> Not sure if I missed you adding it to another bit of the docs? :)
>
> > +       try to collapse to the largest enabled (m)THP size; if it fails, it will
> > +       try the next lower enabled mTHP size. This counter records the number of
> > +       times a collapse attempt was skipped for exceeding the max_ptes_none
> > +       threshold, and khugepaged will move on to the next available mTHP size.
> > +
> > +collapse_exceed_swap_pte
> > +       The number of anonymous mTHP PTE ranges which were unable to collapse due
> > +       to containing at least one swap PTE. Currently khugepaged does not
> > +       support collapsing mTHP regions that contain a swap PTE. This counter can
> > +       be used to monitor the number of khugepaged mTHP collapses that failed
> > +       due to the presence of a swap PTE.
> > +
> > +collapse_exceed_shared_pte
> > +       The number of anonymous mTHP PTE ranges which were unable to collapse due
> > +       to containing at least one shared PTE. Currently khugepaged does not
> > +       support collapsing mTHP PTE ranges that contain a shared PTE. This
> > +       counter can be used to monitor the number of khugepaged mTHP collapses
> > +       that failed due to the presence of a shared PTE.
>
> All of these talk about 'ranges' that could be of any size. Are these useful
> metrics? Counting a bunch of failures and not knowing if they are 256 KB
> failures or 16 KB failures or whatever is maybe not so useful information?

These are per-mTHP size statistics. If you look at the surrounding
examples and docs this all makes more sense.

>
> Also, from the code, aren't you treating PMD events the same as mTHP ones from
> the point of view of these counters? Maybe worth documenting that?

IIUC, yes but that is true of all these

```
In /sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/stats, There are
also individual counters for each huge page size, which can be utilized to
monitor the system's effectiveness in providing huge pages for usage. Each
counter has its own corresponding file.
```

>
> > +
> >  As the system ages, allocating huge pages may be expensive as the
> >  system uses memory compaction to copy data around memory to free a
> >  huge page for use. There are some counters in ``/proc/vmstat`` to help
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 9941fc6d7bd8..e8777bb2347d 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -144,6 +144,9 @@ enum mthp_stat_item {
> >       MTHP_STAT_SPLIT_DEFERRED,
> >       MTHP_STAT_NR_ANON,
> >       MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> > +     MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> > +     MTHP_STAT_COLLAPSE_EXCEED_NONE,
> > +     MTHP_STAT_COLLAPSE_EXCEED_SHARED,
> >       __MTHP_STAT_COUNT
> >  };
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 228f35e962b9..1049a207a257 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -642,6 +642,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> >  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> >  DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
> >  DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
>
> Is there a reason there's such a difference between the names and the actual
> enum names?

Good point I didnt think about that. I can update those as long as
they don't conflict with something else (I forget why i named them
like this).

>
> > +
> >
> >  static struct attribute *anon_stats_attrs[] = {
> >       &anon_fault_alloc_attr.attr,
> > @@ -658,6 +662,9 @@ static struct attribute *anon_stats_attrs[] = {
> >       &split_deferred_attr.attr,
> >       &nr_anon_attr.attr,
> >       &nr_anon_partially_mapped_attr.attr,
> > +     &collapse_exceed_swap_pte_attr.attr,
> > +     &collapse_exceed_none_pte_attr.attr,
> > +     &collapse_exceed_shared_pte_attr.attr,
> >       NULL,
> >  };
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index c739f26dd61e..a6cf90e09e4a 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -595,7 +595,9 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                               continue;
> >                       } else {
> >                               result = SCAN_EXCEED_NONE_PTE;
> > -                             count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> > +                             if (is_pmd_order(order))
> > +                                     count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> > +                             count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
>
> It's a bit gross to have separate stats for both thp and mthp but maybe
> unavoidable from a legacy stand point.

I agree but that's how it currently is. Perhaps we can add this to the
TODO list for THP work.

>
> Why are we dropping the _PTE suffix?

I follow the convention that the other mTHP stats follow for example
(MTHP_STAT_SPLIT_DEFERRED)

>
> >                               goto out;
> >                       }
> >               }
> > @@ -631,10 +633,17 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                        * shared may cause a future higher order collapse on a
> >                        * rescan of the same range.
> >                        */
> > -                     if (!is_pmd_order(order) || (cc->is_khugepaged &&
> > -                         shared > khugepaged_max_ptes_shared)) {
>
> OK losing track here :) as the series sadly doesn't currently apply so can't
> browser file as is.
>
> In the code I'm looking at, there's also a ++shared here that I guess another
> patch removed?
>
> Is this in the folio_maybe_mapped_shared() branch?

yes the counting is now done at the top of that branch.

>
> > +                     if (!is_pmd_order(order)) {
> > +                             result = SCAN_EXCEED_SHARED_PTE;
> > +                             count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > +                             goto out;
> > +                     }
> > +
> > +                     if (cc->is_khugepaged &&
> > +                         shared > khugepaged_max_ptes_shared) {
> >                               result = SCAN_EXCEED_SHARED_PTE;
> >                               count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> > +                             count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> >                               goto out;
>
> Anyway I'm a bit lost on this logic until a respin but this looks like a LOT of
> code duplication. I see David alluded to a refactoring so maybe what he suggests
> will help (not had a chance to check what it is specifically :P)

Yep :) should look cleaner in the next one. Although it's quite a bit
of refactoring. I'll be praying that i got it right on the first go,
and I put all the other pieces in the desired spot.

>
> >                       }
> >               }
> > @@ -1081,6 +1090,7 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
> >                * range.
> >                */
> >               if (!is_pmd_order(order)) {
> > +                     count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
>
> Hmm I thought we were incrementing mthp stats for pmd sized also?

Yes we are supposed to. I've already refactored and it looks fine
there... perhaps i missed this one in this version!

Cheers,

-- Nico

>
> >                       pte_unmap(pte);
> >                       mmap_read_unlock(mm);
> >                       result = SCAN_EXCEED_SWAP_PTE;
> > --
> > 2.53.0
> >
>
> Cheers, Lorenzo
>


^ permalink raw reply

* [PATCH 2/3] mm: use get_i_mmap_root to access the file's i_mmap
From: Huang Shijie @ 2026-04-13  6:20 UTC (permalink / raw)
  To: akpm, viro, brauner
  Cc: linux-mm, linux-kernel, linux-arm-kernel, linux-fsdevel,
	muchun.song, osalvador, linux-trace-kernel, linux-perf-users,
	linux-parisc, nvdimm, zhongyuan, fangbaoshun, yingzhiwei,
	Huang Shijie
In-Reply-To: <20260413062042.804-1-huangsj@hygon.cn>

Do not access the file's i_mmap directly, use get_i_mmap_root()
to access it. This patch makes preparations for later patches.

Signed-off-by: Huang Shijie <huangsj@hygon.cn>
---
 arch/arm/mm/fault-armv.c   |  3 ++-
 arch/arm/mm/flush.c        |  3 ++-
 arch/nios2/mm/cacheflush.c |  3 ++-
 arch/parisc/kernel/cache.c |  4 +++-
 fs/dax.c                   |  3 ++-
 fs/hugetlbfs/inode.c       |  6 +++---
 include/linux/fs.h         |  5 +++++
 include/linux/mm.h         |  1 +
 kernel/events/uprobes.c    |  3 ++-
 mm/hugetlb.c               |  7 +++++--
 mm/khugepaged.c            |  6 ++++--
 mm/memory-failure.c        |  8 +++++---
 mm/memory.c                |  4 ++--
 mm/mmap.c                  |  2 +-
 mm/nommu.c                 |  9 +++++----
 mm/pagewalk.c              |  2 +-
 mm/rmap.c                  |  2 +-
 mm/vma.c                   | 14 ++++++++------
 18 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 91e488767783..1b5fe151e805 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -126,6 +126,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
 {
 	const unsigned long pmd_start_addr = ALIGN_DOWN(addr, PMD_SIZE);
 	const unsigned long pmd_end_addr = pmd_start_addr + PMD_SIZE;
+	struct rb_root_cached *root = get_i_mmap_root(mapping);
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *mpnt;
 	unsigned long offset;
@@ -140,7 +141,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
 	 * cache coherency.
 	 */
 	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(mpnt, root, pgoff, pgoff) {
 		/*
 		 * If we are using split PTE locks, then we need to take the pte
 		 * lock. Otherwise we are using shared mm->page_table_lock which
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 19470d938b23..b9641901f206 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -238,6 +238,7 @@ void __flush_dcache_folio(struct address_space *mapping, struct folio *folio)
 static void __flush_dcache_aliases(struct address_space *mapping, struct folio *folio)
 {
 	struct mm_struct *mm = current->active_mm;
+	struct rb_root_cached *root = get_i_mmap_root(mapping);
 	struct vm_area_struct *vma;
 	pgoff_t pgoff, pgoff_end;
 
@@ -251,7 +252,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct folio *
 	pgoff_end = pgoff + folio_nr_pages(folio) - 1;
 
 	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff_end) {
+	vma_interval_tree_foreach(vma, root, pgoff, pgoff_end) {
 		unsigned long start, offset, pfn;
 		unsigned int nr;
 
diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c
index 8321182eb927..ab6e064fabe2 100644
--- a/arch/nios2/mm/cacheflush.c
+++ b/arch/nios2/mm/cacheflush.c
@@ -78,11 +78,12 @@ static void flush_aliases(struct address_space *mapping, struct folio *folio)
 	unsigned long flags;
 	pgoff_t pgoff;
 	unsigned long nr = folio_nr_pages(folio);
+	struct rb_root_cached *root = get_i_mmap_root(mapping);
 
 	pgoff = folio->index;
 
 	flush_dcache_mmap_lock_irqsave(mapping, flags);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + nr - 1) {
+	vma_interval_tree_foreach(vma, root, pgoff, pgoff + nr - 1) {
 		unsigned long start;
 
 		if (vma->vm_mm != mm)
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 4c5240d3a3c7..920adacaaac2 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -473,6 +473,7 @@ static inline unsigned long get_upa(struct mm_struct *mm, unsigned long addr)
 void flush_dcache_folio(struct folio *folio)
 {
 	struct address_space *mapping = folio_flush_mapping(folio);
+	struct rb_root_cached *root;
 	struct vm_area_struct *vma;
 	unsigned long addr, old_addr = 0;
 	void *kaddr;
@@ -494,6 +495,7 @@ void flush_dcache_folio(struct folio *folio)
 		return;
 
 	pgoff = folio->index;
+	root = get_i_mmap_root(mapping);
 
 	/*
 	 * We have carefully arranged in arch_get_unmapped_area() that
@@ -503,7 +505,7 @@ void flush_dcache_folio(struct folio *folio)
 	 * on machines that support equivalent aliasing
 	 */
 	flush_dcache_mmap_lock_irqsave(mapping, flags);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + nr - 1) {
+	vma_interval_tree_foreach(vma, root, pgoff, pgoff + nr - 1) {
 		unsigned long offset = pgoff - vma->vm_pgoff;
 		unsigned long pfn = folio_pfn(folio);
 
diff --git a/fs/dax.c b/fs/dax.c
index 289e6254aa30..00fe5481accc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1101,6 +1101,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 		struct address_space *mapping, void *entry)
 {
 	unsigned long pfn, index, count, end;
+	struct rb_root_cached *root = get_i_mmap_root(mapping);
 	long ret = 0;
 	struct vm_area_struct *vma;
 
@@ -1164,7 +1165,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 
 	/* Walk all mappings of a given index of a file and writeprotect them */
 	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
+	vma_interval_tree_foreach(vma, root, index, end) {
 		pfn_mkclean_range(pfn, count, index, vma);
 		cond_resched();
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ab5ac092d8a6..9cf82fba6eb6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -400,7 +400,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
 					struct address_space *mapping,
 					struct folio *folio, pgoff_t index)
 {
-	struct rb_root_cached *root = &mapping->i_mmap;
+	struct rb_root_cached *root = get_i_mmap_root(mapping);
 	struct hugetlb_vma_lock *vma_lock;
 	unsigned long pfn = folio_pfn(folio);
 	struct vm_area_struct *vma;
@@ -647,7 +647,7 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	i_size_write(inode, offset);
 	i_mmap_lock_write(mapping);
 	if (mapping_mapped(mapping))
-		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
+		hugetlb_vmdelete_list(get_i_mmap_root(mapping), pgoff, 0,
 				      ZAP_FLAG_DROP_MARKER);
 	i_mmap_unlock_write(mapping);
 	remove_inode_hugepages(inode, offset, LLONG_MAX);
@@ -708,7 +708,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	/* Unmap users of full pages in the hole. */
 	if (hole_end > hole_start) {
 		if (mapping_mapped(mapping))
-			hugetlb_vmdelete_list(&mapping->i_mmap,
+			hugetlb_vmdelete_list(get_i_mmap_root(mapping),
 					      hole_start >> PAGE_SHIFT,
 					      hole_end >> PAGE_SHIFT, 0);
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8b3dd145b25e..a6a99e044265 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -555,6 +555,11 @@ static inline int mapping_mapped(const struct address_space *mapping)
 	return	!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
 }
 
+static inline struct rb_root_cached *get_i_mmap_root(struct address_space *mapping)
+{
+	return &mapping->i_mmap;
+}
+
 /*
  * Might pages of this file have been modified in userspace?
  * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f06..15cb1da43eb2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3797,6 +3797,7 @@ struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
 struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 				unsigned long start, unsigned long last);
 
+/* Please use get_i_mmap_root() to get the @root */
 #define vma_interval_tree_foreach(vma, root, start, last)		\
 	for (vma = vma_interval_tree_iter_first(root, start, last);	\
 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 923b24b321cc..420035b0cc7b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1201,6 +1201,7 @@ static inline struct map_info *free_map_info(struct map_info *info)
 static struct map_info *
 build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 {
+	struct rb_root_cached *root = get_i_mmap_root(mapping);
 	unsigned long pgoff = offset >> PAGE_SHIFT;
 	struct vm_area_struct *vma;
 	struct map_info *curr = NULL;
@@ -1210,7 +1211,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 
  again:
 	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, root, pgoff, pgoff) {
 		if (!valid_vma(vma, is_register))
 			continue;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 327eaa4074d3..8d27f1b8abb5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5396,6 +5396,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct hstate *h = hstate_vma(vma);
 	struct vm_area_struct *iter_vma;
 	struct address_space *mapping;
+	struct rb_root_cached *root;
 	pgoff_t pgoff;
 
 	/*
@@ -5406,6 +5407,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
 			vma->vm_pgoff;
 	mapping = vma->vm_file->f_mapping;
+	root = get_i_mmap_root(mapping);
 
 	/*
 	 * Take the mapping lock for the duration of the table walk. As
@@ -5413,7 +5415,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * __unmap_hugepage_range() is called as the lock is already held
 	 */
 	i_mmap_lock_write(mapping);
-	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(iter_vma, root, pgoff, pgoff) {
 		/* Do not unmap the current VMA */
 		if (iter_vma == vma)
 			continue;
@@ -6879,6 +6881,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, pud_t *pud)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct rb_root_cached *root = get_i_mmap_root(mapping);
 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
 			vma->vm_pgoff;
 	struct vm_area_struct *svma;
@@ -6887,7 +6890,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *pte;
 
 	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+	vma_interval_tree_foreach(svma, root, idx, idx) {
 		if (svma == vma)
 			continue;
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1dd3cfca610d..3a4e81474fe3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1740,10 +1740,11 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
 
 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 {
+	struct rb_root_cached *root = get_i_mmap_root(mapping);
 	struct vm_area_struct *vma;
 
 	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+	vma_interval_tree_foreach(vma, root, pgoff, pgoff) {
 		struct mmu_notifier_range range;
 		struct mm_struct *mm;
 		unsigned long addr;
@@ -2163,7 +2164,8 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
 		 * not be able to observe any missing pages due to the
 		 * previously inserted retry entries.
 		 */
-		vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
+		vma_interval_tree_foreach(vma, get_i_mmap_root(mapping),
+					start, end) {
 			if (userfaultfd_missing(vma)) {
 				result = SCAN_EXCEED_NONE_PTE;
 				goto immap_locked;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ee42d4361309..85196d9bb26c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -598,7 +598,7 @@ static void collect_procs_file(const struct folio *folio,
 
 		if (!t)
 			continue;
-		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
+		vma_interval_tree_foreach(vma, get_i_mmap_root(mapping), pgoff,
 				      pgoff) {
 			/*
 			 * Send early kill signal to tasks where a vma covers
@@ -650,7 +650,8 @@ static void collect_procs_fsdax(const struct page *page,
 			t = task_early_kill(tsk, true);
 		if (!t)
 			continue;
-		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+		vma_interval_tree_foreach(vma, get_i_mmap_root(mapping), pgoff,
+					pgoff) {
 			if (vma->vm_mm == t->mm)
 				add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
 		}
@@ -2251,7 +2252,8 @@ static void collect_procs_pfn(struct pfn_address_space *pfn_space,
 		t = task_early_kill(tsk, true);
 		if (!t)
 			continue;
-		vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX) {
+		vma_interval_tree_foreach(vma, get_i_mmap_root(mapping),
+					0, ULONG_MAX) {
 			pgoff_t pgoff;
 
 			if (vma->vm_mm == t->mm &&
diff --git a/mm/memory.c b/mm/memory.c
index 366054435773..1ddd6b55fe7e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4298,7 +4298,7 @@ void unmap_mapping_folio(struct folio *folio)
 
 	i_mmap_lock_read(mapping);
 	if (unlikely(mapping_mapped(mapping)))
-		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+		unmap_mapping_range_tree(get_i_mmap_root(mapping), first_index,
 					 last_index, &details);
 	i_mmap_unlock_read(mapping);
 }
@@ -4328,7 +4328,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
 
 	i_mmap_lock_read(mapping);
 	if (unlikely(mapping_mapped(mapping)))
-		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+		unmap_mapping_range_tree(get_i_mmap_root(mapping), first_index,
 					 last_index, &details);
 	i_mmap_unlock_read(mapping);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 843160946aa5..5b0671dff019 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1832,7 +1832,7 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
 			vma_interval_tree_insert_after(tmp, mpnt,
-					&mapping->i_mmap);
+					get_i_mmap_root(mapping));
 			flush_dcache_mmap_unlock(mapping);
 			i_mmap_unlock_write(mapping);
 		}
diff --git a/mm/nommu.c b/mm/nommu.c
index c3a23b082adb..2e64b6c4c539 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -569,7 +569,7 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
 
 		i_mmap_lock_write(mapping);
 		flush_dcache_mmap_lock(mapping);
-		vma_interval_tree_insert(vma, &mapping->i_mmap);
+		vma_interval_tree_insert(vma, get_i_mmap_root(mapping));
 		flush_dcache_mmap_unlock(mapping);
 		i_mmap_unlock_write(mapping);
 	}
@@ -585,7 +585,7 @@ static void cleanup_vma_from_mm(struct vm_area_struct *vma)
 
 		i_mmap_lock_write(mapping);
 		flush_dcache_mmap_lock(mapping);
-		vma_interval_tree_remove(vma, &mapping->i_mmap);
+		vma_interval_tree_remove(vma, get_i_mmap_root(mapping));
 		flush_dcache_mmap_unlock(mapping);
 		i_mmap_unlock_write(mapping);
 	}
@@ -1804,6 +1804,7 @@ EXPORT_SYMBOL_GPL(copy_remote_vm_str);
 int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 				size_t newsize)
 {
+	struct rb_root_cached *root = get_i_mmap_root(&inode->mapping);
 	struct vm_area_struct *vma;
 	struct vm_region *region;
 	pgoff_t low, high;
@@ -1816,7 +1817,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	i_mmap_lock_read(inode->i_mapping);
 
 	/* search for VMAs that fall within the dead zone */
-	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
+	vma_interval_tree_foreach(vma, root, low, high) {
 		/* found one - only interested if it's shared out of the page
 		 * cache */
 		if (vma->vm_flags & VM_SHARED) {
@@ -1832,7 +1833,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	 * we don't check for any regions that start beyond the EOF as there
 	 * shouldn't be any
 	 */
-	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
+	vma_interval_tree_foreach(vma, root, 0, ULONG_MAX) {
 		if (!(vma->vm_flags & VM_SHARED))
 			continue;
 
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index a94c401ab2cf..c6c1c45df575 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -792,7 +792,7 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
 		return -EINVAL;
 
 	lockdep_assert_held(&mapping->i_mmap_rwsem);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
+	vma_interval_tree_foreach(vma, get_i_mmap_root(mapping), first_index,
 				  first_index + nr - 1) {
 		/* Clip to the vma */
 		vba = vma->vm_pgoff;
diff --git a/mm/rmap.c b/mm/rmap.c
index 391337282e3f..52288d39d8a2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -3036,7 +3036,7 @@ static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
 		i_mmap_lock_read(mapping);
 	}
 lookup:
-	vma_interval_tree_foreach(vma, &mapping->i_mmap,
+	vma_interval_tree_foreach(vma, get_i_mmap_root(mapping),
 			pgoff_start, pgoff_end) {
 		unsigned long address = vma_address(vma, pgoff_start, nr_pages);
 
diff --git a/mm/vma.c b/mm/vma.c
index be64f781a3aa..1768e4355a13 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -231,7 +231,7 @@ static void __vma_link_file(struct vm_area_struct *vma,
 		mapping_allow_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_insert(vma, &mapping->i_mmap);
+	vma_interval_tree_insert(vma, get_i_mmap_root(mapping));
 	flush_dcache_mmap_unlock(mapping);
 }
 
@@ -245,7 +245,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		mapping_unmap_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_remove(vma, &mapping->i_mmap);
+	vma_interval_tree_remove(vma, get_i_mmap_root(mapping));
 	flush_dcache_mmap_unlock(mapping);
 }
 
@@ -316,10 +316,11 @@ static void vma_prepare(struct vma_prepare *vp)
 
 	if (vp->file) {
 		flush_dcache_mmap_lock(vp->mapping);
-		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
+		vma_interval_tree_remove(vp->vma,
+					get_i_mmap_root(vp->mapping));
 		if (vp->adj_next)
 			vma_interval_tree_remove(vp->adj_next,
-						 &vp->mapping->i_mmap);
+					get_i_mmap_root(vp->mapping));
 	}
 
 }
@@ -338,8 +339,9 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
 	if (vp->file) {
 		if (vp->adj_next)
 			vma_interval_tree_insert(vp->adj_next,
-						 &vp->mapping->i_mmap);
-		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
+					get_i_mmap_root(vp->mapping));
+		vma_interval_tree_insert(vp->vma,
+					get_i_mmap_root(vp->mapping));
 		flush_dcache_mmap_unlock(vp->mapping);
 	}
 
-- 
2.43.0



^ permalink raw reply related

* [PATCH 1/3] mm: use mapping_mapped to simplify the code
From: Huang Shijie @ 2026-04-13  6:20 UTC (permalink / raw)
  To: akpm, viro, brauner
  Cc: linux-mm, linux-kernel, linux-arm-kernel, linux-fsdevel,
	muchun.song, osalvador, linux-trace-kernel, linux-perf-users,
	linux-parisc, nvdimm, zhongyuan, fangbaoshun, yingzhiwei,
	Huang Shijie
In-Reply-To: <20260413062042.804-1-huangsj@hygon.cn>

Use mapping_mapped() to simplify the code, make
the code tidy and clean.

Signed-off-by: Huang Shijie <huangsj@hygon.cn>
---
 fs/hugetlbfs/inode.c | 4 ++--
 mm/memory.c          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3f70c47981de..ab5ac092d8a6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -646,7 +646,7 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 
 	i_size_write(inode, offset);
 	i_mmap_lock_write(mapping);
-	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
+	if (mapping_mapped(mapping))
 		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
 				      ZAP_FLAG_DROP_MARKER);
 	i_mmap_unlock_write(mapping);
@@ -707,7 +707,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 	/* Unmap users of full pages in the hole. */
 	if (hole_end > hole_start) {
-		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
+		if (mapping_mapped(mapping))
 			hugetlb_vmdelete_list(&mapping->i_mmap,
 					      hole_start >> PAGE_SHIFT,
 					      hole_end >> PAGE_SHIFT, 0);
diff --git a/mm/memory.c b/mm/memory.c
index 2f815a34d924..366054435773 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4297,7 +4297,7 @@ void unmap_mapping_folio(struct folio *folio)
 	details.zap_flags = ZAP_FLAG_DROP_MARKER;
 
 	i_mmap_lock_read(mapping);
-	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+	if (unlikely(mapping_mapped(mapping)))
 		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
 					 last_index, &details);
 	i_mmap_unlock_read(mapping);
@@ -4327,7 +4327,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
 		last_index = ULONG_MAX;
 
 	i_mmap_lock_read(mapping);
-	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+	if (unlikely(mapping_mapped(mapping)))
 		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
 					 last_index, &details);
 	i_mmap_unlock_read(mapping);
-- 
2.43.0



^ permalink raw reply related

* [PATCH 0/3] mm: split the file's i_mmap tree for NUMA
From: Huang Shijie @ 2026-04-13  6:20 UTC (permalink / raw)
  To: akpm, viro, brauner
  Cc: linux-mm, linux-kernel, linux-arm-kernel, linux-fsdevel,
	muchun.song, osalvador, linux-trace-kernel, linux-perf-users,
	linux-parisc, nvdimm, zhongyuan, fangbaoshun, yingzhiwei,
	Huang Shijie

  In NUMA, there are maybe many NUMA nodes and many CPUs.
For example, a Hygon's server has 12 NUMA nodes, and 384 CPUs.
In the UnixBench tests, there is a test "execl" which tests
the execve system call.

  When we test our server with "./Run -c 384 execl",
the test result is not good enough. The i_mmap locks contended heavily on
"libc.so" and "ld.so". For example, the i_mmap tree for "libc.so" can have 
over 6000 VMAs, all the VMAs can be in different NUMA mode.
The insert/remove operations do not run quickly enough.

patch 1 & patch 2 are try to hide the direct access of i_mmap.
patch 3 splits the i_mmap into sibling trees, and we can get better 
performance with this patch set:
    we can get 77% performance improvement(10 times average)


Huang Shijie (3):
  mm: use mapping_mapped to simplify the code
  mm: use get_i_mmap_root to access the file's i_mmap
  mm: split the file's i_mmap tree for NUMA

 arch/arm/mm/fault-armv.c   |  3 ++-
 arch/arm/mm/flush.c        |  3 ++-
 arch/nios2/mm/cacheflush.c |  3 ++-
 arch/parisc/kernel/cache.c |  4 ++-
 fs/dax.c                   |  3 ++-
 fs/hugetlbfs/inode.c       | 10 +++----
 fs/inode.c                 | 55 +++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h         | 40 +++++++++++++++++++++++++++
 include/linux/mm.h         | 33 +++++++++++++++++++++++
 include/linux/mm_types.h   |  1 +
 kernel/events/uprobes.c    |  3 ++-
 mm/hugetlb.c               |  7 +++--
 mm/khugepaged.c            |  6 +++--
 mm/memory-failure.c        |  8 +++---
 mm/memory.c                |  8 +++---
 mm/mmap.c                  |  3 ++-
 mm/nommu.c                 | 11 +++++---
 mm/pagewalk.c              |  2 +-
 mm/rmap.c                  |  2 +-
 mm/vma.c                   | 36 +++++++++++++++++++------
 mm/vma_init.c              |  1 +
 21 files changed, 204 insertions(+), 38 deletions(-)

-- 
2.43.0



^ permalink raw reply

* [PATCH 3/3] mm: split the file's i_mmap tree for NUMA
From: Huang Shijie @ 2026-04-13  6:20 UTC (permalink / raw)
  To: akpm, viro, brauner
  Cc: linux-mm, linux-kernel, linux-arm-kernel, linux-fsdevel,
	muchun.song, osalvador, linux-trace-kernel, linux-perf-users,
	linux-parisc, nvdimm, zhongyuan, fangbaoshun, yingzhiwei,
	Huang Shijie
In-Reply-To: <20260413062042.804-1-huangsj@hygon.cn>

  In NUMA, there are maybe many NUMA nodes and many CPUs.
For example, a Hygon's server has 12 NUMA nodes, and 384 CPUs.
In the UnixBench tests, there is a test "execl" which tests
the execve system call.

  When we test our server with "./Run -c 384 execl",
the test result is not good enough. The i_mmap locks contended heavily on
"libc.so" and "ld.so". For example, the i_mmap tree for "libc.so" can have 
over 6000 VMAs, all the VMAs can be in different NUMA mode.
The insert/remove operations do not run quickly enough.

 In order to reduce the competition of the i_mmap lock, this patch does
following:
   1.) Split the single i_mmap tree into several sibling trees:
       Each NUMA node has a tree.
   2.) Introduce a new field "tree_idx" for vm_area_struct to save the
       sibling tree index for this VMA.
   3.) Introduce a new field "vma_count" for address_space.
       The new mapping_mapped() will use it.
   4.) Rewrite the vma_interval_tree_foreach() for NUMA.

 After this patch, the VMA insert/remove operations will work faster,
and we can get 77% (10 times average) performance improvement
with the above test.

Signed-off-by: Huang Shijie <huangsj@hygon.cn>
---
 fs/inode.c               | 55 +++++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h       | 35 +++++++++++++++++++++++++
 include/linux/mm.h       | 32 +++++++++++++++++++++++
 include/linux/mm_types.h |  1 +
 mm/mmap.c                |  3 ++-
 mm/nommu.c               |  6 +++--
 mm/vma.c                 | 34 +++++++++++++++++++------
 mm/vma_init.c            |  1 +
 8 files changed, 155 insertions(+), 12 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index cc12b68e021b..3067cb2558da 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -215,6 +215,56 @@ static int no_open(struct inode *inode, struct file *file)
 	return -ENXIO;
 }
 
+#ifdef CONFIG_NUMA
+static void free_mapping_i_mmap(struct address_space *mapping)
+{
+	int i;
+
+	if (!mapping->i_mmap)
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		kfree(mapping->i_mmap[i]);
+
+	kfree(mapping->i_mmap);
+	mapping->i_mmap = NULL;
+}
+
+static int init_mapping_i_mmap(struct address_space *mapping)
+{
+	struct rb_root_cached *root;
+	int i;
+
+	/* The extra one is used as terminator in vma_interval_tree_foreach() */
+	mapping->i_mmap = kzalloc(sizeof(root) * (nr_node_ids + 1), GFP_KERNEL);
+	if (!mapping->i_mmap)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		root = kzalloc_node(sizeof(*root), GFP_KERNEL, i);
+		if (!root)
+			goto no_mem;
+
+		*root = RB_ROOT_CACHED;
+		mapping->i_mmap[i] = root;
+	}
+	return 0;
+
+no_mem:
+	free_mapping_i_mmap(mapping);
+	return -ENOMEM;
+}
+#else
+static int init_mapping_i_mmap(struct address_space *mapping)
+{
+	mapping->i_mmap = RB_ROOT_CACHED;
+	return 0;
+}
+static void free_mapping_i_mmap(struct address_space *mapping)
+{
+}
+#endif
+
 /**
  * inode_init_always_gfp - perform inode structure initialisation
  * @sb: superblock inode belongs to
@@ -307,6 +357,9 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
 	if (unlikely(security_inode_alloc(inode, gfp)))
 		return -ENOMEM;
 
+	if (init_mapping_i_mmap(mapping))
+		return -ENOMEM;
+
 	this_cpu_inc(nr_inodes);
 
 	return 0;
@@ -383,6 +436,7 @@ void __destroy_inode(struct inode *inode)
 	if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
 		posix_acl_release(inode->i_default_acl);
 #endif
+	free_mapping_i_mmap(&inode->i_data);
 	this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
@@ -486,7 +540,6 @@ static void __address_space_init_once(struct address_space *mapping)
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
-	mapping->i_mmap = RB_ROOT_CACHED;
 }
 
 void address_space_init_once(struct address_space *mapping)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a6a99e044265..34064c1cbd10 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -477,7 +477,12 @@ struct address_space {
 	/* number of thp, only for non-shmem files */
 	atomic_t		nr_thps;
 #endif
+#ifdef CONFIG_NUMA
+	struct rb_root_cached	**i_mmap;
+	unsigned long		vma_count;
+#else
 	struct rb_root_cached	i_mmap;
+#endif
 	unsigned long		nrpages;
 	pgoff_t			writeback_index;
 	const struct address_space_operations *a_ops;
@@ -547,6 +552,27 @@ static inline void i_mmap_assert_write_locked(struct address_space *mapping)
 	lockdep_assert_held_write(&mapping->i_mmap_rwsem);
 }
 
+#ifdef CONFIG_NUMA
+static inline int mapping_mapped(const struct address_space *mapping)
+{
+	return	READ_ONCE(mapping->vma_count);
+}
+
+static inline void inc_mapping_vma(struct address_space *mapping)
+{
+	mapping->vma_count++;
+}
+
+static inline void dec_mapping_vma(struct address_space *mapping)
+{
+	mapping->vma_count--;
+}
+
+static inline struct rb_root_cached *get_i_mmap_root(struct address_space *mapping)
+{
+	return (struct rb_root_cached *)mapping->i_mmap;
+}
+#else
 /*
  * Might pages of this file be mapped into userspace?
  */
@@ -555,10 +581,19 @@ static inline int mapping_mapped(const struct address_space *mapping)
 	return	!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
 }
 
+static inline void inc_mapping_vma(struct address_space *mapping)
+{
+}
+
+static inline void dec_mapping_vma(struct address_space *mapping)
+{
+}
+
 static inline struct rb_root_cached *get_i_mmap_root(struct address_space *mapping)
 {
 	return &mapping->i_mmap;
 }
+#endif
 
 /*
  * Might pages of this file have been modified in userspace?
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 15cb1da43eb2..c7f26eb34322 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -913,6 +913,9 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma_lock_init(vma, false);
+#ifdef CONFIG_NUMA
+	vma->tree_idx = numa_node_id();
+#endif
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
@@ -3783,6 +3786,8 @@ extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
 
 /* interval_tree.c */
+struct rb_root_cached *get_rb_root(struct vm_area_struct *vma,
+					struct address_space *mapping);
 void vma_interval_tree_insert(struct vm_area_struct *node,
 			      struct rb_root_cached *root);
 void vma_interval_tree_insert_after(struct vm_area_struct *node,
@@ -3798,9 +3803,36 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 				unsigned long start, unsigned long last);
 
 /* Please use get_i_mmap_root() to get the @root */
+#ifdef CONFIG_NUMA
+/* Find the first valid VMA in the sibling trees */
+static inline struct vm_area_struct *first_vma(struct rb_root_cached ***__r,
+				unsigned long start, unsigned long last)
+{
+	struct vm_area_struct *vma = NULL;
+	struct rb_root_cached **tree = *__r;
+
+	while (*tree) {
+		vma = vma_interval_tree_iter_first(*tree++, start, last);
+		if (vma)
+			break;
+	}
+
+	/* Save for the next loop */
+	*__r = tree;
+	return vma;
+}
+
+/* @_tmp is referenced to avoid unused variable warning. */
+#define vma_interval_tree_foreach(vma, root, start, last)		\
+	for (struct rb_root_cached **_r = (void *)(root),		\
+		**_tmp = (vma = first_vma(&_r, start, last)) ? _r : NULL;\
+	     ((_tmp && vma) || (vma = first_vma(&_r, start, last)));	\
+		vma = vma_interval_tree_iter_next(vma, start, last))
+#else
 #define vma_interval_tree_foreach(vma, root, start, last)		\
 	for (vma = vma_interval_tree_iter_first(root, start, last);	\
 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
+#endif
 
 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
 				   struct rb_root_cached *root);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cc8ae722886..4982e20ce27c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -984,6 +984,7 @@ struct vm_area_struct {
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
+	int tree_idx;			/* The sibling tree index for the VMA */
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
diff --git a/mm/mmap.c b/mm/mmap.c
index 5b0671dff019..81a2f4932ca8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1832,8 +1832,9 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
 			vma_interval_tree_insert_after(tmp, mpnt,
-					get_i_mmap_root(mapping));
+					get_rb_root(mpnt, mapping));
 			flush_dcache_mmap_unlock(mapping);
+			inc_mapping_vma(mapping);
 			i_mmap_unlock_write(mapping);
 		}
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 2e64b6c4c539..6553cfcb6683 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -569,8 +569,9 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
 
 		i_mmap_lock_write(mapping);
 		flush_dcache_mmap_lock(mapping);
-		vma_interval_tree_insert(vma, get_i_mmap_root(mapping));
+		vma_interval_tree_insert(vma, get_rb_root(vma, mapping));
 		flush_dcache_mmap_unlock(mapping);
+		inc_mapping_vma(mapping);
 		i_mmap_unlock_write(mapping);
 	}
 }
@@ -585,8 +586,9 @@ static void cleanup_vma_from_mm(struct vm_area_struct *vma)
 
 		i_mmap_lock_write(mapping);
 		flush_dcache_mmap_lock(mapping);
-		vma_interval_tree_remove(vma, get_i_mmap_root(mapping));
+		vma_interval_tree_remove(vma, get_rb_root(vma, mapping));
 		flush_dcache_mmap_unlock(mapping);
+		dec_mapping_vma(mapping);
 		i_mmap_unlock_write(mapping);
 	}
 }
diff --git a/mm/vma.c b/mm/vma.c
index 1768e4355a13..5aa3915d183b 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -224,6 +224,16 @@ static bool can_vma_merge_after(struct vma_merge_struct *vmg)
 	return false;
 }
 
+struct rb_root_cached *get_rb_root(struct vm_area_struct *vma,
+					struct address_space *mapping)
+{
+#ifdef CONFIG_NUMA
+	return mapping->i_mmap[vma->tree_idx];
+#else
+	return &mapping->i_mmap;
+#endif
+}
+
 static void __vma_link_file(struct vm_area_struct *vma,
 			    struct address_space *mapping)
 {
@@ -231,8 +241,9 @@ static void __vma_link_file(struct vm_area_struct *vma,
 		mapping_allow_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_insert(vma, get_i_mmap_root(mapping));
+	vma_interval_tree_insert(vma, get_rb_root(vma, mapping));
 	flush_dcache_mmap_unlock(mapping);
+	inc_mapping_vma(mapping);
 }
 
 /*
@@ -245,8 +256,9 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		mapping_unmap_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_remove(vma, get_i_mmap_root(mapping));
+	vma_interval_tree_remove(vma, get_rb_root(vma, mapping));
 	flush_dcache_mmap_unlock(mapping);
+	dec_mapping_vma(mapping);
 }
 
 /*
@@ -317,10 +329,13 @@ static void vma_prepare(struct vma_prepare *vp)
 	if (vp->file) {
 		flush_dcache_mmap_lock(vp->mapping);
 		vma_interval_tree_remove(vp->vma,
-					get_i_mmap_root(vp->mapping));
-		if (vp->adj_next)
+					get_rb_root(vp->vma, vp->mapping));
+		dec_mapping_vma(vp->mapping);
+		if (vp->adj_next) {
 			vma_interval_tree_remove(vp->adj_next,
-					get_i_mmap_root(vp->mapping));
+					get_rb_root(vp->adj_next, vp->mapping));
+			dec_mapping_vma(vp->mapping);
+		}
 	}
 
 }
@@ -337,11 +352,14 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
 			 struct mm_struct *mm)
 {
 	if (vp->file) {
-		if (vp->adj_next)
+		if (vp->adj_next) {
 			vma_interval_tree_insert(vp->adj_next,
-					get_i_mmap_root(vp->mapping));
+					get_rb_root(vp->adj_next, vp->mapping));
+			inc_mapping_vma(vp->mapping);
+		}
 		vma_interval_tree_insert(vp->vma,
-					get_i_mmap_root(vp->mapping));
+					get_rb_root(vp->vma, vp->mapping));
+		inc_mapping_vma(vp->mapping);
 		flush_dcache_mmap_unlock(vp->mapping);
 	}
 
diff --git a/mm/vma_init.c b/mm/vma_init.c
index 3c0b65950510..5735868b1ad4 100644
--- a/mm/vma_init.c
+++ b/mm/vma_init.c
@@ -71,6 +71,7 @@ static void vm_area_init_from(const struct vm_area_struct *src,
 #endif
 #ifdef CONFIG_NUMA
 	dest->vm_policy = src->vm_policy;
+	dest->tree_idx = src->tree_idx;
 #endif
 #ifdef __HAVE_PFNMAP_TRACKING
 	dest->pfnmap_track_ctx = NULL;
-- 
2.43.0



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox