Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v3 9/9] selftests/verification: add tlob selftests
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1780847473.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Add selftest coverage for the tlob uprobe monitoring interface under
tools/testing/selftests/verification/.

test.d/tlob/ contains both the helper sources (tlob_target, tlob_sym)
and the seven test scripts so the test suite is self-contained.
tlob_target provides busy-spin, sleep, and preempt workloads; tlob_sym
resolves ELF symbol offsets for uprobe registration.

Seven test scripts exercise uprobe binding management, budget violation
detection, and per-state time accounting (running_ns, waiting_ns,
sleeping_ns).

Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 .../testing/selftests/verification/.gitignore |   2 +
 tools/testing/selftests/verification/Makefile |  19 +-
 .../verification/test.d/tlob/Makefile         |  20 ++
 .../verification/test.d/tlob/test.d/functions |   1 +
 .../verification/test.d/tlob/tlob_sym.c       | 189 ++++++++++++++++++
 .../verification/test.d/tlob/tlob_target.c    | 138 +++++++++++++
 .../verification/test.d/tlob/uprobe_bind.tc   |  37 ++++
 .../test.d/tlob/uprobe_detail_running.tc      |  51 +++++
 .../test.d/tlob/uprobe_detail_sleeping.tc     |  50 +++++
 .../test.d/tlob/uprobe_detail_waiting.tc      |  66 ++++++
 .../verification/test.d/tlob/uprobe_multi.tc  |  64 ++++++
 .../test.d/tlob/uprobe_no_event.tc            |  19 ++
 .../test.d/tlob/uprobe_violation.tc           |  67 +++++++
 13 files changed, 722 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/Makefile
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/test.d/functions
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/tlob_sym.c
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/tlob_target.c
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_running.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc

diff --git a/tools/testing/selftests/verification/.gitignore b/tools/testing/selftests/verification/.gitignore
index 2659417cb2c7..cbbd03ee16c7 100644
--- a/tools/testing/selftests/verification/.gitignore
+++ b/tools/testing/selftests/verification/.gitignore
@@ -1,2 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 logs
+test.d/tlob/tlob_sym
+test.d/tlob/tlob_target
diff --git a/tools/testing/selftests/verification/Makefile b/tools/testing/selftests/verification/Makefile
index aa8790c22a71..0b32bdfdb8db 100644
--- a/tools/testing/selftests/verification/Makefile
+++ b/tools/testing/selftests/verification/Makefile
@@ -1,8 +1,25 @@
 # SPDX-License-Identifier: GPL-2.0
-all:
 
 TEST_PROGS := verificationtest-ktap
 TEST_FILES := test.d settings
 EXTRA_CLEAN := $(OUTPUT)/logs/*
 
+# Subdirectories that provide binaries used by the test runner.
+# Each entry must contain a Makefile that accepts OUTDIR= and
+# deposits its binaries there.
+BUILD_SUBDIRS := test.d/tlob
+
 include ../lib.mk
+
+all: $(patsubst %,_build_%,$(BUILD_SUBDIRS))
+
+clean: $(patsubst %,_clean_%,$(BUILD_SUBDIRS))
+
+.PHONY: $(patsubst %,_build_%,$(BUILD_SUBDIRS)) \
+        $(patsubst %,_clean_%,$(BUILD_SUBDIRS))
+
+$(patsubst %,_build_%,$(BUILD_SUBDIRS)): _build_%:
+	$(MAKE) -C $* OUTDIR="$(OUTPUT)" TOOLS_INCLUDES="$(TOOLS_INCLUDES)"
+
+$(patsubst %,_clean_%,$(BUILD_SUBDIRS)): _clean_%:
+	$(MAKE) -C $* OUTDIR="$(OUTPUT)" clean
diff --git a/tools/testing/selftests/verification/test.d/tlob/Makefile b/tools/testing/selftests/verification/test.d/tlob/Makefile
new file mode 100644
index 000000000000..29b3519b255f
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/Makefile
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+# Builds tlob selftest helper binaries in the directory of this Makefile.
+#
+# Invoked by ../../Makefile via BUILD_SUBDIRS; outputs tlob_sym and
+# tlob_target alongside the .tc scripts so they are self-contained.
+
+CFLAGS += $(TOOLS_INCLUDES)
+
+.PHONY: all
+all: tlob_sym tlob_target
+
+tlob_sym: tlob_sym.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+tlob_target: tlob_target.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+.PHONY: clean
+clean:
+	$(RM) tlob_sym tlob_target
diff --git a/tools/testing/selftests/verification/test.d/tlob/test.d/functions b/tools/testing/selftests/verification/test.d/tlob/test.d/functions
new file mode 100644
index 000000000000..0b4c5e4344d2
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/test.d/functions
@@ -0,0 +1 @@
+. "${TOP_DIR%/*}/functions"
diff --git a/tools/testing/selftests/verification/test.d/tlob/tlob_sym.c b/tools/testing/selftests/verification/test.d/tlob/tlob_sym.c
new file mode 100644
index 000000000000..1b7ba1c6d95b
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/tlob_sym.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob_sym.c - ELF symbol-to-file-offset utility for tlob selftests
+ *
+ * Usage: tlob_sym sym_offset <binary> <symbol>
+ *
+ *   Prints the ELF file offset of <symbol> in <binary> to stdout.
+ *
+ * Exit: 0 = found, 1 = error / not found.
+ */
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+static int sym_offset(const char *binary, const char *symname)
+{
+	int fd;
+	struct stat st;
+	void *map;
+	Elf64_Ehdr *ehdr;
+	Elf32_Ehdr *ehdr32;
+	int is64;
+	uint64_t sym_vaddr = 0;
+	int found = 0;
+	uint64_t file_offset = 0;
+
+	fd = open(binary, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "open %s: %s\n", binary, strerror(errno));
+		return 1;
+	}
+	if (fstat(fd, &st) < 0) {
+		close(fd);
+		return 1;
+	}
+	map = mmap(NULL, (size_t)st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	close(fd);
+	if (map == MAP_FAILED) {
+		fprintf(stderr, "mmap: %s\n", strerror(errno));
+		return 1;
+	}
+
+	ehdr = (Elf64_Ehdr *)map;
+	ehdr32 = (Elf32_Ehdr *)map;
+	if (st.st_size < 4 ||
+	    ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
+	    ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+	    ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
+	    ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+		fprintf(stderr, "%s: not an ELF file\n", binary);
+		munmap(map, (size_t)st.st_size);
+		return 1;
+	}
+	is64 = (ehdr->e_ident[EI_CLASS] == ELFCLASS64);
+
+	if (is64) {
+		Elf64_Shdr *shdrs = (Elf64_Shdr *)((char *)map + ehdr->e_shoff);
+		Elf64_Shdr *shstrtab_hdr = &shdrs[ehdr->e_shstrndx];
+		const char *shstrtab = (char *)map + shstrtab_hdr->sh_offset;
+		int si;
+
+		for (int pass = 0; pass < 2 && !found; pass++) {
+			const char *target = pass ? ".dynsym" : ".symtab";
+
+			for (si = 0; si < ehdr->e_shnum && !found; si++) {
+				Elf64_Shdr *sh = &shdrs[si];
+				const char *name = shstrtab + sh->sh_name;
+
+				if (strcmp(name, target) != 0)
+					continue;
+
+				Elf64_Shdr *strtab_sh = &shdrs[sh->sh_link];
+				const char *strtab = (char *)map + strtab_sh->sh_offset;
+				Elf64_Sym *syms = (Elf64_Sym *)((char *)map + sh->sh_offset);
+				uint64_t nsyms = sh->sh_size / sizeof(Elf64_Sym);
+				uint64_t j;
+
+				for (j = 0; j < nsyms; j++) {
+					if (strcmp(strtab + syms[j].st_name, symname) == 0) {
+						sym_vaddr = syms[j].st_value;
+						found = 1;
+						break;
+					}
+				}
+			}
+		}
+
+		if (!found) {
+			fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary);
+			munmap(map, (size_t)st.st_size);
+			return 1;
+		}
+
+		Elf64_Phdr *phdrs = (Elf64_Phdr *)((char *)map + ehdr->e_phoff);
+		int pi;
+
+		for (pi = 0; pi < ehdr->e_phnum; pi++) {
+			Elf64_Phdr *ph = &phdrs[pi];
+
+			if (ph->p_type != PT_LOAD)
+				continue;
+			if (sym_vaddr >= ph->p_vaddr &&
+			    sym_vaddr < ph->p_vaddr + ph->p_filesz) {
+				file_offset = sym_vaddr - ph->p_vaddr + ph->p_offset;
+				break;
+			}
+		}
+	} else {
+		Elf32_Shdr *shdrs = (Elf32_Shdr *)((char *)map + ehdr32->e_shoff);
+		Elf32_Shdr *shstrtab_hdr = &shdrs[ehdr32->e_shstrndx];
+		const char *shstrtab = (char *)map + shstrtab_hdr->sh_offset;
+		int si;
+		uint32_t sym_vaddr32 = 0;
+
+		for (int pass = 0; pass < 2 && !found; pass++) {
+			const char *target = pass ? ".dynsym" : ".symtab";
+
+			for (si = 0; si < ehdr32->e_shnum && !found; si++) {
+				Elf32_Shdr *sh = &shdrs[si];
+				const char *name = shstrtab + sh->sh_name;
+
+				if (strcmp(name, target) != 0)
+					continue;
+
+				Elf32_Shdr *strtab_sh = &shdrs[sh->sh_link];
+				const char *strtab = (char *)map + strtab_sh->sh_offset;
+				Elf32_Sym *syms = (Elf32_Sym *)((char *)map + sh->sh_offset);
+				uint32_t nsyms = sh->sh_size / sizeof(Elf32_Sym);
+				uint32_t j;
+
+				for (j = 0; j < nsyms; j++) {
+					if (strcmp(strtab + syms[j].st_name, symname) == 0) {
+						sym_vaddr32 = syms[j].st_value;
+						found = 1;
+						break;
+					}
+				}
+			}
+		}
+
+		if (!found) {
+			fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary);
+			munmap(map, (size_t)st.st_size);
+			return 1;
+		}
+
+		Elf32_Phdr *phdrs = (Elf32_Phdr *)((char *)map + ehdr32->e_phoff);
+		int pi;
+
+		for (pi = 0; pi < ehdr32->e_phnum; pi++) {
+			Elf32_Phdr *ph = &phdrs[pi];
+
+			if (ph->p_type != PT_LOAD)
+				continue;
+			if (sym_vaddr32 >= ph->p_vaddr &&
+			    sym_vaddr32 < ph->p_vaddr + ph->p_filesz) {
+				file_offset = sym_vaddr32 - ph->p_vaddr + ph->p_offset;
+				break;
+			}
+		}
+		sym_vaddr = sym_vaddr32;
+	}
+
+	munmap(map, (size_t)st.st_size);
+
+	if (!file_offset && sym_vaddr) {
+		fprintf(stderr, "could not map vaddr 0x%lx to file offset\n",
+			(unsigned long)sym_vaddr);
+		return 1;
+	}
+
+	printf("0x%lx\n", (unsigned long)file_offset);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 4 || strcmp(argv[1], "sym_offset") != 0) {
+		fprintf(stderr, "Usage: %s sym_offset <binary> <symbol>\n", argv[0]);
+		return 1;
+	}
+	return sym_offset(argv[2], argv[3]);
+}
diff --git a/tools/testing/selftests/verification/test.d/tlob/tlob_target.c b/tools/testing/selftests/verification/test.d/tlob/tlob_target.c
new file mode 100644
index 000000000000..0fdbc575d71d
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/tlob_target.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob_target.c - uprobe target binary for tlob selftests.
+ *
+ * Provides three start/stop probe pairs, each designed to exercise a
+ * different dominant component of the detail_env_tlob ns breakdown:
+ *
+ *   tlob_busy_work    / tlob_busy_work_done    - busy-spin: running_ns dominates
+ *   tlob_sleep_work   / tlob_sleep_work_done   - nanosleep: sleeping_ns dominates
+ *   tlob_preempt_work / tlob_preempt_work_done - busy-spin: waiting_ns dominates
+ *                                                (needs an RT competitor on the same CPU)
+ *
+ * Usage: tlob_target <duration_ms> [mode]
+ *
+ * mode is one of: busy (default), sleep, preempt.
+ * Loops in 200 ms iterations until <duration_ms> has elapsed
+ * (0 = run for ~24 hours).
+ */
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#ifndef noinline
+#define noinline __attribute__((noinline))
+#endif
+
+static inline int timespec_before(const struct timespec *a,
+				   const struct timespec *b)
+{
+	return a->tv_sec < b->tv_sec ||
+	       (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
+}
+
+static void timespec_add_ms(struct timespec *ts, unsigned long ms)
+{
+	ts->tv_sec  += ms / 1000;
+	ts->tv_nsec += (long)(ms % 1000) * 1000000L;
+	if (ts->tv_nsec >= 1000000000L) {
+		ts->tv_sec++;
+		ts->tv_nsec -= 1000000000L;
+	}
+}
+
+/* stop probe; noinline keeps the entry point visible to uprobes */
+noinline void tlob_busy_work_done(void)
+{
+	/* empty: uprobe fires on entry */
+}
+
+/* start probe; busy-spin so running_ns dominates */
+noinline void tlob_busy_work(unsigned long duration_ns)
+{
+	struct timespec start, now;
+	unsigned long elapsed;
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	do {
+		clock_gettime(CLOCK_MONOTONIC, &now);
+		elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+			  * 1000000000UL
+			+ (unsigned long)(now.tv_nsec - start.tv_nsec);
+	} while (elapsed < duration_ns);
+
+	tlob_busy_work_done();
+}
+
+/* stop probe; noinline keeps the entry point visible to uprobes */
+noinline void tlob_sleep_work_done(void)
+{
+	/* empty: uprobe fires on entry */
+}
+
+/* start probe; nanosleep so sleeping_ns dominates */
+noinline void tlob_sleep_work(unsigned long duration_ms)
+{
+	struct timespec ts = {
+		.tv_sec  = duration_ms / 1000,
+		.tv_nsec = (long)(duration_ms % 1000) * 1000000L,
+	};
+	nanosleep(&ts, NULL);
+	tlob_sleep_work_done();
+}
+
+/* stop probe; noinline keeps the entry point visible to uprobes */
+noinline void tlob_preempt_work_done(void)
+{
+	/* empty: uprobe fires on entry */
+}
+
+/*
+ * start probe; busy-spin so an RT competitor on the same CPU drives
+ * waiting_ns (prev_state==0 -> preempt event, task stays runnable off-CPU).
+ */
+noinline void tlob_preempt_work(unsigned long duration_ms)
+{
+	struct timespec start, now;
+	unsigned long elapsed;
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	do {
+		clock_gettime(CLOCK_MONOTONIC, &now);
+		elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+			  * 1000000000UL
+			+ (unsigned long)(now.tv_nsec - start.tv_nsec);
+	} while (elapsed < duration_ms * 1000000UL);
+
+	tlob_preempt_work_done();
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long duration_ms = 0;
+	const char *mode = "busy";
+	struct timespec deadline, now;
+
+	if (argc >= 2)
+		duration_ms = strtoul(argv[1], NULL, 10);
+	if (argc >= 3)
+		mode = argv[2];
+
+	clock_gettime(CLOCK_MONOTONIC, &deadline);
+	timespec_add_ms(&deadline, duration_ms ? duration_ms : 86400000UL);
+
+	do {
+		if (strcmp(mode, "sleep") == 0)
+			tlob_sleep_work(200);
+		else if (strcmp(mode, "preempt") == 0)
+			tlob_preempt_work(200);
+		else
+			tlob_busy_work(200 * 1000000UL);
+		clock_gettime(CLOCK_MONOTONIC, &now);
+	} while (timespec_before(&now, &deadline));
+
+	return 0;
+}
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
new file mode 100644
index 000000000000..1ac3db6ca7bb
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
@@ -0,0 +1,37 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob monitor uprobe binding (visible in monitor file, removable, duplicate rejected)
+# requires: tlob:monitor
+
+RV_BINDIR="${RV_BINDIR:-$(realpath "$(dirname "${1:-$0}")")}"
+UPROBE_TARGET="${RV_BINDIR}/tlob_target"
+TLOB_SYM="${RV_BINDIR}/tlob_sym"
+[ -x "$UPROBE_TARGET" ] || exit_unsupported
+[ -x "$TLOB_SYM" ]      || exit_unsupported
+TLOB_MONITOR=monitors/tlob/monitor
+
+busy_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+stop_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+[ -n "$busy_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 30000 &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > monitors/tlob/enable
+echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=5000000000" > "$TLOB_MONITOR"
+
+# Binding must appear in monitor file with canonical hex-offset format.
+grep -qE "^p ${UPROBE_TARGET}:0x[0-9a-f]+ 0x[0-9a-f]+ threshold=[0-9]+$" "$TLOB_MONITOR"
+grep -q "threshold=5000000000" "$TLOB_MONITOR"
+
+# Duplicate offset_start must be rejected.
+! echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=9999000" > "$TLOB_MONITOR" 2>/dev/null
+
+# Remove the binding; it must no longer appear.
+echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR"
+! grep -q "^p .*:0x${busy_offset#0x} " "$TLOB_MONITOR"
+
+kill "$busy_pid" 2>/dev/null || true; wait "$busy_pid" 2>/dev/null || true
+echo 0 > monitors/tlob/enable
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_running.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_running.tc
new file mode 100644
index 000000000000..2814caa34902
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_running.tc
@@ -0,0 +1,51 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob monitor detail running (running_ns dominates when task busy-spins between probes)
+# requires: tlob:monitor
+
+RV_BINDIR="${RV_BINDIR:-$(realpath "$(dirname "${1:-$0}")")}"
+UPROBE_TARGET="${RV_BINDIR}/tlob_target"
+TLOB_SYM="${RV_BINDIR}/tlob_sym"
+[ -x "$UPROBE_TARGET" ] || exit_unsupported
+[ -x "$TLOB_SYM" ]      || exit_unsupported
+TLOB_MONITOR=monitors/tlob/monitor
+
+start_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+stop_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+[ -n "$start_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 5000 &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# 10 µs budget; task busy-spins 200 ms per iteration -> running_ns dominates.
+echo "p ${UPROBE_TARGET}:${start_offset} ${stop_offset} threshold=10000" > "$TLOB_MONITOR"
+
+found=0; i=0
+while [ "$i" -lt 30 ]; do
+	sleep 0.1
+	grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+	i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${start_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$busy_pid" 2>/dev/null || true; wait "$busy_pid" 2>/dev/null || true
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+waiting=$(echo "$line" | sed 's/.*waiting_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+# Busy-spin keeps the task on-CPU: running_ns must exceed sleeping_ns.
+[ "$running" -gt "$sleeping" ]
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
new file mode 100644
index 000000000000..0a6470b4cadb
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
@@ -0,0 +1,50 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob monitor detail sleeping (sleeping_ns dominates when task blocks between probes)
+# requires: tlob:monitor
+
+RV_BINDIR="${RV_BINDIR:-$(realpath "$(dirname "${1:-$0}")")}"
+UPROBE_TARGET="${RV_BINDIR}/tlob_target"
+TLOB_SYM="${RV_BINDIR}/tlob_sym"
+[ -x "$UPROBE_TARGET" ] || exit_unsupported
+[ -x "$TLOB_SYM" ]      || exit_unsupported
+TLOB_MONITOR=monitors/tlob/monitor
+
+start_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_sleep_work 2>/dev/null)
+stop_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_sleep_work_done 2>/dev/null)
+[ -n "$start_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 5000 sleep &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# 50 ms budget; task sleeps 200 ms per iteration -> sleeping_ns dominates.
+echo "p ${UPROBE_TARGET}:${start_offset} ${stop_offset} threshold=50000000" > "$TLOB_MONITOR"
+
+found=0; i=0
+while [ "$i" -lt 30 ]; do
+	sleep 0.1
+	grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+	i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${start_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$busy_pid" 2>/dev/null || true; wait "$busy_pid" 2>/dev/null || true
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+waiting=$(echo "$line" | sed 's/.*waiting_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+[ "$sleeping" -gt "$((running + waiting))" ]
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
new file mode 100644
index 000000000000..ef22fce700fc
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
@@ -0,0 +1,66 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob monitor detail waiting (waiting_ns dominates when task is preempted between probes)
+# requires: tlob:monitor
+
+RV_BINDIR="${RV_BINDIR:-$(realpath "$(dirname "${1:-$0}")")}"
+UPROBE_TARGET="${RV_BINDIR}/tlob_target"
+TLOB_SYM="${RV_BINDIR}/tlob_sym"
+[ -x "$UPROBE_TARGET" ] || exit_unsupported
+[ -x "$TLOB_SYM" ]      || exit_unsupported
+TLOB_MONITOR=monitors/tlob/monitor
+
+command -v chrt    > /dev/null || exit_unsupported
+command -v taskset > /dev/null || exit_unsupported
+
+start_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_preempt_work 2>/dev/null)
+stop_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_preempt_work_done 2>/dev/null)
+[ -n "$start_offset" ] || exit_unsupported
+[ -n "$stop_offset" ]  || exit_unsupported
+
+cpu=0
+
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# Register probe before the target starts so the start uprobe fires on the
+# first entry to tlob_preempt_work. Budget: 500 ms.
+echo "p ${UPROBE_TARGET}:${start_offset} ${stop_offset} threshold=500000000" > "$TLOB_MONITOR"
+
+# Target starts; start probe fires on tlob_preempt_work entry.
+taskset -c "$cpu" "$UPROBE_TARGET" 5000 preempt &
+busy_pid=$!
+sleep 0.05
+
+# RT hog on the same CPU preempts the target; target stays in waiting state
+# (runnable, off-CPU) until the budget expires -> waiting_ns dominates.
+chrt -f 99 taskset -c "$cpu" sh -c 'while true; do :; done' 2>/dev/null &
+hog_pid=$!
+
+found=0; i=0
+while [ "$i" -lt 30 ]; do
+	sleep 0.1
+	grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+	i=$((i+1))
+done
+
+# Kill the RT hog first so tlob_target can release any in-flight SRCU read
+# section from uprobe_notify_resume; otherwise probe removal blocks in
+# synchronize_srcu with the hog monopolising the CPU at FIFO-99.
+kill "$hog_pid" 2>/dev/null || true; wait "$hog_pid" 2>/dev/null || true
+kill "$busy_pid" 2>/dev/null || true; wait "$busy_pid" 2>/dev/null || true
+echo "-${UPROBE_TARGET}:${start_offset}" > "$TLOB_MONITOR" 2>/dev/null
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+waiting=$(echo "$line" | sed 's/.*waiting_ns=\([0-9]*\).*/\1/')
+[ "$waiting" -gt "$((running + sleeping))" ]
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
new file mode 100644
index 000000000000..f1bd6c955f1d
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
@@ -0,0 +1,64 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob monitor multiple uprobe bindings (different offsets fire independently)
+# requires: tlob:monitor
+
+RV_BINDIR="${RV_BINDIR:-$(realpath "$(dirname "${1:-$0}")")}"
+UPROBE_TARGET="${RV_BINDIR}/tlob_target"
+TLOB_SYM="${RV_BINDIR}/tlob_sym"
+[ -x "$UPROBE_TARGET" ] || exit_unsupported
+[ -x "$TLOB_SYM" ]      || exit_unsupported
+TLOB_MONITOR=monitors/tlob/monitor
+
+busy_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+busy_stop=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+sleep_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_sleep_work 2>/dev/null)
+sleep_stop=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_sleep_work_done 2>/dev/null)
+[ -n "$busy_offset" ]  || exit_unsupported
+[ -n "$busy_stop" ]    || exit_unsupported
+[ -n "$sleep_offset" ] || exit_unsupported
+[ -n "$sleep_stop" ]   || exit_unsupported
+
+"$UPROBE_TARGET" 30000 &       # busy mode: tlob_busy_work fires every 200 ms
+busy_pid=$!
+"$UPROBE_TARGET" 30000 sleep & # sleep mode: tlob_sleep_work fires every 200 ms
+sleep_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# Binding A: 5 s budget on the busy probe - must not fire in 200 ms loops.
+echo "p ${UPROBE_TARGET}:${busy_offset} ${busy_stop} threshold=5000000000" > "$TLOB_MONITOR"
+# Binding B: 10 µs budget on the sleep probe - fires on first invocation.
+echo "p ${UPROBE_TARGET}:${sleep_offset} ${sleep_stop} threshold=10000" > "$TLOB_MONITOR"
+
+# Wait up to 2 s for error_env_tlob from binding B.
+found=0; i=0
+while [ "$i" -lt 20 ]; do
+	sleep 0.1
+	grep -q "error_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+	i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR" 2>/dev/null
+echo "-${UPROBE_TARGET}:${sleep_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$sleep_pid" 2>/dev/null || true; wait "$sleep_pid" 2>/dev/null || true
+kill "$busy_pid" 2>/dev/null || true; wait "$busy_pid" 2>/dev/null || true
+
+echo 0 > monitors/tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+
+[ "$found" = "1" ]
+# error_env_tlob payload: clock variable must be present.
+# The event field can be "budget_exceeded" (hrtimer path) or the DA event
+# name ("sleep", "preempt") depending on which fires first; don't constrain it.
+grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "clk_elapsed="
+# detail_env_tlob must appear alongside the error.
+grep -q "detail_env_tlob" /sys/kernel/tracing/trace
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
new file mode 100644
index 000000000000..a143635a60ce
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob monitor no spurious events without active uprobe binding
+# requires: tlob:monitor
+
+TLOB_MONITOR=monitors/tlob/monitor
+
+echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+sleep 0.5
+
+! grep -q "error_env_tlob" /sys/kernel/tracing/trace
+
+echo 0 > monitors/tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
new file mode 100644
index 000000000000..d210d9c3a92d
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
@@ -0,0 +1,67 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob monitor budget violation (error_env_tlob and detail_env_tlob fire with correct fields)
+# requires: tlob:monitor
+
+RV_BINDIR="${RV_BINDIR:-$(realpath "$(dirname "${1:-$0}")")}"
+UPROBE_TARGET="${RV_BINDIR}/tlob_target"
+TLOB_SYM="${RV_BINDIR}/tlob_sym"
+[ -x "$UPROBE_TARGET" ] || exit_unsupported
+[ -x "$TLOB_SYM" ]      || exit_unsupported
+TLOB_MONITOR=monitors/tlob/monitor
+
+busy_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+stop_offset=$("$TLOB_SYM" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+[ -n "$busy_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 30000 &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# 10 µs budget - fires almost immediately; task is busy-spinning on-CPU.
+echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=10000" > "$TLOB_MONITOR"
+
+# wait up to 2 s for detail_env_tlob
+found=0; i=0
+while [ "$i" -lt 20 ]; do
+	sleep 0.1
+	grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+	i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$busy_pid" 2>/dev/null || true; wait "$busy_pid" 2>/dev/null || true
+echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+# error_env_tlob must carry the clk_elapsed environment field.
+# The event label is "budget_exceeded" when detected by the hrtimer callback,
+# or the triggering sched event name when detected by the constraint path on a
+# preemption that races with the timer (common on PREEMPT_RT / VM).  Both are
+# valid detections; check the env field instead of the label.
+grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "clk_elapsed="
+
+# detail_env_tlob must have all five fields with the correct threshold
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+echo "$line" | grep -q "pid="
+echo "$line" | grep -q "threshold_ns=10000"
+echo "$line" | grep -q "running_ns="
+echo "$line" | grep -q "waiting_ns="
+echo "$line" | grep -q "sleeping_ns="
+
+# Busy-spin keeps the task on-CPU: running_ns must exceed sleeping_ns.
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+[ "$running" -gt "$sleeping" ]
+
+echo > /sys/kernel/tracing/trace
-- 
2.43.0


^ permalink raw reply related

* [PATCH v3 8/9] selftests/verification: fix verificationtest-ktap for out-of-tree execution
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1780847473.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

verificationtest-ktap used CWD-relative paths which broke when
invoked outside the verification directory (e.g. via vng).

Resolve paths via realpath "$(dirname "$0")" so the script works
from any working directory.  Accept an optional subdirectory argument
interpreted relative to the script's directory.

Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 tools/testing/selftests/verification/verificationtest-ktap | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/verification/verificationtest-ktap b/tools/testing/selftests/verification/verificationtest-ktap
index 18f7fe324e2f..055747cef38a 100755
--- a/tools/testing/selftests/verification/verificationtest-ktap
+++ b/tools/testing/selftests/verification/verificationtest-ktap
@@ -5,4 +5,6 @@
 #
 # Copyright (C) Arm Ltd., 2023
 
-../ftrace/ftracetest -K -v --rv ../verification
+dir=$(realpath "$(dirname "$0")")
+testdir=$(cd "$dir" && realpath "${1:-.}")
+"$dir/../ftrace/ftracetest" -K -v --rv "$testdir"
-- 
2.43.0


^ permalink raw reply related

* [PATCH v3 7/9] rv/tlob: add KUnit tests for the tlob monitor
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1780847473.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Add CONFIG_TLOB_KUNIT_TEST (tristate, depends on RV_MON_TLOB && KUNIT,
default KUNIT_ALL_TESTS) with a single test suite covering the uprobe
line parser: valid bindings are accepted, malformed ones return -EINVAL,
and out-of-range thresholds return -ERANGE.

Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 kernel/trace/rv/Makefile                   |  1 +
 kernel/trace/rv/monitors/tlob/.kunitconfig |  6 ++
 kernel/trace/rv/monitors/tlob/Kconfig      |  7 ++
 kernel/trace/rv/monitors/tlob/tlob_kunit.c | 92 ++++++++++++++++++++++
 4 files changed, 106 insertions(+)
 create mode 100644 kernel/trace/rv/monitors/tlob/.kunitconfig
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob_kunit.c

diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index ae59e97f8682..316d53398345 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
 obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
 obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
 obj-$(CONFIG_RV_MON_TLOB) += monitors/tlob/tlob.o
+obj-$(CONFIG_TLOB_KUNIT_TEST) += monitors/tlob/tlob_kunit.o
 # Add new monitors here
 obj-$(CONFIG_RV_UPROBE) += rv_uprobe.o
 obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
diff --git a/kernel/trace/rv/monitors/tlob/.kunitconfig b/kernel/trace/rv/monitors/tlob/.kunitconfig
new file mode 100644
index 000000000000..35d313dfc20d
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/.kunitconfig
@@ -0,0 +1,6 @@
+CONFIG_FTRACE=y
+CONFIG_KUNIT=y
+CONFIG_MODULES=y
+CONFIG_RV=y
+CONFIG_RV_MON_TLOB=y
+CONFIG_TLOB_KUNIT_TEST=y
diff --git a/kernel/trace/rv/monitors/tlob/Kconfig b/kernel/trace/rv/monitors/tlob/Kconfig
index b29a375de228..7ec3326640c2 100644
--- a/kernel/trace/rv/monitors/tlob/Kconfig
+++ b/kernel/trace/rv/monitors/tlob/Kconfig
@@ -10,3 +10,10 @@ config RV_MON_TLOB
 	  monitor.  tlob tracks per-task elapsed wall-clock time across a
 	  user-delimited code section and emits error_env_tlob when the
 	  elapsed time exceeds a configurable per-invocation budget.
+
+config TLOB_KUNIT_TEST
+	tristate "KUnit tests for tlob monitor" if !KUNIT_ALL_TESTS
+	depends on RV_MON_TLOB && KUNIT
+	default KUNIT_ALL_TESTS
+	help
+	  Enable KUnit unit tests for the tlob RV monitor.
diff --git a/kernel/trace/rv/monitors/tlob/tlob_kunit.c b/kernel/trace/rv/monitors/tlob/tlob_kunit.c
new file mode 100644
index 000000000000..6450d61b26c3
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob_kunit.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for the tlob RV monitor.
+ *
+ */
+#include <kunit/test.h>
+
+#include "tlob.h"
+
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+
+static const char * const tlob_parse_valid[] = {
+	"p /usr/bin/myapp:4768 4848 threshold=5000000",
+	"p /usr/bin/myapp:0x12a0 0x12f0 threshold=10000000",
+	"p /opt/my:app/bin:0x100 0x200 threshold=1000000",
+};
+
+static const char * const tlob_parse_invalid[] = {
+	/* add: malformed */
+	"p :0x100 0x200 threshold=5000",
+	"p /usr/bin/myapp:0x100 threshold=5000",
+	"p /usr/bin/myapp:-1 0x200 threshold=5000",
+	"p /usr/bin/myapp:0x100 0x200",
+	"p /usr/bin/myapp:0x100 0x100 threshold=5000",
+	/* remove: malformed */
+	"-usr/bin/myapp:0x100",
+	"-/usr/bin/myapp",
+	"-/:0x100",
+	"-/usr/bin/myapp:abc",
+};
+
+/* threshold_ns < 1000 or > TLOB_MAX_THRESHOLD_NS return -ERANGE, not -EINVAL. */
+static const char * const tlob_parse_out_of_range[] = {
+	"p /usr/bin/myapp:0x100 0x200 threshold=0",
+	"p /usr/bin/myapp:0x100 0x200 threshold=999",
+	"p /usr/bin/myapp:0x100 0x200 threshold=3600000000001",  /* TLOB_MAX_THRESHOLD_NS + 1 */
+};
+
+/*
+ * Valid add lines return -ENOENT (kern_path() finds no such file in the test
+ * environment) rather than 0; a non-(-EINVAL) return confirms the format was
+ * accepted by the parser.
+ */
+static void tlob_parse_valid_accepted(struct kunit *test)
+{
+	char buf[128];
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tlob_parse_valid); i++) {
+		strscpy(buf, tlob_parse_valid[i], sizeof(buf));
+		KUNIT_EXPECT_NE(test, tlob_create_or_delete_uprobe(buf), -EINVAL);
+	}
+}
+
+static void tlob_parse_invalid_rejected(struct kunit *test)
+{
+	char buf[128];
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tlob_parse_invalid); i++) {
+		strscpy(buf, tlob_parse_invalid[i], sizeof(buf));
+		KUNIT_EXPECT_EQ(test, tlob_create_or_delete_uprobe(buf), -EINVAL);
+	}
+}
+
+static void tlob_parse_out_of_range_rejected(struct kunit *test)
+{
+	char buf[128];
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tlob_parse_out_of_range); i++) {
+		strscpy(buf, tlob_parse_out_of_range[i], sizeof(buf));
+		KUNIT_EXPECT_EQ(test, tlob_create_or_delete_uprobe(buf), -ERANGE);
+	}
+}
+
+static struct kunit_case tlob_parse_cases[] = {
+	KUNIT_CASE(tlob_parse_valid_accepted),
+	KUNIT_CASE(tlob_parse_invalid_rejected),
+	KUNIT_CASE(tlob_parse_out_of_range_rejected),
+	{}
+};
+
+static struct kunit_suite tlob_parse_suite = {
+	.name       = "tlob_parse",
+	.test_cases = tlob_parse_cases,
+};
+
+kunit_test_suite(tlob_parse_suite);
+
+MODULE_DESCRIPTION("KUnit tests for the tlob RV monitor");
+MODULE_LICENSE("GPL");
-- 
2.43.0


^ permalink raw reply related

* [PATCH v3 6/9] rv/tlob: add tlob hybrid automaton monitor
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1780847473.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Add tlob (task latency over budget), a per-task hybrid automaton RV
monitor that tracks elapsed wall-clock time across a user-delimited
code section and emits error_env_tlob when the elapsed time exceeds a
configurable budget.

The monitor uses RV_MON_PER_OBJ with three states (running, waiting,
sleeping) driven by sched_switch and sched_wakeup tracepoints, and a
single clock invariant clk_elapsed < budget enforced by an hrtimer
(HRTIMER_MODE_REL_HARD).  On violation, detail_env_tlob provides a
per-state time breakdown (running_ns, waiting_ns, sleeping_ns).

Per-task state is managed via DA_ALLOC_POOL to avoid allocation on the
scheduler tracepoint path.  Uprobe pairs are registered through the
tracefs monitor file as "p PATH:OFFSET_START OFFSET_STOP threshold=NS".

Also adds ha_cancel_timer_sync() to ha_monitor.h, a blocking cancel
variant needed by tlob's stop_task path to ensure the hrtimer callback
has completed before the per-task monitor state is freed.

Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 Documentation/trace/rv/index.rst           |   1 +
 Documentation/trace/rv/monitor_tlob.rst    | 177 ++++
 kernel/trace/rv/Kconfig                    |   1 +
 kernel/trace/rv/Makefile                   |   1 +
 kernel/trace/rv/monitors/tlob/Kconfig      |  12 +
 kernel/trace/rv/monitors/tlob/tlob.c       | 968 +++++++++++++++++++++
 kernel/trace/rv/monitors/tlob/tlob.h       | 148 ++++
 kernel/trace/rv/monitors/tlob/tlob_trace.h |  49 ++
 kernel/trace/rv/rv_trace.h                 |   1 +
 9 files changed, 1358 insertions(+)
 create mode 100644 Documentation/trace/rv/monitor_tlob.rst
 create mode 100644 kernel/trace/rv/monitors/tlob/Kconfig
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob.c
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob.h
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob_trace.h

diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst
index 29769f06bb0f..1501545b5f08 100644
--- a/Documentation/trace/rv/index.rst
+++ b/Documentation/trace/rv/index.rst
@@ -16,5 +16,6 @@ Runtime Verification
    monitor_wwnr.rst
    monitor_sched.rst
    monitor_rtapp.rst
+   monitor_tlob.rst
    monitor_stall.rst
    monitor_deadline.rst
diff --git a/Documentation/trace/rv/monitor_tlob.rst b/Documentation/trace/rv/monitor_tlob.rst
new file mode 100644
index 000000000000..c651272eab89
--- /dev/null
+++ b/Documentation/trace/rv/monitor_tlob.rst
@@ -0,0 +1,177 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Monitor tlob
+============
+
+- Name: tlob - task latency over budget
+- Type: per-object hybrid automaton (RV_MON_PER_OBJ)
+- Author: Wen Yang <wen.yang@linux.dev>
+
+Description
+-----------
+
+The tlob monitor tracks per-task elapsed wall-clock time (CLOCK_MONOTONIC,
+spanning running, waiting, and sleeping states) and reports a violation when
+the monitored task exceeds a configurable per-invocation budget threshold.
+
+The monitor implements a three-state hybrid automaton with a single clock
+environment variable ``clk_elapsed``.  The clock invariant
+``clk_elapsed < BUDGET_NS()`` is active in all three states; when it is
+violated the HA timer fires and the framework emits ``error_env_tlob``
+then calls ``da_monitor_reset()`` automatically::
+
+                  | (initial, via task_start)
+                  v
+           +--------------+
+           |   running    | <-----------+
+           +--------------+             |
+             |         |                |
+           sleep     preempt        switch_in
+             |         |                |
+             v         v                |
+        +---------+  +---------+        |
+        | sleeping|  | waiting | -------+
+        +---------+  +---------+
+             |            ^
+             +---wakeup---+
+
+  Key transitions:
+    running  --(sleep)------> sleeping   (task blocks waiting for a resource)
+    running  --(preempt)----> waiting    (task preempted, back in runqueue)
+    sleeping --(wakeup)-----> waiting    (resource available, enters runqueue)
+    waiting  --(switch_in)--> running    (scheduler picks task, back on CPU)
+
+  ``tlob_start_task()`` calls ``da_handle_start_run_event(task->pid, ws, start_tlob)``.
+  The ``start_tlob`` self-loop on the ``running`` state triggers
+  ``ha_setup_invariants()``, which resets ``clk_elapsed`` and arms the budget
+  timer automatically.  ``tlob_stop_task()`` cancels the HA timer synchronously
+  via ``ha_cancel_timer_sync()``, then calls ``da_monitor_reset()``.
+
+The non-running condition (monitor not yet started or reset after a
+stop/violation) is handled implicitly by the RV framework
+(``da_mon->monitoring == 0``) — it is not an explicit DA state.
+
+Per-task state lives in ``struct tlob_task_state`` which is stored as
+``monitor_target`` in the framework's ``da_monitor_storage``, indexed by
+pid.  The per-invocation ``threshold_ns`` is read via
+``ha_get_target(ha_mon)->threshold_ns`` inside the HA constraint functions,
+following the same pattern as the ``nomiss`` monitor.
+
+Usage
+-----
+
+tracefs interface (uprobe-based external monitoring)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``monitor`` tracefs file instruments an unmodified binary via uprobes.
+The format follows the ftrace ``uprobe_events`` convention (``PATH:OFFSET``
+for the probe location, ``key=value`` for configuration parameters)::
+
+  p PATH:OFFSET_START OFFSET_STOP threshold=NS
+
+The uprobe at ``OFFSET_START`` fires ``tlob_start_task()``; the uprobe at
+``OFFSET_STOP`` fires ``tlob_stop_task()``.  Both offsets are ELF file
+offsets of entry points in ``PATH``.  ``PATH`` may contain ``:``; the last
+``:`` in the ``PATH:OFFSET_START`` token is the separator.
+
+To remove a binding, use ``-PATH:OFFSET_START``::
+
+  echo 1 > /sys/kernel/tracing/rv/monitors/tlob/enable
+
+  echo "p /usr/bin/myapp:0x12a0 0x12f0 threshold=5000000" \
+      > /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+  # Remove a binding
+  echo "-/usr/bin/myapp:0x12a0" > /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+  # List registered bindings
+  cat /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+  # Read violations from the trace buffer
+  cat /sys/kernel/tracing/trace
+
+Violation tracepoints
+~~~~~~~~~~~~~~~~~~~~~
+
+Two tracepoints are emitted together on a budget violation:
+
+``error_env_tlob``
+  Standard HA clock-invariant tracepoint (emitted by the RV framework).
+  Fields: ``id`` (task pid), ``state``, ``event`` (``"budget_exceeded"``),
+  ``env`` (``"clk_elapsed"``).
+
+``detail_env_tlob``
+  Tlob-specific breakdown of elapsed time per DA state.
+  Fields: ``id`` (task pid), ``threshold_ns``, ``running_ns``,
+  ``waiting_ns``, ``sleeping_ns``.
+
+  Use ``detail_env_tlob`` to diagnose *which phase* consumed the budget:
+  high ``sleeping_ns`` indicates I/O latency; high ``waiting_ns`` indicates
+  scheduler pressure; high ``running_ns`` indicates a compute overrun.
+
+Example: correlate the two tracepoints to see the breakdown::
+
+  trace-cmd record -e error_env_tlob -e detail_env_tlob &
+  # ... run workload ...
+  trace-cmd report
+
+tracefs files
+~~~~~~~~~~~~~
+
+The following files are specific to tlob under
+``/sys/kernel/tracing/rv/monitors/tlob/``:
+
+``monitor`` (rw)
+  Write ``p PATH:OFFSET_START OFFSET_STOP threshold=NS``
+  to bind two entry uprobes.  Write ``-PATH:OFFSET_START`` to remove a
+  binding.  Read to list registered bindings in the same format.
+  See the `tracefs interface (uprobe-based external monitoring)`_ section above.
+
+Kernel API
+----------
+
+``tlob_start_task`` and ``tlob_stop_task`` are the implementation-level
+functions called by the uprobe entry/exit handlers; the interface is
+driven from userspace.
+
+.. kernel-doc:: kernel/trace/rv/monitors/tlob/tlob.c
+   :functions: tlob_start_task tlob_stop_task
+
+``tlob_start_task(task, threshold_ns)``
+  Begin monitoring *task* with a total latency budget of *threshold_ns*
+  nanoseconds.  Allocates per-task state, sets initial DA state to
+  ``running``, resets ``clk_elapsed``, and arms the HA budget timer.
+  Returns 0, -ENODEV (monitor disabled), -ERANGE (threshold out of range),
+  -EALREADY (already monitoring), -ENOSPC (at capacity), or -ENOMEM.
+
+``tlob_stop_task(task)``
+  Stop monitoring *task*.  Synchronously cancels the HA timer via
+  ``ha_cancel_timer_sync()``, checks ``da_monitoring()`` to determine outcome.
+  Returns 0 (clean stop, within budget), -EOVERFLOW (budget was exceeded),
+  -ESRCH (not monitored), or -EAGAIN (concurrent stop racing).
+
+Design notes
+------------
+
+Limitations:
+
+- The initial DA state is always ``running``, set by feeding the synthetic
+  event ``switch_in_tlob`` to ``da_handle_start_event()``.  Monitoring a non-current
+  task that is already in waiting or sleeping state at call time misclassifies
+  the first interval as ``running_ns``.
+- ``TASK_STOPPED`` and ``TASK_TRACED`` carry ``prev_state != 0`` and are
+  therefore counted as ``sleeping_ns``, indistinguishable from
+  I/O-blocked time.
+- ``sched_wakeup_new`` is not hooked.  In practice this is not an issue
+  because ``tlob_start_task`` is always called from a running context.
+
+Specification
+-------------
+
+Graphviz DOT file in tools/verification/models/tlob.dot.
+
+KUnit tests under ``kernel/trace/rv/monitors/tlob/tlob_kunit.c``
+(CONFIG_TLOB_KUNIT_TEST).
+
+User-space integration tests under ``tools/testing/selftests/verification/``
+(requires CONFIG_RV_MON_TLOB=y and root).
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index e2e0033a00b9..ed2de31d0312 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -85,6 +85,7 @@ source "kernel/trace/rv/monitors/sleep/Kconfig"
 source "kernel/trace/rv/monitors/stall/Kconfig"
 source "kernel/trace/rv/monitors/deadline/Kconfig"
 source "kernel/trace/rv/monitors/nomiss/Kconfig"
+source "kernel/trace/rv/monitors/tlob/Kconfig"
 # Add new deadline monitors here
 
 # Add new monitors here
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index f139b904bea3..ae59e97f8682 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
 obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
 obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
 obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
+obj-$(CONFIG_RV_MON_TLOB) += monitors/tlob/tlob.o
 # Add new monitors here
 obj-$(CONFIG_RV_UPROBE) += rv_uprobe.o
 obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
diff --git a/kernel/trace/rv/monitors/tlob/Kconfig b/kernel/trace/rv/monitors/tlob/Kconfig
new file mode 100644
index 000000000000..b29a375de228
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_TLOB
+	depends on RV && UPROBES && HIGH_RES_TIMERS
+	select HA_MON_EVENTS_ID
+	select RV_UPROBE
+	bool "tlob monitor"
+	help
+	  Enable the tlob (task latency over budget) hybrid-automaton RV
+	  monitor.  tlob tracks per-task elapsed wall-clock time across a
+	  user-delimited code section and emits error_env_tlob when the
+	  elapsed time exceeds a configurable per-invocation budget.
diff --git a/kernel/trace/rv/monitors/tlob/tlob.c b/kernel/trace/rv/monitors/tlob/tlob.c
new file mode 100644
index 000000000000..d8e0c4794720
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob.c
@@ -0,0 +1,968 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob: task latency over budget monitor
+ *
+ * Track the elapsed wall-clock time of a marked code path and detect when
+ * a monitored task exceeds its per-task latency budget.  CLOCK_MONOTONIC
+ * is used so both on-CPU and off-CPU time count toward the budget.
+ *
+ * On a budget violation, two tracepoints are emitted from the hrtimer
+ * callback: error_env_tlob signals the violation, and detail_env_tlob
+ * provides a per-state time breakdown (running_ns, waiting_ns, sleeping_ns)
+ * that pinpoints whether the overrun occurred in running, waiting, or sleeping state.
+ *
+ * The monitor uses RV_MON_PER_OBJ: per-task state (struct tlob_task_state)
+ * is stored as monitor_target in the framework's hash table.
+ *
+ * One HA clock invariant is enforced:
+ *   clk_elapsed < BUDGET_NS()   (active in all states)
+ *
+ * tlob_start_task() uses da_handle_start_run_event(start_tlob) to initialise
+ * the monitor: the DA framework sets the initial state and processes the start
+ * event, which resets clk_elapsed and arms the budget hrtimer via
+ * ha_setup_invariants().  The HA timer is cancelled synchronously by
+ * ha_cancel_timer_sync() in tlob_stop_task().
+ *
+ * Copyright (C) 2026 Wen Yang <wen.yang@linux.dev>
+ */
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/rv.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/tracefs.h>
+#include <kunit/visibility.h>
+#include <rv/instrumentation.h>
+#include <rv/rv_uprobe.h>
+#include "../../rv.h"
+
+#define MODULE_NAME "tlob"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+
+/*
+ * Per-task latency monitoring state.  One instance per monitoring window.
+ * Stored as monitor_target in da_monitor_storage; freed via call_rcu.
+ */
+struct tlob_task_state {
+	struct task_struct	*task;		/* via get_task_struct */
+	u64			threshold_ns;	/* budget in nanoseconds */
+
+	/* 1 = cleanup claimed; ha_setup_invariants won't restart the timer. */
+	atomic_t		stopping;
+
+	/* Serialises the ns accumulators; held briefly (hardirq-safe). */
+	raw_spinlock_t		entry_lock;
+	u64			running_ns;	/* time in running state  */
+	u64			waiting_ns;	/* time in waiting state  */
+	u64			sleeping_ns;	/* time in sleeping state */
+	ktime_t			last_ts;
+
+	struct rcu_head		rcu;		/* for call_rcu() teardown */
+};
+
+#define RV_MON_TYPE RV_MON_PER_OBJ
+#define HA_TIMER_TYPE HA_TIMER_HRTIMER
+#define DA_MON_ALLOCATION_STRATEGY DA_ALLOC_POOL
+
+/* Type for da_monitor_storage.target; must be defined before the includes. */
+typedef struct tlob_task_state *monitor_target;
+
+/* Forward-declared so da_monitor_reset_hook works before ha_monitor.h. */
+static inline void tlob_reset_notify(struct da_monitor *da_mon);
+#define da_monitor_reset_hook tlob_reset_notify
+
+/* Override EVENT_NONE_LBL so the timer-fired violation shows "budget_exceeded". */
+#define EVENT_NONE_LBL "budget_exceeded"
+
+#include "tlob.h"
+
+/*
+ * DA_MON_POOL_SIZE must be defined HERE: after tlob.h (which defines
+ * TLOB_MAX_MONITORED) and before #include <rv/ha_monitor.h> (which
+ * transitively includes da_monitor.h and expands __da_monitor_init_pool
+ * using this macro).  Placing the define before tlob.h or after
+ * ha_monitor.h both cause a build error.
+ */
+#define DA_MON_POOL_SIZE TLOB_MAX_MONITORED
+
+/*
+ * Forward-declare tlob_extra_cleanup so the #define below is valid when
+ * da_monitor.h (included via ha_monitor.h) expands da_extra_cleanup inside
+ * da_monitor_destroy().  The full definition follows after ha_monitor.h.
+ */
+static inline void tlob_extra_cleanup(struct da_monitor *da_mon);
+#define da_extra_cleanup tlob_extra_cleanup
+
+#include <rv/ha_monitor.h>
+
+/*
+ * Called from da_monitor_reset() on both normal stop and hrtimer expiry.
+ * On violation (stopping==0), emits detail_env_tlob.
+ */
+static inline void tlob_reset_notify(struct da_monitor *da_mon)
+{
+	struct ha_monitor *ha_mon = to_ha_monitor(da_mon);
+	struct tlob_task_state *ws;
+
+	ha_monitor_reset_env(da_mon);
+
+	ws = ha_get_target(ha_mon);
+	if (!ws)
+		return;
+
+	/*
+	 * Emit per-state breakdown on budget violation only.
+	 * stopping==0: timer callback owns this path (genuine overrun).
+	 * stopping==1: normal stop claimed ownership first; skip.
+	 */
+	if (!atomic_read(&ws->stopping)) {
+		unsigned int curr_state = READ_ONCE(da_mon->curr_state);
+		u64 running_ns, waiting_ns, sleeping_ns, partial_ns;
+		unsigned long flags;
+
+		/*
+		 * Snapshot accumulators; partial_ns covers curr_state time
+		 * not yet folded in (transition-out pending).
+		 */
+		raw_spin_lock_irqsave(&ws->entry_lock, flags);
+		partial_ns   = ktime_get_ns() - ktime_to_ns(ws->last_ts);
+		running_ns   = ws->running_ns  +
+			       (curr_state == running_tlob  ? partial_ns : 0);
+		waiting_ns   = ws->waiting_ns  +
+			       (curr_state == waiting_tlob  ? partial_ns : 0);
+		sleeping_ns  = ws->sleeping_ns +
+			       (curr_state == sleeping_tlob ? partial_ns : 0);
+		raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+
+		trace_detail_env_tlob(da_get_id(da_mon), ws->threshold_ns,
+				      running_ns, waiting_ns, sleeping_ns);
+	}
+}
+
+#define BUDGET_NS(ha_mon) (ha_get_target(ha_mon)->threshold_ns)
+
+/* HA constraint functions (called by ha_monitor_handle_constraint) */
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_tlob env, u64 time_ns)
+{
+	if (env == clk_elapsed_tlob)
+		return ha_get_clk_ns(ha_mon, env, time_ns);
+	return ENV_INVALID_VALUE;
+}
+
+/*
+ * ha_verify_invariants - clk_elapsed < BUDGET_NS must hold in all states.
+ *
+ * The invariant is uniform across running/waiting/sleeping; check it
+ * unconditionally rather than enumerating each state.
+ */
+static inline bool ha_verify_invariants(struct ha_monitor *ha_mon,
+					enum states curr_state, enum events event,
+					enum states next_state, u64 time_ns)
+{
+	return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns);
+}
+
+/*
+ * Convert invariant (deadline) to guard (reset anchor) on state transitions.
+ *
+ * The conversion is identical for every departing state; skip only self-loops.
+ */
+static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
+					enum states curr_state, enum events event,
+					enum states next_state, u64 time_ns)
+{
+	if (curr_state != next_state)
+		ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+}
+
+/* No per-event guard conditions for tlob; invariants suffice. */
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+				    enum states curr_state, enum events event,
+				    enum states next_state, u64 time_ns)
+{
+	return true;
+}
+
+/*
+ * Arm or cancel the HA budget timer on state transitions.
+ *
+ * The timer must run in every monitored state (running/waiting/sleeping),
+ * so arm it whenever next_state is any of the three.  On a self-loop caused
+ * by a non-start event the timer is already running; skip the redundant
+ * restart.  On a true state change the old timer is implicitly superseded by
+ * the new ha_start_timer_ns() call.
+ *
+ * Guard on stopping: sched_switch events can arrive after ha_cancel_timer_sync,
+ * restarting the timer and triggering an ODEBUG "activate active" splat.
+ * The _acquire pairs with the cmpxchg_release in tlob_stop_task.
+ */
+static inline void ha_setup_invariants(struct ha_monitor *ha_mon,
+				       enum states curr_state, enum events event,
+				       enum states next_state, u64 time_ns)
+{
+	if (next_state == curr_state && event != start_tlob)
+		return;
+
+	if (next_state < state_max_tlob) {
+		if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping))
+			ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+	} else {
+		ha_cancel_timer(ha_mon);
+	}
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+				 enum states curr_state, enum events event,
+				 enum states next_state, u64 time_ns)
+{
+	if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns))
+		return false;
+
+	ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns);
+
+	if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+		return false;
+
+	ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns);
+
+	return true;
+}
+
+static struct kmem_cache *tlob_state_cache;
+
+/* Uprobe binding list; protected by tlob_uprobe_mutex. */
+static LIST_HEAD(tlob_uprobe_list);
+static DEFINE_MUTEX(tlob_uprobe_mutex);
+
+/* Serialises duplicate-check + da_handle_start_run_event() for the same pid. */
+static DEFINE_MUTEX(tlob_start_mutex);
+
+
+/* Per-uprobe-binding state: a start + stop probe pair for one binary region. */
+struct tlob_uprobe_binding {
+	struct list_head	list;
+	u64			threshold_ns;
+	char			binpath[TLOB_MAX_PATH];
+	loff_t			offset_start;
+	loff_t			offset_stop;
+	struct rv_uprobe	*start_probe;
+	struct rv_uprobe	*stop_probe;
+};
+
+/* RCU callback: free the slab once no readers remain. */
+static void tlob_free_rcu(struct rcu_head *head)
+{
+	struct tlob_task_state *ws =
+		container_of(head, struct tlob_task_state, rcu);
+	kmem_cache_free(tlob_state_cache, ws);
+}
+
+/*
+ * da_extra_cleanup - per-task teardown called by da_monitor_destroy().
+ *
+ * Claims cleanup ownership via CAS; cancels the budget timer; decrements the
+ * monitored-task counter; and schedules the slab free via call_rcu().
+ * Must run before da_monitor_reset() (i.e. before hash_del_rcu()) so that
+ * ha_cancel_timer_sync() can safely access the still-registered ha_monitor.
+ */
+static inline void tlob_extra_cleanup(struct da_monitor *da_mon)
+{
+	struct ha_monitor *ha_mon = to_ha_monitor(da_mon);
+	struct tlob_task_state *ws = ha_get_target(ha_mon);
+
+	if (!ws)
+		return;
+
+	if (atomic_cmpxchg_release(&ws->stopping, 0, 1) != 0)
+		return;
+
+	ha_cancel_timer_sync(ha_mon);
+	put_task_struct(ws->task);
+	call_rcu(&ws->rcu, tlob_free_rcu);
+}
+
+/*
+ * __tlob_acc - accumulate elapsed ns into one per-state counter.
+ *
+ * Looks up the task's tlob_task_state under RCU, adds the interval
+ * [ws->last_ts, now] to the field at @offset within the state struct,
+ * and updates last_ts.  Returns true if the task is monitored.
+ *
+ * entry_lock is a raw spinlock so this is safe from hardirq context.
+ */
+static inline bool __tlob_acc(struct task_struct *task, ktime_t now,
+			       size_t offset)
+{
+	struct tlob_task_state *ws;
+	unsigned long flags;
+
+	scoped_guard(rcu) {
+		ws = da_get_target_by_id(task->pid);
+		if (!ws)
+			return false;
+		raw_spin_lock_irqsave(&ws->entry_lock, flags);
+		*(u64 *)((char *)ws + offset) += ktime_to_ns(ktime_sub(now, ws->last_ts));
+		ws->last_ts = now;
+		raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+	}
+	return true;
+}
+
+/* Accumulate running_ns for prev; returns true if prev is monitored. */
+static inline bool tlob_acc_running(struct task_struct *task, ktime_t now)
+{
+	return __tlob_acc(task, now, offsetof(struct tlob_task_state, running_ns));
+}
+
+/* Accumulate waiting_ns for next; returns true if next is monitored. */
+static inline bool tlob_acc_waiting(struct task_struct *task, ktime_t now)
+{
+	return __tlob_acc(task, now, offsetof(struct tlob_task_state, waiting_ns));
+}
+
+/*
+ * handle_sched_switch - advance the DA on every context switch.
+ *
+ * Generates three DA events:
+ *   prev, prev_state != 0  -> sleep_tlob    (running -> sleeping)
+ *   prev, prev_state == 0  -> preempt_tlob  (running -> waiting)
+ *   next                   -> switch_in_tlob (waiting -> running)
+ *
+ * A single ktime_get() at handler entry is shared by both acc calls so that
+ * prev's running_ns and next's waiting_ns share the same context-switch
+ * timestamp; neither absorbs handler overhead into its accumulator.
+ *
+ * No waiting->sleeping edge exists: a task can only block voluntarily
+ * (call schedule()) while it is executing on CPU, which corresponds to
+ * the running DA state.  A task in the waiting state is TASK_RUNNING in
+ * kernel terms (on the runqueue) and cannot block itself.
+ *
+ * da_handle_event() is called unconditionally: it skips tasks that have no
+ * monitor entry in the hash table.
+ */
+static void handle_sched_switch(void *data, bool preempt_unused,
+				struct task_struct *prev,
+				struct task_struct *next,
+				unsigned int prev_state)
+{
+	ktime_t now = ktime_get();
+	bool prev_preempted = (prev_state == 0);
+
+	/*
+	 * No guard on tlob_num_monitored here: da_handle_event() internally
+	 * calls da_monitor_handling_event() which checks both rv_monitoring_on()
+	 * and da_monitoring(da_mon).  The hash lookup inside da_get_monitor()
+	 * simply returns NULL for unmonitored tasks, which is equally fast as
+	 * an atomic_read() guard.  By omitting the guard we avoid touching the
+	 * tlob_num_monitored cacheline on every global context-switch.
+	 */
+	if (tlob_acc_running(prev, now))
+		da_handle_event(prev->pid, NULL,
+				prev_preempted ? preempt_tlob : sleep_tlob);
+	if (tlob_acc_waiting(next, now))
+		da_handle_event(next->pid, NULL, switch_in_tlob);
+}
+
+/* Accumulate sleeping_ns on wakeup; returns true if task is monitored. */
+static inline bool tlob_acc_sleeping(struct task_struct *task, ktime_t now)
+{
+	return __tlob_acc(task, now, offsetof(struct tlob_task_state, sleeping_ns));
+}
+
+/*
+ * handle_sched_wakeup - sleeping -> waiting transition.
+ *
+ * try_to_wake_up() skips TASK_RUNNING tasks, so this never fires for a
+ * task already in running or waiting state.
+ */
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+	ktime_t now = ktime_get();
+
+	/* Same reasoning as handle_sched_switch: rely on hash-lookup fast path. */
+	if (tlob_acc_sleeping(p, now))
+		da_handle_event(p->pid, NULL, wakeup_tlob);
+}
+
+/*
+ * handle_sched_process_exit - clean up if a task exits without TRACE_STOP.
+ *
+ * Called in do_exit() context; the task still has a valid pid here.
+ * tlob_stop_task() returns -ESRCH if the task is not monitored, which is fine.
+ */
+static void handle_sched_process_exit(void *data, struct task_struct *p,
+				       bool group_dead)
+{
+	tlob_stop_task(p);
+}
+
+
+
+/**
+ * tlob_start_task - begin monitoring @task with budget @threshold_ns ns.
+ * @task:         Task to monitor; may be current or another task.
+ * @threshold_ns: Latency budget in nanoseconds (wall-clock; running + waiting + sleeping).
+ *                Must be in [1000, TLOB_MAX_THRESHOLD_NS].
+ *
+ * Returns 0, -ENODEV, -ERANGE, -EALREADY, -ENOMEM, or -ENOSPC.
+ */
+int tlob_start_task(struct task_struct *task, u64 threshold_ns)
+{
+	struct tlob_task_state *ws;
+
+	if (!da_monitor_enabled())
+		return -ENODEV;
+
+	if (threshold_ns < 1000 || threshold_ns > TLOB_MAX_THRESHOLD_NS)
+		return -ERANGE;
+
+	/* Serialise duplicate-check + pool-slot claim for the same pid. */
+	guard(mutex)(&tlob_start_mutex);
+
+	if (da_get_target_by_id(task->pid))
+		return -EALREADY;
+
+	ws = kmem_cache_zalloc(tlob_state_cache, GFP_KERNEL);
+	if (!ws)
+		return -ENOMEM;
+
+	ws->task = task;
+	get_task_struct(task);
+	ws->threshold_ns = threshold_ns;
+	ws->last_ts = ktime_get();
+	raw_spin_lock_init(&ws->entry_lock);
+
+	/*
+	 * da_handle_start_run_event() claims a pool slot via da_prepare_storage(),
+	 * initialises the monitor, and delivers start_tlob in one step: the
+	 * generated ha_setup_invariants() resets clk_elapsed and arms the timer.
+	 * Returns 0 if the pool is exhausted (-ENOSPC).
+	 */
+	if (!da_handle_start_run_event(task->pid, ws, start_tlob)) {
+		put_task_struct(task);
+		kmem_cache_free(tlob_state_cache, ws);
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tlob_start_task);
+
+/**
+ * tlob_stop_task - stop monitoring @task.
+ * @task: Task to stop.
+ *
+ * CAS on ws->stopping (0->1) under RCU claims cleanup ownership;
+ * the winner cancels the timer synchronously and frees all resources.
+ *
+ * Returns 0, -EOVERFLOW (budget exceeded), -ESRCH (not monitored),
+ * or -EAGAIN (concurrent caller claimed cleanup).
+ */
+int tlob_stop_task(struct task_struct *task)
+{
+	struct da_monitor *da_mon;
+	struct ha_monitor *ha_mon;
+	struct tlob_task_state *ws;
+	bool budget_exceeded;
+
+	scoped_guard(rcu) {
+		ws = da_get_target_by_id(task->pid);
+		if (!ws)
+			return -ESRCH;
+
+		da_mon = da_get_monitor(task->pid, NULL);
+		if (unlikely(!da_mon)) {
+			/* ws in hash but da_mon gone; internal inconsistency. */
+			WARN_ON_ONCE(1);
+			return -ESRCH;
+		}
+
+		ha_mon = to_ha_monitor(da_mon);
+
+		/*
+		 * CAS (0->1) claims cleanup ownership under RCU (ws guaranteed valid).
+		 * _release pairs with atomic_read_acquire in ha_setup_invariants.
+		 */
+		if (atomic_cmpxchg_release(&ws->stopping, 0, 1) != 0)
+			return -EAGAIN;
+	}
+
+	/* Wait for in-flight timer callback before reading da_monitoring. */
+	ha_cancel_timer_sync(ha_mon);
+
+	/* Timer fired first -> budget exceeded; otherwise reset normally. */
+	scoped_guard(rcu) {
+		budget_exceeded = !da_monitoring(da_mon);
+		if (!budget_exceeded)
+			da_monitor_reset(da_mon);
+	}
+	da_destroy_storage(task->pid);
+
+	put_task_struct(ws->task);
+	call_rcu(&ws->rcu, tlob_free_rcu);
+	return budget_exceeded ? -EOVERFLOW : 0;
+}
+EXPORT_SYMBOL_GPL(tlob_stop_task);
+
+
+static int tlob_uprobe_entry_handler(struct rv_uprobe *p, struct pt_regs *regs,
+				     __u64 *data)
+{
+	struct tlob_uprobe_binding *b = p->priv;
+
+	tlob_start_task(current, b->threshold_ns);
+	return 0;
+}
+
+static int tlob_uprobe_stop_handler(struct rv_uprobe *p, struct pt_regs *regs,
+				    __u64 *data)
+{
+	tlob_stop_task(current);
+	return 0;
+}
+
+/*
+ * Register start + stop entry uprobes for a binding.
+ * Called with tlob_uprobe_mutex held.
+ */
+static int tlob_add_uprobe(u64 threshold_ns, const char *binpath,
+			   loff_t offset_start, loff_t offset_stop)
+{
+	struct tlob_uprobe_binding *b, *tmp_b;
+	char pathbuf[TLOB_MAX_PATH];
+	struct path path;
+	char *canon;
+	int ret;
+
+	if (binpath[0] != '/')
+		return -EINVAL;
+
+	b = kzalloc_obj(*b, GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+
+	b->threshold_ns = threshold_ns;
+	b->offset_start = offset_start;
+	b->offset_stop  = offset_stop;
+
+	ret = kern_path(binpath, LOOKUP_FOLLOW, &path);
+	if (ret)
+		goto err_free;
+
+	if (!d_is_reg(path.dentry)) {
+		ret = -EINVAL;
+		goto err_path;
+	}
+
+	/* Reject duplicate start offset for the same binary. */
+	list_for_each_entry(tmp_b, &tlob_uprobe_list, list) {
+		if (tmp_b->offset_start == offset_start &&
+		    tmp_b->start_probe->path.dentry == path.dentry) {
+			ret = -EEXIST;
+			goto err_path;
+		}
+	}
+
+	canon = d_path(&path, pathbuf, sizeof(pathbuf));
+	if (IS_ERR(canon)) {
+		ret = PTR_ERR(canon);
+		goto err_path;
+	}
+	strscpy(b->binpath, canon, sizeof(b->binpath));
+
+	/* Both probes share b (priv) and path; attach_path refs path itself. */
+	b->start_probe = rv_uprobe_attach_path(&path, offset_start,
+					       tlob_uprobe_entry_handler, NULL, b);
+	if (IS_ERR(b->start_probe)) {
+		ret = PTR_ERR(b->start_probe);
+		b->start_probe = NULL;
+		goto err_path;
+	}
+
+	b->stop_probe = rv_uprobe_attach_path(&path, offset_stop,
+					      tlob_uprobe_stop_handler, NULL, b);
+	if (IS_ERR(b->stop_probe)) {
+		ret = PTR_ERR(b->stop_probe);
+		b->stop_probe = NULL;
+		goto err_start;
+	}
+
+	path_put(&path);
+	list_add_tail(&b->list, &tlob_uprobe_list);
+	return 0;
+
+err_start:
+	rv_uprobe_detach(b->start_probe);
+err_path:
+	path_put(&path);
+err_free:
+	kfree(b);
+	return ret;
+}
+
+static int tlob_remove_uprobe_by_key(loff_t offset_start, const char *binpath)
+{
+	struct tlob_uprobe_binding *b, *tmp;
+	struct path remove_path;
+	int ret;
+
+	ret = kern_path(binpath, LOOKUP_FOLLOW, &remove_path);
+	if (ret)
+		return ret;
+
+	ret = -ENOENT;
+	list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
+		if (b->offset_start != offset_start)
+			continue;
+		if (b->start_probe->path.dentry != remove_path.dentry)
+			continue;
+		list_del(&b->list);
+		rv_uprobe_detach(b->start_probe);
+		rv_uprobe_detach(b->stop_probe);
+		kfree(b);
+		ret = 0;
+		break;
+	}
+
+	path_put(&remove_path);
+	return ret;
+}
+
+static void tlob_remove_all_uprobes(void)
+{
+	struct tlob_uprobe_binding *b, *tmp;
+	LIST_HEAD(pending);
+
+	mutex_lock(&tlob_uprobe_mutex);
+	list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
+		list_move(&b->list, &pending);
+		rv_uprobe_unregister_nosync(b->start_probe);
+		rv_uprobe_unregister_nosync(b->stop_probe);
+	}
+	mutex_unlock(&tlob_uprobe_mutex);
+
+	if (list_empty(&pending))
+		return;
+
+	/*
+	 * One global barrier for all probes dequeued above; no new handlers
+	 * for any of them can fire after this returns.
+	 */
+	rv_uprobe_sync();
+
+	list_for_each_entry_safe(b, tmp, &pending, list) {
+		rv_uprobe_free(b->start_probe);
+		rv_uprobe_free(b->stop_probe);
+		kfree(b);
+	}
+}
+
+static ssize_t tlob_monitor_read(struct file *file,
+				 char __user *ubuf,
+				 size_t count, loff_t *ppos)
+{
+	const int line_sz = TLOB_MAX_PATH + 128;
+	struct tlob_uprobe_binding *b;
+	char *buf, *p;
+	int n = 0, buf_sz, pos = 0;
+	ssize_t ret;
+
+	mutex_lock(&tlob_uprobe_mutex);
+	list_for_each_entry(b, &tlob_uprobe_list, list)
+		n++;
+
+	buf_sz = (n ? n : 1) * line_sz + 1;
+	buf = kmalloc(buf_sz, GFP_KERNEL);
+	if (!buf) {
+		mutex_unlock(&tlob_uprobe_mutex);
+		return -ENOMEM;
+	}
+
+	list_for_each_entry(b, &tlob_uprobe_list, list) {
+		p = b->binpath;
+		pos += scnprintf(buf + pos, buf_sz - pos,
+				 "p %s:0x%llx 0x%llx threshold=%llu\n",
+				 p,
+				 (unsigned long long)b->offset_start,
+				 (unsigned long long)b->offset_stop,
+				 b->threshold_ns);
+	}
+	mutex_unlock(&tlob_uprobe_mutex);
+
+	ret = simple_read_from_buffer(ubuf, count, ppos, buf, pos);
+	kfree(buf);
+	return ret;
+}
+
+/*
+ * Parse "p PATH:OFFSET_START OFFSET_STOP threshold=NS".
+ * PATH may contain ':'; the last ':' separates path from offset.
+ * Returns 0, -EINVAL, or -ERANGE.
+ */
+static int tlob_parse_uprobe_line(char *buf, u64 *thr_out,
+				  char **path_out,
+				  loff_t *start_out, loff_t *stop_out)
+{
+	unsigned long long thr = 0, stop_val = 0;
+	long long start_val;
+	char *p, *path_token, *token, *colon;
+	bool got_stop = false, got_thr = false;
+	int n;
+
+	/* Must start with "p " */
+	if (buf[0] != 'p' || buf[1] != ' ')
+		return -EINVAL;
+
+	p = buf + 2;
+	while (*p == ' ')
+		p++;
+
+	/* First space-delimited token is PATH:OFFSET_START */
+	path_token = strsep(&p, " \t");
+	if (!path_token || !*path_token)
+		return -EINVAL;
+
+	/* Split at last ':' to handle paths that contain ':'. */
+	colon = strrchr(path_token, ':');
+	if (!colon || colon - path_token < 2)
+		return -EINVAL;
+	*colon = '\0';
+
+	if (path_token[0] != '/')
+		return -EINVAL;
+
+	n = 0;
+	if (sscanf(colon + 1, "%lli%n", &start_val, &n) != 1 || n == 0)
+		return -EINVAL;
+	if (start_val < 0)
+		return -EINVAL;
+
+	/* Remaining tokens: OFFSET_STOP threshold=NS */
+	while (p && (token = strsep(&p, " \t")) != NULL) {
+		if (!*token)
+			continue;
+		if (strncmp(token, "threshold=", 10) == 0) {
+			if (kstrtoull(token + 10, 0, &thr))
+				return -EINVAL;
+			if (thr < 1000 || thr > TLOB_MAX_THRESHOLD_NS)
+				return -ERANGE;
+			got_thr = true;
+		} else if (!got_stop) {
+			long long sv;
+
+			n = 0;
+			if (sscanf(token, "%lli%n", &sv, &n) != 1 || n == 0)
+				return -EINVAL;
+			if (sv < 0)
+				return -EINVAL;
+			stop_val = (unsigned long long)sv;
+			got_stop = true;
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	if (!got_stop || !got_thr)
+		return -EINVAL;
+	if (start_val == (long long)stop_val)
+		return -EINVAL;
+
+	*thr_out   = thr;
+	*path_out  = path_token;
+	*start_out = (loff_t)start_val;
+	*stop_out  = (loff_t)stop_val;
+	return 0;
+}
+
+/* Parse "-PATH:OFFSET_START" (ftrace uprobe_events removal convention). */
+static int tlob_parse_remove_line(char *buf, char **path_out, loff_t *start_out)
+{
+	char *binpath, *colon;
+	long long off;
+	int n = 0;
+
+	if (buf[0] != '-')
+		return -EINVAL;
+	binpath = buf + 1;
+	if (binpath[0] != '/')
+		return -EINVAL;
+	colon = strrchr(binpath, ':');
+	if (!colon || colon - binpath < 2)
+		return -EINVAL;
+	*colon = '\0';
+	if (sscanf(colon + 1, "%lli%n", &off, &n) != 1 || n == 0)
+		return -EINVAL;
+	*path_out  = binpath;
+	*start_out = (loff_t)off;
+	return 0;
+}
+
+VISIBLE_IF_KUNIT int tlob_create_or_delete_uprobe(char *buf)
+{
+	loff_t offset_start, offset_stop;
+	u64 threshold_ns;
+	char *binpath;
+	int ret;
+
+	if (buf[0] == '-') {
+		ret = tlob_parse_remove_line(buf, &binpath, &offset_start);
+		if (ret)
+			return ret;
+		mutex_lock(&tlob_uprobe_mutex);
+		ret = tlob_remove_uprobe_by_key(offset_start, binpath);
+		mutex_unlock(&tlob_uprobe_mutex);
+		return ret;
+	}
+	ret = tlob_parse_uprobe_line(buf, &threshold_ns, &binpath,
+				     &offset_start, &offset_stop);
+	if (ret)
+		return ret;
+	mutex_lock(&tlob_uprobe_mutex);
+	ret = tlob_add_uprobe(threshold_ns, binpath, offset_start, offset_stop);
+	mutex_unlock(&tlob_uprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_create_or_delete_uprobe);
+
+static ssize_t tlob_monitor_write(struct file *file,
+				  const char __user *ubuf,
+				  size_t count, loff_t *ppos)
+{
+	char buf[TLOB_MAX_PATH + 128];
+
+	if (count >= sizeof(buf))
+		return -EINVAL;
+	if (copy_from_user(buf, ubuf, count))
+		return -EFAULT;
+	buf[count] = '\0';
+	if (count > 0 && buf[count - 1] == '\n')
+		buf[count - 1] = '\0';
+	return tlob_create_or_delete_uprobe(buf) ?: (ssize_t)count;
+}
+
+static const struct file_operations tlob_monitor_fops = {
+	.open	= simple_open,
+	.read	= tlob_monitor_read,
+	.write	= tlob_monitor_write,
+	.llseek	= noop_llseek,
+};
+
+static int __tlob_init_monitor(void)
+{
+	int retval;
+
+	tlob_state_cache = kmem_cache_create("tlob_task_state",
+					     sizeof(struct tlob_task_state),
+					     0, 0, NULL);
+	if (!tlob_state_cache)
+		return -ENOMEM;
+
+	retval = ha_monitor_init();
+	if (retval) {
+		kmem_cache_destroy(tlob_state_cache);
+		tlob_state_cache = NULL;
+		return retval;
+	}
+
+	rv_this.enabled = 1;
+	return 0;
+}
+
+static void __tlob_destroy_monitor(void)
+{
+	rv_this.enabled = 0;
+	/*
+	 * Remove uprobes first; rv_uprobe_sync() inside ensures all in-flight
+	 * handlers have finished before we proceed.
+	 */
+	tlob_remove_all_uprobes();
+
+	/*
+	 * da_monitor_destroy() iterates any remaining entries via da_extra_cleanup
+	 * (tlob_extra_cleanup), cancels their timers, and frees their state.
+	 * rcu_barrier() inside drains both da_pool_return_cb and tlob_free_rcu
+	 * callbacks before the pool arrays are freed.
+	 */
+	ha_monitor_destroy();
+	kmem_cache_destroy(tlob_state_cache);
+	tlob_state_cache = NULL;
+}
+
+static int tlob_enable_hooks(void)
+{
+	rv_attach_trace_probe("tlob", sched_switch, handle_sched_switch);
+	rv_attach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup);
+	rv_attach_trace_probe("tlob", sched_process_exit, handle_sched_process_exit);
+	return 0;
+}
+
+static void tlob_disable_hooks(void)
+{
+	rv_detach_trace_probe("tlob", sched_switch, handle_sched_switch);
+	rv_detach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup);
+	rv_detach_trace_probe("tlob", sched_process_exit, handle_sched_process_exit);
+}
+
+static int enable_tlob(void)
+{
+	int retval;
+
+	retval = __tlob_init_monitor();
+	if (retval)
+		return retval;
+
+	return tlob_enable_hooks();
+}
+
+static void disable_tlob(void)
+{
+	tlob_disable_hooks();
+	__tlob_destroy_monitor();
+}
+
+static struct rv_monitor rv_this = {
+	.name		= "tlob",
+	.description	= "Per-task latency-over-budget monitor.",
+	.enable		= enable_tlob,
+	.disable	= disable_tlob,
+	.reset		= da_monitor_reset_all,
+	.enabled	= 0,
+};
+
+static int __init register_tlob(void)
+{
+	int ret;
+
+	ret = rv_register_monitor(&rv_this, NULL);
+	if (ret)
+		return ret;
+
+	if (rv_this.root_d) {
+		if (!tracefs_create_file("monitor", 0644, rv_this.root_d, NULL,
+					 &tlob_monitor_fops)) {
+			rv_unregister_monitor(&rv_this);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void __exit unregister_tlob(void)
+{
+	rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_tlob);
+module_exit(unregister_tlob);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wen Yang <wen.yang@linux.dev>");
+MODULE_DESCRIPTION("tlob: task latency over budget per-task monitor.");
diff --git a/kernel/trace/rv/monitors/tlob/tlob.h b/kernel/trace/rv/monitors/tlob/tlob.h
new file mode 100644
index 000000000000..b6724e629c69
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RV_TLOB_H
+#define _RV_TLOB_H
+
+/*
+ * C representation of the tlob hybrid automaton.
+ *
+ * Three-state HA following sched_stat / wwnr monitor naming conventions:
+ *
+ *   running  (initial) - task is executing on CPU          [sched_stat: runtime]
+ *   waiting             - task is in runqueue, awaiting CPU [sched_stat: wait   ]
+ *   sleeping            - task is blocked, awaiting resource[sched_stat: sleep  ]
+ *
+ * Events (derived from sched_switch / sched_wakeup tracepoints):
+ *   start     - tlob_start_task()               running  → running  (resets clock, arms timer)
+ *   sleep     - sched_switch, prev_state != 0   running  → sleeping
+ *   preempt   - sched_switch, prev_state == 0   running  → waiting
+ *   wakeup    - sched_wakeup                    sleeping → waiting
+ *   switch_in - sched_switch, next == task      waiting  → running
+ *
+ * One HA clock invariant:
+ *   clk_elapsed < BUDGET_NS()  active in all states  (total latency budget)
+ *
+ * tlob_start_task() uses da_handle_start_run_event(start_tlob) to initialise
+ * the monitor: the DA framework sets the initial state and then processes the
+ * start event, which resets clk_elapsed and arms the budget hrtimer via the
+ * generated ha_setup_invariants().
+ * tlob_stop_task() calls ha_cancel_timer_sync() + da_monitor_reset() directly.
+ *
+ * For the format description see:
+ *   Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#include <linux/rv.h>
+#include <linux/sched.h>
+
+#define MONITOR_NAME tlob
+
+enum states_tlob {
+	running_tlob,
+	waiting_tlob,
+	sleeping_tlob,
+	state_max_tlob,
+};
+
+#define INVALID_STATE state_max_tlob
+
+enum events_tlob {
+	start_tlob,
+	sleep_tlob,
+	preempt_tlob,
+	wakeup_tlob,
+	switch_in_tlob,
+	event_max_tlob,
+};
+
+/*
+ * HA environment variable: clk_elapsed is the only clock.
+ * It measures wall-clock time since task_start and is active in all states.
+ */
+enum envs_tlob {
+	clk_elapsed_tlob,
+	env_max_tlob,
+	env_max_stored_tlob = env_max_tlob,
+};
+
+_Static_assert(env_max_stored_tlob <= MAX_HA_ENV_LEN, "Not enough slots");
+#define HA_CLK_NS
+
+struct automaton_tlob {
+	char *state_names[state_max_tlob];
+	char *event_names[event_max_tlob];
+	char *env_names[env_max_tlob];
+	unsigned char function[state_max_tlob][event_max_tlob];
+	unsigned char initial_state;
+	bool final_states[state_max_tlob];
+};
+
+static const struct automaton_tlob automaton_tlob = {
+	.state_names = {
+		"running",
+		"waiting",
+		"sleeping",
+	},
+	.event_names = {
+		"start",
+		"sleep",
+		"preempt",
+		"wakeup",
+		"switch_in",
+	},
+	.env_names = {
+		"clk_elapsed",
+	},
+	.function = {
+		/* running */
+		{
+			running_tlob,	/* start     (tlob_start_task, resets clock)  */
+			sleeping_tlob,	/* sleep     (sched_switch, prev_state != 0) */
+			waiting_tlob,	/* preempt   (sched_switch, prev_state == 0) */
+			INVALID_STATE,	/* wakeup    (TASK_RUNNING can't be woken)   */
+			INVALID_STATE,	/* switch_in (already on CPU)                */
+		},
+		/* waiting */
+		{
+			INVALID_STATE,	/* start     (not in running state)          */
+			INVALID_STATE,	/* sleep     (not on CPU)                    */
+			INVALID_STATE,	/* preempt   (not on CPU)                    */
+			INVALID_STATE,	/* wakeup    (already TASK_RUNNING)          */
+			running_tlob,	/* switch_in                                 */
+		},
+		/* sleeping */
+		{
+			INVALID_STATE,	/* start     (not in running state)          */
+			INVALID_STATE,	/* sleep     (already sleeping)              */
+			INVALID_STATE,	/* preempt   (not on CPU)                    */
+			waiting_tlob,	/* wakeup                                    */
+			INVALID_STATE,	/* switch_in (must go through waiting first) */
+		},
+	},
+	.initial_state = running_tlob,
+	.final_states = { 1, 0, 0 },
+};
+
+/* Maximum number of concurrently monitored tasks. */
+#define TLOB_MAX_MONITORED	64U
+
+/* Maximum binary path length for uprobe binding. */
+#define TLOB_MAX_PATH		256
+
+/*
+ * Upper bound on the monitoring budget (1 hour = 3 600 000 000 000 ns).
+ * The ns-resolution accumulators (running_ns, waiting_ns, sleeping_ns)
+ * are u64; keeping the window below this limit ensures they stay well
+ * clear of u64 overflow and covers every realistic latency-monitoring
+ * use case.
+ */
+#define TLOB_MAX_THRESHOLD_NS	3600000000000ULL
+
+/* Exported to ioctl/uprobe layers and KUnit */
+int tlob_start_task(struct task_struct *task, u64 threshold_ns);
+int tlob_stop_task(struct task_struct *task);
+
+#if IS_ENABLED(CONFIG_KUNIT)
+int tlob_create_or_delete_uprobe(char *buf);
+#endif /* CONFIG_KUNIT */
+
+#endif /* _RV_TLOB_H */
diff --git a/kernel/trace/rv/monitors/tlob/tlob_trace.h b/kernel/trace/rv/monitors/tlob/tlob_trace.h
new file mode 100644
index 000000000000..1ac4900d38e8
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob_trace.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_TLOB
+DEFINE_EVENT(event_da_monitor_id, event_tlob,
+	     TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+	     TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_tlob,
+	     TP_PROTO(int id, char *state, char *event),
+	     TP_ARGS(id, state, event));
+
+DEFINE_EVENT(error_env_da_monitor_id, error_env_tlob,
+	     TP_PROTO(int id, char *state, char *event, char *env),
+	     TP_ARGS(id, state, event, env));
+
+/*
+ * detail_env_tlob - per-state latency breakdown emitted on budget violation.
+ *
+ * Fired immediately after error_env_tlob from the hrtimer callback.
+ * Fields show how much time was spent in each DA state since tlob_start_task().
+ * running_ns + waiting_ns + sleeping_ns ≈ total elapsed time (threshold_ns exceeded).
+ */
+TRACE_EVENT(detail_env_tlob,
+	TP_PROTO(int id, u64 threshold_ns,
+		 u64 running_ns, u64 waiting_ns, u64 sleeping_ns),
+	TP_ARGS(id, threshold_ns, running_ns, waiting_ns, sleeping_ns),
+	TP_STRUCT__entry(
+		__field(int,	id)
+		__field(u64,	threshold_ns)
+		__field(u64,	running_ns)
+		__field(u64,	waiting_ns)
+		__field(u64,	sleeping_ns)
+	),
+	TP_fast_assign(
+		__entry->id		= id;
+		__entry->threshold_ns	= threshold_ns;
+		__entry->running_ns	= running_ns;
+		__entry->waiting_ns	= waiting_ns;
+		__entry->sleeping_ns	= sleeping_ns;
+	),
+	TP_printk("pid=%d threshold_ns=%llu running_ns=%llu waiting_ns=%llu sleeping_ns=%llu",
+		__entry->id, __entry->threshold_ns,
+		__entry->running_ns, __entry->waiting_ns, __entry->sleeping_ns)
+);
+#endif /* CONFIG_RV_MON_TLOB */
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 9622c269789c..a4bc215c1f15 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -189,6 +189,7 @@ DECLARE_EVENT_CLASS(error_env_da_monitor_id,
 
 #include <monitors/stall/stall_trace.h>
 #include <monitors/nomiss/nomiss_trace.h>
+#include <monitors/tlob/tlob_trace.h>
 // Add new monitors based on CONFIG_HA_MON_EVENTS_ID here
 
 #endif
-- 
2.43.0


^ permalink raw reply related

* [PATCH v3 5/9] rv/ha: make da_monitor_reset_hook and EVENT_NONE_LBL overridable
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1780847473.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Wrap the two definitions with #ifndef guards so that HA-based monitors
can substitute their own implementations before including this header:

  /* in monitor.c, before #include <rv/ha_monitor.h> */
  #define da_monitor_reset_hook  my_monitor_reset_env
  #define EVENT_NONE_LBL         "idle"

No behaviour change for monitors that do not override either macro.

Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 include/rv/ha_monitor.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
index e5860900a337..610da54c111f 100644
--- a/include/rv/ha_monitor.h
+++ b/include/rv/ha_monitor.h
@@ -36,7 +36,10 @@ static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
 					 da_id_type id);
 #define da_monitor_event_hook ha_monitor_handle_constraint
 #define da_monitor_init_hook ha_monitor_init_env
+/* Allow monitors to override da_monitor_reset_hook before including this header. */
+#ifndef da_monitor_reset_hook
 #define da_monitor_reset_hook ha_monitor_reset_env
+#endif
 #define da_monitor_sync_hook() synchronize_rcu()
 
 #if !defined(HA_SKIP_AUTO_CLEANUP) && RV_MON_TYPE == RV_MON_PER_TASK
@@ -75,7 +78,9 @@ _Static_assert(offsetof(struct ha_monitor, da_mon) == 0,
 #define ENV_INVALID_VALUE U64_MAX
 /* Error with no event occurs only on timeouts */
 #define EVENT_NONE EVENT_MAX
+#ifndef EVENT_NONE_LBL
 #define EVENT_NONE_LBL "none"
+#endif
 #define ENV_BUFFER_SIZE 64
 
 #ifdef CONFIG_RV_REACTORS
-- 
2.43.0


^ permalink raw reply related

* [PATCH v3 4/9] rv/ha: fix ha_invariant_passed_ns silent bypass of invariant check
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1780847473.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

The function is documented as "prepare the invariant and return the time
since reset", but on the first call (env_store == U64_MAX) it exits
early without calling ha_set_invariant_ns():

  if (ha_monitor_env_invalid(ha_mon, env))  /* env_store == U64_MAX */
      return 0;   /* ha_set_invariant_ns skipped, env_store stays U64_MAX */
  ...
  ha_set_invariant_ns(ha_mon, env, expire - passed, time_ns);

This leaves env_store == U64_MAX, so ha_check_invariant_ns() always
passes on the first activation regardless of elapsed time:

  return READ_ONCE(ha_mon->env_store[env]) >= time_ns;  /* U64_MAX >= any */

Fix: establish the guard before converting to the invariant:

  if (ha_monitor_env_invalid(ha_mon, env))
      ha_reset_clk_ns(ha_mon, env, time_ns); /* guard: env_store = time_ns */
  passed = ha_get_env(ha_mon, env, time_ns);
  ha_set_invariant_ns(ha_mon, env, expire - passed, time_ns);
                                 /* invariant: env_store = time_ns + expire */

Apply the same fix to ha_invariant_passed_jiffy().

Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 include/rv/ha_monitor.h | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
index 28d3c74cabfc..e5860900a337 100644
--- a/include/rv/ha_monitor.h
+++ b/include/rv/ha_monitor.h
@@ -365,16 +365,22 @@ static inline bool ha_check_invariant_ns(struct ha_monitor *ha_mon,
 }
 /*
  * ha_invariant_passed_ns - prepare the invariant and return the time since reset
+ *
+ * If the env has not been initialised yet (first entry into a state with an
+ * invariant), anchor the guard clock at the current time so that the full
+ * budget is available from this point.  This preserves the documented
+ * guard→invariant ordering: ha_set_invariant_ns() is always preceded by a
+ * valid guard representation in env_store.
  */
 static inline u64 ha_invariant_passed_ns(struct ha_monitor *ha_mon, enum envs env,
 				   u64 expire, u64 time_ns)
 {
-	u64 passed = 0;
+	u64 passed;
 
 	if (env < 0 || env >= ENV_MAX_STORED)
 		return 0;
 	if (ha_monitor_env_invalid(ha_mon, env))
-		return 0;
+		ha_reset_clk_ns(ha_mon, env, time_ns);
 	passed = ha_get_env(ha_mon, env, time_ns);
 	ha_set_invariant_ns(ha_mon, env, expire - passed, time_ns);
 	return passed;
@@ -404,16 +410,19 @@ static inline bool ha_check_invariant_jiffy(struct ha_monitor *ha_mon,
 }
 /*
  * ha_invariant_passed_jiffy - prepare the invariant and return the time since reset
+ *
+ * Same first-use semantics as ha_invariant_passed_ns(): anchor the guard clock
+ * now if the env has not been initialised.
  */
 static inline u64 ha_invariant_passed_jiffy(struct ha_monitor *ha_mon, enum envs env,
 				      u64 expire, u64 time_ns)
 {
-	u64 passed = 0;
+	u64 passed;
 
 	if (env < 0 || env >= ENV_MAX_STORED)
 		return 0;
 	if (ha_monitor_env_invalid(ha_mon, env))
-		return 0;
+		ha_reset_clk_jiffy(ha_mon, env);
 	passed = ha_get_env(ha_mon, env, time_ns);
 	ha_set_invariant_jiffy(ha_mon, env, expire - passed);
 	return passed;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v3 3/9] rv/tlob: add tlob model DOT file
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1780847473.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Add tools/verification/models/tlob.dot, the Graphviz specification of
the tlob hybrid automaton.  The model has three states (running,
waiting, sleeping) connected by four transitions (switch_in, preempt,
wakeup, sleep) with a single clock invariant clk_elapsed < BUDGET_NS()
active in all states.

Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 tools/verification/models/tlob.dot | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 tools/verification/models/tlob.dot

diff --git a/tools/verification/models/tlob.dot b/tools/verification/models/tlob.dot
new file mode 100644
index 000000000000..a1834daff2ed
--- /dev/null
+++ b/tools/verification/models/tlob.dot
@@ -0,0 +1,22 @@
+digraph state_automaton {
+	center = true;
+	size = "7,11";
+	{node [shape = plaintext, style=invis, label=""] "__init_running"};
+	{node [shape = ellipse] "running"};
+	{node [shape = plaintext] "running"};
+	{node [shape = plaintext] "waiting"};
+	{node [shape = plaintext] "sleeping"};
+	"__init_running" -> "running";
+	"running"  -> "running"  [ label = "start;reset(clk_elapsed)" ];
+	"running"  [label = "running\nclk_elapsed < BUDGET_NS()",  color = green3];
+	"waiting"  [label = "waiting\nclk_elapsed < BUDGET_NS()"];
+	"sleeping" [label = "sleeping\nclk_elapsed < BUDGET_NS()"];
+	"running"  -> "sleeping" [ label = "sleep" ];
+	"running"  -> "waiting"  [ label = "preempt" ];
+	"waiting"  -> "running"  [ label = "switch_in" ];
+	"sleeping" -> "waiting"  [ label = "wakeup" ];
+	{ rank = min ;
+		"__init_running";
+		"running";
+	}
+}
-- 
2.43.0


^ permalink raw reply related

* [PATCH v3 0/9] rv/tlob: Add task latency over budget RV monitor
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang

From: Wen Yang <wen.yang@linux.dev>

This series introduces tlob (task latency over budget), a per-task
hybrid automaton RV monitor that measures elapsed wall-clock time across
a user-delimited code section and fires when the time exceeds a
configurable budget.

The series applies cleanly on top of:
  [1] git://git.kernel.org/pub/scm/linux/kernel/git/gmonaco/linux.git rv-fixes-7.1
      "rv fixes for v7.1"

Background
----------
The existing wwnr monitor uses a two-state DA to detect tasks that are
woken but never run.  tlob extends the RV framework to a three-state
hybrid automaton:

  running  (initial) -- on CPU
  waiting             -- in the scheduler runqueue, not yet on CPU
  sleeping            -- blocked on a lock, I/O, or similar resource

A single HA clock invariant, clk_elapsed < BUDGET_NS(), is active in
all states.  The framework enforces it via a per-task hrtimer.  On
expiry, error_env_tlob is emitted, followed by detail_env_tlob which
carries a per-state time breakdown (running_ns, waiting_ns, sleeping_ns)
that pinpoints whether the overrun occurred in the running, waiting, or
sleeping state.

Userspace interface
-------------------
Tasks are registered for monitoring by writing to the tracefs monitor
file:

  # echo "p /path/to/binary:START_OFFSET STOP_OFFSET threshold=NS" \
        > /sys/kernel/tracing/rv/monitors/tlob/monitor

Two uprobes are registered at START_OFFSET (entry) and STOP_OFFSET
(exit) of the delimited section.  When a task executes the entry uprobe,
the monitor starts; when the task reaches the exit uprobe or the budget
expires, monitoring stops and the slot is returned to the pool.

Multiple uprobe pairs can be registered for the same binary or different
binaries.  Each task can have at most one active monitoring session; if
a task hits a start uprobe while already monitored, the prior session is
cancelled and a new one begins.

Series structure
----------------
Patch 1: rv/da: introduce DA_MON_ALLOCATION_STRATEGY
  Consolidates per-object DA storage allocation under a compile-time
  selector with three strategies:
    DA_ALLOC_AUTO   (default) - lock-free kmalloc_nolock; unbounded
    DA_ALLOC_POOL             - pre-allocated fixed-size pool
    DA_ALLOC_MANUAL           - caller pre-inserts storage

  da_handle_start_event() and da_handle_start_run_event() call
  da_prepare_storage() which resolves at compile time to the correct
  allocation function.

  This patch also includes critical correctness fixes for the pool
  implementation:
  - Add tracepoint_synchronize_unregister() in da_monitor_destroy_pool()
    to fix UAF where in-flight handlers access freed pool storage
  - Fix duplicate hash entry race in da_create_or_get_pool() via
    concurrent-insert detection under RCU
  - Add capacity field to fix build error (DA_MON_POOL_SIZE undeclared
    in da_pool_return_cb)

Patch 2: rv: add generic uprobe infrastructure for RV monitors
  Introduces rv_uprobe, a thin wrapper around uprobe_consumer for RV
  monitors.  Provides rv_uprobe_register(), rv_uprobe_unregister(),
  and rv_uprobe_sync() for safe teardown.

Patch 3: rv/tlob: add tlob model DOT file
  The formal model used to generate tlob.h.

Patch 4: rv/ha: fix ha_invariant_passed_ns silent bypass of invariant check
  Fixes a bug where ha_invariant_passed_ns() returned 0 early when
  env_store was invalid (U64_MAX), leaving it at U64_MAX and causing
  ha_check_invariant_ns() to always pass.  The fix calls ha_reset_clk_ns()
  then ha_set_invariant_ns() on first use.

Patch 5: rv/ha: make da_monitor_reset_hook and EVENT_NONE_LBL overridable
  Allows tlob to override EVENT_NONE_LBL for its start_tlob self-loop.

Patch 6: rv/tlob: add tlob hybrid automaton monitor
  The main tlob implementation, including:
  - Three-state HA (running/waiting/sleeping)
  - Per-task hrtimer enforcement (HRTIMER_MODE_REL_HARD)
  - DA_ALLOC_POOL for allocation-free hot path
  - Uprobe registration via tracefs monitor file
  - Per-state time accumulation (running_ns, waiting_ns, sleeping_ns)
  - HIGH_RES_TIMERS dependency in Kconfig

Patches 7-9: Tests
  - KUnit tests for tlob monitor
  - Selftest infrastructure fixes
  - tlob selftests (uprobe binding, state tracking, violation detection)

Changes since v2
----------------
All feedback from Gabriele Monaco has been addressed:

-- Patch 02 (per-task slot ordering / ha_monitor_reset_env):
   Dropped from v3; rebased on top of Gabriele's series [1].

-- Patch 03 (verificationtest-ktap):
   Changed to use realpath for robustness as suggested.

-- Patch 04 (pre-allocated storage pool):
   Complete redesign as DA_MON_ALLOCATION_STRATEGY:
   - Three strategies (AUTO/POOL/MANUAL) via compile-time macro
   - da_monitor_init_prealloc() removed; da_monitor_init() selects
     internally
   - da_create_or_get_kmalloc() removed (no viable use case)
   - nomiss updated to use DA_ALLOC_MANUAL
   - da_extra_cleanup() hook added for per-entry teardown

   Critical bug fixes included in this patch:
   - tracepoint_synchronize_unregister() added to da_monitor_destroy_pool()
     to prevent UAF from in-flight handlers accessing freed pool storage
   - Duplicate hash entry race fixed in da_create_or_get_pool() via
     concurrent-insert detection and slot return under RCU
   - capacity field added to fix DA_MON_POOL_SIZE undeclared build error

-- Patch 05 (generic uprobe infrastructure):
   Carried unchanged into v3.

-- Patch 06 (rvgen __init arrow reset):
   Carried unchanged into v3.

-- Patch 08 (tlob monitor):
   Split and refactored:
   - ioctl interface deferred to follow-up series (tracefs-only in v3)
   - Handler simplification: three inline helpers (tlob_acc_running/
     waiting/sleeping) with scoped_guard(rcu)
   - do_prev/do_next flags removed (da_handle_event skips unmonitored)
   - scoped_guard(rcu) and guard(mutex) applied throughout
   - tlob_stop_all() removed; da_extra_cleanup() hook used instead
   - start_tlob self-loop added to DOT model as suggested
   - ha_setup_invariants() guards against redundant timer restart
   - HIGH_RES_TIMERS dependency added to Kconfig

Additional improvements in v3
------------------------------
Beyond the v2 feedback, this version includes:

1. Simplified tlob monitor implementation:
   - Removed redundant tlob_num_monitored atomic counter
     (da_handle_event already handles unmonitored tasks via hash lookup)
   - Eliminated extra cacheline touch on every sched_switch/sched_wakeup
   - Several repeated pattern simplifications.

2. Extracted common accumulation logic:
   - __tlob_acc() using offsetof() replaces three nearly-identical functions
   - Reduces code duplication while maintaining type safety

3. Complete test coverage:
   - KUnit tests for core functionality
   - Comprehensive selftests for uprobe integration, state tracking,
     and violation detection

Testing
-------
All patches have been tested on:
- x86_64 with CONFIG_PREEMPT_RT
- All KUnit tests pass
- All selftests pass with verificationtest-ktap

  
[1] git://git.kernel.org/pub/scm/linux/kernel/git/gmonaco/linux.git rv-fixes-7.1
    "rv fixes for v7.1"


Wen Yang (9):
  rv/da: introduce DA_MON_ALLOCATION_STRATEGY
  rv: add generic uprobe infrastructure for RV monitors
  rv/tlob: add tlob model DOT file
  rv/ha: fix ha_invariant_passed_ns silent bypass of invariant check
  rv/ha: make da_monitor_reset_hook and EVENT_NONE_LBL overridable
  rv/tlob: add tlob hybrid automaton monitor
  rv/tlob: add KUnit tests for the tlob monitor
  selftests/verification: fix verificationtest-ktap for out-of-tree
    execution
  selftests/verification: add tlob selftests

 Documentation/trace/rv/index.rst              |   1 +
 Documentation/trace/rv/monitor_tlob.rst       | 177 ++++
 include/rv/da_monitor.h                       | 276 ++++-
 include/rv/ha_monitor.h                       |  22 +-
 include/rv/rv_uprobe.h                        | 119 +++
 kernel/trace/rv/Kconfig                       |   5 +
 kernel/trace/rv/Makefile                      |   3 +
 kernel/trace/rv/monitors/nomiss/nomiss.c      |   6 +-
 kernel/trace/rv/monitors/tlob/.kunitconfig    |   6 +
 kernel/trace/rv/monitors/tlob/Kconfig         |  19 +
 kernel/trace/rv/monitors/tlob/tlob.c          | 968 ++++++++++++++++++
 kernel/trace/rv/monitors/tlob/tlob.h          | 148 +++
 kernel/trace/rv/monitors/tlob/tlob_kunit.c    |  92 ++
 kernel/trace/rv/monitors/tlob/tlob_trace.h    |  49 +
 kernel/trace/rv/rv_trace.h                    |   1 +
 kernel/trace/rv/rv_uprobe.c                   | 182 ++++
 .../testing/selftests/verification/.gitignore |   2 +
 tools/testing/selftests/verification/Makefile |  19 +-
 .../verification/test.d/tlob/Makefile         |  20 +
 .../verification/test.d/tlob/test.d/functions |   1 +
 .../verification/test.d/tlob/tlob_sym.c       | 189 ++++
 .../verification/test.d/tlob/tlob_target.c    | 138 +++
 .../verification/test.d/tlob/uprobe_bind.tc   |  37 +
 .../test.d/tlob/uprobe_detail_running.tc      |  51 +
 .../test.d/tlob/uprobe_detail_sleeping.tc     |  50 +
 .../test.d/tlob/uprobe_detail_waiting.tc      |  66 ++
 .../verification/test.d/tlob/uprobe_multi.tc  |  64 ++
 .../test.d/tlob/uprobe_no_event.tc            |  19 +
 .../test.d/tlob/uprobe_violation.tc           |  67 ++
 .../verification/verificationtest-ktap        |   4 +-
 tools/verification/models/tlob.dot            |  22 +
 31 files changed, 2789 insertions(+), 34 deletions(-)
 create mode 100644 Documentation/trace/rv/monitor_tlob.rst
 create mode 100644 include/rv/rv_uprobe.h
 create mode 100644 kernel/trace/rv/monitors/tlob/.kunitconfig
 create mode 100644 kernel/trace/rv/monitors/tlob/Kconfig
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob.c
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob.h
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob_kunit.c
 create mode 100644 kernel/trace/rv/monitors/tlob/tlob_trace.h
 create mode 100644 kernel/trace/rv/rv_uprobe.c
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/Makefile
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/test.d/functions
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/tlob_sym.c
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/tlob_target.c
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_running.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
 create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
 create mode 100644 tools/verification/models/tlob.dot

-- 
2.43.0


^ permalink raw reply

* [PATCH v3 2/9] rv: add generic uprobe infrastructure for RV monitors
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1780847473.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Introduce rv_uprobe, a thin wrapper around uprobe_consumer providing
rv_uprobe_attach_path(), rv_uprobe_attach(), and rv_uprobe_detach()
for RV monitors.  An opaque priv pointer is forwarded unchanged to
entry/return handlers so monitors can carry per-binding state (e.g. a
latency threshold) to the hot path without any global lookup.

rv_uprobe_detach() is fully synchronous (nosync + sync + path_put +
kfree), closing the use-after-free window present in open-coded
patterns where kfree() precedes uprobe_unregister_sync().

Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 include/rv/rv_uprobe.h      | 119 +++++++++++++++++++++++
 kernel/trace/rv/Kconfig     |   4 +
 kernel/trace/rv/Makefile    |   1 +
 kernel/trace/rv/rv_uprobe.c | 182 ++++++++++++++++++++++++++++++++++++
 4 files changed, 306 insertions(+)
 create mode 100644 include/rv/rv_uprobe.h
 create mode 100644 kernel/trace/rv/rv_uprobe.c

diff --git a/include/rv/rv_uprobe.h b/include/rv/rv_uprobe.h
new file mode 100644
index 000000000000..9106c5c9275e
--- /dev/null
+++ b/include/rv/rv_uprobe.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Generic uprobe infrastructure for RV monitors.
+ *
+ */
+
+#ifndef _RV_UPROBE_H
+#define _RV_UPROBE_H
+
+#include <linux/path.h>
+#include <linux/types.h>
+
+struct pt_regs;
+
+/**
+ * struct rv_uprobe - a single uprobe registered on behalf of an RV monitor
+ *
+ * @offset:   byte offset within the ELF binary where the probe is installed
+ * @priv:     monitor-private pointer; set at attach time, never touched by
+ *            this layer; passed unchanged to entry_fn / ret_fn
+ * @path:     resolved path of the probed binary (read-only after attach);
+ *            callers may use path.dentry for identity comparisons
+ *
+ * The implementation fields (uprobe_consumer, uprobe handle, callbacks) are
+ * private to rv_uprobe.c and are not exposed here; monitors must not access
+ * them directly.
+ */
+struct rv_uprobe {
+	/* public: read-only after rv_uprobe_attach*() */
+	loff_t		 offset;
+	void		*priv;
+	struct path	 path;
+};
+
+/**
+ * rv_uprobe_attach_path - register an uprobe given an already-resolved path
+ * @path:     path of the target binary; rv_uprobe takes its own reference
+ * @offset:   byte offset within the binary
+ * @entry_fn: called on probe hit (entry); may be NULL
+ * @ret_fn:   called on function return (uretprobe); may be NULL
+ * @priv:     opaque pointer forwarded to callbacks unchanged
+ *
+ * Use this variant when the caller has already resolved the path (e.g. to
+ * register multiple probes on the same binary with a single kern_path call).
+ * The inode is derived internally via d_real_inode(), so inode and path are
+ * always consistent.
+ *
+ * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure.
+ */
+struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset,
+	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+			struct pt_regs *regs, __u64 *data),
+	void *priv);
+
+/**
+ * rv_uprobe_attach - resolve binpath and register an uprobe
+ * @binpath:  absolute path to the target binary
+ * @offset:   byte offset within the binary
+ * @entry_fn: called on probe hit (entry); may be NULL
+ * @ret_fn:   called on function return (uretprobe); may be NULL
+ * @priv:     opaque pointer forwarded to callbacks unchanged
+ *
+ * Resolves binpath via kern_path(), then delegates to rv_uprobe_attach_path().
+ *
+ * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure.
+ */
+struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
+	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+			struct pt_regs *regs, __u64 *data),
+	void *priv);
+
+/**
+ * rv_uprobe_detach - synchronously unregister an uprobe and free it
+ * @p:  probe to detach; may be NULL (no-op)
+ *
+ * Calls uprobe_unregister_nosync(), then uprobe_unregister_sync() to wait
+ * for any in-progress handler to finish, then releases the path reference
+ * and frees the rv_uprobe struct.  The caller's priv data is NOT freed.
+ *
+ * When removing a single probe, prefer this over the three-phase API.
+ * Safe to call from process context only (uprobe_unregister_sync() may
+ * schedule).
+ */
+void rv_uprobe_detach(struct rv_uprobe *p);
+
+/**
+ * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting
+ * @p:  probe to dequeue; may be NULL (no-op)
+ *
+ * Removes the uprobe from the uprobe subsystem but does NOT wait for
+ * in-flight handlers to complete.  The caller must call rv_uprobe_sync()
+ * before calling rv_uprobe_free() on the same probe.
+ *
+ * Use this to batch multiple deregistrations before a single rv_uprobe_sync().
+ */
+void rv_uprobe_unregister_nosync(struct rv_uprobe *p);
+
+/**
+ * rv_uprobe_sync - wait for all in-flight uprobe handlers to complete
+ *
+ * Global barrier: waits for every in-flight uprobe handler across the system
+ * to finish.  Call once after a batch of rv_uprobe_unregister_nosync() calls
+ * and before any rv_uprobe_free() call.
+ */
+void rv_uprobe_sync(void);
+
+/**
+ * rv_uprobe_free - release resources of a previously deregistered probe
+ * @p:  probe to free; may be NULL (no-op)
+ *
+ * Releases the path reference and frees the rv_uprobe struct.  Must only
+ * be called after rv_uprobe_sync() has returned.  The caller's priv data
+ * is NOT freed.
+ */
+void rv_uprobe_free(struct rv_uprobe *p);
+
+#endif /* _RV_UPROBE_H */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 3884b14df375..e2e0033a00b9 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -59,6 +59,10 @@ config RV_PER_TASK_MONITORS
 	  This option configures the maximum number of per-task RV monitors that can run
 	  simultaneously.
 
+config RV_UPROBE
+	bool
+	depends on RV && UPROBES
+
 source "kernel/trace/rv/monitors/wip/Kconfig"
 source "kernel/trace/rv/monitors/wwnr/Kconfig"
 
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 94498da35b37..f139b904bea3 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
 obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
 obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
 # Add new monitors here
+obj-$(CONFIG_RV_UPROBE) += rv_uprobe.o
 obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
 obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
 obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o
diff --git a/kernel/trace/rv/rv_uprobe.c b/kernel/trace/rv/rv_uprobe.c
new file mode 100644
index 000000000000..3d8b764dded3
--- /dev/null
+++ b/kernel/trace/rv/rv_uprobe.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic uprobe infrastructure for RV monitors.
+ *
+ */
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/uprobes.h>
+#include <rv/rv_uprobe.h>
+
+/*
+ * Private extension of struct rv_uprobe.  Allocated by rv_uprobe_attach*()
+ * and returned to callers as &impl->pub.
+ */
+struct rv_uprobe_impl {
+	struct rv_uprobe	pub;	/* must be first; callers hold &pub */
+	struct uprobe_consumer	uc;
+	struct uprobe		*uprobe;
+	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data);
+	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+			struct pt_regs *regs, __u64 *data);
+};
+
+static int rv_uprobe_handler(struct uprobe_consumer *uc,
+			     struct pt_regs *regs, __u64 *data)
+{
+	struct rv_uprobe_impl *impl = container_of(uc, struct rv_uprobe_impl, uc);
+
+	if (impl->entry_fn)
+		return impl->entry_fn(&impl->pub, regs, data);
+	return 0;
+}
+
+static int rv_uprobe_ret_handler(struct uprobe_consumer *uc,
+				 unsigned long func,
+				 struct pt_regs *regs, __u64 *data)
+{
+	struct rv_uprobe_impl *impl = container_of(uc, struct rv_uprobe_impl, uc);
+
+	if (impl->ret_fn)
+		return impl->ret_fn(&impl->pub, func, regs, data);
+	return 0;
+}
+
+static struct rv_uprobe *
+__rv_uprobe_attach(struct inode *inode, struct path *path, loff_t offset,
+		   int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+		   int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+				   struct pt_regs *regs, __u64 *data),
+		   void *priv)
+{
+	struct rv_uprobe_impl *impl;
+	int ret;
+
+	if (!entry_fn && !ret_fn)
+		return ERR_PTR(-EINVAL);
+
+	impl = kzalloc_obj(*impl, GFP_KERNEL);
+	if (!impl)
+		return ERR_PTR(-ENOMEM);
+
+	impl->pub.offset = offset;
+	impl->pub.priv   = priv;
+	impl->entry_fn   = entry_fn;
+	impl->ret_fn     = ret_fn;
+	path_get(path);
+	impl->pub.path   = *path;
+
+	if (entry_fn)
+		impl->uc.handler     = rv_uprobe_handler;
+	if (ret_fn)
+		impl->uc.ret_handler = rv_uprobe_ret_handler;
+
+	impl->uprobe = uprobe_register(inode, offset, 0, &impl->uc);
+	if (IS_ERR(impl->uprobe)) {
+		ret = PTR_ERR(impl->uprobe);
+		path_put(&impl->pub.path);
+		kfree(impl);
+		return ERR_PTR(ret);
+	}
+
+	return &impl->pub;
+}
+
+/**
+ * rv_uprobe_attach_path - register an uprobe given an already-resolved path
+ */
+struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset,
+	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+			struct pt_regs *regs, __u64 *data),
+	void *priv)
+{
+	struct inode *inode = d_real_inode(path->dentry);
+
+	return __rv_uprobe_attach(inode, path, offset, entry_fn, ret_fn, priv);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_attach_path);
+
+/**
+ * rv_uprobe_attach - resolve binpath and register an uprobe
+ */
+struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
+	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+			struct pt_regs *regs, __u64 *data),
+	void *priv)
+{
+	struct rv_uprobe *p;
+	struct path path;
+	int ret;
+
+	ret = kern_path(binpath, LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!d_is_reg(path.dentry)) {
+		path_put(&path);
+		return ERR_PTR(-EINVAL);
+	}
+
+	p = rv_uprobe_attach_path(&path, offset, entry_fn, ret_fn, priv);
+	path_put(&path);
+	return p;
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_attach);
+
+/**
+ * rv_uprobe_detach - synchronously unregister an uprobe and free it
+ */
+void rv_uprobe_detach(struct rv_uprobe *p)
+{
+	if (!p)
+		return;
+
+	rv_uprobe_unregister_nosync(p);
+	rv_uprobe_sync();
+	rv_uprobe_free(p);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_detach);
+
+/**
+ * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting
+ */
+void rv_uprobe_unregister_nosync(struct rv_uprobe *p)
+{
+	struct rv_uprobe_impl *impl;
+
+	if (!p)
+		return;
+
+	impl = container_of(p, struct rv_uprobe_impl, pub);
+	uprobe_unregister_nosync(impl->uprobe, &impl->uc);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_unregister_nosync);
+
+/**
+ * rv_uprobe_sync - wait for all in-flight uprobe handlers to complete
+ */
+void rv_uprobe_sync(void)
+{
+	uprobe_unregister_sync();
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_sync);
+
+/**
+ * rv_uprobe_free - release resources of a previously deregistered probe
+ */
+void rv_uprobe_free(struct rv_uprobe *p)
+{
+	struct rv_uprobe_impl *impl;
+
+	if (!p)
+		return;
+
+	impl = container_of(p, struct rv_uprobe_impl, pub);
+	path_put(&p->path);
+	kfree(impl);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_free);
-- 
2.43.0


^ permalink raw reply related

* [PATCH v3 1/9] rv/da: introduce DA_MON_ALLOCATION_STRATEGY
From: wen.yang @ 2026-06-07 16:13 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1780847473.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

Consolidate per-object DA monitor storage allocation under a
single compile-time selector, replacing the ad-hoc
da_monitor_init_prealloc() API.

Three strategies are provided:

  DA_ALLOC_AUTO   (default) - lock-free kmalloc_nolock on the hot path;
                              unbounded capacity.  Preserves the existing
                              behaviour for all monitors that do not set
                              DA_MON_ALLOCATION_STRATEGY.

  DA_ALLOC_POOL             - pre-allocated fixed-size pool.  Requires the
                              monitor to define DA_MON_POOL_SIZE; enforced
                              with #error.  da_prepare_storage() acquires
                              spinlock_t (O(1), irqsave); must be called
                              from task context on PREEMPT_RT where
                              spinlock_t is a sleeping lock.

  DA_ALLOC_MANUAL           - caller pre-inserts storage via
                              da_create_empty_storage() before the first
                              da_handle_start_event(); the framework only
                              links the target field.  Useful for monitors
                              that allocate storage from known-safe task
                              context (e.g. a syscall path) and then hand
                              it to a tracepoint handler on the hot path.

da_handle_start_event() and da_handle_start_run_event() both call
da_prepare_storage() which resolves at compile time to the correct
allocation function, so no runtime dispatch is needed.

da_monitor_init_prealloc() is removed; da_monitor_init() selects pool
or kmalloc initialisation internally based on the strategy.

A da_extra_cleanup() hook macro is added: the default is a no-op; a
monitor may define it as a function called by da_monitor_destroy() on
each remaining entry before hash_del_rcu().

nomiss is updated to DA_ALLOC_MANUAL: it calls da_create_empty_storage()
from handle_sys_enter() (the sched_setscheduler syscall path, safe
task context), then da_fill_empty_storage() links the sched_dl_entity
target on the first da_handle_start_run_event() call in
handle_sched_switch().

Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 include/rv/da_monitor.h                  | 276 +++++++++++++++++++++--
 kernel/trace/rv/monitors/nomiss/nomiss.c |   6 +-
 2 files changed, 254 insertions(+), 28 deletions(-)

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 34b8fba9ecd4..eb7fc02ecb8a 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -14,6 +14,26 @@
 #ifndef _RV_DA_MONITOR_H
 #define _RV_DA_MONITOR_H
 
+/*
+ * Allocation strategies for RV_MON_PER_OBJ monitors.
+ *
+ * Define DA_MON_ALLOCATION_STRATEGY before including this header.
+ * DA_ALLOC_AUTO   - lock-free kmalloc on the hot path; unbounded capacity.
+ * DA_ALLOC_POOL   - pre-allocated fixed-size pool; requires DA_MON_POOL_SIZE.
+ *                   da_prepare_storage() acquires spinlock_t (O(1), irqsave);
+ *                   must be called from task context on PREEMPT_RT where
+ *                   spinlock_t is a sleeping lock.
+ * DA_ALLOC_MANUAL - caller inserts storage before da_handle_start_event();
+ *                   the framework only links the target field.
+ */
+#define DA_ALLOC_AUTO   0
+#define DA_ALLOC_POOL   1
+#define DA_ALLOC_MANUAL 2
+
+#ifndef DA_MON_ALLOCATION_STRATEGY
+# define DA_MON_ALLOCATION_STRATEGY DA_ALLOC_AUTO
+#endif
+
 #include <rv/automata.h>
 #include <linux/rv.h>
 #include <linux/stringify.h>
@@ -66,6 +86,19 @@ static struct rv_monitor rv_this;
 #define da_monitor_sync_hook()
 #endif
 
+/*
+ * Hook for per-object teardown during da_monitor_destroy().
+ *
+ * Called for each entry still in the hash table when the monitor is
+ * destroyed.  Invoked before da_monitor_reset() and hash_del_rcu(), so
+ * it is safe to call ha_cancel_timer_sync() here.
+ *
+ * Define before including this header.  Default is a no-op.
+ */
+#ifndef da_extra_cleanup
+#define da_extra_cleanup(da_mon)
+#endif
+
 /*
  * Type for the target id, default to int but can be overridden.
  * A long type can work as hash table key (PER_OBJ) but will be downgraded to
@@ -398,6 +431,16 @@ static inline void da_monitor_destroy(void)
  * Functions to define, init and get a per-object monitor.
  */
 
+/*
+ * DA_MON_POOL_SIZE must be defined before this header is included (directly or
+ * transitively via ha_monitor.h) when DA_ALLOC_POOL is selected.  In practice
+ * this means defining it after the monitor's model header (which supplies the
+ * capacity constant) and before the ha_monitor.h include.
+ */
+#if DA_MON_ALLOCATION_STRATEGY == DA_ALLOC_POOL && !defined(DA_MON_POOL_SIZE)
+# error "DA_ALLOC_POOL requires DA_MON_POOL_SIZE to be defined before including this header"
+#endif
+
 struct da_monitor_storage {
 	da_id_type id;
 	monitor_target target;
@@ -495,18 +538,6 @@ static inline da_id_type da_get_id(struct da_monitor *da_mon)
 	return container_of(da_mon, struct da_monitor_storage, rv.da_mon)->id;
 }
 
-/*
- * da_create_or_get - create the per-object storage if not already there
- *
- * This needs a lookup so should be guarded by RCU, the condition is checked
- * directly in da_create_storage()
- */
-static inline void da_create_or_get(da_id_type id, monitor_target target)
-{
-	guard(rcu)();
-	da_create_storage(id, target, da_get_monitor(id, target));
-}
-
 /*
  * da_fill_empty_storage - store the target in a pre-allocated storage
  *
@@ -537,15 +568,96 @@ static inline monitor_target da_get_target_by_id(da_id_type id)
 	return mon_storage->target;
 }
 
+/*
+ * Per-object pool state.
+ *
+ * Zero-initialised by default (storage == NULL ⟹ kmalloc mode).  A monitor
+ * opts into pool mode by defining DA_MON_ALLOCATION_STRATEGY DA_ALLOC_POOL
+ * and DA_MON_POOL_SIZE before including this header; da_monitor_init() then
+ * pre-allocates the pool internally.
+ *
+ * Because every field is wrapped in this struct and the struct itself is a
+ * per-TU static, each monitor that includes this header gets a completely
+ * independent pool.  A kmalloc monitor (e.g. nomiss) and a pool monitor
+ * (e.g. tlob) therefore coexist without any interference.
+ *
+ * da_pool_return_cb runs from softirq (non-PREEMPT_RT) or rcuc kthread
+ * (PREEMPT_RT); spin_lock_irqsave handles both.
+ */
+struct da_per_obj_pool {
+	struct da_monitor_storage  *storage;  /* non-NULL ⟹ pool mode */
+	struct da_monitor_storage **free;     /* kmalloc'd pointer stack */
+	unsigned int                free_top;
+	unsigned int                capacity; /* total number of slots */
+	spinlock_t                  lock;
+};
+
+static struct da_per_obj_pool da_pool = {
+	.lock = __SPIN_LOCK_UNLOCKED(da_pool.lock),
+};
+
+static void da_pool_return_cb(struct rcu_head *head)
+{
+	struct da_monitor_storage *ms =
+		container_of(head, struct da_monitor_storage, rcu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&da_pool.lock, flags);
+	if (!WARN_ON_ONCE(!da_pool.free || da_pool.free_top >= da_pool.capacity))
+		da_pool.free[da_pool.free_top++] = ms;
+	spin_unlock_irqrestore(&da_pool.lock, flags);
+}
+
+/*
+ * da_create_or_get_pool - pop a slot and insert it into the hash.
+ *
+ * Returns the new da_monitor on success, NULL if the pool is exhausted, or
+ * the existing da_monitor if a concurrent caller already inserted the same id
+ * (in which case the popped slot is returned to the free stack).
+ *
+ * Must be called inside an RCU read-side critical section (guard(rcu)()).
+ */
+static inline struct da_monitor *
+da_create_or_get_pool(da_id_type id, monitor_target target)
+{
+	struct da_monitor_storage *mon_storage, *existing;
+	unsigned long flags;
+
+	spin_lock_irqsave(&da_pool.lock, flags);
+	if (!da_pool.free_top) {
+		spin_unlock_irqrestore(&da_pool.lock, flags);
+		return NULL;
+	}
+	mon_storage = da_pool.free[--da_pool.free_top];
+	spin_unlock_irqrestore(&da_pool.lock, flags);
+
+	mon_storage->id = id;
+	mon_storage->target = target;
+
+	/*
+	 * A concurrent caller may have inserted the same id between our spinlock
+	 * release and here.  Return the slot to the pool and yield to the winner.
+	 */
+	existing = __da_get_mon_storage(id);
+	if (unlikely(existing)) {
+		spin_lock_irqsave(&da_pool.lock, flags);
+		da_pool.free[da_pool.free_top++] = mon_storage;
+		spin_unlock_irqrestore(&da_pool.lock, flags);
+		return &existing->rv.da_mon;
+	}
+	hash_add_rcu(da_monitor_ht, &mon_storage->node, id);
+	return &mon_storage->rv.da_mon;
+}
+
+
 /*
  * da_destroy_storage - destroy the per-object storage
  *
- * The caller is responsible to synchronise writers, either with locks or
- * implicitly. For instance, if da_destroy_storage is called at sched_exit and
- * da_create_storage can never occur after that, it's safe to call this without
- * locks.
- * This function includes an RCU read-side critical section to synchronise
- * against da_monitor_destroy().
+ * Pool mode: removes from hash and returns the slot via call_rcu().
+ * Kmalloc mode: removes from hash and frees via kfree_rcu().
+ *
+ * Includes an RCU read-side critical section to synchronise against
+ * da_monitor_destroy().
  */
 static inline void da_destroy_storage(da_id_type id)
 {
@@ -558,7 +670,11 @@ static inline void da_destroy_storage(da_id_type id)
 		return;
 	da_monitor_reset_hook(&mon_storage->rv.da_mon);
 	hash_del_rcu(&mon_storage->node);
+#if DA_MON_ALLOCATION_STRATEGY == DA_ALLOC_POOL
+	call_rcu(&mon_storage->rcu, da_pool_return_cb);
+#else
 	kfree_rcu(mon_storage, rcu);
+#endif
 }
 
 static void __da_monitor_reset_all(void (*reset)(struct da_monitor *))
@@ -581,13 +697,87 @@ static inline void da_monitor_reset_state_all(void)
 	__da_monitor_reset_all(da_monitor_reset_state);
 }
 
+/* Not part of the public API; called by da_monitor_init() for DA_ALLOC_POOL. */
+static inline int __da_monitor_init_pool(unsigned int prealloc_count)
+{
+	da_pool.storage = kcalloc(prealloc_count, sizeof(*da_pool.storage),
+				  GFP_KERNEL);
+	if (!da_pool.storage)
+		return -ENOMEM;
+
+	da_pool.free = kmalloc_array(prealloc_count, sizeof(*da_pool.free),
+				     GFP_KERNEL);
+	if (!da_pool.free) {
+		kfree(da_pool.storage);
+		da_pool.storage = NULL;
+		return -ENOMEM;
+	}
+
+	da_pool.capacity = prealloc_count;
+	da_pool.free_top = 0;
+	for (unsigned int i = 0; i < prealloc_count; i++)
+		da_pool.free[da_pool.free_top++] = &da_pool.storage[i];
+	return 0;
+}
+
+/*
+ * da_monitor_init - initialise the per-object monitor
+ *
+ * Selects the allocation path at compile time based on DA_MON_ALLOCATION_STRATEGY:
+ *   DA_ALLOC_POOL   - pre-allocates DA_MON_POOL_SIZE storage slots.
+ *   DA_ALLOC_AUTO / DA_ALLOC_MANUAL - initialises the hash table only.
+ */
 static inline int da_monitor_init(void)
 {
 	hash_init(da_monitor_ht);
+#if DA_MON_ALLOCATION_STRATEGY == DA_ALLOC_POOL
+	return __da_monitor_init_pool(DA_MON_POOL_SIZE);
+#else
 	return 0;
+#endif
 }
 
-static inline void da_monitor_destroy(void)
+static inline void da_monitor_destroy_pool(void)
+{
+	struct da_monitor_storage *ms;
+	struct hlist_node *tmp;
+	int bkt;
+
+	/*
+	 * Ensure all in-flight tracepoint handlers that may hold a raw pointer
+	 * to a pool slot (e.g. tlob_stop_task after its RCU guard exits) have
+	 * completed before we begin tearing down the pool.  Mirrors the same
+	 * call in da_monitor_destroy_kmalloc().
+	 */
+	tracepoint_synchronize_unregister();
+
+	/*
+	 * Drain any entries that were not stopped before destroy (e.g.
+	 * uprobe-started sessions whose stop probe never fired).  Call
+	 * da_extra_cleanup() before hash_del_rcu() so the hook may safely
+	 * call ha_cancel_timer_sync() while the monitor is still reachable.
+	 */
+	hash_for_each_safe(da_monitor_ht, bkt, tmp, ms, node) {
+		da_extra_cleanup(&ms->rv.da_mon);
+		hash_del_rcu(&ms->node);
+		call_rcu(&ms->rcu, da_pool_return_cb);
+	}
+
+	/*
+	 * rcu_barrier() drains every pending call_rcu() callback, including
+	 * both da_pool_return_cb() and any monitor-specific free callbacks
+	 * (e.g. tlob_free_rcu) enqueued by da_extra_cleanup().
+	 */
+	rcu_barrier();
+	kfree(da_pool.storage);
+	da_pool.storage = NULL;
+	kfree(da_pool.free);
+	da_pool.free = NULL;
+	da_pool.free_top = 0;
+	da_pool.capacity = 0;
+}
+
+static inline void da_monitor_destroy_kmalloc(void)
 {
 	struct da_monitor_storage *mon_storage;
 	struct hlist_node *tmp;
@@ -607,15 +797,51 @@ static inline void da_monitor_destroy(void)
 }
 
 /*
- * Allow the per-object monitors to run allocation manually, necessary if the
- * start condition is in a context problematic for allocation (e.g. scheduling).
- * In such case, if the storage was pre-allocated without a target, set it now.
+ * da_monitor_destroy - tear down the per-object monitor
+ *
+ * DA_ALLOC_POOL: calls tracepoint_synchronize_unregister() to drain any
+ * in-flight handlers, then iterates the hash draining remaining entries via
+ * da_extra_cleanup() + hash_del_rcu() + call_rcu(), then rcu_barrier() to
+ * wait for all pending da_pool_return_cb() callbacks before freeing the pool.
+ * DA_ALLOC_AUTO / DA_ALLOC_MANUAL: drains remaining entries after
+ * tracepoint_synchronize_unregister() + synchronize_rcu().
  */
-#ifdef DA_SKIP_AUTO_ALLOC
-#define da_prepare_storage da_fill_empty_storage
+static inline void da_monitor_destroy(void)
+{
+#if DA_MON_ALLOCATION_STRATEGY == DA_ALLOC_POOL
+	da_monitor_destroy_pool();
 #else
+	da_monitor_destroy_kmalloc();
+#endif
+}
+
+/*
+ * da_prepare_storage - obtain (or create) the da_monitor for (id, target)
+ *
+ * The implementation is selected at compile time by DA_MON_ALLOCATION_STRATEGY:
+ *
+ * DA_ALLOC_AUTO   - calls da_create_storage() (lock-free kmalloc_nolock).
+ * DA_ALLOC_POOL   - if an entry already exists, returns it; otherwise pops a
+ *                   slot from the pre-allocated pool and re-looks it up.
+ *                   Returns NULL if the pool is exhausted.
+ * DA_ALLOC_MANUAL - caller has already inserted storage via da_create_empty_storage();
+ *                   only fills in the target field if it was left NULL.
+ */
+#if DA_MON_ALLOCATION_STRATEGY == DA_ALLOC_POOL
+static inline struct da_monitor *da_prepare_storage(da_id_type id,
+						     monitor_target target,
+						     struct da_monitor *da_mon)
+{
+	if (da_mon)
+		return da_mon;
+	/* da_create_or_get_pool() returns the da_monitor directly; no re-lookup needed. */
+	return da_create_or_get_pool(id, target);
+}
+#elif DA_MON_ALLOCATION_STRATEGY == DA_ALLOC_MANUAL
+#define da_prepare_storage da_fill_empty_storage
+#else /* DA_ALLOC_AUTO */
 #define da_prepare_storage da_create_storage
-#endif /* DA_SKIP_AUTO_ALLOC */
+#endif
 
 #endif /* RV_MON_TYPE */
 
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c
index 8ead8783c29f..ac4d334e757f 100644
--- a/kernel/trace/rv/monitors/nomiss/nomiss.c
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.c
@@ -17,8 +17,8 @@
 
 #define RV_MON_TYPE RV_MON_PER_OBJ
 #define HA_TIMER_TYPE HA_TIMER_WHEEL
-/* The start condition is on sched_switch, it's dangerous to allocate there */
-#define DA_SKIP_AUTO_ALLOC
+/* Allocate storage in sched_setscheduler; sched_switch is too hot to alloc. */
+#define DA_MON_ALLOCATION_STRATEGY DA_ALLOC_MANUAL
 typedef struct sched_dl_entity *monitor_target;
 #include "nomiss.h"
 #include <rv/ha_monitor.h>
@@ -214,7 +214,7 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
 	if (p->policy == SCHED_DEADLINE)
 		da_reset(EXPAND_ID_TASK(p));
 	else if (new_policy == SCHED_DEADLINE)
-		da_create_or_get(EXPAND_ID_TASK(p));
+		da_create_empty_storage(get_entity_id(&p->dl, task_cpu(p), DL_TASK));
 }
 
 static void handle_sched_wakeup(void *data, struct task_struct *tsk)
-- 
2.43.0


^ permalink raw reply related

* [PATCH 1/2] ring-buffer: Fix event length with forced 8-byte alignment
From: Hui Wang @ 2026-06-07  7:24 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, pjw, linux-trace-kernel,
	shuah, wangfushuai, linux-kselftest
  Cc: hui.wang
In-Reply-To: <20260607072431.125633-1-hui.wang@canonical.com>

When RB_FORCE_8BYTE_ALIGNMENT is true, rb_calculate_event_length()
reserves the space of event->array[0] for placing the data length and
rb_update_event() stores the data length in event->array[0]
accordingly. As a result the whole event length will add extra 4 bytes
for sizeof(event.array[0]) unconditionally.

But ring_buffer_event_length() only subtracts the
sizeof(event->array[0]) for events larger than RB_MAX_SMALL_DATA +
sizeof(event->array[0]). As a result, small events on architectures
with RB_FORCE_8BYTE_ALIGNMENT=true report a data length that is 4
bytes larger than expected.

To fix it, add the RB_FORCE_8BYTE_ALIGNMENT as a condition to subtract
the size of that length field whenever RB_FORCE_8BYTE_ALIGNMENT is
true.

This issue is observed in a riscv64 kernel with
CONFIG_HAVE_64BIT_ALIGNED_ACCESS set to y, when we run ftrace selftest
trace_marker_raw.tc, we get the weird log: for cases where the id is
1..100, the number of data field is 8*N, but once id exceeds 100, the
number of data field becomes 8*N+4:
 # 1 buf: 58 00 00 00 80 5e d1 63 (number of data field is 8*1)
 ...
 # a buf: 58 ...                  (number of data field is 8*2)
 ...
 # 64 buf: 58 ...                 (number of data field is 8*13)
 # 65 buf: 58 ...                 (number of data field is 8*13+4)

After applying this change, the number of data field keeps being 8*N+4
consistently.

Fixes: 2271048d1b3b ("ring-buffer: Do 8 byte alignment for 64 bit that can not handle 4 byte align")
Signed-off-by: Hui Wang <hui.wang@canonical.com>
---
 kernel/trace/ring_buffer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 56a328e94395..d9af2bbaf9c0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -270,7 +270,8 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 		return length;
 	length -= RB_EVNT_HDR_SIZE;
-	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
+	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]) ||
+	    RB_FORCE_8BYTE_ALIGNMENT)
                 length -= sizeof(event->array[0]);
 	return length;
 }
-- 
2.43.0

^ permalink raw reply related

* [PATCH 2/2] selftests/ftrace: Account for 8-byte aligned trace_marker_raw events
From: Hui Wang @ 2026-06-07  7:24 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, pjw, linux-trace-kernel,
	shuah, wangfushuai, linux-kselftest
  Cc: hui.wang
In-Reply-To: <20260607072431.125633-1-hui.wang@canonical.com>

trace_marker_raw.tc assumes that the raw marker payload length
reported in trace_pipe is the result of int((id + 3) / 4) * 4, but
that is not true on kernels with CONFIG_HAVE_64BIT_ALIGNED_ACCESS
enabled.

With forced 8-byte alignment, the ring buffer event forces 8-byte
alignment. The event length is stored in array[0], the payload data
and id are placed in a struct raw_data_entry which is stored starting
at array[1]. In this case, the printed payload data length is 8*N+4
bytes.

To make the testcase pass in this case, add a kconfig_enabled() helper
and use it to detect CONFIG_HAVE_64BIT_ALIGNED_ACCESS so
trace_marker_raw.tc can calculate the expected length correctly.

Assisted-by: Copilot:gpt-5.5
Signed-off-by: Hui Wang <hui.wang@canonical.com>
---
 .../ftrace/test.d/00basic/trace_marker_raw.tc | 16 +++++++--
 .../testing/selftests/ftrace/test.d/functions | 33 +++++++++++++++++++
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
index 8e905d4fe6dd..beda0f8627b3 100644
--- a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
+++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
@@ -15,6 +15,11 @@ is_little_endian() {
 }
 
 little=`is_little_endian`
+raw_data_align=4
+
+if kconfig_enabled CONFIG_HAVE_64BIT_ALIGNED_ACCESS; then
+       raw_data_align=8
+fi
 
 make_str() {
 	id=$1
@@ -60,7 +65,8 @@ test_multiple_writes() {
 	echo stop > trace_marker
 
 	# Check to make sure the number of entries is the id (rounded up by 4)
-	awk '/.*: # [0-9a-f]* / {
+	# or is (((id + 3) rounded by 8) + 4) if raw_data_align is 8
+	awk -v data_align=$raw_data_align '/.*: # [0-9a-f]* / {
 			print;
 			cnt = -1;
 			for (i = 0; i < NF; i++) {
@@ -69,8 +75,12 @@ test_multiple_writes() {
 					i++;
 					cnt = strtonum("0x" $i);
 					num = NF - (i + 1);
-					# The number of items is always rounded up by 4
-					cnt2 = int((cnt + 3) / 4) * 4;
+					# The number of items is rounded up by 4
+					# or is (8 * N + 4) if data_align is 8
+					if (data_align == 4)
+						cnt2 = int((cnt + 3) / 4) * 4;
+					else
+						cnt2 = int((cnt + 3) / 8) * 8 + 4;
 					if (cnt2 != num) {
 						exit 1;
 					}
diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
index 826141e299e5..0f778087d81b 100644
--- a/tools/testing/selftests/ftrace/test.d/functions
+++ b/tools/testing/selftests/ftrace/test.d/functions
@@ -177,6 +177,39 @@ check_awk_strtonum() { # strtonum is GNU awk extension
     awk 'BEGIN{strtonum("0x1")}'
 }
 
+# a helper to check if a kconfig is enabled or not
+# return value: 0 (if kconfig is enabled)
+#               1 (if kconfig is not enabled)
+#               2 (if the config files don't exist or are unreadable)
+kconfig_enabled() { # config-name
+    local config="$1"
+    local uname_r=`uname -r`
+    local config_file
+
+    case "$config" in
+    CONFIG_*) ;;
+    *) config="CONFIG_$config" ;;
+    esac
+
+    if [ -f /proc/config.gz ] && zgrep --version >/dev/null 2>&1; then
+        zgrep -Eq "^${config}=(y|m)$" /proc/config.gz 2>/dev/null
+        return $?
+    fi
+
+    for config_file in \
+        /boot/config-$uname_r \
+        /lib/modules/$uname_r/config \
+        /lib/modules/$uname_r/build/.config
+    do
+        if [ -f "$config_file" ]; then
+            grep -Eq "^${config}=(y|m)$" "$config_file"
+            return $?
+        fi
+    done
+
+    return 2
+}
+
 LOCALHOST=127.0.0.1
 
 yield() {
-- 
2.43.0


^ permalink raw reply related

* [PATCH 0/2] ring-buffer: Fix forced 8-byte alignment event length
From: Hui Wang @ 2026-06-07  7:24 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, pjw, linux-trace-kernel,
	shuah, wangfushuai, linux-kselftest
  Cc: hui.wang

This series fixes the event length reported by ring_buffer_event_length()
when RB_FORCE_8BYTE_ALIGNMENT is enabled, and updates the ftrace
trace_marker_raw selftest to account for that layout.

On architectures where CONFIG_HAVE_64BIT_ALIGNED_ACCESS is enabled, the
ring buffer forces 8-byte alignment. In that mode, the event length is
stored in event->array[0] even for small data events, and the payload
starts from event->array[1]. However, ring_buffer_event_length() only
subtracted the extra length field for large events. As a result, small
events reported a payload length 4 bytes larger than expected.

This was observed on riscv64 with CONFIG_HAVE_64BIT_ALIGNED_ACCESS=y
when running the ftrace trace_marker_raw.tc selftest. The first patch
fixes the ring-buffer length calculation. The second patch updates the
selftest expectation when the running kernel uses forced 8-byte
alignment.

Hui Wang (2):
  ring-buffer: Fix event length with forced 8-byte alignment
  selftests/ftrace: Account for 8-byte aligned trace_marker_raw events

 kernel/trace/ring_buffer.c                    |  3 +-
 .../ftrace/test.d/00basic/trace_marker_raw.tc | 16 +++++++--
 .../testing/selftests/ftrace/test.d/functions | 33 +++++++++++++++++++
 3 files changed, 48 insertions(+), 4 deletions(-)

-- 
2.43.0

^ permalink raw reply

* [PATCH next] kernel/trace/trace_printk: Use kstrdup() instead of kmalloc() and strcpy()
From: david.laight.linux @ 2026-06-06 20:26 UTC (permalink / raw)
  To: Kees Cook, linux-hardening, Arnd Bergmann, linux-kernel,
	linux-trace-kernel
  Cc: Masami Hiramatsu, Steven Rostedt, David Laight

From: David Laight <david.laight.linux@gmail.com>

Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
This is one of a group of patches that remove potentially unbounded
strcpy() calls.

They are mostly replaced by strscpy() or, when strlen() has just been
called, with memcpy() (usually including the '\0').

Calls with copy string literals into arrays are left unchanged.
They are safe and easily detected as such.

The changes were made by getting the compiler to detect the calls and
then fixing the code by hand.

Note that all the changes are only compile tested.

Some Makefiles were changed to allow files to contain strcpy().
As well as 'difficult to fix' files, this included 'show' functions
as they really need to use sysfs_emit() or seq_printf().

All the patches are being sent individually to avoid very long cc lists.
Apologies for the terse commit messages and likely unexpected tags.
(There are about 100 patches in total.)

 kernel/trace/trace_printk.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 3ea17af60169..98171a2398e4 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -71,10 +71,9 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
 		fmt = NULL;
 		tb_fmt = kmalloc_obj(*tb_fmt);
 		if (tb_fmt) {
-			fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
+			fmt = kstrdup(*iter, GFP_KERNEL);
 			if (fmt) {
 				list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-				strcpy(fmt, *iter);
 				tb_fmt->fmt = fmt;
 			} else
 				kfree(tb_fmt);
-- 
2.39.5

^ permalink raw reply related

* [PATCHv8 bpf-next 29/29] selftests/bpf: Add tracing multi attach rollback tests
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding tests for the rollback code when the tracing_multi
link won't get attached, covering 2 reasons:

  - wrong btf id passed by user, where all previously allocated
    trampolines will be released
  - trampoline for requested function is fully attached (has already
    maximum programs attached) and the link fails, the rollback code
    needs to release all previously link-ed trampolines and release
    them

We need the bpf_fentry_test* unattached for the tests to pass,
so the rollback tests are serial.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 .../selftests/bpf/prog_tests/tracing_multi.c  | 212 ++++++++++++++++++
 .../bpf/progs/tracing_multi_rollback.c        |  43 ++++
 2 files changed, 255 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_rollback.c

diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
index 909dd0705b15..99d09b93bd23 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -11,6 +11,7 @@
 #include "tracing_multi_fail.skel.h"
 #include "tracing_multi_verifier.skel.h"
 #include "tracing_multi_bench.skel.h"
+#include "tracing_multi_rollback.skel.h"
 #include "trace_helpers.h"
 
 static __u64 bpf_fentry_test_cookies[] = {
@@ -693,6 +694,217 @@ void serial_test_tracing_multi_bench_attach(void)
 	btf__free(btf);
 }
 
+static void tracing_multi_rollback_run(struct tracing_multi_rollback *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	prog_fd = bpf_program__fd(skel->progs.test_fentry);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+
+	/* make sure the rollback code did not leave any program attached */
+	ASSERT_EQ(skel->bss->test_result_fentry, 0, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, 0, "test_result_fexit");
+}
+
+static void test_rollback_put(void)
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi_rollback *skel = NULL;
+	size_t cnt = FUNCS_CNT;
+	__u32 *ids = NULL;
+	int err;
+
+	skel = tracing_multi_rollback__open();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
+		return;
+
+	bpf_program__set_autoload(skel->progs.test_fentry, true);
+	bpf_program__set_autoload(skel->progs.test_fexit, true);
+
+	err = tracing_multi_rollback__load(skel);
+	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
+		goto cleanup;
+
+	ids = get_ids(bpf_fentry_test, cnt, NULL);
+	if (!ASSERT_OK_PTR(ids, "get_ids"))
+		goto cleanup;
+
+	/*
+	 * Mangle last id to trigger rollback, which needs to do put
+	 * on get-ed trampolines.
+	 */
+	ids[9] = 0;
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	skel->bss->pid = getpid();
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+						NULL, &opts);
+	if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	/* We don't really attach any program, but let's make sure. */
+	tracing_multi_rollback_run(skel);
+
+cleanup:
+	tracing_multi_rollback__destroy(skel);
+	free(ids);
+}
+
+static void fillers_cleanup(struct tracing_multi_rollback **skels, int cnt)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++)
+		tracing_multi_rollback__destroy(skels[i]);
+
+	free(skels);
+}
+
+static struct tracing_multi_rollback *extra_load_and_link(void)
+{
+	struct tracing_multi_rollback *skel;
+	int err;
+
+	skel = tracing_multi_rollback__open();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
+		goto cleanup;
+
+	bpf_program__set_autoload(skel->progs.extra, true);
+
+	err = tracing_multi_rollback__load(skel);
+	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
+		goto cleanup;
+
+	skel->links.extra = bpf_program__attach_trace(skel->progs.extra);
+	if (!ASSERT_OK_PTR(skel->links.extra, "bpf_program__attach_trace"))
+		goto cleanup;
+
+	return skel;
+
+cleanup:
+	tracing_multi_rollback__destroy(skel);
+	return NULL;
+}
+
+static struct tracing_multi_rollback **fillers_load_and_link(int max)
+{
+	struct tracing_multi_rollback **skels, *skel;
+	int i, err;
+
+	skels = calloc(max + 1, sizeof(*skels));
+	if (!ASSERT_OK_PTR(skels, "calloc"))
+		return NULL;
+
+	for (i = 0; i < max; i++) {
+		skel = skels[i] = tracing_multi_rollback__open();
+		if (!ASSERT_OK_PTR(skels[i], "tracing_multi_rollback__open"))
+			goto cleanup;
+
+		bpf_program__set_autoload(skel->progs.filler, true);
+
+		err = tracing_multi_rollback__load(skel);
+		if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
+			goto cleanup;
+
+		skel->links.filler = bpf_program__attach_trace(skel->progs.filler);
+		if (!ASSERT_OK_PTR(skels[i]->links.filler, "bpf_program__attach_trace"))
+			goto cleanup;
+	}
+
+	return skels;
+
+cleanup:
+	fillers_cleanup(skels, i + 1);
+	return NULL;
+}
+
+static void test_rollback_unlink(void)
+{
+	struct tracing_multi_rollback *skel = NULL, *extra;
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi_rollback **fillers;
+	size_t cnt = FUNCS_CNT;
+	__u32 *ids = NULL;
+	int err, max;
+
+	max = get_bpf_max_tramp_links();
+	if (!ASSERT_GE(max, 1, "bpf_max_tramp_links"))
+		return;
+
+	/* Attach maximum allowed programs to bpf_fentry_test10 */
+	fillers = fillers_load_and_link(max);
+	if (!ASSERT_OK_PTR(fillers, "fillers_load_and_link"))
+		return;
+
+	extra = extra_load_and_link();
+	if (!ASSERT_OK_PTR(extra, "extra_load_and_link"))
+		goto cleanup;
+
+	skel = tracing_multi_rollback__open();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
+		goto cleanup;
+
+	bpf_program__set_autoload(skel->progs.test_fentry, true);
+	bpf_program__set_autoload(skel->progs.test_fexit, true);
+
+	/*
+	 * Attach tracing_multi link on bpf_fentry_test1-10, which will
+	 * fail on bpf_fentry_test10 function, because it already has
+	 * maximum allowed programs attached.
+	 *
+	 * The rollback needs to unlink already link-ed trampolines and
+	 * put all of them.
+	 */
+	err = tracing_multi_rollback__load(skel);
+	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
+		goto cleanup;
+
+	ids = get_ids(bpf_fentry_test, cnt, NULL);
+	if (!ASSERT_OK_PTR(ids, "get_ids"))
+		goto cleanup;
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	skel->bss->pid = getpid();
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+						NULL, &opts);
+	if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	tracing_multi_rollback_run(skel);
+
+cleanup:
+	fillers_cleanup(fillers, max);
+	tracing_multi_rollback__destroy(extra);
+	tracing_multi_rollback__destroy(skel);
+	free(ids);
+}
+
+void serial_test_tracing_multi_attach_rollback(void)
+{
+	if (test__start_subtest("put"))
+		test_rollback_put();
+	if (test__start_subtest("unlink"))
+		test_rollback_unlink();
+}
+
 void test_tracing_multi_test(void)
 {
 #ifndef __x86_64__
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c b/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c
new file mode 100644
index 000000000000..a49d1d841f3a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int pid = 0;
+
+__u64 test_result_fentry = 0;
+__u64 test_result_fexit = 0;
+
+SEC("?fentry.multi")
+int BPF_PROG(test_fentry)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	test_result_fentry++;
+	return 0;
+}
+
+SEC("?fexit.multi")
+int BPF_PROG(test_fexit)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	test_result_fexit++;
+	return 0;
+}
+
+SEC("?fentry/bpf_fentry_test1")
+int BPF_PROG(extra)
+{
+	return 0;
+}
+
+SEC("?fentry/bpf_fentry_test10")
+int BPF_PROG(filler)
+{
+	return 0;
+}
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 28/29] selftests/bpf: Add tracing multi attach benchmark test
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding benchmark test that attaches to (almost) all allowed tracing
functions and display attach/detach times.

  # ./test_progs -t tracing_multi_bench_attach -v
  bpf_testmod.ko is already unloaded.
  Loading bpf_testmod.ko...
  Successfully loaded bpf_testmod.ko.
  serial_test_tracing_multi_bench_attach:PASS:btf__load_vmlinux_btf 0 nsec
  serial_test_tracing_multi_bench_attach:PASS:tracing_multi_bench__open_and_load 0 nsec
  serial_test_tracing_multi_bench_attach:PASS:get_syms 0 nsec
  serial_test_tracing_multi_bench_attach:PASS:bpf_program__attach_tracing_multi 0 nsec
  serial_test_tracing_multi_bench_attach: found 51186 functions
  serial_test_tracing_multi_bench_attach: attached in   1.295s
  serial_test_tracing_multi_bench_attach: detached in   0.243s
  #507     tracing_multi_bench_attach:OK
  Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED
  Successfully unloaded bpf_testmod.ko.

Exporting skip_entry as is_unsafe_function and using it in the test.

Also updating trace_blacklist with ___migrate_enable to be in sync
with kernel functions deny list.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 .../selftests/bpf/prog_tests/tracing_multi.c  | 100 ++++++++++++++++++
 .../selftests/bpf/progs/tracing_multi_bench.c |  12 +++
 tools/testing/selftests/bpf/trace_helpers.c   |   7 +-
 tools/testing/selftests/bpf/trace_helpers.h   |   1 +
 4 files changed, 117 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_bench.c

diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
index 9e026f2b254d..909dd0705b15 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -10,6 +10,7 @@
 #include "tracing_multi_session.skel.h"
 #include "tracing_multi_fail.skel.h"
 #include "tracing_multi_verifier.skel.h"
+#include "tracing_multi_bench.skel.h"
 #include "trace_helpers.h"
 
 static __u64 bpf_fentry_test_cookies[] = {
@@ -593,6 +594,105 @@ static void test_attach_api_fails(void)
 	free(ids2);
 }
 
+void serial_test_tracing_multi_bench_attach(void)
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi_bench *skel = NULL;
+	long attach_start_ns, attach_end_ns;
+	long detach_start_ns, detach_end_ns;
+	double attach_delta, detach_delta;
+	struct bpf_link *link = NULL;
+	size_t i, cap = 0, cnt = 0;
+	struct ksyms *ksyms = NULL;
+	void *root = NULL;
+	__u32 *ids = NULL;
+	__u32 nr, type_id;
+	struct btf *btf;
+	int err;
+
+#ifndef __x86_64__
+	test__skip();
+	return;
+#endif
+
+	btf = btf__load_vmlinux_btf();
+	if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf"))
+		return;
+
+	skel = tracing_multi_bench__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_bench__open_and_load"))
+		goto cleanup;
+
+	if (!ASSERT_OK(bpf_get_ksyms(&ksyms, true), "get_syms"))
+		goto cleanup;
+
+	/* Get all ftrace 'safe' symbols.. */
+	for (i = 0; i < ksyms->filtered_cnt; i++) {
+		if (!tsearch(&ksyms->filtered_syms[i], &root, compare)) {
+			ASSERT_FAIL("tsearch failed");
+			goto cleanup;
+		}
+	}
+
+	/* ..and filter them through BTF and btf_type_is_traceable_func. */
+	nr = btf__type_cnt(btf);
+	for (type_id = 1; type_id < nr; type_id++) {
+		const struct btf_type *type;
+		const char *str;
+
+		type = btf__type_by_id(btf, type_id);
+		if (!type)
+			break;
+
+		if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC)
+			continue;
+
+		str = btf__name_by_offset(btf, type->name_off);
+		if (!str)
+			break;
+
+		if (!tfind(&str, &root, compare))
+			continue;
+
+		if (!btf_type_is_traceable_func(btf, type))
+			continue;
+
+		err = libbpf_ensure_mem((void **) &ids, &cap, sizeof(*ids), cnt + 1);
+		if (err)
+			goto cleanup;
+
+		ids[cnt++] = type_id;
+	}
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	attach_start_ns = get_time_ns();
+	link = bpf_program__attach_tracing_multi(skel->progs.bench, NULL, &opts);
+	attach_end_ns = get_time_ns();
+
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	detach_start_ns = get_time_ns();
+	bpf_link__destroy(link);
+	detach_end_ns = get_time_ns();
+
+	attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
+	detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
+
+	printf("%s: found %lu functions\n", __func__, cnt);
+	printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
+	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
+
+cleanup:
+	tracing_multi_bench__destroy(skel);
+	tdestroy(root, tdestroy_free_nop);
+	free_kallsyms_local(ksyms);
+	free(ids);
+	btf__free(btf);
+}
+
 void test_tracing_multi_test(void)
 {
 #ifndef __x86_64__
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_bench.c b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c
new file mode 100644
index 000000000000..beae946cb8c4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("fentry.multi")
+int BPF_PROG(bench)
+{
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 0e63daf83ed5..679008b310d9 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -546,9 +546,10 @@ static const char * const trace_blacklist[] = {
 	"__rcu_read_lock",
 	"__rcu_read_unlock",
 	"bpf_get_numa_node_id",
+	"___migrate_enable",
 };
 
-static bool skip_entry(char *name)
+bool is_unsafe_function(const char *name)
 {
 	int i;
 
@@ -651,7 +652,7 @@ int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel)
 		free(name);
 		if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
 			continue;
-		if (skip_entry(name))
+		if (is_unsafe_function(name))
 			continue;
 
 		ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
@@ -728,7 +729,7 @@ int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
 		free(name);
 		if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
 			continue;
-		if (skip_entry(name))
+		if (is_unsafe_function(name))
 			continue;
 
 		if (cnt == max_cnt) {
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index d5bf1433675d..01c8ecc45627 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -63,4 +63,5 @@ int read_build_id(const char *path, char *build_id, size_t size);
 int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel);
 int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel);
 
+bool is_unsafe_function(const char *name);
 #endif
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 27/29] selftests/bpf: Add tracing multi verifier fails test
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding tests for verifier fails on tracing multi programs.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 .../selftests/bpf/prog_tests/tracing_multi.c  |  2 ++
 .../bpf/progs/tracing_multi_verifier.c        | 31 +++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_verifier.c

diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
index 7e1bb071ce2a..9e026f2b254d 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -9,6 +9,7 @@
 #include "tracing_multi_intersect.skel.h"
 #include "tracing_multi_session.skel.h"
 #include "tracing_multi_fail.skel.h"
+#include "tracing_multi_verifier.skel.h"
 #include "trace_helpers.h"
 
 static __u64 bpf_fentry_test_cookies[] = {
@@ -619,4 +620,5 @@ void test_tracing_multi_test(void)
 		test_session();
 	if (test__start_subtest("attach_api_fails"))
 		test_attach_api_fails();
+	RUN_TESTS(tracing_multi_verifier);
 }
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c b/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c
new file mode 100644
index 000000000000..7b6ed41bf452
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("fentry.multi/bpf_fentry_test1")
+__failure
+__msg("func 'bpf_multi_func' doesn't have 1-th argument")
+int BPF_PROG(fentry_direct_access, int a)
+{
+	return a;
+}
+
+SEC("fexit.multi/bpf_fentry_test3")
+__failure
+__msg("invalid bpf_context access off=24 size=8")
+int BPF_PROG(fexit_direct_access, char a, int b, __u64 c, int ret)
+{
+	return ret;
+}
+
+SEC("fsession.multi/bpf_fentry_test4")
+__failure
+__msg("invalid bpf_context access off=16 size=8")
+int BPF_PROG(fsession_direct_access, void *a, char b, int c, __u64 d, int ret)
+{
+	return c;
+}
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 26/29] selftests/bpf: Add tracing multi attach fails test
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding tests for attach fails on tracing multi link.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 .../selftests/bpf/prog_tests/tracing_multi.c  | 96 +++++++++++++++++++
 .../selftests/bpf/progs/tracing_multi_fail.c  | 18 ++++
 2 files changed, 114 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_fail.c

diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
index 05683b8d0680..7e1bb071ce2a 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -8,6 +8,7 @@
 #include "tracing_multi_module.skel.h"
 #include "tracing_multi_intersect.skel.h"
 #include "tracing_multi_session.skel.h"
+#include "tracing_multi_fail.skel.h"
 #include "trace_helpers.h"
 
 static __u64 bpf_fentry_test_cookies[] = {
@@ -498,6 +499,99 @@ static void test_session(void)
 	tracing_multi_session__destroy(skel);
 }
 
+static void test_attach_api_fails(void)
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	static const char * const func[] = {
+		"bpf_fentry_test2",
+	};
+	struct tracing_multi_fail *skel = NULL;
+	__u32 ids[2] = {}, *ids2 = NULL;
+	__u64 cookies[2];
+
+	skel = tracing_multi_fail__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_fail__open_and_load"))
+		return;
+
+	/* fail#1 (libbpf) pattern and opts NULL */
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, NULL);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_1"))
+		goto cleanup;
+
+	/* fail#2 (libbpf) pattern and ids */
+	LIBBPF_OPTS_RESET(opts,
+		.ids = ids,
+		.cnt = 2,
+	);
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						"bpf_fentry_test*", &opts);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_2"))
+		goto cleanup;
+
+	/* fail#3 (libbpf) pattern and cookies */
+	LIBBPF_OPTS_RESET(opts,
+		.ids = NULL,
+		.cnt = 2,
+		.cookies = cookies,
+	);
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						"bpf_fentry_test*", &opts);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_3"))
+		goto cleanup;
+
+	/* fail#4 (libbpf) bogus pattern */
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						"bpf_not_really_a_function*", NULL);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_4"))
+		goto cleanup;
+
+	/* fail#5 (kernel) abnormal cnt */
+	LIBBPF_OPTS_RESET(opts,
+		.ids = ids,
+		.cnt = INT_MAX,
+	);
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -E2BIG, "fail_5"))
+		goto cleanup;
+
+	/* fail#6 (kernel) attach sleepable program to not-allowed function */
+	ids2 = get_ids(func, 1, NULL);
+	if (!ASSERT_OK_PTR(ids2, "get_ids"))
+		goto cleanup;
+
+	LIBBPF_OPTS_RESET(opts,
+		.ids = ids2,
+		.cnt = 1,
+	);
+
+	skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s,
+						NULL, &opts);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry_s), -EINVAL, "fail_6"))
+		goto cleanup;
+
+	/* fail#7 (kernel) attach with duplicate id */
+	ids[0] = ids2[0];
+	ids[1] = ids2[0];
+
+	LIBBPF_OPTS_RESET(opts,
+		.ids = ids,
+		.cnt = 2,
+	);
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_7");
+
+cleanup:
+	tracing_multi_fail__destroy(skel);
+	free(ids2);
+}
+
 void test_tracing_multi_test(void)
 {
 #ifndef __x86_64__
@@ -523,4 +617,6 @@ void test_tracing_multi_test(void)
 		test_link_api_ids(true);
 	if (test__start_subtest("session"))
 		test_session();
+	if (test__start_subtest("attach_api_fails"))
+		test_attach_api_fails();
 }
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_fail.c b/tools/testing/selftests/bpf/progs/tracing_multi_fail.c
new file mode 100644
index 000000000000..7f0375f4213d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_fail.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("fentry.multi")
+int BPF_PROG(test_fentry)
+{
+	return 0;
+}
+
+SEC("fentry.multi.s")
+int BPF_PROG(test_fentry_s)
+{
+	return 0;
+}
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 25/29] selftests/bpf: Add tracing multi session test
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding tests for tracing multi link session.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/testing/selftests/bpf/Makefile          |  4 +-
 .../selftests/bpf/prog_tests/tracing_multi.c  | 45 +++++++++++++
 .../bpf/progs/tracing_multi_session_attach.c  | 65 +++++++++++++++++++
 3 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 2b5688c97006..d53b7e496ac9 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -514,7 +514,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 		test_subskeleton.skel.h test_subskeleton_lib.skel.h	\
 		test_usdt.skel.h tracing_multi.skel.h			\
 		tracing_multi_module.skel.h				\
-		tracing_multi_intersect.skel.h
+		tracing_multi_intersect.skel.h				\
+		tracing_multi_session.skel.h
 
 LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c 	\
 	core_kern.c core_kern_overflow.c test_ringbuf.c			\
@@ -543,6 +544,7 @@ xdp_features.skel.h-deps := xdp_features.bpf.o
 tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o
 tracing_multi_module.skel.h-deps := tracing_multi_attach_module.bpf.o tracing_multi_check.bpf.o
 tracing_multi_intersect.skel.h-deps := tracing_multi_intersect_attach.bpf.o tracing_multi_check.bpf.o
+tracing_multi_session.skel.h-deps := tracing_multi_session_attach.bpf.o tracing_multi_check.bpf.o
 
 LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps))
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS))
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
index 0f066063cb82..05683b8d0680 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -7,6 +7,7 @@
 #include "tracing_multi.skel.h"
 #include "tracing_multi_module.skel.h"
 #include "tracing_multi_intersect.skel.h"
+#include "tracing_multi_session.skel.h"
 #include "trace_helpers.h"
 
 static __u64 bpf_fentry_test_cookies[] = {
@@ -455,6 +456,48 @@ static void test_intersect(void)
 	tracing_multi_intersect__destroy(skel);
 }
 
+static void test_session(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct tracing_multi_session *skel;
+	int err, prog_fd;
+
+	skel = tracing_multi_session__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_session__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	err = tracing_multi_session__attach(skel);
+	if (!ASSERT_OK(err, "tracing_multi_session__attach"))
+		goto cleanup;
+
+	/* execute kernel session */
+	prog_fd = bpf_program__fd(skel->progs.test_session_1);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+
+	/* 10 for test_session_1, 1 for test_fsession_s */
+	ASSERT_EQ(skel->bss->test_result_fentry, 11, "test_result_fentry");
+	/* extra count (+1 for each fexit execution) for test_result_fexit cookie check/inc */
+	ASSERT_EQ(skel->bss->test_result_fexit, 22, "test_result_fexit");
+
+	skel->bss->test_result_fentry = 0;
+	skel->bss->test_result_fexit = 0;
+
+	/* execute bpf_testmo.ko session */
+	ASSERT_OK(trigger_module_test_read(1), "trigger_read");
+
+	/* 5 for test_session_2 */
+	ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry");
+	/* extra count (+1 for each fexit execution) for test_result_fexit cookie */
+	ASSERT_EQ(skel->bss->test_result_fexit, 10, "test_result_fexit");
+
+
+cleanup:
+	tracing_multi_session__destroy(skel);
+}
+
 void test_tracing_multi_test(void)
 {
 #ifndef __x86_64__
@@ -478,4 +521,6 @@ void test_tracing_multi_test(void)
 		test_intersect();
 	if (test__start_subtest("cookies"))
 		test_link_api_ids(true);
+	if (test__start_subtest("session"))
+		test_session();
 }
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c
new file mode 100644
index 000000000000..7c9a46016ccd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return);
+
+__u64 test_result_fentry = 0;
+__u64 test_result_fexit = 0;
+
+SEC("fsession.multi/bpf_fentry_test*")
+int BPF_PROG(test_session_1)
+{
+	volatile __u64 *cookie = bpf_session_cookie(ctx);
+
+	if (bpf_session_is_return(ctx)) {
+		if (tracing_multi_arg_check(ctx, &test_result_fexit, true))
+			return 0;
+		/* extra count for test_result_fexit cookie */
+		test_result_fexit += *cookie == 0xbeafbeafbeafbeaf;
+	} else {
+		if (tracing_multi_arg_check(ctx, &test_result_fentry, false))
+			return 0;
+		*cookie = 0xbeafbeafbeafbeaf;
+	}
+	return 0;
+}
+
+SEC("fsession.multi.s/bpf_fentry_test1")
+int BPF_PROG(test_fsession_s)
+{
+	volatile __u64 *cookie = bpf_session_cookie(ctx);
+
+	if (bpf_session_is_return(ctx)) {
+		if (tracing_multi_arg_check(ctx, &test_result_fexit, true))
+			return 0;
+		/* extra count for test_result_fexit cookie */
+		test_result_fexit += *cookie == 0xbeafbeafbeafbeaf;
+	} else {
+		if (tracing_multi_arg_check(ctx, &test_result_fentry, false))
+			return 0;
+		*cookie = 0xbeafbeafbeafbeaf;
+	}
+	return 0;
+}
+
+SEC("fsession.multi/bpf_testmod:bpf_testmod_fentry_test*")
+int BPF_PROG(test_session_2)
+{
+	volatile __u64 *cookie = bpf_session_cookie(ctx);
+
+	if (bpf_session_is_return(ctx)) {
+		if (tracing_multi_arg_check(ctx, &test_result_fexit, true))
+			return 0;
+		/* extra count for test_result_fexit cookie */
+		test_result_fexit += *cookie == 0xbeafbeafbeafbeaf;
+	} else {
+		if (tracing_multi_arg_check(ctx, &test_result_fentry, false))
+			return 0;
+		*cookie = 0xbeafbeafbeafbeaf;
+	}
+	return 0;
+}
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 24/29] selftests/bpf: Add tracing multi cookies test
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding tests for using cookies on tracing multi link.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 .../selftests/bpf/prog_tests/tracing_multi.c  | 23 +++++++++++++++++--
 .../selftests/bpf/progs/tracing_multi_check.c | 15 +++++++++++-
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
index 4dd610e74f9a..0f066063cb82 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -9,6 +9,19 @@
 #include "tracing_multi_intersect.skel.h"
 #include "trace_helpers.h"
 
+static __u64 bpf_fentry_test_cookies[] = {
+	8,  /* bpf_fentry_test1 */
+	9,  /* bpf_fentry_test2 */
+	7,  /* bpf_fentry_test3 */
+	5,  /* bpf_fentry_test4 */
+	4,  /* bpf_fentry_test5 */
+	2,  /* bpf_fentry_test6 */
+	3,  /* bpf_fentry_test7 */
+	1,  /* bpf_fentry_test8 */
+	10, /* bpf_fentry_test9 */
+	6,  /* bpf_fentry_test10 */
+};
+
 static const char * const bpf_fentry_test[] = {
 	"bpf_fentry_test1",
 	"bpf_fentry_test2",
@@ -217,7 +230,7 @@ static void test_link_api_pattern(void)
 	tracing_multi__destroy(skel);
 }
 
-static void test_link_api_ids(void)
+static void test_link_api_ids(bool test_cookies)
 {
 	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
 	struct tracing_multi *skel;
@@ -229,6 +242,7 @@ static void test_link_api_ids(void)
 		return;
 
 	skel->bss->pid = getpid();
+	skel->bss->test_cookies = test_cookies;
 
 	ids = get_ids(bpf_fentry_test, cnt, NULL);
 	if (!ASSERT_OK_PTR(ids, "get_ids"))
@@ -237,6 +251,9 @@ static void test_link_api_ids(void)
 	opts.ids = ids;
 	opts.cnt = cnt;
 
+	if (test_cookies)
+		opts.cookies = bpf_fentry_test_cookies;
+
 	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
 						NULL, &opts);
 	if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
@@ -450,7 +467,7 @@ void test_tracing_multi_test(void)
 	if (test__start_subtest("link_api_pattern"))
 		test_link_api_pattern();
 	if (test__start_subtest("link_api_ids"))
-		test_link_api_ids();
+		test_link_api_ids(false);
 	if (test__start_subtest("module_skel_api"))
 		test_module_skel_api();
 	if (test__start_subtest("module_link_api_pattern"))
@@ -459,4 +476,6 @@ void test_tracing_multi_test(void)
 		test_module_link_api_ids();
 	if (test__start_subtest("intersect"))
 		test_intersect();
+	if (test__start_subtest("cookies"))
+		test_link_api_ids(true);
 }
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_check.c b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
index 7ede84c50cb6..b2959ba71179 100644
--- a/tools/testing/selftests/bpf/progs/tracing_multi_check.c
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
@@ -6,6 +6,7 @@
 char _license[] SEC("license") = "GPL";
 
 int pid = 0;
+bool test_cookies = false;
 
 /* bpf_fentry_test1 is exported as kfunc via vmlinux.h */
 extern const void bpf_fentry_test2 __ksym;
@@ -27,7 +28,7 @@ extern const void bpf_testmod_fentry_test11 __ksym;
 int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 {
 	void *ip = (void *) bpf_get_func_ip(ctx);
-	__u64 value = 0, ret = 0;
+	__u64 value = 0, ret = 0, cookie = 0;
 	long err = 0;
 
 	if (bpf_get_current_pid_tgid() >> 32 != pid)
@@ -35,6 +36,8 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 
 	if (is_return)
 		err |= bpf_get_func_ret(ctx, &ret);
+	if (test_cookies)
+		cookie = bpf_get_attach_cookie(ctx);
 
 	if (ip == &bpf_fentry_test1) {
 		int a;
@@ -43,6 +46,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 		a = (int) value;
 
 		err |= is_return ? ret != 2 : 0;
+		err |= test_cookies ? cookie != 8 : 0;
 
 		*test_result += err == 0 && a == 1;
 	} else if (ip == &bpf_fentry_test2) {
@@ -55,6 +59,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 		b = value;
 
 		err |= is_return ? ret != 5 : 0;
+		err |= test_cookies ? cookie != 9 : 0;
 
 		*test_result += err == 0 && a == 2 && b == 3;
 	} else if (ip == &bpf_fentry_test3) {
@@ -70,6 +75,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 		c = value;
 
 		err |= is_return ? ret != 15 : 0;
+		err |= test_cookies ? cookie != 7 : 0;
 
 		*test_result += err == 0 && a == 4 && b == 5 && c == 6;
 	} else if (ip == &bpf_fentry_test4) {
@@ -88,6 +94,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 		d = value;
 
 		err |= is_return ? ret != 34 : 0;
+		err |= test_cookies ? cookie != 5 : 0;
 
 		*test_result += err == 0 && a == (void *) 7 && b == 8 && c == 9 && d == 10;
 	} else if (ip == &bpf_fentry_test5) {
@@ -109,6 +116,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 		e = value;
 
 		err |= is_return ? ret != 65 : 0;
+		err |= test_cookies ? cookie != 4 : 0;
 
 		*test_result += err == 0 && a == 11 && b == (void *) 12 && c == 13 && d == 14 && e == 15;
 	} else if (ip == &bpf_fentry_test6) {
@@ -133,22 +141,27 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 		f = value;
 
 		err |= is_return ? ret != 111 : 0;
+		err |= test_cookies ? cookie != 2 : 0;
 
 		*test_result += err == 0 && a == 16 && b == (void *) 17 && c == 18 && d == 19 && e == (void *) 20 && f == 21;
 	} else if (ip == &bpf_fentry_test7) {
 		err |= is_return ? ret != 0 : 0;
+		err |= test_cookies ? cookie != 3 : 0;
 
 		*test_result += err == 0 ? 1 : 0;
 	} else if (ip == &bpf_fentry_test8) {
 		err |= is_return ? ret != 0 : 0;
+		err |= test_cookies ? cookie != 1 : 0;
 
 		*test_result += err == 0 ? 1 : 0;
 	} else if (ip == &bpf_fentry_test9) {
 		err |= is_return ? ret != 0 : 0;
+		err |= test_cookies ? cookie != 10 : 0;
 
 		*test_result += err == 0 ? 1 : 0;
 	} else if (ip == &bpf_fentry_test10) {
 		err |= is_return ? ret != 0 : 0;
+		err |= test_cookies ? cookie != 6 : 0;
 
 		*test_result += err == 0 ? 1 : 0;
 	} else if (ip == &bpf_testmod_fentry_test1) {
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 23/29] selftests/bpf: Add tracing multi intersect tests
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding tracing multi tests for intersecting attached functions.

Using bits from (from 1 to 16 values) to specify (up to 4) attached
programs, and randomly choosing bpf_fentry_test* functions they are
attached to.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/testing/selftests/bpf/Makefile          |  4 +-
 .../selftests/bpf/prog_tests/tracing_multi.c  | 99 +++++++++++++++++++
 .../progs/tracing_multi_intersect_attach.c    | 41 ++++++++
 3 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index ed220558d41b..2b5688c97006 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -513,7 +513,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 		linked_vars.skel.h linked_maps.skel.h 			\
 		test_subskeleton.skel.h test_subskeleton_lib.skel.h	\
 		test_usdt.skel.h tracing_multi.skel.h			\
-		tracing_multi_module.skel.h
+		tracing_multi_module.skel.h				\
+		tracing_multi_intersect.skel.h
 
 LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c 	\
 	core_kern.c core_kern_overflow.c test_ringbuf.c			\
@@ -541,6 +542,7 @@ xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o
 xdp_features.skel.h-deps := xdp_features.bpf.o
 tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o
 tracing_multi_module.skel.h-deps := tracing_multi_attach_module.bpf.o tracing_multi_check.bpf.o
+tracing_multi_intersect.skel.h-deps := tracing_multi_intersect_attach.bpf.o tracing_multi_check.bpf.o
 
 LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps))
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS))
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
index 77134f1e2dc3..4dd610e74f9a 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -6,6 +6,7 @@
 #include "bpf/libbpf_internal.h"
 #include "tracing_multi.skel.h"
 #include "tracing_multi_module.skel.h"
+#include "tracing_multi_intersect.skel.h"
 #include "trace_helpers.h"
 
 static const char * const bpf_fentry_test[] = {
@@ -31,6 +32,20 @@ static const char * const bpf_testmod_fentry_test[] = {
 
 #define FUNCS_CNT (ARRAY_SIZE(bpf_fentry_test))
 
+static int get_random_funcs(const char **funcs)
+{
+	int i, cnt = 0;
+
+	for (i = 0; i < FUNCS_CNT; i++) {
+		if (rand() % 2)
+			funcs[cnt++] = bpf_fentry_test[i];
+	}
+	/* we always need at least one.. */
+	if (!cnt)
+		funcs[cnt++] = bpf_fentry_test[rand() % FUNCS_CNT];
+	return cnt;
+}
+
 static int compare(const void *ppa, const void *ppb)
 {
 	const char *pa = *(const char **) ppa;
@@ -341,6 +356,88 @@ static void test_module_link_api_ids(void)
 	free(ids);
 }
 
+static bool is_set(__u32 mask, __u32 bit)
+{
+	return (1 << bit) & mask;
+}
+
+static void __test_intersect(__u32 mask, const struct bpf_program *progs[4], __u64 *test_results[4])
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct bpf_link *links[4] = { NULL };
+	const char *funcs[FUNCS_CNT];
+	__u64 expected[4];
+	__u32 *ids, i;
+	int err, cnt;
+
+	/*
+	 * We have 4 programs in progs and the mask bits pick which
+	 * of them gets attached to randomly chosen functions.
+	 */
+	for (i = 0; i < 4; i++) {
+		if (!is_set(mask, i))
+			continue;
+
+		cnt = get_random_funcs(funcs);
+		ids = get_ids(funcs, cnt, NULL);
+		if (!ASSERT_OK_PTR(ids, "get_ids"))
+			goto cleanup;
+
+		opts.ids = ids;
+		opts.cnt = cnt;
+		links[i] = bpf_program__attach_tracing_multi(progs[i], NULL, &opts);
+		free(ids);
+
+		if (!ASSERT_OK_PTR(links[i], "bpf_program__attach_tracing_multi"))
+			goto cleanup;
+
+		expected[i] = *test_results[i] + cnt;
+	}
+
+	err = bpf_prog_test_run_opts(bpf_program__fd(progs[0]), &topts);
+	ASSERT_OK(err, "test_run");
+
+	for (i = 0; i < 4; i++) {
+		if (!is_set(mask, i))
+			continue;
+		ASSERT_EQ(*test_results[i], expected[i], "test_results");
+	}
+
+cleanup:
+	for (i = 0; i < 4; i++)
+		bpf_link__destroy(links[i]);
+}
+
+static void test_intersect(void)
+{
+	struct tracing_multi_intersect *skel;
+	const struct bpf_program *progs[4];
+	__u64 *test_results[4];
+	__u32 i;
+
+	skel = tracing_multi_intersect__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_intersect__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	progs[0] = skel->progs.fentry_1;
+	progs[1] = skel->progs.fexit_1;
+	progs[2] = skel->progs.fentry_2;
+	progs[3] = skel->progs.fexit_2;
+
+	test_results[0] = &skel->bss->test_result_fentry_1;
+	test_results[1] = &skel->bss->test_result_fexit_1;
+	test_results[2] = &skel->bss->test_result_fentry_2;
+	test_results[3] = &skel->bss->test_result_fexit_2;
+
+	for (i = 1; i < 16; i++)
+		__test_intersect(i, progs, test_results);
+
+	tracing_multi_intersect__destroy(skel);
+}
+
 void test_tracing_multi_test(void)
 {
 #ifndef __x86_64__
@@ -360,4 +457,6 @@ void test_tracing_multi_test(void)
 		test_module_link_api_pattern();
 	if (test__start_subtest("module_link_api_ids"))
 		test_module_link_api_ids();
+	if (test__start_subtest("intersect"))
+		test_intersect();
 }
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c
new file mode 100644
index 000000000000..cd5be0bb6ffd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return);
+
+__u64 test_result_fentry_1 = 0;
+__u64 test_result_fentry_2 = 0;
+__u64 test_result_fexit_1 = 0;
+__u64 test_result_fexit_2 = 0;
+
+SEC("fentry.multi")
+int BPF_PROG(fentry_1)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry_1, false);
+	return 0;
+}
+
+SEC("fentry.multi")
+int BPF_PROG(fentry_2)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry_2, false);
+	return 0;
+}
+
+SEC("fexit.multi")
+int BPF_PROG(fexit_1)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit_1, true);
+	return 0;
+}
+
+SEC("fexit.multi")
+int BPF_PROG(fexit_2)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit_2, true);
+	return 0;
+}
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 22/29] selftests/bpf: Add tracing multi skel/pattern/ids module attach tests
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding tests for tracing_multi link attachment via all possible
libbpf apis - skeleton, function pattern and btf ids on top of
bpf_testmod kernel module.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/testing/selftests/bpf/Makefile          |   4 +-
 .../selftests/bpf/prog_tests/tracing_multi.c  | 105 ++++++++++++++++++
 .../bpf/progs/tracing_multi_attach_module.c   |  25 +++++
 .../selftests/bpf/progs/tracing_multi_check.c |  50 +++++++++
 4 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index fd885beee0fd..ed220558d41b 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -512,7 +512,8 @@ SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c
 LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 		linked_vars.skel.h linked_maps.skel.h 			\
 		test_subskeleton.skel.h test_subskeleton_lib.skel.h	\
-		test_usdt.skel.h tracing_multi.skel.h
+		test_usdt.skel.h tracing_multi.skel.h			\
+		tracing_multi_module.skel.h
 
 LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c 	\
 	core_kern.c core_kern_overflow.c test_ringbuf.c			\
@@ -539,6 +540,7 @@ xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o
 xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o
 xdp_features.skel.h-deps := xdp_features.bpf.o
 tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o
+tracing_multi_module.skel.h-deps := tracing_multi_attach_module.bpf.o tracing_multi_check.bpf.o
 
 LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps))
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS))
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
index f333b2514b34..77134f1e2dc3 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -5,6 +5,7 @@
 #include <search.h>
 #include "bpf/libbpf_internal.h"
 #include "tracing_multi.skel.h"
+#include "tracing_multi_module.skel.h"
 #include "trace_helpers.h"
 
 static const char * const bpf_fentry_test[] = {
@@ -20,6 +21,14 @@ static const char * const bpf_fentry_test[] = {
 	"bpf_fentry_test10",
 };
 
+static const char * const bpf_testmod_fentry_test[] = {
+	"bpf_testmod_fentry_test1",
+	"bpf_testmod_fentry_test2",
+	"bpf_testmod_fentry_test3",
+	"bpf_testmod_fentry_test7",
+	"bpf_testmod_fentry_test11",
+};
+
 #define FUNCS_CNT (ARRAY_SIZE(bpf_fentry_test))
 
 static int compare(const void *ppa, const void *ppb)
@@ -242,6 +251,96 @@ static void test_link_api_ids(void)
 	free(ids);
 }
 
+static void test_module_skel_api(void)
+{
+	struct tracing_multi_module *skel = NULL;
+	int err;
+
+	skel = tracing_multi_module__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	err = tracing_multi_module__attach(skel);
+	if (!ASSERT_OK(err, "tracing_multi__attach"))
+		goto cleanup;
+
+	ASSERT_OK(trigger_module_test_read(1), "trigger_read");
+	ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit");
+
+cleanup:
+	tracing_multi_module__destroy(skel);
+}
+
+static void test_module_link_api_pattern(void)
+{
+	struct tracing_multi_module *skel = NULL;
+
+	skel = tracing_multi_module__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_module__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+					"bpf_testmod:bpf_testmod_fentry_test*", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+					"bpf_testmod:bpf_testmod_fentry_test*", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	ASSERT_OK(trigger_module_test_read(1), "trigger_read");
+	ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit");
+
+cleanup:
+	tracing_multi_module__destroy(skel);
+}
+
+static void test_module_link_api_ids(void)
+{
+	size_t cnt = ARRAY_SIZE(bpf_testmod_fentry_test);
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi_module *skel = NULL;
+	__u32 *ids;
+
+	skel = tracing_multi_module__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_module__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	ids = get_ids(bpf_testmod_fentry_test, cnt, "bpf_testmod");
+	if (!ASSERT_OK_PTR(ids, "get_ids"))
+		goto cleanup;
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	ASSERT_OK(trigger_module_test_read(1), "trigger_read");
+	ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit");
+
+cleanup:
+	tracing_multi_module__destroy(skel);
+	free(ids);
+}
+
 void test_tracing_multi_test(void)
 {
 #ifndef __x86_64__
@@ -255,4 +354,10 @@ void test_tracing_multi_test(void)
 		test_link_api_pattern();
 	if (test__start_subtest("link_api_ids"))
 		test_link_api_ids();
+	if (test__start_subtest("module_skel_api"))
+		test_module_skel_api();
+	if (test__start_subtest("module_link_api_pattern"))
+		test_module_link_api_pattern();
+	if (test__start_subtest("module_link_api_ids"))
+		test_module_link_api_ids();
 }
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c b/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c
new file mode 100644
index 000000000000..b3374f2db450
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return);
+
+__u64 test_result_fentry = 0;
+__u64 test_result_fexit = 0;
+
+SEC("fentry.multi/bpf_testmod:bpf_testmod_fentry_test*")
+int BPF_PROG(test_fentry)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry, false);
+	return 0;
+}
+
+SEC("fexit.multi/bpf_testmod:bpf_testmod_fentry_test*")
+int BPF_PROG(test_fexit)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit, true);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_check.c b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
index 333a3a7bae8a..7ede84c50cb6 100644
--- a/tools/testing/selftests/bpf/progs/tracing_multi_check.c
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
@@ -18,6 +18,12 @@ extern const void bpf_fentry_test8 __ksym;
 extern const void bpf_fentry_test9 __ksym;
 extern const void bpf_fentry_test10 __ksym;
 
+extern const void bpf_testmod_fentry_test1 __ksym;
+extern const void bpf_testmod_fentry_test2 __ksym;
+extern const void bpf_testmod_fentry_test3 __ksym;
+extern const void bpf_testmod_fentry_test7 __ksym;
+extern const void bpf_testmod_fentry_test11 __ksym;
+
 int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 {
 	void *ip = (void *) bpf_get_func_ip(ctx);
@@ -145,6 +151,50 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
 		err |= is_return ? ret != 0 : 0;
 
 		*test_result += err == 0 ? 1 : 0;
+	} else if (ip == &bpf_testmod_fentry_test1) {
+		int a;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (int) value;
+
+		err |= is_return ? ret != 2 : 0;
+
+		*test_result += err == 0 && a == 1;
+	} else if (ip == &bpf_testmod_fentry_test2) {
+		int a;
+		__u64 b;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (int) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (__u64) value;
+
+		err |= is_return ? ret != 5 : 0;
+
+		*test_result += err == 0 && a == 2 && b == 3;
+	} else if (ip == &bpf_testmod_fentry_test3) {
+		char a;
+		int b;
+		__u64 c;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (char) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (int) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = (__u64) value;
+
+		err |= is_return ? ret != 15 : 0;
+
+		*test_result += err == 0 && a == 4 && b == 5 && c == 6;
+	} else if (ip == &bpf_testmod_fentry_test7) {
+		err |= is_return ? ret != 133 : 0;
+
+		*test_result += err == 0;
+	} else if (ip == &bpf_testmod_fentry_test11) {
+		err |= is_return ? ret != 231 : 0;
+
+		*test_result += err == 0;
 	}
 
 	return 0;
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 21/29] selftests/bpf: Add tracing multi skel/pattern/ids attach tests
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding tests for tracing_multi link attachment via all possible
libbpf apis - skeleton, function pattern and btf ids.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/testing/selftests/bpf/Makefile          |   3 +-
 .../selftests/bpf/prog_tests/tracing_multi.c  | 258 ++++++++++++++++++
 .../bpf/progs/tracing_multi_attach.c          |  39 +++
 .../selftests/bpf/progs/tracing_multi_check.c | 151 ++++++++++
 4 files changed, 450 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_multi.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_check.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 42d9cf848b25..fd885beee0fd 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -512,7 +512,7 @@ SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c
 LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 		linked_vars.skel.h linked_maps.skel.h 			\
 		test_subskeleton.skel.h test_subskeleton_lib.skel.h	\
-		test_usdt.skel.h
+		test_usdt.skel.h tracing_multi.skel.h
 
 LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c 	\
 	core_kern.c core_kern_overflow.c test_ringbuf.c			\
@@ -538,6 +538,7 @@ test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o
 xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o
 xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o
 xdp_features.skel.h-deps := xdp_features.bpf.o
+tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o
 
 LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps))
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS))
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
new file mode 100644
index 000000000000..f333b2514b34
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include <search.h>
+#include "bpf/libbpf_internal.h"
+#include "tracing_multi.skel.h"
+#include "trace_helpers.h"
+
+static const char * const bpf_fentry_test[] = {
+	"bpf_fentry_test1",
+	"bpf_fentry_test2",
+	"bpf_fentry_test3",
+	"bpf_fentry_test4",
+	"bpf_fentry_test5",
+	"bpf_fentry_test6",
+	"bpf_fentry_test7",
+	"bpf_fentry_test8",
+	"bpf_fentry_test9",
+	"bpf_fentry_test10",
+};
+
+#define FUNCS_CNT (ARRAY_SIZE(bpf_fentry_test))
+
+static int compare(const void *ppa, const void *ppb)
+{
+	const char *pa = *(const char **) ppa;
+	const char *pb = *(const char **) ppb;
+
+	return strcmp(pa, pb);
+}
+
+static void tdestroy_free_nop(void *ptr)
+{
+}
+
+static __u32 *get_ids(const char * const funcs[], int funcs_cnt, const char *mod)
+{
+	struct btf *btf, *vmlinux_btf = NULL;
+	__u32 nr, type_id, cnt = 0;
+	void *root = NULL;
+	__u32 *ids = NULL;
+	int i, err = 0;
+
+	btf = btf__load_vmlinux_btf();
+	if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf"))
+		return NULL;
+
+	if (mod) {
+		vmlinux_btf = btf;
+		btf = btf__load_module_btf(mod, vmlinux_btf);
+		if (!ASSERT_OK_PTR(btf, "btf__load_module_btf")) {
+			btf__free(vmlinux_btf);
+			return NULL;
+		}
+	}
+
+	ids = calloc(funcs_cnt, sizeof(ids[0]));
+	if (!ids)
+		goto out;
+
+	/*
+	 * We sort function names by name and search them
+	 * below for each function.
+	 */
+	for (i = 0; i < funcs_cnt; i++) {
+		if (!tsearch(&funcs[i], &root, compare)) {
+			ASSERT_FAIL("tsearch failed");
+			err = -1;
+			goto error;
+		}
+	}
+
+	nr = btf__type_cnt(btf);
+	for (type_id = 1; type_id < nr && cnt < funcs_cnt; type_id++) {
+		const struct btf_type *type;
+		const char *str, ***val;
+		unsigned int idx;
+
+		type = btf__type_by_id(btf, type_id);
+		if (!type) {
+			err = -1;
+			break;
+		}
+
+		if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC)
+			continue;
+
+		str = btf__name_by_offset(btf, type->name_off);
+		if (!str) {
+			err = -1;
+			break;
+		}
+
+		val = tfind(&str, &root, compare);
+		if (!val)
+			continue;
+
+		/*
+		 * We keep pointer for each function name so we can get the original
+		 * array index and have the resulting ids array matching the original
+		 * function array.
+		 *
+		 * Doing it this way allow us to easily test the cookies support,
+		 * because each cookie is attached to particular function/id.
+		 */
+		idx = *val - funcs;
+		ids[idx] = type_id;
+		cnt++;
+	}
+
+error:
+	if (err) {
+		free(ids);
+		ids = NULL;
+	}
+
+out:
+	tdestroy(root, tdestroy_free_nop);
+	btf__free(vmlinux_btf);
+	btf__free(btf);
+	return ids;
+}
+
+static void tracing_multi_test_run(struct tracing_multi *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	prog_fd = bpf_program__fd(skel->progs.test_fentry);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+
+	/* extra +1 count for sleepable programs */
+	ASSERT_EQ(skel->bss->test_result_fentry, FUNCS_CNT + 1, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, FUNCS_CNT + 1, "test_result_fexit");
+}
+
+static void test_skel_api(void)
+{
+	struct tracing_multi *skel;
+	int err;
+
+	skel = tracing_multi__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	err = tracing_multi__attach(skel);
+	if (!ASSERT_OK(err, "tracing_multi__attach"))
+		goto cleanup;
+
+	tracing_multi_test_run(skel);
+
+cleanup:
+	tracing_multi__destroy(skel);
+}
+
+static void test_link_api_pattern(void)
+{
+	struct tracing_multi *skel;
+
+	skel = tracing_multi__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+					"bpf_fentry_test*", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+					"bpf_fentry_test*", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s,
+					"bpf_fentry_test1", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry_s, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit_s = bpf_program__attach_tracing_multi(skel->progs.test_fexit_s,
+					"bpf_fentry_test1", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit_s, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	tracing_multi_test_run(skel);
+
+cleanup:
+	tracing_multi__destroy(skel);
+}
+
+static void test_link_api_ids(void)
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi *skel;
+	size_t cnt = FUNCS_CNT;
+	__u32 *ids;
+
+	skel = tracing_multi__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	ids = get_ids(bpf_fentry_test, cnt, NULL);
+	if (!ASSERT_OK_PTR(ids, "get_ids"))
+		goto cleanup;
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	/* Only bpf_fentry_test1 is allowed for sleepable programs. */
+	opts.cnt = 1;
+	skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry_s, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit_s = bpf_program__attach_tracing_multi(skel->progs.test_fexit_s,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit_s, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	tracing_multi_test_run(skel);
+
+cleanup:
+	tracing_multi__destroy(skel);
+	free(ids);
+}
+
+void test_tracing_multi_test(void)
+{
+#ifndef __x86_64__
+	test__skip();
+	return;
+#endif
+
+	if (test__start_subtest("skel_api"))
+		test_skel_api();
+	if (test__start_subtest("link_api_pattern"))
+		test_link_api_pattern();
+	if (test__start_subtest("link_api_ids"))
+		test_link_api_ids();
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_attach.c
new file mode 100644
index 000000000000..332d0a423a43
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_attach.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return);
+
+__u64 test_result_fentry = 0;
+__u64 test_result_fexit = 0;
+
+SEC("fentry.multi/bpf_fentry_test*")
+int BPF_PROG(test_fentry)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry, false);
+	return 0;
+}
+
+SEC("fexit.multi/bpf_fentry_test*")
+int BPF_PROG(test_fexit)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit, true);
+	return 0;
+}
+
+SEC("fentry.multi.s/bpf_fentry_test1")
+int BPF_PROG(test_fentry_s)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry, false);
+	return 0;
+}
+
+SEC("fexit.multi.s/bpf_fentry_test1")
+int BPF_PROG(test_fexit_s)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit, true);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_check.c b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
new file mode 100644
index 000000000000..333a3a7bae8a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int pid = 0;
+
+/* bpf_fentry_test1 is exported as kfunc via vmlinux.h */
+extern const void bpf_fentry_test2 __ksym;
+extern const void bpf_fentry_test3 __ksym;
+extern const void bpf_fentry_test4 __ksym;
+extern const void bpf_fentry_test5 __ksym;
+extern const void bpf_fentry_test6 __ksym;
+extern const void bpf_fentry_test7 __ksym;
+extern const void bpf_fentry_test8 __ksym;
+extern const void bpf_fentry_test9 __ksym;
+extern const void bpf_fentry_test10 __ksym;
+
+int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
+{
+	void *ip = (void *) bpf_get_func_ip(ctx);
+	__u64 value = 0, ret = 0;
+	long err = 0;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 1;
+
+	if (is_return)
+		err |= bpf_get_func_ret(ctx, &ret);
+
+	if (ip == &bpf_fentry_test1) {
+		int a;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (int) value;
+
+		err |= is_return ? ret != 2 : 0;
+
+		*test_result += err == 0 && a == 1;
+	} else if (ip == &bpf_fentry_test2) {
+		__u64 b;
+		int a;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (int) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = value;
+
+		err |= is_return ? ret != 5 : 0;
+
+		*test_result += err == 0 && a == 2 && b == 3;
+	} else if (ip == &bpf_fentry_test3) {
+		__u64 c;
+		char a;
+		int b;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (char) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (int) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = value;
+
+		err |= is_return ? ret != 15 : 0;
+
+		*test_result += err == 0 && a == 4 && b == 5 && c == 6;
+	} else if (ip == &bpf_fentry_test4) {
+		void *a;
+		char b;
+		int c;
+		__u64 d;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (void *) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (char) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = (int) value;
+		err |= bpf_get_func_arg(ctx, 3, &value);
+		d = value;
+
+		err |= is_return ? ret != 34 : 0;
+
+		*test_result += err == 0 && a == (void *) 7 && b == 8 && c == 9 && d == 10;
+	} else if (ip == &bpf_fentry_test5) {
+		__u64 a;
+		void *b;
+		short c;
+		int d;
+		__u64 e;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (void *) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = (short) value;
+		err |= bpf_get_func_arg(ctx, 3, &value);
+		d = (int) value;
+		err |= bpf_get_func_arg(ctx, 4, &value);
+		e = value;
+
+		err |= is_return ? ret != 65 : 0;
+
+		*test_result += err == 0 && a == 11 && b == (void *) 12 && c == 13 && d == 14 && e == 15;
+	} else if (ip == &bpf_fentry_test6) {
+		__u64 a;
+		void *b;
+		short c;
+		int d;
+		void *e;
+		__u64 f;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (void *) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = (short) value;
+		err |= bpf_get_func_arg(ctx, 3, &value);
+		d = (int) value;
+		err |= bpf_get_func_arg(ctx, 4, &value);
+		e = (void *) value;
+		err |= bpf_get_func_arg(ctx, 5, &value);
+		f = value;
+
+		err |= is_return ? ret != 111 : 0;
+
+		*test_result += err == 0 && a == 16 && b == (void *) 17 && c == 18 && d == 19 && e == (void *) 20 && f == 21;
+	} else if (ip == &bpf_fentry_test7) {
+		err |= is_return ? ret != 0 : 0;
+
+		*test_result += err == 0 ? 1 : 0;
+	} else if (ip == &bpf_fentry_test8) {
+		err |= is_return ? ret != 0 : 0;
+
+		*test_result += err == 0 ? 1 : 0;
+	} else if (ip == &bpf_fentry_test9) {
+		err |= is_return ? ret != 0 : 0;
+
+		*test_result += err == 0 ? 1 : 0;
+	} else if (ip == &bpf_fentry_test10) {
+		err |= is_return ? ret != 0 : 0;
+
+		*test_result += err == 0 ? 1 : 0;
+	}
+
+	return 0;
+}
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 20/29] libbpf: Add support to create tracing multi link
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding bpf_program__attach_tracing_multi function for attaching
tracing program to multiple functions.

  struct bpf_link *
  bpf_program__attach_tracing_multi(const struct bpf_program *prog,
                                    const char *pattern,
                                    const struct bpf_tracing_multi_opts *opts);

User can specify functions to attach with 'pattern' argument that
allows wildcards (*?' supported) or provide BTF ids of functions
in array directly via opts argument. These options are mutually
exclusive.

When using BTF ids, user can also provide cookie value for each
provided id/function, that can be retrieved later in bpf program
with bpf_get_attach_cookie helper. Each cookie value is paired with
provided BTF id with the same array index.

Adding support to auto attach programs with following sections:

  fsession.multi/<pattern>
  fsession.multi.s/<pattern>
  fentry.multi/<pattern>
  fexit.multi/<pattern>
  fentry.multi.s/<pattern>
  fexit.multi.s/<pattern>

The provided <pattern> is used as 'pattern' argument in
bpf_program__attach_kprobe_multi_opts function.

The <pattern> allows to specify optional kernel module name with
following syntax:

  <module>:<function_pattern>

In order to attach tracing_multi link to a module functions:
- program must be loaded with 'module' btf fd
  (in attr::attach_btf_obj_fd)
- bpf_program__attach_tracing_multi must either have
  pattern with module spec or BTF ids from the module

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/lib/bpf/libbpf.c   | 276 +++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.h   |  15 +++
 tools/lib/bpf/libbpf.map |   1 +
 3 files changed, 292 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 42f0efd70327..1368752aa13c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7772,6 +7772,69 @@ static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program
 static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attach_name,
 				     int *btf_obj_fd, int *btf_type_id);
 
+static inline bool is_tracing_multi(enum bpf_attach_type type)
+{
+	return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI ||
+	       type == BPF_TRACE_FSESSION_MULTI;
+}
+
+static const struct module_btf *find_attach_module(struct bpf_object *obj, const char *attach)
+{
+	const char *sep, *mod_name = NULL;
+	int i, mod_len, err;
+
+	/*
+	 * We expect attach string in the form of either
+	 * - function_pattern or
+	 * - <module>:function_pattern
+	 */
+	sep = strchr(attach, ':');
+	if (sep) {
+		mod_name = attach;
+		mod_len = sep - mod_name;
+	}
+	if (!mod_name)
+		return NULL;
+
+	err = load_module_btfs(obj);
+	if (err)
+		return NULL;
+
+	for (i = 0; i < obj->btf_module_cnt; i++) {
+		const struct module_btf *mod = &obj->btf_modules[i];
+
+		if (strncmp(mod->name, mod_name, mod_len) == 0 && mod->name[mod_len] == '\0')
+			return mod;
+	}
+	return NULL;
+}
+
+static int tracing_multi_mod_fd(struct bpf_program *prog, int *btf_obj_fd)
+{
+	const char *attach_name, *sep;
+	const struct module_btf *mod;
+
+	*btf_obj_fd = 0;
+	attach_name = strchr(prog->sec_name, '/');
+
+	/* Program with no details in spec, using kernel btf. */
+	if (!attach_name)
+		return 0;
+
+	/* Program with no module section, using kernel btf. */
+	sep = strchr(++attach_name, ':');
+	if (!sep)
+		return 0;
+
+	/* Program with module specified, get its btf fd. */
+	mod = find_attach_module(prog->obj, attach_name);
+	if (!mod)
+		return -EINVAL;
+
+	*btf_obj_fd = mod->fd;
+	return 0;
+}
+
 /* this is called as prog->sec_def->prog_prepare_load_fn for libbpf-supported sec_defs */
 static int libbpf_prepare_prog_load(struct bpf_program *prog,
 				    struct bpf_prog_load_opts *opts, long cookie)
@@ -7835,6 +7898,18 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
 		opts->attach_btf_obj_fd = btf_obj_fd;
 		opts->attach_btf_id = btf_type_id;
 	}
+
+	if (is_tracing_multi(prog->expected_attach_type)) {
+		int err, btf_obj_fd = 0;
+
+		err = tracing_multi_mod_fd(prog, &btf_obj_fd);
+		if (err < 0)
+			return err;
+
+		prog->attach_btf_obj_fd = btf_obj_fd;
+		opts->attach_btf_obj_fd = btf_obj_fd;
+	}
+
 	return 0;
 }
 
@@ -9996,6 +10071,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, st
 static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link);
+static int attach_tracing_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 
 static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("socket",		SOCKET_FILTER, 0, SEC_NONE),
@@ -10049,6 +10125,12 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("fexit.s+",		TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
 	SEC_DEF("fsession+",		TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF, attach_trace),
 	SEC_DEF("fsession.s+",		TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
+	SEC_DEF("fsession.multi+",	TRACING, BPF_TRACE_FSESSION_MULTI, 0, attach_tracing_multi),
+	SEC_DEF("fsession.multi.s+",	TRACING, BPF_TRACE_FSESSION_MULTI, SEC_SLEEPABLE, attach_tracing_multi),
+	SEC_DEF("fentry.multi+",	TRACING, BPF_TRACE_FENTRY_MULTI, 0, attach_tracing_multi),
+	SEC_DEF("fexit.multi+",		TRACING, BPF_TRACE_FEXIT_MULTI, 0, attach_tracing_multi),
+	SEC_DEF("fentry.multi.s+",	TRACING, BPF_TRACE_FENTRY_MULTI, SEC_SLEEPABLE, attach_tracing_multi),
+	SEC_DEF("fexit.multi.s+",	TRACING, BPF_TRACE_FEXIT_MULTI, SEC_SLEEPABLE, attach_tracing_multi),
 	SEC_DEF("freplace+",		EXT, 0, SEC_ATTACH_BTF, attach_trace),
 	SEC_DEF("lsm+",			LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
 	SEC_DEF("lsm.s+",		LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
@@ -12529,6 +12611,200 @@ bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t)
 	return true;
 }
 
+static int
+collect_btf_func_ids_by_glob(const struct btf *btf, const char *pattern, __u32 **ids)
+{
+	__u32 type_id, nr_types = btf__type_cnt(btf);
+	size_t cap = 0, cnt = 0;
+
+	if (!pattern)
+		return -EINVAL;
+
+	for (type_id = 1; type_id < nr_types; type_id++) {
+		const struct btf_type *t = btf__type_by_id(btf, type_id);
+		const char *name;
+		int err;
+
+		if (btf_kind(t) != BTF_KIND_FUNC)
+			continue;
+		name = btf__name_by_offset(btf, t->name_off);
+		if (!name)
+			continue;
+
+		if (!glob_match(name, pattern))
+			continue;
+		if (!btf_type_is_traceable_func(btf, t))
+			continue;
+
+		err = libbpf_ensure_mem((void **) ids, &cap, sizeof(**ids), cnt + 1);
+		if (err) {
+			free(*ids);
+			return -ENOMEM;
+		}
+		(*ids)[cnt++] = type_id;
+	}
+
+	return cnt;
+}
+
+static int collect_func_ids_by_glob(const struct bpf_program *prog, const char *pattern, __u32 **ids)
+{
+	struct bpf_object *obj = prog->obj;
+	const struct module_btf *mod;
+	struct btf *btf = NULL;
+	const char *sep;
+	int err;
+
+	err = bpf_object__load_vmlinux_btf(obj, true);
+	if (err)
+		return err;
+
+	/* In case we have module specified, we will find its btf and use that. */
+	sep = strchr(pattern, ':');
+	if (sep) {
+		mod = find_attach_module(obj, pattern);
+		if (!mod) {
+			err = -EINVAL;
+			goto cleanup;
+		}
+		btf = mod->btf;
+		pattern = sep + 1;
+	} else {
+		/* Program is loaded for kernel module. */
+		if (prog->attach_btf_obj_fd) {
+			err = -EINVAL;
+			goto cleanup;
+		}
+		btf = obj->btf_vmlinux;
+	}
+
+	err = collect_btf_func_ids_by_glob(btf, pattern, ids);
+
+cleanup:
+	bpf_object_cleanup_btf(obj);
+	return err;
+}
+
+struct bpf_link *
+bpf_program__attach_tracing_multi(const struct bpf_program *prog, const char *pattern,
+				  const struct bpf_tracing_multi_opts *opts)
+{
+	LIBBPF_OPTS(bpf_link_create_opts, lopts);
+	int prog_fd, link_fd, err, cnt;
+	__u32 *free_ids = NULL;
+	struct bpf_link *link;
+	const __u64 *cookies;
+	const __u32 *ids;
+
+	if (!OPTS_VALID(opts, bpf_tracing_multi_opts))
+		return libbpf_err_ptr(-EINVAL);
+
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n",
+			prog->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
+	cnt = OPTS_GET(opts, cnt, 0);
+	ids = OPTS_GET(opts, ids, NULL);
+	cookies = OPTS_GET(opts, cookies, NULL);
+
+	if (!!ids != !!cnt)
+		return libbpf_err_ptr(-EINVAL);
+	if (pattern && (ids || cookies))
+		return libbpf_err_ptr(-EINVAL);
+	if (!pattern && !ids)
+		return libbpf_err_ptr(-EINVAL);
+
+	if (pattern) {
+		cnt = collect_func_ids_by_glob(prog, pattern, &free_ids);
+		if (cnt < 0)
+			return libbpf_err_ptr(cnt);
+		if (cnt == 0)
+			return libbpf_err_ptr(-EINVAL);
+		ids = (const __u32 *) free_ids;
+	}
+
+	lopts.tracing_multi.ids = ids;
+	lopts.tracing_multi.cookies = cookies;
+	lopts.tracing_multi.cnt = cnt;
+
+	link = calloc(1, sizeof(*link));
+	if (!link) {
+		err = -ENOMEM;
+		goto error;
+	}
+	link->detach = &bpf_link__detach_fd;
+
+	link_fd = bpf_link_create(prog_fd, 0, prog->expected_attach_type, &lopts);
+	if (link_fd < 0) {
+		err = -errno;
+		pr_warn("prog '%s': failed to attach: %s\n", prog->name, errstr(err));
+		goto error;
+	}
+	link->fd = link_fd;
+	free(free_ids);
+	return link;
+
+error:
+	free(link);
+	free(free_ids);
+	return libbpf_err_ptr(err);
+}
+
+static int attach_tracing_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link)
+{
+	static const char *const prefixes[] = {
+		"fentry.multi",
+		"fexit.multi",
+		"fsession.multi",
+		"fentry.multi.s",
+		"fexit.multi.s",
+		"fsession.multi.s",
+	};
+	const char *spec = NULL;
+	char *pattern;
+	size_t i;
+	int n;
+
+	*link = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(prefixes); i++) {
+		size_t pfx_len;
+
+		if (!str_has_pfx(prog->sec_name, prefixes[i]))
+			continue;
+
+		pfx_len = strlen(prefixes[i]);
+		/* no auto-attach case of, e.g., SEC("fentry.multi") */
+		if (prog->sec_name[pfx_len] == '\0')
+			return 0;
+
+		if (prog->sec_name[pfx_len] != '/')
+			continue;
+
+		spec = prog->sec_name + pfx_len + 1;
+		break;
+	}
+
+	if (!spec) {
+		pr_warn("prog '%s': invalid section name '%s'\n",
+			prog->name, prog->sec_name);
+		return -EINVAL;
+	}
+
+	n = sscanf(spec, "%m[a-zA-Z0-9_.*?:]", &pattern);
+	if (n < 1) {
+		pr_warn("tracing multi pattern is invalid: %s\n", spec);
+		return -EINVAL;
+	}
+
+	*link = bpf_program__attach_tracing_multi(prog, pattern, NULL);
+	free(pattern);
+	return libbpf_get_error(*link);
+}
+
 static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe,
 					  const char *binary_path, size_t offset)
 {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index bba4e8464396..b965ad571540 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -726,6 +726,21 @@ bpf_program__attach_ksyscall(const struct bpf_program *prog,
 			     const char *syscall_name,
 			     const struct bpf_ksyscall_opts *opts);
 
+struct bpf_tracing_multi_opts {
+	/* size of this struct, for forward/backward compatibility */
+	size_t sz;
+	const __u32 *ids;
+	const __u64 *cookies;
+	size_t cnt;
+	size_t :0;
+};
+
+#define bpf_tracing_multi_opts__last_field cnt
+
+LIBBPF_API struct bpf_link *
+bpf_program__attach_tracing_multi(const struct bpf_program *prog, const char *pattern,
+				  const struct bpf_tracing_multi_opts *opts);
+
 struct bpf_uprobe_opts {
 	/* size of this struct, for forward/backward compatibility */
 	size_t sz;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index dfed8d60af05..b731df19ae69 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -458,6 +458,7 @@ LIBBPF_1.7.0 {
 
 LIBBPF_1.8.0 {
 	global:
+		bpf_program__attach_tracing_multi;
 		bpf_program__clone;
 		btf__new_empty_opts;
 } LIBBPF_1.7.0;
-- 
2.54.0


^ permalink raw reply related

* [PATCHv8 bpf-next 19/29] libbpf: Add btf_type_is_traceable_func function
From: Jiri Olsa @ 2026-06-06 12:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: bpf, linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <20260606123955.345967-1-jolsa@kernel.org>

Adding btf_type_is_traceable_func function to perform same checks
as the kernel's btf_distill_func_proto function to prevent attachment
on some of the functions.

Exporting the function via libbpf_internal.h because it will be used
by benchmark test in following changes.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/lib/bpf/libbpf.c          | 79 +++++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf_internal.h |  1 +
 2 files changed, 80 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5bdaa5eb1f50..42f0efd70327 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -12450,6 +12450,85 @@ static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, stru
 	return ret;
 }
 
+#define MAX_BPF_FUNC_ARGS 12
+
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_CONST:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_TYPE_TAG:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#define MAX_RESOLVE_DEPTH 32
+
+static int btf_get_type_size(const struct btf *btf, __u32 type_id,
+			     const struct btf_type **ret_type)
+{
+	const struct btf_type *t;
+	int i;
+
+	*ret_type = btf__type_by_id(btf, 0);
+	if (!type_id)
+		return 0;
+	t = btf__type_by_id(btf, type_id);
+	for (i = 0; i < MAX_RESOLVE_DEPTH && t && btf_type_is_modifier(t); i++)
+		t = btf__type_by_id(btf, t->type);
+	if (!t || i == MAX_RESOLVE_DEPTH)
+		return -EINVAL;
+	*ret_type = t;
+	if (btf_is_ptr(t))
+		return btf__pointer_size(btf);
+	if (btf_is_int(t) || btf_is_any_enum(t) || btf_is_struct(t) || btf_is_union(t))
+		return t->size;
+	return -EINVAL;
+}
+
+bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t)
+{
+	const struct btf_param *args;
+	const struct btf_type *proto;
+	__u32 i, nargs;
+	int ret;
+
+	if (!btf_is_func(t))
+		return false;
+	proto = btf__type_by_id(btf, t->type);
+	if (!proto || !btf_is_func_proto(proto))
+		return false;
+
+	args = (const struct btf_param *)(proto + 1);
+	nargs = btf_vlen(proto);
+	if (nargs > MAX_BPF_FUNC_ARGS)
+		return false;
+
+	/* No support for struct return type. */
+	ret = btf_get_type_size(btf, proto->type, &t);
+	if (ret < 0 || btf_is_struct(t) || btf_is_union(t))
+		return false;
+
+	for (i = 0; i < nargs; i++) {
+		/* No support for variable args. */
+		if (i == nargs - 1 && args[i].type == 0)
+			return false;
+		ret = btf_get_type_size(btf, args[i].type, &t);
+		/* No support of struct argument size greater than 16 bytes. */
+		if (ret < 0 || ret > 16)
+			return false;
+		/* No support for void argument. */
+		if (ret == 0)
+			return false;
+	}
+
+	return true;
+}
+
 static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe,
 					  const char *binary_path, size_t offset)
 {
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 7d93c6c01d60..04cd303fb5a8 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -250,6 +250,7 @@ const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, _
 const struct btf_header *btf_header(const struct btf *btf);
 void btf_set_base_btf(struct btf *btf, const struct btf *base_btf);
 int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **id_map);
+bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t);
 
 static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t)
 {
-- 
2.54.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox