* [PATCH 5/9] rtla/tests: Extend timerlat top --aa-only coverage
From: Tomas Glozar @ 2026-04-23 13:05 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, LKML, linux-trace-kernel
In-Reply-To: <20260423130558.882022-1-tglozar@redhat.com>
rtla-timerlat-top's --aa-only option is currently only tested for return
value.
Extend the tests to also check that only auto-analysis is being done via
a negative match for the "Timer Latency" text in the top header, and
further split the test case into two:
- one test case for --aa-only stopping on threshold
- one test case for --aa-only exiting without threshold being hit
For both cases, the expected output ("analyzing it" or "Max latency was"
respectively) is checked against in addition to the negative match.
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
tools/tracing/rtla/tests/timerlat.t | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index f47a82c115c7..28c01d8b299d 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -35,8 +35,10 @@ check_top_hist "set the automatic trace mode" \
"timerlat TOOL -a 5" 2 "analyzing it"
check_top_hist "dump tasks" \
"timerlat TOOL -a 5 --dump-tasks" 2 "Printing CPU tasks"
-check "print the auto-analysis if hits the stop tracing condition" \
- "timerlat top --aa-only 5" 2
+check "verify --aa-only stop on threshold" \
+ "timerlat top --aa-only 5" 2 "analyzing it" "Timer Latency"
+check "verify --aa-only max latency" \
+ "timerlat top --aa-only 2000000 -d 1s" 0 "^ Max latency was" "Timer Latency"
check_top_hist "disable auto-analysis" \
"timerlat TOOL -s 3 -T 10 -t --no-aa" 2 "" "analyzing it"
check_top_q_hist "verify -c/--cpus" \
--
2.53.0
^ permalink raw reply related
* [PATCH 6/9] rtla/tests: Cover all hist options in runtime tests
From: Tomas Glozar @ 2026-04-23 13:05 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, LKML, linux-trace-kernel
In-Reply-To: <20260423130558.882022-1-tglozar@redhat.com>
Cover all options regarding histogram formatting for both
rtla-osnoise-hist and rtla-timerlat-hist tools. All options also have
output checking using positive or negative match, except for
-b/--bucket-size and -E/--entries, which cannot be tested in isolated
due to the output depending on the actual data collected.
Old -E/--entries test for rtla-osnoise was replaced with a new one
equivalent to the timerlat one.
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
tools/tracing/rtla/tests/osnoise.t | 18 ++++++++++++++++--
tools/tracing/rtla/tests/timerlat.t | 20 ++++++++++++++++++++
2 files changed, 36 insertions(+), 2 deletions(-)
diff --git a/tools/tracing/rtla/tests/osnoise.t b/tools/tracing/rtla/tests/osnoise.t
index 5edffb23981b..773a46e2dc5f 100644
--- a/tools/tracing/rtla/tests/osnoise.t
+++ b/tools/tracing/rtla/tests/osnoise.t
@@ -16,11 +16,25 @@ check_top_q_hist "verify the --stop/-s param" \
"osnoise TOOL -s 30 -T 1" 2 "osnoise hit stop tracing"
check_top_q_hist "verify the --trace param" \
"osnoise TOOL -s 30 -T 1 -t" 2 "Saving trace to osnoise_trace.txt"
-check "verify the --entries/-E param" \
- "osnoise hist -P F:1 -c 0 -r 900000 -d 10s -b 10 -E 25"
check_top_q_hist "verify the -c/--cpus param" \
"osnoise TOOL -P F:1 -c 0 -r 900000 -d 10s -S 1 --on-threshold shell,command=tests/scripts/check-cpus.sh" 2 "^Affinity of threads: 0$"
+# Histogram tests
+check "hist with -b/--bucket-size" \
+ "osnoise hist -b 1 -d 1s"
+check "hist with -E/--entries" \
+ "osnoise hist -E 10 -d 1s"
+check "hist with -E/--entries out of range" \
+ "osnoise hist -E 1 -d 1s" 1 "^Entries must be > 10 and < 9999999$"
+check "hist with --no-header" \
+ "osnoise hist --no-header -d 1s" 0 "" "RTLA osnoise histogram"
+check "hist with --with-zeros" \
+ "osnoise hist --with-zeros -b 100000 -E 21 -d 1s" 0 '^2000000\s+0\s+'
+check "hist with --no-index" \
+ "osnoise hist --no-index --with-zeros -d 1s" 0 "" "^count:"
+check "hist with --no-summary" \
+ "osnoise hist --no-summary -d 1s" 0 "" "^count:"
+
# Test setting default period by putting an absurdly high period
# and stopping on threshold.
# If default period is not set, this will time out.
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index 28c01d8b299d..a14d9ec32ede 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -44,6 +44,26 @@ check_top_hist "disable auto-analysis" \
check_top_q_hist "verify -c/--cpus" \
"timerlat TOOL -c 0 -d 10s -T 1 --on-threshold shell,command=tests/scripts/check-cpus.sh" 2 "^Affinity of threads: 0$"
+# Histogram tests
+check "hist with -b/--bucket-size" \
+ "timerlat hist -b 1 -d 1s"
+check "hist with -E/--entries" \
+ "timerlat hist -E 10 -d 1s"
+check "hist with -E/--entries out of range" \
+ "timerlat hist -E 1 -d 1s" 1 "^Entries must be > 10 and < 9999999$"
+check "hist with --no-header" \
+ "timerlat hist --no-header -d 1s" 0 "" "RTLA timerlat histogram"
+check "hist with --with-zeros" \
+ "timerlat hist --with-zeros -b 100000 -E 21 -d 1s" 0 '^2000000\s+0\s+'
+check "hist with --no-index" \
+ "timerlat hist --no-index --with-zeros -d 1s" 0 "" "^count:"
+check "hist with --no-summary" \
+ "timerlat hist --no-summary -d 1s" 0 "" "^ALL:"
+check "hist with --no-irq" \
+ "timerlat hist --no-irq -d 1s" 0 "" "IRQ-"
+check "hist with --no-thread" \
+ "timerlat hist --no-thread -d 1s" 0 "" "Thr-"
+
# Actions tests
check_top_q_hist "trace output through -t" \
"timerlat TOOL -T 2 -t" 2 "^ Saving trace to timerlat_trace.txt$"
--
2.53.0
^ permalink raw reply related
* [PATCH 7/9] rtla/tests: Add runtime test for -H/--house-keeping
From: Tomas Glozar @ 2026-04-23 13:05 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, LKML, linux-trace-kernel
In-Reply-To: <20260423130558.882022-1-tglozar@redhat.com>
Add a runtime test for -H/--house-keeping option for both osnoise and
timerlat tools, with affinity checking similar to what is done for
-c/--cpus.
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
tools/tracing/rtla/tests/osnoise.t | 2 ++
tools/tracing/rtla/tests/scripts/check-housekeeping-cpus.sh | 4 ++++
tools/tracing/rtla/tests/timerlat.t | 2 ++
3 files changed, 8 insertions(+)
create mode 100755 tools/tracing/rtla/tests/scripts/check-housekeeping-cpus.sh
diff --git a/tools/tracing/rtla/tests/osnoise.t b/tools/tracing/rtla/tests/osnoise.t
index 773a46e2dc5f..cdea84914345 100644
--- a/tools/tracing/rtla/tests/osnoise.t
+++ b/tools/tracing/rtla/tests/osnoise.t
@@ -18,6 +18,8 @@ check_top_q_hist "verify the --trace param" \
"osnoise TOOL -s 30 -T 1 -t" 2 "Saving trace to osnoise_trace.txt"
check_top_q_hist "verify the -c/--cpus param" \
"osnoise TOOL -P F:1 -c 0 -r 900000 -d 10s -S 1 --on-threshold shell,command=tests/scripts/check-cpus.sh" 2 "^Affinity of threads: 0$"
+check_top_q_hist "verify the -H/--house-keeping param" \
+ "osnoise TOOL -P F:1 -H 0 -r 900000 -d 10s -S 1 --on-threshold shell,command=tests/scripts/check-housekeeping-cpus.sh" 2 "^Affinity of threads: 0$"
# Histogram tests
check "hist with -b/--bucket-size" \
diff --git a/tools/tracing/rtla/tests/scripts/check-housekeeping-cpus.sh b/tools/tracing/rtla/tests/scripts/check-housekeeping-cpus.sh
new file mode 100755
index 000000000000..4742f34efb49
--- /dev/null
+++ b/tools/tracing/rtla/tests/scripts/check-housekeeping-cpus.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+pid=$(ps -o ppid= $$)
+echo "Affinity of threads:$(taskset -c -p $pid | cut -d ':' -f 2)"
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index a14d9ec32ede..20f68bcbcb27 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -43,6 +43,8 @@ check_top_hist "disable auto-analysis" \
"timerlat TOOL -s 3 -T 10 -t --no-aa" 2 "" "analyzing it"
check_top_q_hist "verify -c/--cpus" \
"timerlat TOOL -c 0 -d 10s -T 1 --on-threshold shell,command=tests/scripts/check-cpus.sh" 2 "^Affinity of threads: 0$"
+check_top_q_hist "verify -H/--house-keeping" \
+ "timerlat TOOL -H 0 -d 10s -T 1 --on-threshold shell,command=tests/scripts/check-housekeeping-cpus.sh" 2 "^Affinity of threads: 0$"
# Histogram tests
check "hist with -b/--bucket-size" \
--
2.53.0
^ permalink raw reply related
* [PATCH 8/9] rtla/tests: Add runtime test for -k and -u options
From: Tomas Glozar @ 2026-04-23 13:05 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, LKML, linux-trace-kernel
In-Reply-To: <20260423130558.882022-1-tglozar@redhat.com>
Add runtime test for rtla-timerlat's -k/--kernel-threads and
-u/--user-threads options using get_workload_pids.sh to check whether
the appropriate threads are being created.
The tests are implemented for both top and hist. Additionally, all tests
related to timerlat threads are moved to a separate section in the test
files. The latter is also done for rtla-osnoise tests.
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
tools/tracing/rtla/tests/osnoise.t | 8 +++++---
.../tests/scripts/check-user-kernel-threads.sh | 16 ++++++++++++++++
tools/tracing/rtla/tests/timerlat.t | 12 +++++++++---
3 files changed, 30 insertions(+), 6 deletions(-)
create mode 100755 tools/tracing/rtla/tests/scripts/check-user-kernel-threads.sh
diff --git a/tools/tracing/rtla/tests/osnoise.t b/tools/tracing/rtla/tests/osnoise.t
index cdea84914345..d0b623233db5 100644
--- a/tools/tracing/rtla/tests/osnoise.t
+++ b/tools/tracing/rtla/tests/osnoise.t
@@ -9,13 +9,15 @@ check "verify help page" \
"osnoise --help" 0 "osnoise version"
check_top_hist "verify help page" \
"osnoise TOOL --help" 0 "rtla osnoise"
-check_top_q_hist "verify the --priority/-P param" \
- "osnoise TOOL -P F:1 -c 0 -r 900000 -d 10s -S 1 --on-threshold shell,command=\"tests/scripts/check-priority.sh SCHED_FIFO 1\"" \
- 2 "Priorities are set correctly"
check_top_q_hist "verify the --stop/-s param" \
"osnoise TOOL -s 30 -T 1" 2 "osnoise hit stop tracing"
check_top_q_hist "verify the --trace param" \
"osnoise TOOL -s 30 -T 1 -t" 2 "Saving trace to osnoise_trace.txt"
+
+# Thread tests
+check_top_q_hist "verify the --priority/-P param" \
+ "osnoise TOOL -P F:1 -c 0 -r 900000 -d 10s -S 1 --on-threshold shell,command=\"tests/scripts/check-priority.sh SCHED_FIFO 1\"" \
+ 2 "Priorities are set correctly"
check_top_q_hist "verify the -c/--cpus param" \
"osnoise TOOL -P F:1 -c 0 -r 900000 -d 10s -S 1 --on-threshold shell,command=tests/scripts/check-cpus.sh" 2 "^Affinity of threads: 0$"
check_top_q_hist "verify the -H/--house-keeping param" \
diff --git a/tools/tracing/rtla/tests/scripts/check-user-kernel-threads.sh b/tools/tracing/rtla/tests/scripts/check-user-kernel-threads.sh
new file mode 100755
index 000000000000..bb7ac510a735
--- /dev/null
+++ b/tools/tracing/rtla/tests/scripts/check-user-kernel-threads.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+. "$(dirname $0)/lib/get_workload_pids.sh"
+kthreadd_pid=$(pgrep ^kthreadd$)
+cnt_kernel=0
+cnt_user=0
+for pid in $(get_workload_pids)
+do
+ if [ "$(echo $(ps -o ppid= $pid))" = "$kthreadd_pid" ]
+ then
+ ((++cnt_kernel))
+ else
+ ((++cnt_user))
+ fi
+done
+echo "$cnt_kernel kernel threads, $cnt_user user threads"
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index 20f68bcbcb27..3557adbdebae 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -26,9 +26,6 @@ check_top_hist "verify help page" \
"timerlat TOOL --help" 0 "rtla timerlat"
check_top_hist "verify -s/--stack" \
"timerlat TOOL -s 3 -T 10 -t" 2 "Blocking thread stack trace"
-check_top_hist "verify -P/--priority" \
- "timerlat TOOL -P F:1 -c 0 -d 10s -T 1 --on-threshold shell,command=\"tests/scripts/check-priority.sh SCHED_FIFO 1\"" \
- 2 "Priorities are set correctly"
check_top_hist "test in nanoseconds" \
"timerlat TOOL -i 2 -c 0 -n -d 10s" 2 "ns"
check_top_hist "set the automatic trace mode" \
@@ -41,10 +38,19 @@ check "verify --aa-only max latency" \
"timerlat top --aa-only 2000000 -d 1s" 0 "^ Max latency was" "Timer Latency"
check_top_hist "disable auto-analysis" \
"timerlat TOOL -s 3 -T 10 -t --no-aa" 2 "" "analyzing it"
+
+# Thread tests
+check_top_hist "verify -P/--priority" \
+ "timerlat TOOL -P F:1 -c 0 -d 10s -T 1 --on-threshold shell,command=\"tests/scripts/check-priority.sh SCHED_FIFO 1\"" \
+ 2 "Priorities are set correctly"
check_top_q_hist "verify -c/--cpus" \
"timerlat TOOL -c 0 -d 10s -T 1 --on-threshold shell,command=tests/scripts/check-cpus.sh" 2 "^Affinity of threads: 0$"
check_top_q_hist "verify -H/--house-keeping" \
"timerlat TOOL -H 0 -d 10s -T 1 --on-threshold shell,command=tests/scripts/check-housekeeping-cpus.sh" 2 "^Affinity of threads: 0$"
+check_top_q_hist "verify -k/--kernel-threads" \
+ "timerlat TOOL -k -c 0 -d 10s -T 1 --on-threshold shell,command=tests/scripts/check-user-kernel-threads.sh" 2 "1 kernel threads, 0 user threads"
+check_top_q_hist "verify -u/--user-threads" \
+ "timerlat TOOL -u -c 0 -d 10s -T 1 --on-threshold shell,command=tests/scripts/check-user-kernel-threads.sh" 2 "0 kernel threads, 1 user threads"
# Histogram tests
check "hist with -b/--bucket-size" \
--
2.53.0
^ permalink raw reply related
* [PATCH 9/9] rtla/tests: Add runtime tests for -C/--cgroup
From: Tomas Glozar @ 2026-04-23 13:05 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, LKML, linux-trace-kernel
In-Reply-To: <20260423130558.882022-1-tglozar@redhat.com>
Add a new script check-cgroup-match.sh that retrieves the cgroup of the
main rtla process and compares it to the cgroup of the rtla workload
threads.
Add a new test based on this script, for both osnoise and timerlat
tools, testing the variant of -C without argument (which sets the cgroup
of the workload to the cgroup of the rtla main process).
Note that this has to be tested in kernel mode to be significant for
timerlat tool, as user workloads inherit the parent rtla process cgroup
even without the option.
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
tools/tracing/rtla/tests/osnoise.t | 3 +++
.../rtla/tests/scripts/check-cgroup-match.sh | 17 +++++++++++++++++
tools/tracing/rtla/tests/timerlat.t | 3 +++
3 files changed, 23 insertions(+)
create mode 100755 tools/tracing/rtla/tests/scripts/check-cgroup-match.sh
diff --git a/tools/tracing/rtla/tests/osnoise.t b/tools/tracing/rtla/tests/osnoise.t
index d0b623233db5..06787471d0e8 100644
--- a/tools/tracing/rtla/tests/osnoise.t
+++ b/tools/tracing/rtla/tests/osnoise.t
@@ -18,6 +18,9 @@ check_top_q_hist "verify the --trace param" \
check_top_q_hist "verify the --priority/-P param" \
"osnoise TOOL -P F:1 -c 0 -r 900000 -d 10s -S 1 --on-threshold shell,command=\"tests/scripts/check-priority.sh SCHED_FIFO 1\"" \
2 "Priorities are set correctly"
+check_top_q_hist "verify the -C/--cgroup param" \
+ "osnoise TOOL -C -c 0 -r 900000 -d 10s -S 1 --on-threshold shell,command=\"tests/scripts/check-cgroup-match.sh\"" \
+ 2 "cgroup matches for all workload PIDs"
check_top_q_hist "verify the -c/--cpus param" \
"osnoise TOOL -P F:1 -c 0 -r 900000 -d 10s -S 1 --on-threshold shell,command=tests/scripts/check-cpus.sh" 2 "^Affinity of threads: 0$"
check_top_q_hist "verify the -H/--house-keeping param" \
diff --git a/tools/tracing/rtla/tests/scripts/check-cgroup-match.sh b/tools/tracing/rtla/tests/scripts/check-cgroup-match.sh
new file mode 100755
index 000000000000..fdc2c68c5957
--- /dev/null
+++ b/tools/tracing/rtla/tests/scripts/check-cgroup-match.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+. "$(dirname $0)/lib/get_workload_pids.sh"
+rtla_pid=$(echo $(ps -o ppid= $$))
+rtla_cgroup=$(</proc/$rtla_pid/cgroup)
+echo "RTLA cgroup: $rtla_cgroup"
+for pid in $(get_workload_pids)
+do
+ pid_cgroup=$(</proc/$pid/cgroup)
+ echo "PID $pid cgroup: $pid_cgroup"
+ if ! [ "$pid_cgroup" = "$rtla_cgroup" ]
+ then
+ echo "Mismatch!"
+ exit 0
+ fi
+done
+echo "cgroup matches for all workload PIDs"
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index 3557adbdebae..3ebfe316b39e 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -43,6 +43,9 @@ check_top_hist "disable auto-analysis" \
check_top_hist "verify -P/--priority" \
"timerlat TOOL -P F:1 -c 0 -d 10s -T 1 --on-threshold shell,command=\"tests/scripts/check-priority.sh SCHED_FIFO 1\"" \
2 "Priorities are set correctly"
+check_top_hist "verify -C/--cgroup" \
+ "timerlat TOOL -k -C -c 0 -d 10s -T 1 --on-threshold shell,command=\"tests/scripts/check-cgroup-match.sh\"" \
+ 2 "cgroup matches for all workload PIDs"
check_top_q_hist "verify -c/--cpus" \
"timerlat TOOL -c 0 -d 10s -T 1 --on-threshold shell,command=tests/scripts/check-cpus.sh" 2 "^Affinity of threads: 0$"
check_top_q_hist "verify -H/--house-keeping" \
--
2.53.0
^ permalink raw reply related
* [PATCH] rtla: Document tests in README
From: Tomas Glozar @ 2026-04-23 13:07 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, LKML, linux-trace-kernel
RTLA tests are not documented anywhere. Mention both runtime and unit
tests in the README, with instructions on how to run them and a list of
dependencies and required system configuration.
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
tools/tracing/rtla/README.txt | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/tools/tracing/rtla/README.txt b/tools/tracing/rtla/README.txt
index a9faee4dbb3a..8a782cd2c171 100644
--- a/tools/tracing/rtla/README.txt
+++ b/tools/tracing/rtla/README.txt
@@ -42,4 +42,34 @@ For development, we suggest the following steps for compiling rtla:
$ make
$ sudo make install
+Running tests
+
+RTLA has two test suites: a runtime test suite and a unit test suite.
+
+The runtime test suite is available as "make check" (root required) and has
+the following dependencies, in addition to RTLA build dependencies:
+
+- Perl
+- Test::Harness / TAP::Harness
+- bash
+- coreutils
+- ldd
+- util-linux
+- procps(-ng)
+- bpftool (if rtla is built against libbpf)
+
+as well as the following required system configuration:
+
+- CONFIG_OSNOISE_TRACER=y
+- CONFIG_TIMERLAT_TRACER=y
+- tracefs mounted and readable at /sys/kernel/tracing
+
+The unit test suite is available as "make unit-tests" and has the following
+dependencies:
+
+- libcheck
+
+Unlike the runtime test suite, root is not required to run unit tests, nor is
+a tracefs/osnoise/timerlat-capable kernel required.
+
For further information, please refer to the rtla man page.
--
2.53.0
^ permalink raw reply related
* Re: [PATCH v2 2/2] module/kallsyms: sort function symbols and use binary search
From: Petr Pavlu @ 2026-04-23 14:00 UTC (permalink / raw)
To: Stanislaw Gruszka
Cc: linux-modules, Sami Tolvanen, Luis Chamberlain, linux-kernel,
linux-trace-kernel, live-patching, Daniel Gomez, Aaron Tomlin,
Steven Rostedt, Masami Hiramatsu, Jordan Rome, Viktor Malik
In-Reply-To: <20260327110005.16499-2-stf_xl@wp.pl>
On 3/27/26 12:00 PM, Stanislaw Gruszka wrote:
> Module symbol lookup via find_kallsyms_symbol() performs a linear scan
> over the entire symtab when resolving an address. The number of symbols
> in module symtabs has grown over the years, largely due to additional
> metadata in non-standard sections, making this lookup very slow.
>
> Improve this by separating function symbols during module load, placing
> them at the beginning of the symtab, sorting them by address, and using
> binary search when resolving addresses in module text.
>
> This also should improve times for linear symbol name lookups, as valid
> function symbols are now located at the beginning of the symtab.
>
> The cost of sorting is small relative to module load time. In repeated
> module load tests [1], depending on .config options, this change
> increases load time between 2% and 4%. With cold caches, the difference
> is not measurable, as memory access latency dominates.
>
> The sorting theoretically could be done in compile time, but much more
> complicated as we would have to simulate kernel addresses resolution
> for symbols, and then correct relocation entries. That would be risky
> if get out of sync.
>
> The improvement can be observed when listing ftrace filter functions.
>
> Before:
>
> root@nano:~# time cat /sys/kernel/tracing/available_filter_functions | wc -l
> 74908
>
> real 0m1.315s
> user 0m0.000s
> sys 0m1.312s
>
> After:
>
> root@nano:~# time cat /sys/kernel/tracing/available_filter_functions | wc -l
> 74911
>
> real 0m0.167s
> user 0m0.004s
> sys 0m0.175s
>
> (there are three more symbols introduced by the patch)
>
> For livepatch modules, the symtab layout is preserved and the existing
> linear search is used. For this case, it should be possible to keep
> the original ELF symtab instead of copying it 1:1, but that is outside
> the scope of this patch.
>
> Link: https://gist.github.com/sgruszka/09f3fb1dad53a97b1aad96e1927ab117 [1]
> Signed-off-by: Stanislaw Gruszka <stf_xl@wp.pl>
Sorry for the delay reviewing this patch.
> ---
> v1 -> v2:
> - fix searching data symbols for CONFIG_KALLSYMS_ALL
> - use kallsyms_symbol_value() in elf_sym_cmp()
>
> include/linux/module.h | 1 +
> kernel/module/internal.h | 1 +
> kernel/module/kallsyms.c | 171 +++++++++++++++++++++++++++++----------
> 3 files changed, 130 insertions(+), 43 deletions(-)
>
> diff --git a/include/linux/module.h b/include/linux/module.h
> index ac254525014c..67c053afa882 100644
> --- a/include/linux/module.h
> +++ b/include/linux/module.h
> @@ -379,6 +379,7 @@ struct module_memory {
> struct mod_kallsyms {
> Elf_Sym *symtab;
> unsigned int num_symtab;
> + unsigned int num_func_syms;
> char *strtab;
> char *typetab;
> };
> diff --git a/kernel/module/internal.h b/kernel/module/internal.h
> index 618202578b42..6a4d498619b1 100644
> --- a/kernel/module/internal.h
> +++ b/kernel/module/internal.h
> @@ -73,6 +73,7 @@ struct load_info {
> bool sig_ok;
> #ifdef CONFIG_KALLSYMS
> unsigned long mod_kallsyms_init_off;
> + unsigned long num_func_syms;
> #endif
> #ifdef CONFIG_MODULE_DECOMPRESS
> #ifdef CONFIG_MODULE_STATS
> diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
> index f23126d804b2..d69e99e67707 100644
> --- a/kernel/module/kallsyms.c
> +++ b/kernel/module/kallsyms.c
> @@ -10,6 +10,7 @@
> #include <linux/kallsyms.h>
> #include <linux/buildid.h>
> #include <linux/bsearch.h>
> +#include <linux/sort.h>
> #include "internal.h"
>
> /* Lookup exported symbol in given range of kernel_symbols */
> @@ -103,6 +104,95 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
> return true;
> }
>
> +static inline bool is_func_symbol(const Elf_Sym *sym)
> +{
> + return sym->st_shndx != SHN_UNDEF && sym->st_size != 0 &&
> + ELF_ST_TYPE(sym->st_info) == STT_FUNC;
> +}
> +
> +static unsigned int bsearch_func_symbol(struct mod_kallsyms *kallsyms,
> + unsigned long addr,
> + unsigned long *bestval,
> + unsigned long *nextval)
> +
> +{
> + unsigned int mid, low = 1, high = kallsyms->num_func_syms + 1;
> + unsigned int best = 0;
> + unsigned long thisval;
> +
> + while (low < high) {
> + mid = low + (high - low) / 2;
> + thisval = kallsyms_symbol_value(&kallsyms->symtab[mid]);
> +
> + if (thisval <= addr) {
> + *bestval = thisval;
> + best = mid;
> + low = mid + 1;
If thisval == addr, the search moves to the right and finds the last
symbol with the same address. I believe it should do the opposite and
return the first symbol to match the behavior of
search_kallsyms_symbol().
> + } else {
> + *nextval = thisval;
> + high = mid;
> + }
> + }
> +
> + return best;
> +}
> +
> +static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms,
> + unsigned int symnum)
> +{
> + return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
> +}
> +
> +static unsigned int search_kallsyms_symbol(struct mod_kallsyms *kallsyms,
> + unsigned long addr,
> + unsigned long *bestval,
> + unsigned long *nextval)
> +{
> + unsigned int i, best = 0;
> +
> + /*
> + * Scan for closest preceding symbol and next symbol. (ELF starts
> + * real symbols at 1). Skip the initial function symbols range
> + * if num_func_syms is non-zero, those are handled separately for
> + * the core TEXT segment lookup.
> + */
> + for (i = 1 + kallsyms->num_func_syms; i < kallsyms->num_symtab; i++) {
> + const Elf_Sym *sym = &kallsyms->symtab[i];
> + unsigned long thisval = kallsyms_symbol_value(sym);
> +
> + if (sym->st_shndx == SHN_UNDEF)
> + continue;
> +
> + /*
> + * We ignore unnamed symbols: they're uninformative
> + * and inserted at a whim.
> + */
> + if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
> + is_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
> + continue;
> +
> + if (thisval <= addr && thisval > *bestval) {
> + best = i;
> + *bestval = thisval;
> + }
> + if (thisval > addr && thisval < *nextval)
> + *nextval = thisval;
> + }
> +
> + return best;
> +}
> +
> +static int elf_sym_cmp(const void *a, const void *b)
> +{
> + unsigned long val_a = kallsyms_symbol_value((const Elf_Sym *)a);
> + unsigned long val_b = kallsyms_symbol_value((const Elf_Sym *)b);
> +
> + if (val_a < val_b)
> + return -1;
> +
> + return val_a > val_b;
Does this comparison function and the sort() call result in stable
sorting? If val_a and val_b are the same, the sorting should preserve
the original order.
> +}
> +
> /*
> * We only allocate and copy the strings needed by the parts of symtab
> * we keep. This is simple, but has the effect of making multiple
> @@ -115,9 +205,10 @@ void layout_symtab(struct module *mod, struct load_info *info)
> Elf_Shdr *symsect = info->sechdrs + info->index.sym;
> Elf_Shdr *strsect = info->sechdrs + info->index.str;
> const Elf_Sym *src;
> - unsigned int i, nsrc, ndst, strtab_size = 0;
> + unsigned int i, nsrc, ndst, nfunc, strtab_size = 0;
> struct module_memory *mod_mem_data = &mod->mem[MOD_DATA];
> struct module_memory *mod_mem_init_data = &mod->mem[MOD_INIT_DATA];
> + bool is_lp_mod = is_livepatch_module(mod);
>
> /* Put symbol section at end of init part of module. */
> symsect->sh_flags |= SHF_ALLOC;
> @@ -129,12 +220,14 @@ void layout_symtab(struct module *mod, struct load_info *info)
> nsrc = symsect->sh_size / sizeof(*src);
>
> /* Compute total space required for the core symbols' strtab. */
> - for (ndst = i = 0; i < nsrc; i++) {
> - if (i == 0 || is_livepatch_module(mod) ||
> + for (ndst = nfunc = i = 0; i < nsrc; i++) {
> + if (i == 0 || is_lp_mod ||
> is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
> info->index.pcpu)) {
> strtab_size += strlen(&info->strtab[src[i].st_name]) + 1;
> ndst++;
> + if (!is_lp_mod && is_func_symbol(src + i))
> + nfunc++;
> }
> }
>
> @@ -156,6 +249,7 @@ void layout_symtab(struct module *mod, struct load_info *info)
> mod_mem_init_data->size = ALIGN(mod_mem_init_data->size,
> __alignof__(struct mod_kallsyms));
> info->mod_kallsyms_init_off = mod_mem_init_data->size;
> + info->num_func_syms = nfunc;
>
> mod_mem_init_data->size += sizeof(struct mod_kallsyms);
> info->init_typeoffs = mod_mem_init_data->size;
> @@ -169,7 +263,7 @@ void layout_symtab(struct module *mod, struct load_info *info)
> */
> void add_kallsyms(struct module *mod, const struct load_info *info)
> {
> - unsigned int i, ndst;
> + unsigned int i, di, nfunc, ndst;
> const Elf_Sym *src;
> Elf_Sym *dst;
> char *s;
> @@ -178,6 +272,7 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
> void *data_base = mod->mem[MOD_DATA].base;
> void *init_data_base = mod->mem[MOD_INIT_DATA].base;
> struct mod_kallsyms *kallsyms;
> + bool is_lp_mod = is_livepatch_module(mod);
>
> kallsyms = init_data_base + info->mod_kallsyms_init_off;
This code is followed by the initialization of kallsyms:
kallsyms->symtab = (void *)symsec->sh_addr;
kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
/* Make sure we get permanent strtab: don't use info->strtab. */
kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
kallsyms->typetab = init_data_base + info->init_typeoffs;
I suggest adding 'kallsyms->num_func_syms = 0;' after the initialization
of kallsyms->num_symtab.
>
> @@ -194,19 +289,28 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
> mod->core_kallsyms.symtab = dst = data_base + info->symoffs;
> mod->core_kallsyms.strtab = s = data_base + info->stroffs;
> mod->core_kallsyms.typetab = data_base + info->core_typeoffs;
> +
> strtab_size = info->core_typeoffs - info->stroffs;
> src = kallsyms->symtab;
> - for (ndst = i = 0; i < kallsyms->num_symtab; i++) {
> + ndst = info->num_func_syms + 1;
> +
> + for (nfunc = i = 0; i < kallsyms->num_symtab; i++) {
> kallsyms->typetab[i] = elf_type(src + i, info);
> - if (i == 0 || is_livepatch_module(mod) ||
> + if (i == 0 || is_lp_mod ||
> is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
> info->index.pcpu)) {
> ssize_t ret;
>
> - mod->core_kallsyms.typetab[ndst] =
> - kallsyms->typetab[i];
> - dst[ndst] = src[i];
> - dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
> + if (i == 0)
> + di = 0;
> + else if (!is_lp_mod && is_func_symbol(src + i))
> + di = 1 + nfunc++;
> + else
> + di = ndst++;
> +
> + mod->core_kallsyms.typetab[di] = kallsyms->typetab[i];
> + dst[di] = src[i];
> + dst[di].st_name = s - mod->core_kallsyms.strtab;
> ret = strscpy(s, &kallsyms->strtab[src[i].st_name],
> strtab_size);
> if (ret < 0)
> @@ -216,9 +320,13 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
> }
> }
>
> + WARN_ON_ONCE(nfunc != info->num_func_syms);
> + sort(dst + 1, nfunc, sizeof(Elf_Sym), elf_sym_cmp, NULL);
> +
The code sorts mod->core_kallsyms.symtab but mod->core_kallsyms.typetab
is not reordered accordingly.
> /* Set up to point into init section. */
> rcu_assign_pointer(mod->kallsyms, kallsyms);
> mod->core_kallsyms.num_symtab = ndst;
> + mod->core_kallsyms.num_func_syms = nfunc;
> }
>
> #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
> @@ -241,11 +349,6 @@ void init_build_id(struct module *mod, const struct load_info *info)
> }
> #endif
>
> -static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
> -{
> - return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
> -}
> -
> /*
> * Given a module and address, find the corresponding symbol and return its name
> * while providing its size and offset if needed.
> @@ -255,7 +358,10 @@ static const char *find_kallsyms_symbol(struct module *mod,
> unsigned long *size,
> unsigned long *offset)
> {
> - unsigned int i, best = 0;
> + unsigned int (*search)(struct mod_kallsyms *kallsyms,
> + unsigned long addr, unsigned long *bestval,
> + unsigned long *nextval);
> + unsigned int best;
> unsigned long nextval, bestval;
> struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms);
> struct module_memory *mod_mem = NULL;
> @@ -266,6 +372,11 @@ static const char *find_kallsyms_symbol(struct module *mod,
> continue;
> #endif
> if (within_module_mem_type(addr, mod, type)) {
> + if (type == MOD_TEXT && kallsyms->num_func_syms > 0)
> + search = bsearch_func_symbol;
I'm not sure if it is ok to limit the search only to function symbols
when the address lies in MOD_TEXT. The text can theoretically contain
non-function symbols. Could this optimization be adjusted to sort all
MOD_TEXT symbols (excluding anonymous and mapping symbols) and move them
to the front of the symbol table?
> + else
> + search = search_kallsyms_symbol;
> +
> mod_mem = &mod->mem[type];
> break;
> }
> @@ -278,33 +389,7 @@ static const char *find_kallsyms_symbol(struct module *mod,
> nextval = (unsigned long)mod_mem->base + mod_mem->size;
> bestval = (unsigned long)mod_mem->base - 1;
>
> - /*
> - * Scan for closest preceding symbol, and next symbol. (ELF
> - * starts real symbols at 1).
> - */
> - for (i = 1; i < kallsyms->num_symtab; i++) {
> - const Elf_Sym *sym = &kallsyms->symtab[i];
> - unsigned long thisval = kallsyms_symbol_value(sym);
> -
> - if (sym->st_shndx == SHN_UNDEF)
> - continue;
> -
> - /*
> - * We ignore unnamed symbols: they're uninformative
> - * and inserted at a whim.
> - */
> - if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
> - is_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
> - continue;
> -
> - if (thisval <= addr && thisval > bestval) {
> - best = i;
> - bestval = thisval;
> - }
> - if (thisval > addr && thisval < nextval)
> - nextval = thisval;
> - }
> -
> + best = search(kallsyms, addr, &bestval, &nextval);
> if (!best)
> return NULL;
>
--
Thanks,
Petr
^ permalink raw reply
* [PATCH 1/1] tools/rv: ensure monitor name and desc are NUL-terminated
From: unknownbbqrx @ 2026-04-23 14:19 UTC (permalink / raw)
To: rostedt, gmonaco; +Cc: linux-trace-kernel, linux-kernel, unknownbbqrx
ikm_fill_monitor_definition() copies monitor name and description with
strncpy(), but does not guarantee NUL termination when source strings are
equal to or longer than the destination buffers.
Clamp copies to sizeof(dst) - 1 and explicitly append '\0' for both fields
to keep them safe for later string operations.
Signed-off-by: unknownbbqrx <dev@unknownbbqr.xyz>
---
tools/verification/rv/src/in_kernel.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/tools/verification/rv/src/in_kernel.c b/tools/verification/rv/src/in_kernel.c
index 4bb746ea6..d32453824 100644
--- a/tools/verification/rv/src/in_kernel.c
+++ b/tools/verification/rv/src/in_kernel.c
@@ -215,10 +215,11 @@ static int ikm_fill_monitor_definition(char *name, struct monitor *ikm, char *co
return -1;
}
- strncpy(ikm->name, nested_name, MAX_DA_NAME_LEN);
+ strncpy(ikm->name, nested_name, sizeof(ikm->name) - 1);
+ ikm->name[sizeof(ikm->name) - 1] = '\0';
ikm->enabled = enabled;
- strncpy(ikm->desc, desc, MAX_DESCRIPTION);
-
+ strncpy(ikm->desc, desc, sizeof(ikm->desc) - 1);
+ ikm->desc[sizeof(ikm->desc) - 1] = '\0';
free(desc);
return 0;
--
2.53.0
^ permalink raw reply related
* [PATCH] tools/rv: harden monitor name lookup bounds checks
From: unknownbbqrx @ 2026-04-23 14:44 UTC (permalink / raw)
To: rostedt, gmonaco; +Cc: linux-trace-kernel, linux-kernel, unknownbbqrx
Bound monitor-name derived copies in __ikm_find_monitor_name() and avoid unbounded writes from sprintf()/memcpy().
Pass the output buffer size from the caller, validate extracted line length from rv/available_monitors, and use snprintf() with truncation checks when building container monitor names.
Signed-off-by: unknownbbqrx <dev@unknownbbqr.xyz>
---
tools/verification/rv/src/in_kernel.c | 34 +++++++++++++++++++++------
1 file changed, 27 insertions(+), 7 deletions(-)
diff --git a/tools/verification/rv/src/in_kernel.c b/tools/verification/rv/src/in_kernel.c
index d32453824..f17eac9b6 100644
--- a/tools/verification/rv/src/in_kernel.c
+++ b/tools/verification/rv/src/in_kernel.c
@@ -56,9 +56,12 @@ static int __ikm_read_enable(char *monitor_name)
* The string out_name is populated with the full name, which can be
* equal to monitor_name or container/monitor_name if nested
*/
-static int __ikm_find_monitor_name(char *monitor_name, char *out_name)
+static int __ikm_find_monitor_name(char *monitor_name, char *out_name,
+ size_t out_name_size)
{
- char *available_monitors, container[MAX_DA_NAME_LEN+1], *cursor, *end;
+ char *available_monitors, container[MAX_DA_NAME_LEN + 2], *cursor, *end;
+ size_t len;
+ int n;
int retval = 1;
available_monitors = tracefs_instance_file_read(NULL, "rv/available_monitors", NULL);
@@ -72,17 +75,34 @@ static int __ikm_find_monitor_name(char *monitor_name, char *out_name)
}
for (; cursor > available_monitors; cursor--)
- if (*(cursor-1) == '\n')
+ if (*(cursor - 1) == '\n')
break;
+
end = strstr(cursor, "\n");
- memcpy(out_name, cursor, end-cursor);
- out_name[end-cursor] = '\0';
+ if (!end) {
+ retval = -1;
+ goto out_free;
+ }
+
+ len = end - cursor;
+ if (len >= out_name_size) {
+ retval = -1;
+ goto out_free;
+ }
+
+ memcpy(out_name, cursor, len);
+ out_name[len] = '\0';
cursor = strstr(out_name, ":");
if (cursor)
*cursor = '/';
else {
- sprintf(container, "%s:", monitor_name);
+ n = snprintf(container, sizeof(container), "%s:", monitor_name);
+ if (n < 0 || (size_t)n >= sizeof(container)) {
+ retval = -1;
+ goto out_free;
+ }
+
if (strstr(available_monitors, container))
config_is_container = 1;
}
@@ -782,7 +802,7 @@ int ikm_run_monitor(char *monitor_name, int argc, char **argv)
else
nested_name = monitor_name;
- retval = __ikm_find_monitor_name(monitor_name, full_name);
+ retval = __ikm_find_monitor_name(monitor_name, full_name, sizeof(full_name));
if (!retval)
return 0;
if (retval < 0) {
base-commit: 2e68039281932e6dc37718a1ea7cbb8e2cda42e6
prerequisite-patch-id: b61dd51dee390277603975bf729a687113185c3a
--
2.53.0
^ permalink raw reply related
* [PATCH v3 0/3] Enable perf tracing for unprivileged users
From: Anubhav Shelat @ 2026-04-23 15:17 UTC (permalink / raw)
To: peterz, mingo, mhiramat, rostedt, acme, namhyung
Cc: mathieu.desnoyers, mark.rutland, alexander.shishkin, jolsa,
irogers, adrian.hunter, james.clark, linux-kernel,
linux-trace-kernel, linux-perf-users, Anubhav Shelat
Enable users to use perf-trace to trace their own processes, like strace
but without the overhead of ptrace(). Ensure that users cannot access
other users' or systemwide tracing data.
Changes in v3:
- Don't set PERF_SAMPLE_IP for unprivileged tracepoints. This allows us
to exclude PERF_SAMPLE_IP from kaddr_leak without weakening KASLR.
- Mount tracefs as world-traversable so users can access eventfs
directories.
v2: https://lore.kernel.org/lkml/20260410133529.21947-1-ashelat@redhat.com/
Anubhav Shelat (3):
perf evsel: don't set PERF_SAMPLE_IP for unprivileged tracepoints
perf: enable unprivileged syscall tracing with perf trace
tracefs: make root directory world-traversable
fs/tracefs/inode.c | 2 +-
kernel/events/core.c | 23 ++++++++++++++++++++---
kernel/trace/trace_event_perf.c | 12 +++++++++++-
kernel/trace/trace_events.c | 8 ++++++--
tools/perf/util/evsel.c | 4 +++-
5 files changed, 41 insertions(+), 8 deletions(-)
--
2.53.0
^ permalink raw reply
* [PATCH v3 1/3] perf evsel: don't set PERF_SAMPLE_IP for unprivileged tracepoints
From: Anubhav Shelat @ 2026-04-23 15:17 UTC (permalink / raw)
To: peterz, mingo, mhiramat, rostedt, acme, namhyung
Cc: mathieu.desnoyers, mark.rutland, alexander.shishkin, jolsa,
irogers, adrian.hunter, james.clark, linux-kernel,
linux-trace-kernel, linux-perf-users, Anubhav Shelat
In-Reply-To: <20260423151746.16258-1-ashelat@redhat.com>
For tracepoint events the IP is a static kernel address.
It doesn't vary by sample and provides no useful information for
unprivileged users. Skipping setting PERF_SAMPLE_IP for unprivileged
tracepoints avoids exposing a kernel address that reveals the KASLR base
offset and slightly reduces sample record size.
Assisted-by: Claude:claude-sonnet-4.5
Signed-off-by: Anubhav Shelat <ashelat@redhat.com>
---
tools/perf/util/evsel.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f59228c1a39e..a1091d937ff9 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1503,7 +1503,9 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts,
attr->write_backward = opts->overwrite ? 1 : 0;
attr->read_format = PERF_FORMAT_LOST;
- evsel__set_sample_bit(evsel, IP);
+ if (attr->type != PERF_TYPE_TRACEPOINT || perf_event_paranoid_check(1))
+ evsel__set_sample_bit(evsel, IP);
+
evsel__set_sample_bit(evsel, TID);
if (evsel->sample_read) {
--
2.53.0
^ permalink raw reply related
* [PATCH v3 2/3] perf: enable unprivileged syscall tracing with perf trace
From: Anubhav Shelat @ 2026-04-23 15:17 UTC (permalink / raw)
To: peterz, mingo, mhiramat, rostedt, acme, namhyung
Cc: mathieu.desnoyers, mark.rutland, alexander.shishkin, jolsa,
irogers, adrian.hunter, james.clark, linux-kernel,
linux-trace-kernel, linux-perf-users, Anubhav Shelat
In-Reply-To: <20260423151746.16258-1-ashelat@redhat.com>
Allow unprivileged users to trace their own processes' syscalls using
perf trace, similar to strace without the intrusive overhead of ptrace().
Currently, perf trace requires CAP_PERFMON or paranoid level ≤ 1 even
though the kernel has existing infrastructure (TRACE_EVENT_FL_CAP_ANY)
specifically designed to mark syscall tracepoints as safe for
unprivileged access. To fix this:
1. Loosen the condition in perf_event_open() which requires privileges
for all events with exclude_kernel=0. This allows perf_event_open() to
bypass the paranoid check for task-attached tracepoint events. Ensure
that sample types which can expose kernel addresses to unprivileged
users are blocked.
2. Make the format and id tracefs files world-readable only for tracepoints
with TRACE_EVENT_FL_CAP_ANY, allowing unprivileged users to see syscall
tracepoint ids without exposing sensitive information.
Also add a check to perf_trace_event_perm() to ensure only
TRACE_EVENT_FL_CAP_ANY events can be traced.
Example usage after this change:
$ perf trace ls # works as unprivileged user
$ perf trace # system-wide, still requires privileges
$ perf trace -p 1234 # requires ptrace permission on pid 1234
Assisted-by: Claude:claude-sonnet-4.5
Signed-off-by: Anubhav Shelat <ashelat@redhat.com>
---
kernel/events/core.c | 24 +++++++++++++++++++++---
kernel/trace/trace_event_perf.c | 12 +++++++++++-
kernel/trace/trace_events.c | 8 ++++++--
3 files changed, 38 insertions(+), 6 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6d1f8bad7e1c..e9c53758574d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13833,9 +13833,27 @@ SYSCALL_DEFINE5(perf_event_open,
return err;
if (!attr.exclude_kernel) {
- err = perf_allow_kernel();
- if (err)
- return err;
+ bool tp_bypass = false;
+
+ if (attr.type == PERF_TYPE_TRACEPOINT && pid != -1) {
+ /*
+ * Block sample types that expose kernel addresses to
+ * prevent KASLR bypass
+ */
+ u64 kaddr_leak = PERF_SAMPLE_CALLCHAIN |
+ PERF_SAMPLE_BRANCH_STACK |
+ PERF_SAMPLE_ADDR |
+ PERF_SAMPLE_REGS_INTR |
+ PERF_SAMPLE_IP;
+
+ tp_bypass = !(attr.sample_type & kaddr_leak);
+ }
+
+ if (!tp_bypass) {
+ err = perf_allow_kernel();
+ if (err)
+ return err;
+ }
}
if (attr.namespaces) {
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a6bb7577e8c5..e8347df7ede5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -73,8 +73,18 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
}
/* No tracing, just counting, so no obvious leak */
- if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) {
+ /*
+ * Only allow CAP_ANY tracepoints for unprivileged
+ * task-attached events in case kernel context is exposed.
+ */
+ if (!p_event->attr.exclude_kernel && !perfmon_capable()) {
+ if (!(p_event->attach_state == PERF_ATTACH_TASK &&
+ (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)))
+ return -EACCES;
+ }
return 0;
+ }
/* Some events are ok to be traced by non-root users... */
if (p_event->attach_state == PERF_ATTACH_TASK) {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa422dc80ae8..69be5561d0b8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3054,7 +3054,9 @@ static int event_callback(const char *name, umode_t *mode, void **data,
struct trace_event_call *call = file->event_call;
if (strcmp(name, "format") == 0) {
- *mode = TRACE_MODE_READ;
+ *mode = (call->flags & TRACE_EVENT_FL_CAP_ANY) ?
+ (TRACE_MODE_READ | 0004) :
+ TRACE_MODE_READ;
*fops = &ftrace_event_format_fops;
return 1;
}
@@ -3090,7 +3092,9 @@ static int event_callback(const char *name, umode_t *mode, void **data,
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg &&
strcmp(name, "id") == 0) {
- *mode = TRACE_MODE_READ;
+ *mode = (call->flags & TRACE_EVENT_FL_CAP_ANY) ?
+ (TRACE_MODE_READ | 0004) :
+ TRACE_MODE_READ;
*data = (void *)(long)call->event.type;
*fops = &ftrace_event_id_fops;
return 1;
--
2.53.0
^ permalink raw reply related
* [PATCH v3 3/3] tracefs: make root directory world-traversable
From: Anubhav Shelat @ 2026-04-23 15:17 UTC (permalink / raw)
To: peterz, mingo, mhiramat, rostedt, acme, namhyung
Cc: mathieu.desnoyers, mark.rutland, alexander.shishkin, jolsa,
irogers, adrian.hunter, james.clark, linux-kernel,
linux-trace-kernel, linux-perf-users, Anubhav Shelat
In-Reply-To: <20260423151746.16258-1-ashelat@redhat.com>
Change the default tracefs mount mode from 0700 to 0755. This allows
unprivileged users to access the eventfs directories underneath which
already use 0755.
This does not expose any tracing data since access to the files
themselves is controlled by individual permissions.
Signed-off-by: Anubhav Shelat <ashelat@redhat.com>
---
fs/tracefs/inode.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 03f768536fd5..9506450fbc91 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -23,7 +23,7 @@
#include <linux/slab.h>
#include "internal.h"
-#define TRACEFS_DEFAULT_MODE 0700
+#define TRACEFS_DEFAULT_MODE 0755
static struct kmem_cache *tracefs_inode_cachep __ro_after_init;
static struct vfsmount *tracefs_mount;
--
2.53.0
^ permalink raw reply related
* Re: [PATCH] mm/vmscan: add balance_pgdat begin/end tracepoints
From: Shakeel Butt @ 2026-04-23 17:46 UTC (permalink / raw)
To: Bunyod Suvonov
Cc: akpm, hannes, rostedt, mhiramat, david, mhocko, zhengqi.arch, ljs,
mathieu.desnoyers, linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <20260423103753.546582-1-b.suvonov@sjtu.edu.cn>
On Thu, Apr 23, 2026 at 06:37:53PM +0800, Bunyod Suvonov wrote:
> Vmscan has six main reclaim entry points: try_to_free_pages() for
> direct reclaim, try_to_free_mem_cgroup_pages() for memcg reclaim,
> mem_cgroup_shrink_node() for memcg soft limit reclaim, node_reclaim()
> for node reclaim, shrink_all_memory() for hibernation reclaim, and
> balance_pgdat() for kswapd reclaim.
>
> All of them, except for shrink_all_memory() and balance_pgdat(), already
> have begin/end tracepoints. This makes it harder to trace which reclaim
> path is responsible for memory reclaim activity, because kswapd reclaim
> cannot be identified as cleanly as other reclaim entry points, even
> though it is the main background reclaim path under memory pressure.
> There may be no need to trace shrink_all_memory() as it is primarily
> used during hibernation. So this patch adds the missing tracepoint pair
> for balance_pgdat().
>
> The begin tracepoint records the node id, requested reclaim order, and
> highest_zoneidx. The end tracepoint records the node id, reclaim order
> that balance_pgdat() finished with, highest_zoneidx, and nr_reclaimed.
Do we need to trace highest_zoneidx at the end? Can it change within
balance_pgdat()?
> Together, they show the requested reclaim order and zone bound, whether
> reclaim fell back to a lower order, and how much reclaim work was done.
>
> Signed-off-by: Bunyod Suvonov <b.suvonov@sjtu.edu.cn>
Overall looks good.
^ permalink raw reply
* Re: [PATCH 7.2 v16 00/13] khugepaged: mTHP support
From: Andrew Morton @ 2026-04-23 20:30 UTC (permalink / raw)
To: Nico Pache
Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
zokeefe
In-Reply-To: <20260419185750.260784-1-npache@redhat.com>
On Sun, 19 Apr 2026 12:57:37 -0600 Nico Pache <npache@redhat.com> wrote:
> The following series provides khugepaged with the capability to collapse
> anonymous memory regions to mTHPs.
Thanks, I added this to mm.git's mm-new branch for testing while review
is being completed. I added notes regarding Usana's comments, so they
don't get lost.
^ permalink raw reply
* Re: [PATCH] rtla: Document tests in README
From: Crystal Wood @ 2026-04-23 22:44 UTC (permalink / raw)
To: Tomas Glozar, Steven Rostedt
Cc: John Kacur, Luis Goncalves, Costa Shulyupin, Wander Lairson Costa,
LKML, linux-trace-kernel
In-Reply-To: <20260423130759.882247-1-tglozar@redhat.com>
On Thu, 2026-04-23 at 15:07 +0200, Tomas Glozar wrote:
> RTLA tests are not documented anywhere. Mention both runtime and unit
> tests in the README, with instructions on how to run them and a list of
> dependencies and required system configuration.
>
> Signed-off-by: Tomas Glozar <tglozar@redhat.com>
> ---
> tools/tracing/rtla/README.txt | 30 ++++++++++++++++++++++++++++++
> 1 file changed, 30 insertions(+)
>
> diff --git a/tools/tracing/rtla/README.txt b/tools/tracing/rtla/README.txt
> index a9faee4dbb3a..8a782cd2c171 100644
> --- a/tools/tracing/rtla/README.txt
> +++ b/tools/tracing/rtla/README.txt
> @@ -42,4 +42,34 @@ For development, we suggest the following steps for compiling rtla:
> $ make
> $ sudo make install
>
> +Running tests
> +
> +RTLA has two test suites: a runtime test suite and a unit test suite.
> +
> +The runtime test suite is available as "make check" (root required) and has
> +the following dependencies, in addition to RTLA build dependencies:
> +
> +- Perl
> +- Test::Harness / TAP::Harness
> +- bash
> +- coreutils
> +- ldd
> +- util-linux
> +- procps(-ng)
> +- bpftool (if rtla is built against libbpf)
> +
> +as well as the following required system configuration:
> +
> +- CONFIG_OSNOISE_TRACER=y
> +- CONFIG_TIMERLAT_TRACER=y
> +- tracefs mounted and readable at /sys/kernel/tracing
> +
> +The unit test suite is available as "make unit-tests" and has the following
> +dependencies:
> +
> +- libcheck
> +
> +Unlike the runtime test suite, root is not required to run unit tests, nor is
> +a tracefs/osnoise/timerlat-capable kernel required.
> +
Should add something explaining how to install "Test::Harness /
TAP::Harness" for those who aren't familiar with the Perl ecosystem.
-Crystal
^ permalink raw reply
* [PATCH v5] tracing: Bound synthetic-field strings with seq_buf
From: Pengpeng Hou @ 2026-04-23 15:33 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu
Cc: Mathieu Desnoyers, Tom Zanussi, linux-trace-kernel, linux-kernel,
pengpeng
In-Reply-To: <20260417223001.1-tracing-synth-v4-pengpeng@iscas.ac.cn>
The synthetic field helpers build a prefixed synthetic variable name and
a generated hist command in fixed MAX_FILTER_STR_VAL buffers. The
current code appends those strings with raw strcat(), so long key lists,
field names, or saved filters can run past the end of the staging
buffers.
Build both strings with seq_buf and propagate -E2BIG if either the
synthetic variable name or the generated command exceeds
MAX_FILTER_STR_VAL. This keeps the existing tracing-side limit while
using the helper intended for bounded command construction.
Fixes: 02205a6752f2 ("tracing: Add support for 'field variables'")
Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
---
Changes since v4:
- add the requested blank lines around seq_buf_str() comments
- add the seq_buf_str() comment for the generated command buffer too
- keep saved_filter scoped next to its point of use
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0dbbf6cca9bc..87429567417f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/security.h>
+#include <linux/seq_buf.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
@@ -2968,14 +2969,24 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
char *system, char *event_name, char *field_name)
{
struct hist_field *event_var;
+ struct seq_buf s;
char *synthetic_name;
synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
if (!synthetic_name)
return ERR_PTR(-ENOMEM);
- strcpy(synthetic_name, "synthetic_");
- strcat(synthetic_name, field_name);
+ seq_buf_init(&s, synthetic_name, MAX_FILTER_STR_VAL);
+ seq_buf_puts(&s, "synthetic_");
+ seq_buf_puts(&s, field_name);
+
+ /* Terminate synthetic_name with a NUL. */
+ seq_buf_str(&s);
+
+ if (seq_buf_has_overflowed(&s)) {
+ kfree(synthetic_name);
+ return ERR_PTR(-E2BIG);
+ }
event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
@@ -3020,7 +3031,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
struct trace_event_file *file;
struct hist_field *key_field;
struct hist_field *event_var;
- char *saved_filter;
+ struct seq_buf s;
char *cmd;
int ret;
@@ -3065,28 +3076,37 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
return ERR_PTR(-ENOMEM);
}
+ seq_buf_init(&s, cmd, MAX_FILTER_STR_VAL);
+
/* Use the same keys as the compatible histogram */
- strcat(cmd, "keys=");
+ seq_buf_puts(&s, "keys=");
for_each_hist_key_field(i, hist_data) {
key_field = hist_data->fields[i];
if (!first)
- strcat(cmd, ",");
- strcat(cmd, key_field->field->name);
+ seq_buf_putc(&s, ',');
+ seq_buf_puts(&s, key_field->field->name);
first = false;
}
/* Create the synthetic field variable specification */
- strcat(cmd, ":synthetic_");
- strcat(cmd, field_name);
- strcat(cmd, "=");
- strcat(cmd, field_name);
+ seq_buf_printf(&s, ":synthetic_%s=%s", field_name, field_name);
/* Use the same filter as the compatible histogram */
- saved_filter = find_trigger_filter(hist_data, file);
- if (saved_filter) {
- strcat(cmd, " if ");
- strcat(cmd, saved_filter);
+ {
+ char *saved_filter = find_trigger_filter(hist_data, file);
+
+ if (saved_filter)
+ seq_buf_printf(&s, " if %s", saved_filter);
+ }
+
+ /* Terminate cmd with a NUL. */
+ seq_buf_str(&s);
+
+ if (seq_buf_has_overflowed(&s)) {
+ kfree(cmd);
+ kfree(var_hist);
+ return ERR_PTR(-E2BIG);
}
var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
--
2.50.1 (Apple Git-155)
^ permalink raw reply related
* Re: [PATCH] mm/vmscan: add balance_pgdat begin/end tracepoints
From: SUVONOV BUNYOD @ 2026-04-24 0:46 UTC (permalink / raw)
To: Shakeel Butt
Cc: akpm, hannes, rostedt, mhiramat, david, mhocko, zhengqi arch, ljs,
mathieu desnoyers, linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <aepavbLy7H3odp6p@linux.dev>
Thank you for reviewing Shakeel,
> Do we need to trace highest_zoneidx at the end? Can it change within
> balance_pgdat()?
highest_zoneidx does not change within a balance_pgdat() invocation. It
is passed in as an argument and remains the classzone bound used for the
balancing checks throughout the function.
I kept highest_zoneidx in the end tracepoint to make the outcome event
self-contained. In principle, begin/end correlation is possible, but
under sustained memory pressure kswapd reclaim can be frequent enough
that consumers may prefer to analyze end events directly, and any
dependence on matching begin/end becomes less convenient and less robust
in the presence of filtering or dropped trace records.
Since nr_reclaimed and the final order are only known at the end, having
highest_zoneidx there allows end-only analysis without correlating with
the begin event.
For example, it lets users answer questions like:
- this pass reclaimed too much or too little memory; what highest_zoneidx
did that result correspond to?
- how much reclaim was done when balancing up to ZONE_NORMAL vs other
classzone bounds?
- when highest_zoneidx == ZONE_NORMAL, how often did reclaim finish at
order=0?
So it is there because it provides context for the end-of-reclaim result.
Do you think this is sufficient justification? If not, then I can drop it
from the end tracepoint in v2.
----- Original Message -----
From: "Shakeel Butt" <shakeel.butt@linux.dev>
To: "Bunyod Suvonov" <b.suvonov@sjtu.edu.cn>
Cc: akpm@linux-foundation.org, hannes@cmpxchg.org, rostedt@goodmis.org, mhiramat@kernel.org, david@kernel.org, mhocko@kernel.org, "zhengqi arch" <zhengqi.arch@bytedance.com>, ljs@kernel.org, "mathieu desnoyers" <mathieu.desnoyers@efficios.com>, linux-mm@kvack.org, linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org
Sent: Friday, April 24, 2026 1:46:55 AM
Subject: Re: [PATCH] mm/vmscan: add balance_pgdat begin/end tracepoints
On Thu, Apr 23, 2026 at 06:37:53PM +0800, Bunyod Suvonov wrote:
> Vmscan has six main reclaim entry points: try_to_free_pages() for
> direct reclaim, try_to_free_mem_cgroup_pages() for memcg reclaim,
> mem_cgroup_shrink_node() for memcg soft limit reclaim, node_reclaim()
> for node reclaim, shrink_all_memory() for hibernation reclaim, and
> balance_pgdat() for kswapd reclaim.
>
> All of them, except for shrink_all_memory() and balance_pgdat(), already
> have begin/end tracepoints. This makes it harder to trace which reclaim
> path is responsible for memory reclaim activity, because kswapd reclaim
> cannot be identified as cleanly as other reclaim entry points, even
> though it is the main background reclaim path under memory pressure.
> There may be no need to trace shrink_all_memory() as it is primarily
> used during hibernation. So this patch adds the missing tracepoint pair
> for balance_pgdat().
>
> The begin tracepoint records the node id, requested reclaim order, and
> highest_zoneidx. The end tracepoint records the node id, reclaim order
> that balance_pgdat() finished with, highest_zoneidx, and nr_reclaimed.
Do we need to trace highest_zoneidx at the end? Can it change within
balance_pgdat()?
> Together, they show the requested reclaim order and zone bound, whether
> reclaim fell back to a lower order, and how much reclaim work was done.
>
> Signed-off-by: Bunyod Suvonov <b.suvonov@sjtu.edu.cn>
Overall looks good.
^ permalink raw reply
* Re: [PATCH] mm/vmscan: add balance_pgdat begin/end tracepoints
From: Shakeel Butt @ 2026-04-24 2:15 UTC (permalink / raw)
To: SUVONOV BUNYOD
Cc: akpm, hannes, rostedt, mhiramat, david, mhocko, zhengqi arch, ljs,
mathieu desnoyers, linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <1868971658.1970916.1776991584726.JavaMail.zimbra@sjtu.edu.cn>
On Fri, Apr 24, 2026 at 08:46:24AM +0800, SUVONOV BUNYOD wrote:
> Thank you for reviewing Shakeel,
>
> > Do we need to trace highest_zoneidx at the end? Can it change within
> > balance_pgdat()?
>
> highest_zoneidx does not change within a balance_pgdat() invocation. It
> is passed in as an argument and remains the classzone bound used for the
> balancing checks throughout the function.
>
> I kept highest_zoneidx in the end tracepoint to make the outcome event
> self-contained. In principle, begin/end correlation is possible, but
> under sustained memory pressure kswapd reclaim can be frequent enough
> that consumers may prefer to analyze end events directly, and any
> dependence on matching begin/end becomes less convenient and less robust
> in the presence of filtering or dropped trace records.
>
> Since nr_reclaimed and the final order are only known at the end, having
> highest_zoneidx there allows end-only analysis without correlating with
> the begin event.
>
> For example, it lets users answer questions like:
> - this pass reclaimed too much or too little memory; what highest_zoneidx
> did that result correspond to?
> - how much reclaim was done when balancing up to ZONE_NORMAL vs other
> classzone bounds?
> - when highest_zoneidx == ZONE_NORMAL, how often did reclaim finish at
> order=0?
>
> So it is there because it provides context for the end-of-reclaim result.
> Do you think this is sufficient justification? If not, then I can drop it
> from the end tracepoint in v2.
I think it is ok but let's add this reasoning in the commit message.
^ permalink raw reply
* [PATCH v2] mm/vmscan: add balance_pgdat begin/end tracepoints
From: Bunyod Suvonov @ 2026-04-24 3:14 UTC (permalink / raw)
To: akpm, hannes, rostedt, mhiramat
Cc: david, mhocko, zhengqi.arch, shakeel.butt, ljs, mathieu.desnoyers,
linux-mm, linux-trace-kernel, linux-kernel, Bunyod Suvonov
In-Reply-To: <20260423103753.546582-1-b.suvonov@sjtu.edu.cn>
Vmscan has six main reclaim entry points: try_to_free_pages() for
direct reclaim, try_to_free_mem_cgroup_pages() for memcg reclaim,
mem_cgroup_shrink_node() for memcg soft limit reclaim, node_reclaim()
for node reclaim, shrink_all_memory() for hibernation reclaim, and
balance_pgdat() for kswapd reclaim.
All of them, except for shrink_all_memory() and balance_pgdat(), already
have begin/end tracepoints. This makes it harder to trace which reclaim
path is responsible for memory reclaim activity, because kswapd reclaim
cannot be identified as cleanly as other reclaim entry points, even
though it is the main background reclaim path under memory pressure.
There may be no need to trace shrink_all_memory() as it is primarily
used during hibernation. So this patch adds the missing tracepoint pair
for balance_pgdat().
The begin tracepoint records the node id, requested reclaim order, and
the requested classzone bound (highest_zoneidx). The end tracepoint
records the node id, the reclaim order that balance_pgdat() finished
with, the requested classzone bound, and nr_reclaimed. Together, they
show the requested reclaim order and classzone bound, whether reclaim
fell back to a lower order, and how much reclaim work was done.
The end tracepoint also records highest_zoneidx even though it does not
change within a balance_pgdat() invocation. This keeps the end event
self-contained, so users can analyze reclaim results directly from end
events without depending on begin/end correlation, which is less
convenient when tracing is filtered or records are dropped. It also
makes it straightforward to relate nr_reclaimed and the final reclaim
order to the requested classzone bound.
Signed-off-by: Bunyod Suvonov <b.suvonov@sjtu.edu.cn>
---
v2:
- explain why highest_zoneidx is kept in the end tracepoint
include/trace/events/vmscan.h | 52 +++++++++++++++++++++++++++++++++++
mm/vmscan.c | 5 ++++
2 files changed, 57 insertions(+)
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 4445a8d9218d..b4bf7b8def1f 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -96,6 +96,58 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
__entry->order)
);
+TRACE_EVENT(mm_vmscan_balance_pgdat_begin,
+
+ TP_PROTO(int nid, int order, int highest_zoneidx),
+
+ TP_ARGS(nid, order, highest_zoneidx),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(int, highest_zoneidx)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->highest_zoneidx = highest_zoneidx;
+ ),
+
+ TP_printk("nid=%d order=%d highest_zoneidx=%-8s",
+ __entry->nid,
+ __entry->order,
+ __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE))
+);
+
+TRACE_EVENT(mm_vmscan_balance_pgdat_end,
+
+ TP_PROTO(int nid, int order, int highest_zoneidx,
+ unsigned long nr_reclaimed),
+
+ TP_ARGS(nid, order, highest_zoneidx, nr_reclaimed),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(int, highest_zoneidx)
+ __field(unsigned long, nr_reclaimed)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->highest_zoneidx = highest_zoneidx;
+ __entry->nr_reclaimed = nr_reclaimed;
+ ),
+
+ TP_printk("nid=%d order=%d highest_zoneidx=%-8s nr_reclaimed=%lu",
+ __entry->nid,
+ __entry->order,
+ __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE),
+ __entry->nr_reclaimed)
+);
+
TRACE_EVENT(mm_vmscan_wakeup_kswapd,
TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd1b1aa12581..b2d89ed69d22 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7121,6 +7121,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
.may_unmap = 1,
};
+ trace_mm_vmscan_balance_pgdat_begin(pgdat->node_id, order,
+ highest_zoneidx);
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire(_THIS_IP_);
@@ -7314,6 +7316,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
+ trace_mm_vmscan_balance_pgdat_end(pgdat->node_id, sc.order,
+ highest_zoneidx, sc.nr_reclaimed);
+
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
--
2.53.0
^ permalink raw reply related
* Re: [PATCH v2] mm/vmscan: add balance_pgdat begin/end tracepoints
From: Shakeel Butt @ 2026-04-24 3:16 UTC (permalink / raw)
To: Bunyod Suvonov, akpm, hannes, rostedt, mhiramat
Cc: david, mhocko, zhengqi.arch, ljs, mathieu.desnoyers, linux-mm,
linux-trace-kernel, linux-kernel, Bunyod Suvonov
In-Reply-To: <20260424031418.174597-1-b.suvonov@sjtu.edu.cn>
April 23, 2026 at 8:14 PM, "Bunyod Suvonov" <b.suvonov@sjtu.edu.cn mailto:b.suvonov@sjtu.edu.cn?to=%22Bunyod%20Suvonov%22%20%3Cb.suvonov%40sjtu.edu.cn%3E > wrote:
>
> Vmscan has six main reclaim entry points: try_to_free_pages() for
> direct reclaim, try_to_free_mem_cgroup_pages() for memcg reclaim,
> mem_cgroup_shrink_node() for memcg soft limit reclaim, node_reclaim()
> for node reclaim, shrink_all_memory() for hibernation reclaim, and
> balance_pgdat() for kswapd reclaim.
>
> All of them, except for shrink_all_memory() and balance_pgdat(), already
> have begin/end tracepoints. This makes it harder to trace which reclaim
> path is responsible for memory reclaim activity, because kswapd reclaim
> cannot be identified as cleanly as other reclaim entry points, even
> though it is the main background reclaim path under memory pressure.
> There may be no need to trace shrink_all_memory() as it is primarily
> used during hibernation. So this patch adds the missing tracepoint pair
> for balance_pgdat().
>
> The begin tracepoint records the node id, requested reclaim order, and
> the requested classzone bound (highest_zoneidx). The end tracepoint
> records the node id, the reclaim order that balance_pgdat() finished
> with, the requested classzone bound, and nr_reclaimed. Together, they
> show the requested reclaim order and classzone bound, whether reclaim
> fell back to a lower order, and how much reclaim work was done.
>
> The end tracepoint also records highest_zoneidx even though it does not
> change within a balance_pgdat() invocation. This keeps the end event
> self-contained, so users can analyze reclaim results directly from end
> events without depending on begin/end correlation, which is less
> convenient when tracing is filtered or records are dropped. It also
> makes it straightforward to relate nr_reclaimed and the final reclaim
> order to the requested classzone bound.
>
> Signed-off-by: Bunyod Suvonov <b.suvonov@sjtu.edu.cn>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
^ permalink raw reply
* [PATCH v18 0/8] ring-buffer: Making persistent ring buffers robust
From: Masami Hiramatsu (Google) @ 2026-04-24 6:51 UTC (permalink / raw)
To: Steven Rostedt, Catalin Marinas, Will Deacon
Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, Ian Rogers, linux-arm-kernel
Hi,
Here is the 18th version of improvement patches for making persistent
ring buffers robust to failures.
The previous version is here:
https://lore.kernel.org/all/177687458572.932171.10907864814735342737.stgit@mhiramat.tok.corp.google.com/
This version fixes a newly found bug and some review comments from
Sashiko[1], also, add 2 cleanups, which includes:
[1/8] Do not double count the reader_page when verifying persistent
ring buffer.
[2/8] Add Geert's Ack (Thanks!)
[3/8] Fix to substract BUF_PAGE_HDR_SIZE from meta->subbuf_size
to make the limit of commit size.
[4/8] Reset timestamp of reader_page when the entire cpu_buffer is
invalid.
[5/8] In rb_test_inject_invalid_pages(), changed entry_bytes and
idx to unsigned long.
[7/8] Cleanup persistent ring buffer validation code.
[8/8] Cleanup buffer_data_page related code.
[1] https://sashiko.dev/#/patchset/177687458572.932171.10907864814735342737.stgit%40mhiramat.tok.corp.google.com
Thank you,
Masami Hiramatsu (Google) (8):
ring-buffer: Do not double count the reader_page
ring-buffer: Flush and stop persistent ring buffer on panic
ring-buffer: Skip invalid sub-buffers when validating persistent ring buffer
ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
ring-buffer: Add persistent ring buffer invalid-page inject test
ring-buffer: Show commit numbers in buffer_meta file
ring-buffer: Cleanup persistent ring buffer validation
ring-buffer: Cleanup buffer_data_page related code
arch/alpha/include/asm/Kbuild | 1
arch/arc/include/asm/Kbuild | 1
arch/arm/include/asm/Kbuild | 1
arch/arm64/include/asm/ring_buffer.h | 10 +
arch/csky/include/asm/Kbuild | 1
arch/hexagon/include/asm/Kbuild | 1
arch/loongarch/include/asm/Kbuild | 1
arch/m68k/include/asm/Kbuild | 1
arch/microblaze/include/asm/Kbuild | 1
arch/mips/include/asm/Kbuild | 1
arch/nios2/include/asm/Kbuild | 1
arch/openrisc/include/asm/Kbuild | 1
arch/parisc/include/asm/Kbuild | 1
arch/powerpc/include/asm/Kbuild | 1
arch/riscv/include/asm/Kbuild | 1
arch/s390/include/asm/Kbuild | 1
arch/sh/include/asm/Kbuild | 1
arch/sparc/include/asm/Kbuild | 1
arch/um/include/asm/Kbuild | 1
arch/x86/include/asm/Kbuild | 1
arch/xtensa/include/asm/Kbuild | 1
include/asm-generic/ring_buffer.h | 13 +
include/linux/ring_buffer.h | 1
kernel/trace/Kconfig | 34 ++
kernel/trace/ring_buffer.c | 472 +++++++++++++++++++++++-----------
kernel/trace/trace.c | 4
26 files changed, 395 insertions(+), 159 deletions(-)
create mode 100644 arch/arm64/include/asm/ring_buffer.h
create mode 100644 include/asm-generic/ring_buffer.h
base-commit: 6170922f137231b98fc568571befef63e1edff3f
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply
* [PATCH v18 1/8] ring-buffer: Do not double count the reader_page
From: Masami Hiramatsu (Google) @ 2026-04-24 6:52 UTC (permalink / raw)
To: Steven Rostedt, Catalin Marinas, Will Deacon
Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, Ian Rogers, linux-arm-kernel
In-Reply-To: <177701351903.2223789.17087009302463188638.stgit@mhiramat.tok.corp.google.com>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Since the cpu_buffer->reader_page is updated if there are unwound
pages. After that update, we should skip the page if it is the
original reader_page, because the original reader_page is already
checked.
Fixes: ca296d32ece3 ("tracing: ring_buffer: Rewind persistent ring buffer on reboot")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v18:
- Newly added.
---
kernel/trace/ring_buffer.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cef49f8871d2..5326924615a4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1884,7 +1884,7 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
- struct buffer_page *head_page, *orig_head;
+ struct buffer_page *head_page, *orig_head, *orig_reader;
unsigned long entry_bytes = 0;
unsigned long entries = 0;
int ret;
@@ -1895,16 +1895,17 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
return;
orig_head = head_page = cpu_buffer->head_page;
+ orig_reader = cpu_buffer->reader_page;
/* Do the reader page first */
- ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+ ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu);
if (ret < 0) {
pr_info("Ring buffer reader page is invalid\n");
goto invalid;
}
entries += ret;
- entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
- local_set(&cpu_buffer->reader_page->entries, ret);
+ entry_bytes += local_read(&orig_reader->page->commit);
+ local_set(&orig_reader->entries, ret);
ts = head_page->page->time_stamp;
@@ -2007,8 +2008,8 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
/* Iterate until finding the commit page */
for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
- /* Reader page has already been done */
- if (head_page == cpu_buffer->reader_page)
+ /* The original reader page has already been checked/counted. */
+ if (head_page == orig_reader)
continue;
ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
^ permalink raw reply related
* [PATCH v18 2/8] ring-buffer: Flush and stop persistent ring buffer on panic
From: Masami Hiramatsu (Google) @ 2026-04-24 6:52 UTC (permalink / raw)
To: Steven Rostedt, Catalin Marinas, Will Deacon
Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, Ian Rogers, linux-arm-kernel
In-Reply-To: <177701351903.2223789.17087009302463188638.stgit@mhiramat.tok.corp.google.com>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
On real hardware, panic and machine reboot may not flush hardware cache
to memory. This means the persistent ring buffer, which relies on a
coherent state of memory, may not have its events written to the buffer
and they may be lost. Moreover, there may be inconsistency with the
counters which are used for validation of the integrity of the
persistent ring buffer which may cause all data to be discarded.
To avoid this issue, stop recording of the ring buffer on panic and
flush the cache of the ring buffer's memory.
Fixes: e645535a954a ("tracing: Add option to use memmapped memory for trace boot instance")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
Changes in v13:
- Fix a rebase conflict.
Changes in v11:
- Do nothing by default since flush_cache_vmap() does nothing on x86
but it can cause deadlock on some architectures via on_each_cpu()
because other CPUs will be stoppped when panic notifier is called.
Changes in v9:
- Fix typo of & to &&.
- Fix typo of "Generic"
Changes in v6:
- Introduce asm/ring_buffer.h for arch_ring_buffer_flush_range().
- Use flush_cache_vmap() instead of flush_cache_all().
Changes in v5:
- Use ring_buffer_record_off() instead of ring_buffer_record_disable().
- Use flush_cache_all() to ensure flush all cache.
Changes in v3:
- update patch description.
---
arch/alpha/include/asm/Kbuild | 1 +
arch/arc/include/asm/Kbuild | 1 +
arch/arm/include/asm/Kbuild | 1 +
arch/arm64/include/asm/ring_buffer.h | 10 ++++++++++
arch/csky/include/asm/Kbuild | 1 +
arch/hexagon/include/asm/Kbuild | 1 +
arch/loongarch/include/asm/Kbuild | 1 +
arch/m68k/include/asm/Kbuild | 1 +
arch/microblaze/include/asm/Kbuild | 1 +
arch/mips/include/asm/Kbuild | 1 +
arch/nios2/include/asm/Kbuild | 1 +
arch/openrisc/include/asm/Kbuild | 1 +
arch/parisc/include/asm/Kbuild | 1 +
arch/powerpc/include/asm/Kbuild | 1 +
arch/riscv/include/asm/Kbuild | 1 +
arch/s390/include/asm/Kbuild | 1 +
arch/sh/include/asm/Kbuild | 1 +
arch/sparc/include/asm/Kbuild | 1 +
arch/um/include/asm/Kbuild | 1 +
arch/x86/include/asm/Kbuild | 1 +
arch/xtensa/include/asm/Kbuild | 1 +
include/asm-generic/ring_buffer.h | 13 +++++++++++++
kernel/trace/ring_buffer.c | 22 ++++++++++++++++++++++
23 files changed, 65 insertions(+)
create mode 100644 arch/arm64/include/asm/ring_buffer.h
create mode 100644 include/asm-generic/ring_buffer.h
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 483965c5a4de..b154b4e3dfa8 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -5,4 +5,5 @@ generic-y += agp.h
generic-y += asm-offsets.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
generic-y += text-patching.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 4c69522e0328..483caacc6988 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -5,5 +5,6 @@ generic-y += extable.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
generic-y += parport.h
+generic-y += ring_buffer.h
generic-y += user.h
generic-y += text-patching.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 03657ff8fbe3..decad5f2c826 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -3,6 +3,7 @@ generic-y += early_ioremap.h
generic-y += extable.h
generic-y += flat.h
generic-y += parport.h
+generic-y += ring_buffer.h
generated-y += mach-types.h
generated-y += unistd-nr.h
diff --git a/arch/arm64/include/asm/ring_buffer.h b/arch/arm64/include/asm/ring_buffer.h
new file mode 100644
index 000000000000..62316c406888
--- /dev/null
+++ b/arch/arm64/include/asm/ring_buffer.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_ARM64_RING_BUFFER_H
+#define _ASM_ARM64_RING_BUFFER_H
+
+#include <asm/cacheflush.h>
+
+/* Flush D-cache on persistent ring buffer */
+#define arch_ring_buffer_flush_range(start, end) dcache_clean_pop(start, end)
+
+#endif /* _ASM_ARM64_RING_BUFFER_H */
diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 3a5c7f6e5aac..7dca0c6cdc84 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += qrwlock.h
generic-y += qrwlock_types.h
generic-y += qspinlock.h
generic-y += parport.h
+generic-y += ring_buffer.h
generic-y += user.h
generic-y += vmlinux.lds.h
generic-y += text-patching.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 1efa1e993d4b..0f887d4238ed 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -5,4 +5,5 @@ generic-y += extable.h
generic-y += iomap.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
generic-y += text-patching.h
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 9034b583a88a..7e92957baf6a 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -10,5 +10,6 @@ generic-y += qrwlock.h
generic-y += user.h
generic-y += ioctl.h
generic-y += mmzone.h
+generic-y += ring_buffer.h
generic-y += statfs.h
generic-y += text-patching.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index b282e0dd8dc1..62543bf305ff 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -3,5 +3,6 @@ generated-y += syscall_table.h
generic-y += extable.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
generic-y += spinlock.h
generic-y += text-patching.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 7178f990e8b3..0030309b47ad 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += extable.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
generic-y += parport.h
+generic-y += ring_buffer.h
generic-y += syscalls.h
generic-y += tlb.h
generic-y += user.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 684569b2ecd6..9771c3d85074 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -12,5 +12,6 @@ generic-y += mcs_spinlock.h
generic-y += parport.h
generic-y += qrwlock.h
generic-y += qspinlock.h
+generic-y += ring_buffer.h
generic-y += user.h
generic-y += text-patching.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 28004301c236..0a2530964413 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += cmpxchg.h
generic-y += extable.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
generic-y += spinlock.h
generic-y += user.h
generic-y += text-patching.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index cef49d60d74c..8aa34621702d 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -8,4 +8,5 @@ generic-y += spinlock_types.h
generic-y += spinlock.h
generic-y += qrwlock_types.h
generic-y += qrwlock.h
+generic-y += ring_buffer.h
generic-y += user.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 4fb596d94c89..d48d158f7241 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -4,4 +4,5 @@ generated-y += syscall_table_64.h
generic-y += agp.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
generic-y += user.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 2e23533b67e3..805b5aeebb6f 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -5,4 +5,5 @@ generated-y += syscall_table_spu.h
generic-y += agp.h
generic-y += mcs_spinlock.h
generic-y += qrwlock.h
+generic-y += ring_buffer.h
generic-y += early_ioremap.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index bd5fc9403295..7721b63642f4 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -14,5 +14,6 @@ generic-y += ticket_spinlock.h
generic-y += qrwlock.h
generic-y += qrwlock_types.h
generic-y += qspinlock.h
+generic-y += ring_buffer.h
generic-y += user.h
generic-y += vmlinux.lds.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 80bad7de7a04..0c1fc47c3ba0 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -7,3 +7,4 @@ generated-y += unistd_nr.h
generic-y += asm-offsets.h
generic-y += mcs_spinlock.h
generic-y += mmzone.h
+generic-y += ring_buffer.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 4d3f10ed8275..f0403d3ee8ab 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -3,4 +3,5 @@ generated-y += syscall_table.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
generic-y += parport.h
+generic-y += ring_buffer.h
generic-y += text-patching.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 17ee8a273aa6..49c6bb326b75 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -4,4 +4,5 @@ generated-y += syscall_table_64.h
generic-y += agp.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
+generic-y += ring_buffer.h
generic-y += text-patching.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 1b9b82bbe322..2a1629ba8140 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += module.lds.h
generic-y += parport.h
generic-y += percpu.h
generic-y += preempt.h
+generic-y += ring_buffer.h
generic-y += runtime-const.h
generic-y += softirq_stack.h
generic-y += switch_to.h
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4566000e15c4..078fd2c0d69d 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -14,3 +14,4 @@ generic-y += early_ioremap.h
generic-y += fprobe.h
generic-y += mcs_spinlock.h
generic-y += mmzone.h
+generic-y += ring_buffer.h
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 13fe45dea296..e57af619263a 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -6,5 +6,6 @@ generic-y += mcs_spinlock.h
generic-y += parport.h
generic-y += qrwlock.h
generic-y += qspinlock.h
+generic-y += ring_buffer.h
generic-y += user.h
generic-y += text-patching.h
diff --git a/include/asm-generic/ring_buffer.h b/include/asm-generic/ring_buffer.h
new file mode 100644
index 000000000000..201d2aee1005
--- /dev/null
+++ b/include/asm-generic/ring_buffer.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic arch dependent ring_buffer macros.
+ */
+#ifndef __ASM_GENERIC_RING_BUFFER_H__
+#define __ASM_GENERIC_RING_BUFFER_H__
+
+#include <linux/cacheflush.h>
+
+/* Flush cache on ring buffer range if needed. Do nothing by default. */
+#define arch_ring_buffer_flush_range(start, end) do { } while (0)
+
+#endif /* __ASM_GENERIC_RING_BUFFER_H__ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5326924615a4..7288383b1f27 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7,6 +7,7 @@
#include <linux/ring_buffer_types.h>
#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -31,6 +32,7 @@
#include <linux/oom.h>
#include <linux/mm.h>
+#include <asm/ring_buffer.h>
#include <asm/local64.h>
#include <asm/local.h>
#include <asm/setup.h>
@@ -559,6 +561,7 @@ struct trace_buffer {
unsigned long range_addr_start;
unsigned long range_addr_end;
+ struct notifier_block flush_nb;
struct ring_buffer_meta *meta;
@@ -2521,6 +2524,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+ struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+ ring_buffer_record_off(buffer);
+ arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+ return NOTIFY_DONE;
+}
+
static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
@@ -2651,6 +2664,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
+ /* Persistent ring buffer needs to flush cache before reboot. */
+ if (start && end) {
+ buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+ atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+ }
+
return_ptr(buffer);
fail_free_buffers:
@@ -2749,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer)
{
int cpu;
+ if (buffer->range_addr_start && buffer->range_addr_end)
+ atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
irq_work_sync(&buffer->irq_work.work);
^ permalink raw reply related
* [PATCH v18 3/8] ring-buffer: Skip invalid sub-buffers when validating persistent ring buffer
From: Masami Hiramatsu (Google) @ 2026-04-24 6:52 UTC (permalink / raw)
To: Steven Rostedt, Catalin Marinas, Will Deacon
Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, Ian Rogers, linux-arm-kernel
In-Reply-To: <177701351903.2223789.17087009302463188638.stgit@mhiramat.tok.corp.google.com>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Skip invalid sub-buffers when validating the persistent ring buffer
instead of discarding the entire ring buffer. Only skipped buffers
are invalidated (cleared).
If the cache data in memory fails to be synchronized during a reboot,
the persistent ring buffer may become partially corrupted, but other
sub-buffers may still contain readable event data. Only discard the
subbuffers that are found to be corrupted.
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v18:
- Minor update by the new fix.
- Fix to substract BUF_PAGE_HDR_SIZE from meta->subbuf_size
to make the limit of commit size.
Changes in v17:
- Fix to use rb_page_size() of rewound pages for entry_bytes.
Changes in v15:
- Skip reader_page loop check on persistent ring buffer because
there can be contiguous empty(invalidated) pages.
- Do not show discarded page number information if it is 0.
Changes in v11:
- Fix a typo.
Changes in v9:
- Add meta->subbuf_size check.
- Fix a typo.
- Handle invalid reader_page case.
Changes in v8:
- Add comment in rb_valudate_buffer()
- Clear the RB_MISSED_* flags in rb_valudate_buffer() instead of
skipping subbuf.
- Remove unused subbuf local variable from rb_cpu_meta_valid().
Changes in v7:
- Combined with Handling RB_MISSED_* flags patch, focus on validation at boot.
- Remove checking subbuffer data when validating metadata, because it should be done
later.
- Do not mark the discarded sub buffer page but just reset it.
Changes in v6:
- Show invalid page detection message once per CPU.
Changes in v5:
- Instead of showing errors for each page, just show the number
of discarded pages at last.
Changes in v3:
- Record missed data event on commit.
---
kernel/trace/ring_buffer.c | 111 ++++++++++++++++++++++++++------------------
1 file changed, 66 insertions(+), 45 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7288383b1f27..404c1fcac0ae 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -370,6 +370,12 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
return local_read(&bpage->page->commit);
}
+/* Size is determined by what has been committed */
+static __always_inline unsigned int rb_page_size(struct buffer_page *bpage)
+{
+ return rb_page_commit(bpage) & ~RB_MISSED_MASK;
+}
+
static void free_buffer_page(struct buffer_page *bpage)
{
/* Range pages are not to be freed */
@@ -1762,7 +1768,6 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
unsigned long *subbuf_mask)
{
int subbuf_size = PAGE_SIZE;
- struct buffer_data_page *subbuf;
unsigned long buffers_start;
unsigned long buffers_end;
int i;
@@ -1770,6 +1775,11 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
if (!subbuf_mask)
return false;
+ if (meta->subbuf_size != PAGE_SIZE) {
+ pr_info("Ring buffer boot meta [%d] invalid subbuf_size\n", cpu);
+ return false;
+ }
+
buffers_start = meta->first_buffer;
buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
@@ -1786,11 +1796,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
return false;
}
- subbuf = rb_subbufs_from_meta(meta);
-
bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
- /* Is the meta buffers and the subbufs themselves have correct data? */
+ /*
+ * Ensure the meta::buffers array has correct data. The data in each subbufs
+ * are checked later in rb_meta_validate_events().
+ */
for (i = 0; i < meta->nr_subbufs; i++) {
if (meta->buffers[i] < 0 ||
meta->buffers[i] >= meta->nr_subbufs) {
@@ -1798,18 +1809,12 @@ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
return false;
}
- if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
- pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
- return false;
- }
-
if (test_bit(meta->buffers[i], subbuf_mask)) {
pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
return false;
}
set_bit(meta->buffers[i], subbuf_mask);
- subbuf = (void *)subbuf + subbuf_size;
}
return true;
@@ -1873,13 +1878,22 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
return events;
}
-static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu,
+ struct ring_buffer_cpu_meta *meta)
{
unsigned long long ts;
+ unsigned long tail;
u64 delta;
- int tail;
- tail = local_read(&dpage->commit);
+ /*
+ * When a sub-buffer is recovered from a read, the commit value may
+ * have RB_MISSED_* bits set, as these bits are reset on reuse.
+ * Even after clearing these bits, a commit value greater than the
+ * subbuf_size is considered invalid.
+ */
+ tail = local_read(&dpage->commit) & ~RB_MISSED_MASK;
+ if (tail > meta->subbuf_size - BUF_PAGE_HDR_SIZE)
+ return -1;
return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
}
@@ -1890,6 +1904,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
struct buffer_page *head_page, *orig_head, *orig_reader;
unsigned long entry_bytes = 0;
unsigned long entries = 0;
+ int discarded = 0;
int ret;
u64 ts;
int i;
@@ -1901,14 +1916,19 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
orig_reader = cpu_buffer->reader_page;
/* Do the reader page first */
- ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu);
+ ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu, meta);
if (ret < 0) {
- pr_info("Ring buffer reader page is invalid\n");
- goto invalid;
+ pr_info("Ring buffer meta [%d] invalid reader page detected\n",
+ cpu_buffer->cpu);
+ discarded++;
+ /* Instead of discard whole ring buffer, discard only this sub-buffer. */
+ local_set(&orig_reader->entries, 0);
+ local_set(&orig_reader->page->commit, 0);
+ } else {
+ entries += ret;
+ entry_bytes += rb_page_size(orig_reader);
+ local_set(&orig_reader->entries, ret);
}
- entries += ret;
- entry_bytes += local_read(&orig_reader->page->commit);
- local_set(&orig_reader->entries, ret);
ts = head_page->page->time_stamp;
@@ -1936,7 +1956,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
break;
/* Stop rewind if the page is invalid. */
- ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
if (ret < 0)
break;
@@ -1945,7 +1965,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
if (ret)
local_inc(&cpu_buffer->pages_touched);
entries += ret;
- entry_bytes += rb_page_commit(head_page);
+ entry_bytes += rb_page_size(head_page);
}
if (i)
pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
@@ -2015,21 +2035,24 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
if (head_page == orig_reader)
continue;
- ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu, meta);
if (ret < 0) {
- pr_info("Ring buffer meta [%d] invalid buffer page\n",
- cpu_buffer->cpu);
- goto invalid;
- }
-
- /* If the buffer has content, update pages_touched */
- if (ret)
- local_inc(&cpu_buffer->pages_touched);
-
- entries += ret;
- entry_bytes += local_read(&head_page->page->commit);
- local_set(&head_page->entries, ret);
+ if (!discarded)
+ pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
+ cpu_buffer->cpu);
+ discarded++;
+ /* Instead of discard whole ring buffer, discard only this sub-buffer. */
+ local_set(&head_page->entries, 0);
+ local_set(&head_page->page->commit, 0);
+ } else {
+ /* If the buffer has content, update pages_touched */
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+ entries += ret;
+ entry_bytes += rb_page_size(head_page);
+ local_set(&head_page->entries, ret);
+ }
if (head_page == cpu_buffer->commit_page)
break;
}
@@ -2043,7 +2066,10 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->entries, entries);
local_set(&cpu_buffer->entries_bytes, entry_bytes);
- pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+ pr_info("Ring buffer meta [%d] is from previous boot!", cpu_buffer->cpu);
+ if (discarded)
+ pr_cont(" (%d pages discarded)", discarded);
+ pr_cont("\n");
return;
invalid:
@@ -3330,12 +3356,6 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
return NULL;
}
-/* Size is determined by what has been committed */
-static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
-{
- return rb_page_commit(bpage) & ~RB_MISSED_MASK;
-}
-
static __always_inline unsigned
rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -5648,11 +5668,12 @@ __rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
again:
/*
* This should normally only loop twice. But because the
- * start of the reader inserts an empty page, it causes
- * a case where we will loop three times. There should be no
- * reason to loop four times (that I know of).
+ * start of the reader inserts an empty page, it causes a
+ * case where we will loop three times. There should be no
+ * reason to loop four times unless the ring buffer is a
+ * recovered persistent ring buffer.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3 && !cpu_buffer->ring_meta)) {
reader = NULL;
goto out;
}
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox