Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v9 2/3] tracing: Remove the backup instance automatically after read
From: Masami Hiramatsu (Google) @ 2026-03-31 16:32 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177497473558.569199.6527680985537865638.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Since the backup instance is readonly, after reading all data
via pipe, no data is left on the instance. Thus it can be
removed safely after closing all files.
This also removes it if user resets the ring buffer manually
via 'trace' file.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v9:
   - Fix to initialize autoremove workqueue only for backup.
   - Fix to return -ENODEV if trace_array_get() refers freeing instance.
 Changes in v6:
   - Fix typo in comment.
   - Only when there is a readonly trace array, initialize autoremove_wq.
   - Fix to exit loop in trace_array_get() if tr is found in the list.
 Changes in v4:
   - Update description.
---
 kernel/trace/trace.c |   85 ++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/trace/trace.h |    6 ++++
 2 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8cec7bd70438..1d73400a01c7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -539,8 +539,65 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
 	tr->ring_buffer_expanded = true;
 }
 
+static int __remove_instance(struct trace_array *tr);
+
+static void trace_array_autoremove(struct work_struct *work)
+{
+	struct trace_array *tr = container_of(work, struct trace_array, autoremove_work);
+
+	guard(mutex)(&event_mutex);
+	guard(mutex)(&trace_types_lock);
+
+	/*
+	 * This can be fail if someone gets @tr before starting this
+	 * function, but in that case, this will be kicked again when
+	 * putting it. So we don't care about the result.
+	 */
+	__remove_instance(tr);
+}
+
+static struct workqueue_struct *autoremove_wq;
+
+static void trace_array_kick_autoremove(struct trace_array *tr)
+{
+	if (autoremove_wq && !work_pending(&tr->autoremove_work))
+		queue_work(autoremove_wq, &tr->autoremove_work);
+}
+
+static void trace_array_cancel_autoremove(struct trace_array *tr)
+{
+	if (autoremove_wq && work_pending(&tr->autoremove_work))
+		cancel_work(&tr->autoremove_work);
+}
+
+static void trace_array_init_autoremove(struct trace_array *tr)
+{
+	INIT_WORK(&tr->autoremove_work, trace_array_autoremove);
+}
+
+static void trace_array_start_autoremove(void)
+{
+	if (autoremove_wq)
+		return;
+
+	autoremove_wq = alloc_workqueue("tr_autoremove_wq",
+					WQ_UNBOUND | WQ_HIGHPRI, 0);
+	if (!autoremove_wq)
+		pr_warn("Unable to allocate tr_autoremove_wq. autoremove disabled.\n");
+}
+
 LIST_HEAD(ftrace_trace_arrays);
 
+static int __trace_array_get(struct trace_array *this_tr)
+{
+	/* When free_on_close is set, this is not available anymore. */
+	if (autoremove_wq && this_tr->free_on_close)
+		return -ENODEV;
+
+	this_tr->ref++;
+	return 0;
+}
+
 int trace_array_get(struct trace_array *this_tr)
 {
 	struct trace_array *tr;
@@ -548,8 +605,7 @@ int trace_array_get(struct trace_array *this_tr)
 	guard(mutex)(&trace_types_lock);
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		if (tr == this_tr) {
-			tr->ref++;
-			return 0;
+			return __trace_array_get(tr);
 		}
 	}
 
@@ -560,6 +616,12 @@ static void __trace_array_put(struct trace_array *this_tr)
 {
 	WARN_ON(!this_tr->ref);
 	this_tr->ref--;
+	/*
+	 * When free_on_close is set, prepare removing the array
+	 * when the last reference is released.
+	 */
+	if (this_tr->ref == 1 && this_tr->free_on_close)
+		trace_array_kick_autoremove(this_tr);
 }
 
 /**
@@ -4829,6 +4891,10 @@ static void update_last_data(struct trace_array *tr)
 	/* Only if the buffer has previous boot data clear and update it. */
 	tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT;
 
+	/* If this is a backup instance, mark it for autoremove. */
+	if (tr->flags & TRACE_ARRAY_FL_VMALLOC)
+		tr->free_on_close = true;
+
 	/* Reset the module list and reload them */
 	if (tr->scratch) {
 		struct trace_scratch *tscratch = tr->scratch;
@@ -8442,8 +8508,8 @@ struct trace_array *trace_array_find_get(const char *instance)
 
 	guard(mutex)(&trace_types_lock);
 	tr = trace_array_find(instance);
-	if (tr)
-		tr->ref++;
+	if (tr && __trace_array_get(tr) < 0)
+		tr = NULL;
 
 	return tr;
 }
@@ -8540,6 +8606,8 @@ trace_array_create_systems(const char *name, const char *systems,
 	if (ftrace_allocate_ftrace_ops(tr) < 0)
 		goto out_free_tr;
 
+	trace_array_init_autoremove(tr);
+
 	ftrace_init_trace_array(tr);
 
 	init_trace_flags_index(tr);
@@ -8650,7 +8718,9 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
 
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		if (tr->name && strcmp(tr->name, name) == 0) {
-			tr->ref++;
+			/* if this fails, @tr is going to be removed. */
+			if (__trace_array_get(tr) < 0)
+				tr = NULL;
 			return tr;
 		}
 	}
@@ -8689,6 +8759,7 @@ static int __remove_instance(struct trace_array *tr)
 			set_tracer_flag(tr, 1ULL << i, 0);
 	}
 
+	trace_array_cancel_autoremove(tr);
 	tracing_set_nop(tr);
 	clear_ftrace_function_probes(tr);
 	event_trace_del_tracer(tr);
@@ -9653,8 +9724,10 @@ __init static void enable_instances(void)
 		/*
 		 * Backup buffers can be freed but need vfree().
 		 */
-		if (backup)
+		if (backup) {
 			tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY;
+			trace_array_start_autoremove();
+		}
 
 		if (start || backup) {
 			tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2d9d26d423f1..60e079177492 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -455,6 +455,12 @@ struct trace_array {
 	 * we do not waste memory on systems that are not using tracing.
 	 */
 	bool ring_buffer_expanded;
+	/*
+	 * If the ring buffer is a read only backup instance, it will be
+	 * removed after dumping all data via pipe, because no readable data.
+	 */
+	bool free_on_close;
+	struct work_struct	autoremove_work;
 };
 
 enum {


^ permalink raw reply related

* [PATCH v9 1/3] tracing: Make the backup instance non-reusable
From: Masami Hiramatsu (Google) @ 2026-03-31 16:32 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177497473558.569199.6527680985537865638.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Since there is no reason to reuse the backup instance, make it
readonly (but erasable).
Note that only backup instances are readonly, because
other trace instances will be empty unless it is writable.
Only backup instances have copy entries from the original.

With this change, most of the trace control files are removed
from the backup instance, including eventfs enable/filter etc.

 # find /sys/kernel/tracing/instances/backup/events/ | wc -l
 4093
 # find /sys/kernel/tracing/instances/boot_map/events/ | wc -l
 9573

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v9:
   - Add forcibly readonly check in open() operations.
 Changes in v8:
   - Remove read-only checks in read() operation.
 Changes in v7:
   - Return -EACCES instead of -EPERM.
 Changes in v6:
   - Remove tracing_on file from readonly instances.
   - Remove unused writable_mode from tracing_init_tracefs_percpu().
   - Cleanup init_tracer_tracefs() and create_event_toplevel_files().
   - Remove TRACE_MODE_WRITE_MASK.
   - Add TRACE_ARRAY_FL_RDONLY.
 Changes in v5:
   - Rebased on the latest for-next (and hide show_event_filters/triggers
     if the instance is readonly.
 Changes in v4:
  - Make trace data erasable. (not reusable)
 Changes in v3:
  - Resuse the beginning part of event_entries for readonly files.
  - Remove readonly file_operations and checking readonly flag in
    each write operation.
 Changes in v2:
  - Use readonly file_operations to prohibit writing instead of
    checking flags in write() callbacks.
  - Remove writable files from eventfs.
---
 kernel/trace/trace.c        |   81 +++++++++++++++++++++++++++----------------
 kernel/trace/trace.h        |    7 ++++
 kernel/trace/trace_boot.c   |    5 ++-
 kernel/trace/trace_events.c |   76 +++++++++++++++++++++++-----------------
 4 files changed, 104 insertions(+), 65 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7b9dd6378849..8cec7bd70438 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3414,6 +3414,11 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
 	if (ret)
 		return ret;
 
+	if ((filp->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+		trace_array_put(tr);
+		return -EACCES;
+	}
+
 	filp->private_data = inode->i_private;
 
 	return 0;
@@ -6435,6 +6440,11 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
 	if (ret)
 		return ret;
 
+	if ((file->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+		trace_array_put(tr);
+		return -EACCES;
+	}
+
 	ret = single_open(file, tracing_clock_show, inode->i_private);
 	if (ret < 0)
 		trace_array_put(tr);
@@ -8771,17 +8781,22 @@ static __init void create_trace_instances(struct dentry *d_tracer)
 static void
 init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
+	umode_t writable_mode = TRACE_MODE_WRITE;
 	int cpu;
 
+	if (trace_array_is_readonly(tr))
+		writable_mode = TRACE_MODE_READ;
+
 	trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
-			tr, &show_traces_fops);
+			  tr, &show_traces_fops);
 
-	trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
-			tr, &set_tracer_fops);
+	trace_create_file("current_tracer", writable_mode, d_tracer,
+			  tr, &set_tracer_fops);
 
-	trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
+	trace_create_file("tracing_cpumask", writable_mode, d_tracer,
 			  tr, &tracing_cpumask_fops);
 
+	/* Options are used for changing print-format even for readonly instance. */
 	trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
 			  tr, &tracing_iter_fops);
 
@@ -8791,12 +8806,36 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
 			  tr, &tracing_pipe_fops);
 
-	trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
+	trace_create_file("buffer_size_kb", writable_mode, d_tracer,
 			  tr, &tracing_entries_fops);
 
 	trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
 			  tr, &tracing_total_entries_fops);
 
+	trace_create_file("trace_clock", writable_mode, d_tracer, tr,
+			  &trace_clock_fops);
+
+	trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
+			  &trace_time_stamp_mode_fops);
+
+	tr->buffer_percent = 50;
+
+	trace_create_file("buffer_subbuf_size_kb", writable_mode, d_tracer,
+			  tr, &buffer_subbuf_size_fops);
+
+	create_trace_options_dir(tr);
+
+	if (tr->range_addr_start)
+		trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+				  tr, &last_boot_fops);
+
+	for_each_tracing_cpu(cpu)
+		tracing_init_tracefs_percpu(tr, cpu);
+
+	/* Read-only instance has above files only. */
+	if (trace_array_is_readonly(tr))
+		return;
+
 	trace_create_file("free_buffer", 0200, d_tracer,
 			  tr, &tracing_free_buffer_fops);
 
@@ -8808,49 +8847,29 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("trace_marker_raw", 0220, d_tracer,
 			  tr, &tracing_mark_raw_fops);
 
-	trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
-			  &trace_clock_fops);
-
-	trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
-			  tr, &rb_simple_fops);
-
-	trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
-			  &trace_time_stamp_mode_fops);
-
-	tr->buffer_percent = 50;
-
 	trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
-			tr, &buffer_percent_fops);
-
-	trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
-			  tr, &buffer_subbuf_size_fops);
+			  tr, &buffer_percent_fops);
 
 	trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer,
-			 tr, &tracing_syscall_buf_fops);
+			  tr, &tracing_syscall_buf_fops);
 
-	create_trace_options_dir(tr);
+	trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
+			  tr, &rb_simple_fops);
 
 	trace_create_maxlat_file(tr, d_tracer);
 
 	if (ftrace_create_function_files(tr, d_tracer))
 		MEM_FAIL(1, "Could not allocate function filter files");
 
-	if (tr->range_addr_start) {
-		trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
-				  tr, &last_boot_fops);
 #ifdef CONFIG_TRACER_SNAPSHOT
-	} else {
+	if (!tr->range_addr_start)
 		trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
 				  tr, &snapshot_fops);
 #endif
-	}
 
 	trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
 			  tr, &tracing_err_log_fops);
 
-	for_each_tracing_cpu(cpu)
-		tracing_init_tracefs_percpu(tr, cpu);
-
 	ftrace_init_tracefs(tr, d_tracer);
 }
 
@@ -9635,7 +9654,7 @@ __init static void enable_instances(void)
 		 * Backup buffers can be freed but need vfree().
 		 */
 		if (backup)
-			tr->flags |= TRACE_ARRAY_FL_VMALLOC;
+			tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY;
 
 		if (start || backup) {
 			tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a3ea735a9ef6..2d9d26d423f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -464,6 +464,7 @@ enum {
 	TRACE_ARRAY_FL_MOD_INIT		= BIT(3),
 	TRACE_ARRAY_FL_MEMMAP		= BIT(4),
 	TRACE_ARRAY_FL_VMALLOC		= BIT(5),
+	TRACE_ARRAY_FL_RDONLY		= BIT(6),
 };
 
 #ifdef CONFIG_MODULES
@@ -493,6 +494,12 @@ extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long
 
 extern struct trace_array *printk_trace;
 
+static inline bool trace_array_is_readonly(struct trace_array *tr)
+{
+	/* backup instance is read only. */
+	return tr->flags & TRACE_ARRAY_FL_RDONLY;
+}
+
 /*
  * The global tracer (top) should be the first trace array added,
  * but we check the flag anyway.
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index dbe29b4c6a7a..2ca2541c8a58 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -61,7 +61,8 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
 		v = memparse(p, NULL);
 		if (v < PAGE_SIZE)
 			pr_err("Buffer size is too small: %s\n", p);
-		if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
+		if (trace_array_is_readonly(tr) ||
+		    tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
 			pr_err("Failed to resize trace buffer to %s\n", p);
 	}
 
@@ -597,7 +598,7 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node)
 
 	p = xbc_node_find_value(node, "tracer", NULL);
 	if (p && *p != '\0') {
-		if (tracing_set_tracer(tr, p) < 0)
+		if (trace_array_is_readonly(tr) || tracing_set_tracer(tr, p) < 0)
 			pr_err("Failed to set given tracer: %s\n", p);
 	}
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index de807a9e2371..7ddcee312471 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1401,6 +1401,9 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
 {
 	int ret;
 
+	if (trace_array_is_readonly(tr))
+		return -EACCES;
+
 	mutex_lock(&event_mutex);
 	ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod);
 	mutex_unlock(&event_mutex);
@@ -2969,8 +2972,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 	} else
 		__get_system(system);
 
-	/* ftrace only has directories no files */
-	if (strcmp(name, "ftrace") == 0)
+	/* ftrace only has directories no files, readonly instance too. */
+	if (strcmp(name, "ftrace") == 0 || trace_array_is_readonly(tr))
 		nr_entries = 0;
 	else
 		nr_entries = ARRAY_SIZE(system_entries);
@@ -3135,28 +3138,30 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
 	int ret;
 	static struct eventfs_entry event_entries[] = {
 		{
-			.name		= "enable",
+			.name		= "format",
 			.callback	= event_callback,
-			.release	= event_release,
 		},
+#ifdef CONFIG_PERF_EVENTS
 		{
-			.name		= "filter",
+			.name		= "id",
 			.callback	= event_callback,
 		},
+#endif
+#define NR_RO_EVENT_ENTRIES	(1 + IS_ENABLED(CONFIG_PERF_EVENTS))
+/* Readonly files must be above this line and counted by NR_RO_EVENT_ENTRIES. */
 		{
-			.name		= "trigger",
+			.name		= "enable",
 			.callback	= event_callback,
+			.release	= event_release,
 		},
 		{
-			.name		= "format",
+			.name		= "filter",
 			.callback	= event_callback,
 		},
-#ifdef CONFIG_PERF_EVENTS
 		{
-			.name		= "id",
+			.name		= "trigger",
 			.callback	= event_callback,
 		},
-#endif
 #ifdef CONFIG_HIST_TRIGGERS
 		{
 			.name		= "hist",
@@ -3189,7 +3194,10 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
 	if (!e_events)
 		return -ENOMEM;
 
-	nr_entries = ARRAY_SIZE(event_entries);
+	if (trace_array_is_readonly(tr))
+		nr_entries = NR_RO_EVENT_ENTRIES;
+	else
+		nr_entries = ARRAY_SIZE(event_entries);
 
 	name = trace_event_name(call);
 	ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
@@ -4532,31 +4540,44 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 	int nr_entries;
 	static struct eventfs_entry events_entries[] = {
 		{
-			.name		= "enable",
+			.name		= "header_page",
 			.callback	= events_callback,
 		},
 		{
-			.name		= "header_page",
+			.name		= "header_event",
 			.callback	= events_callback,
 		},
+#define NR_RO_TOP_ENTRIES	2
+/* Readonly files must be above this line and counted by NR_RO_TOP_ENTRIES. */
 		{
-			.name		= "header_event",
+			.name		= "enable",
 			.callback	= events_callback,
 		},
 	};
 
-	entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
-				  tr, &ftrace_set_event_fops);
-	if (!entry)
-		return -ENOMEM;
+	if (!trace_array_is_readonly(tr)) {
+		entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
+					tr, &ftrace_set_event_fops);
+		if (!entry)
+			return -ENOMEM;
 
-	trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
-			  &ftrace_show_event_filters_fops);
+		/* There are not as crucial, just warn if they are not created */
+		trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
+				&ftrace_show_event_filters_fops);
 
-	trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
-			  &ftrace_show_event_triggers_fops);
+		trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
+				&ftrace_show_event_triggers_fops);
 
-	nr_entries = ARRAY_SIZE(events_entries);
+		trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
+				tr, &ftrace_set_event_pid_fops);
+
+		trace_create_file("set_event_notrace_pid",
+				TRACE_MODE_WRITE, parent, tr,
+				&ftrace_set_event_notrace_pid_fops);
+		nr_entries = ARRAY_SIZE(events_entries);
+	} else {
+		nr_entries = NR_RO_TOP_ENTRIES;
+	}
 
 	e_events = eventfs_create_events_dir("events", parent, events_entries,
 					     nr_entries, tr);
@@ -4565,15 +4586,6 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 		return -ENOMEM;
 	}
 
-	/* There are not as crucial, just warn if they are not created */
-
-	trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
-			  tr, &ftrace_set_event_pid_fops);
-
-	trace_create_file("set_event_notrace_pid",
-			  TRACE_MODE_WRITE, parent, tr,
-			  &ftrace_set_event_notrace_pid_fops);
-
 	tr->event_dir = e_events;
 
 	return 0;


^ permalink raw reply related

* [PATCH v9 0/3] tracing: Remove backup instance after read all
From: Masami Hiramatsu (Google) @ 2026-03-31 16:32 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel

Hi,

Here is the v9 of the series to improve backup instances of
the persistent ring buffer. The previous version is here:

https://lore.kernel.org/all/177071300558.2293046.12057922262682243630.stgit@mhiramat.tok.corp.google.com/

In this version, I removed bugfixes (those are actual fixes/minor
updates) and the force permission check in tracefs
because superuser can modify the permission by itself. Instead,
simply add read-only and FMODE_WRITE check in the related open()
file operations[1/3]. Also, I fixed 2 bugs in autoremove patch
to init dedicated workqueue correctly [2/3].

Series Description
------------------
Since backup instances are a kind of snapshot of the persistent
ring buffer, it should be readonly. And if it is readonly
there is no reason to keep it after reading all data via trace_pipe
because the data has been consumed. But user should be able to remove
the readonly instance by rmdir or truncating `trace` file.

Thus, [1/3] makes backup instances readonly (not able to write any
events, cleanup trace, change buffer size). Also, [2/3] removes the
backup instance after consuming all data via trace_pipe.
With this improvements, even if we makes a backup instance (using
the same amount of memory of the persistent ring buffer), it will
be removed after reading the data automatically.

Thanks,

---

Masami Hiramatsu (Google) (3):
      tracing: Make the backup instance non-reusable
      tracing: Remove the backup instance automatically after read
      tracing/Documentation: Add a section about backup instance

 Documentation/trace/debugging.rst |   19 ++++
 kernel/trace/trace.c              |  166 +++++++++++++++++++++++++++++--------
 kernel/trace/trace.h              |   13 +++
 kernel/trace/trace_boot.c         |    5 +
 kernel/trace/trace_events.c       |   76 ++++++++++-------
 5 files changed, 208 insertions(+), 71 deletions(-)

--
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH] tracing: always provide a prototype for tracing_alloc_snapshot()
From: Steven Rostedt @ 2026-03-31 16:13 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Bartosz Golaszewski, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260331182036.e62143d5a9a59776d8cf7ae2@kernel.org>

On Tue, 31 Mar 2026 18:20:36 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Tue, 31 Mar 2026 10:20:01 +0200
> Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com> wrote:
> 
> > The tracing_alloc_snapshot() symbol is always exported even with
> > !CONFIG_TRACER_SNAPSHOT so the prototype too must be always visible or
> > we'll see the following warning:
> > 
> > kernel/trace/trace.c:820:5: warning: no previous prototype for ‘tracing_alloc_snapshot’ [-Wmissing-prototypes]
> >   820 | int tracing_alloc_snapshot(void)
> >       |     ^~~~~~~~~~~~~~~~~~~~~~
> > 
> > Fixes: bade44fe5462 ("tracing: Move snapshot code out of trace.c and into trace_snapshot.c")
> > Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>  
> 
> Good catch!
> 
> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

But is the wrong fix. I already fixed it properly:

  https://patch.msgid.link/20260328101946.2c4ef4a5@robin

It's still in the testing phase but will be going to linux-next this week.

-- Steve

^ permalink raw reply

* Re: [PATCH v2] bootconfig: Apply early options from embedded config
From: Breno Leitao @ 2026-03-31 15:27 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Jonathan Corbet, Shuah Khan, linux-kernel, linux-trace-kernel,
	linux-doc, oss, paulmck, rostedt, kernel-team
In-Reply-To: <20260331125827.157a833882830007ea9b0b31@kernel.org>

hello Masami,

On Tue, Mar 31, 2026 at 12:58:27PM +0900, Masami Hiramatsu wrote:

> > 3) Ensure that early bootconfig parameters don't overwrite the boot command
> >    line. For example, if the boot command line has foo=bar and bootconfig
> >    later has foo=baz, the command line value should take precedence.
> >    This prevents early boot code (in setup_arch()) from seeing a parameter
> >    value that will be changed later.
>
> OK, this also needs to be considered. Currently we just pass the bootconfig
> parameters right before bootloader given parameters as "extra_command_line"
> if "bootconfig" in cmdline or CONFIG_BOOT_CONFIG_FORCE=y.
>
> [boot_config(.kernel)]<command_line>[ -- [boot_config(.init)][init_command_line]]
>
> This is currently expected behavior. The bootconfig parameters are
> expected to be overridden by command_line or command_line are appended.

That's correct, and I have no intention of changing this behavior. Here's
the current approach:

1) Early parameters from the bootloader are parsed first in setup_arch()

2) Subsequently, bootconfig_apply_early_params() is invoked. Any early
   parameter that was already parsed from the bootloader (in setup_arch())
   will be skipped at this stage.

> If we change this for early params, we also should change the expected
> output of /proc/cmdline too. I think we have 2 options;
>
>  - As before, we expect the parameters provided by the boot configuration
>    to be processed first and then overridden later by the command line.
>
> Or,
>
>  - ignore all parameters which is given from the command line, this also
>    updates existing setup_boot_config() (means xbc_snprint_cmdline() ).
>
> Anyway, this behavior change will also be a bit critical... We have
> to announce it.

As mentioned above, I don't anticipate any changes to existing behavior.
Bootconfig parsing remains unchanged. The only modification is that
bootconfig_apply_early_params() will skip any early config parameter
that's already present in the bootloader command line.

> > +Note that embedded bootconfig is parsed after ``setup_arch()``, so
> > +early options that are consumed during architecture initialization
> > +(e.g., ``mem=``, ``memmap=``, ``earlycon``, ``noapic``, ``nolapic``,
> > +``acpi=``, ``numa=``, ``iommu=``) may not take effect from bootconfig.
> > +
>
> This is easy to explain, but it's quite troublesome for users to
> determine which parameters are unavailable.

Agreed. This turned out to be significantly more complex than I
initially anticipated.

I'm uncertain whether we can accomplish this without examining every
early_parameter() implementation in depth.

> Currently we can identify
> it by `git grep early_param -- arch/${ARCH}`. But it is setup in
> setup_arch() we need to track the source code. (Or ask AI :))

The challenge extends beyond that. There are numerous early_parameter()
definitions scattered throughout the kernel that may or may not be
utilized by setup_arch().

For example, consider `early_param("mitigations", ..)` in
./kernel/cpu.c. This modifies the cpu_mitigations global variable, which
is referenced in various locations across different architectures.

It's worth noting that we have over 300 early_parameter() instances in
the kernel.

Given this, analyzing all these early parameters and examining each one
individually represents a substantial amount of work.

Are there alternative approaches? At this point, I'm leaning toward
breaking bootconfig's dependency on memblock, allowing us to invoke it
before setup_arch(). Is this the only practical solution available?!

Thanks,
--breno

^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Nam Cao @ 2026-03-31 15:23 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <58674d7f10c260369f5cb78599ba6ecb3804358f.camel@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
>> diff --git a/kernel/trace/rv/monitors/sleep/sleep.c
>> b/kernel/trace/rv/monitors/sleep/sleep.c
>> index c1347da69e9d..59091863c17c 100644
>> --- a/kernel/trace/rv/monitors/sleep/sleep.c
>> +++ b/kernel/trace/rv/monitors/sleep/sleep.c
>> @@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs
>> *regs, long id)
>>  			break;
>>  		}
>>  		break;
>> +#ifdef __NR_epoll_wait
>> +	case __NR_epoll_wait:
>> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, true);
>> +		break;
>> +#endif
>
> Sashiko (the AI bot) wonders why this isn't ltl_atom_update() like other things
> around here. Is that intentional?

No that's not intentional. It does not affect verification result, but
still should be fixed. I will send v2.

Funnily a colleague just told me earlier today about how good AIs are at
reviewing..

Nam

^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Gabriele Monaco @ 2026-03-31 15:15 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <20260331104918.2710853-1-namcao@linutronix.de>


On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.c
> b/kernel/trace/rv/monitors/sleep/sleep.c
> index c1347da69e9d..59091863c17c 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.c
> +++ b/kernel/trace/rv/monitors/sleep/sleep.c
> @@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs
> *regs, long id)
>  			break;
>  		}
>  		break;
> +#ifdef __NR_epoll_wait
> +	case __NR_epoll_wait:
> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, true);
> +		break;
> +#endif

Sashiko (the AI bot) wonders why this isn't ltl_atom_update() like other things
around here. Is that intentional?

>  	}
>  }
>  
> @@ -174,6 +181,7 @@ static void handle_sys_exit(void *data, struct pt_regs
> *regs, long ret)
>  	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
> +	ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  	ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
>  }
>  
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.h
> b/kernel/trace/rv/monitors/sleep/sleep.h
> index 2ab46fd218d2..95dc2727c059 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.h
> +++ b/kernel/trace/rv/monitors/sleep/sleep.h
> @@ -15,6 +15,7 @@ enum ltl_atom {
>  	LTL_ABORT_SLEEP,
>  	LTL_BLOCK_ON_RT_MUTEX,
>  	LTL_CLOCK_NANOSLEEP,
> +	LTL_EPOLL_WAIT,
>  	LTL_FUTEX_LOCK_PI,
>  	LTL_FUTEX_WAIT,
>  	LTL_KERNEL_THREAD,
> @@ -40,6 +41,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
>  		"ab_sl",
>  		"bl_on_rt_mu",
>  		"cl_na",
> +		"ep_wa",
>  		"fu_lo_pi",
>  		"fu_wa",
>  		"ker_th",
> @@ -75,39 +77,41 @@ static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
>  
>  static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
>  {
> -	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> -	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> -	bool val40 = task_is_rcu || task_is_migration;
> -	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> -	bool val41 = futex_lock_pi || val40;
> -	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> -	bool val5 = block_on_rt_mutex || val41;
> -	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> -	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val32 = abort_sleep || kthread_should_stop;
>  	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
> -	bool val33 = woken_by_nmi || val32;
>  	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
> -	bool val34 = woken_by_hardirq || val33;
>  	bool woken_by_equal_or_higher_prio =
> test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
>  	     mon->atoms);
> -	bool val14 = woken_by_equal_or_higher_prio || val34;
>  	bool wake = test_bit(LTL_WAKE, mon->atoms);
> -	bool val13 = !wake;
> -	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
> +	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> +	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> +	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
> +	bool rt = test_bit(LTL_RT, mon->atoms);
> +	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
>  	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
>  	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> -	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool val25 = nanosleep_timer_abstime && val24;
> -	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> -	bool val18 = clock_nanosleep && val25;
> +	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> +	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> -	bool val9 = futex_wait || val18;
> +	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> +	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
> +	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> +	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> +	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> +	bool val42 = task_is_rcu || task_is_migration;
> +	bool val43 = futex_lock_pi || val42;
> +	bool val5 = block_on_rt_mutex || val43;
> +	bool val34 = abort_sleep || kthread_should_stop;
> +	bool val35 = woken_by_nmi || val34;
> +	bool val36 = woken_by_hardirq || val35;
> +	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val13 = !wake;
> +	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> +	bool val27 = nanosleep_timer_abstime && val26;
> +	bool val18 = clock_nanosleep && val27;
> +	bool val20 = val18 || epoll_wait;
> +	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> -	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool val2 = !sleep;
> -	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool val1 = !rt;
>  	bool val3 = val1 || val2;
>  
> @@ -124,39 +128,41 @@ static void ltl_start(struct task_struct *task, struct
> ltl_monitor *mon)
>  static void
>  ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state,
> unsigned long *next)
>  {
> -	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> -	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> -	bool val40 = task_is_rcu || task_is_migration;
> -	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> -	bool val41 = futex_lock_pi || val40;
> -	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> -	bool val5 = block_on_rt_mutex || val41;
> -	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> -	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val32 = abort_sleep || kthread_should_stop;
>  	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
> -	bool val33 = woken_by_nmi || val32;
>  	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
> -	bool val34 = woken_by_hardirq || val33;
>  	bool woken_by_equal_or_higher_prio =
> test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
>  	     mon->atoms);
> -	bool val14 = woken_by_equal_or_higher_prio || val34;
>  	bool wake = test_bit(LTL_WAKE, mon->atoms);
> -	bool val13 = !wake;
> -	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
> +	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> +	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> +	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
> +	bool rt = test_bit(LTL_RT, mon->atoms);
> +	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
>  	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
>  	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> -	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool val25 = nanosleep_timer_abstime && val24;
> -	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> -	bool val18 = clock_nanosleep && val25;
> +	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> +	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> -	bool val9 = futex_wait || val18;
> +	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> +	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
> +	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> +	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> +	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> +	bool val42 = task_is_rcu || task_is_migration;
> +	bool val43 = futex_lock_pi || val42;
> +	bool val5 = block_on_rt_mutex || val43;
> +	bool val34 = abort_sleep || kthread_should_stop;
> +	bool val35 = woken_by_nmi || val34;
> +	bool val36 = woken_by_hardirq || val35;
> +	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val13 = !wake;
> +	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> +	bool val27 = nanosleep_timer_abstime && val26;
> +	bool val18 = clock_nanosleep && val27;
> +	bool val20 = val18 || epoll_wait;
> +	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> -	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool val2 = !sleep;
> -	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool val1 = !rt;
>  	bool val3 = val1 || val2;
>  
> diff --git a/tools/verification/models/rtapp/sleep.ltl
> b/tools/verification/models/rtapp/sleep.ltl
> index 6379bbeb6212..6f26c4810f78 100644
> --- a/tools/verification/models/rtapp/sleep.ltl
> +++ b/tools/verification/models/rtapp/sleep.ltl
> @@ -5,6 +5,7 @@ RT_FRIENDLY_SLEEP = (RT_VALID_SLEEP_REASON or KERNEL_THREAD)
>  
>  RT_VALID_SLEEP_REASON = FUTEX_WAIT
>                       or RT_FRIENDLY_NANOSLEEP
> +                     or EPOLL_WAIT
>  
>  RT_FRIENDLY_NANOSLEEP = CLOCK_NANOSLEEP
>                      and NANOSLEEP_TIMER_ABSTIME


^ permalink raw reply

* Re: [PATCH v4 3/5] locking: Add contended_release tracepoint to sleepable locks
From: Usama Arif @ 2026-03-31 14:11 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-kernel, linux-mips,
	virtualization, linux-arch, linux-mm, linux-trace-kernel,
	kernel-team
In-Reply-To: <acu7LdegiZ5_-dEW@shell.ilvokhin.com>



On 31/03/2026 15:16, Dmitry Ilvokhin wrote:
> On Tue, Mar 31, 2026 at 03:34:50AM -0700, Usama Arif wrote:
>> On Thu, 26 Mar 2026 15:10:02 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:
>>
>>> Add the contended_release trace event. This tracepoint fires on the
>>> holder side when a contended lock is released, complementing the
>>> existing contention_begin/contention_end tracepoints which fire on the
>>> waiter side.
>>>
>>> This enables correlating lock hold time under contention with waiter
>>> events by lock address.
>>>
>>> Add trace_contended_release() calls to the slowpath unlock paths of
>>> sleepable locks: mutex, rtmutex, semaphore, rwsem, percpu-rwsem, and
>>> RT-specific rwbase locks.
>>>
>>> Where possible, trace_contended_release() fires before the lock is
>>> released and before the waiter is woken. For some lock types, the
>>> tracepoint fires after the release but before the wake. Making the
>>> placement consistent across all lock types is not worth the added
>>> complexity.
>>>
>>> For reader/writer locks, the tracepoint fires for every reader releasing
>>> while a writer is waiting, not only for the last reader.
>>>
>>> Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
>>> ---
>>>  include/trace/events/lock.h   | 17 +++++++++++++++++
>>>  kernel/locking/mutex.c        |  4 ++++
>>>  kernel/locking/percpu-rwsem.c | 11 +++++++++++
>>>  kernel/locking/rtmutex.c      |  1 +
>>>  kernel/locking/rwbase_rt.c    |  6 ++++++
>>>  kernel/locking/rwsem.c        | 10 ++++++++--
>>>  kernel/locking/semaphore.c    |  4 ++++
>>>  7 files changed, 51 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
>>> index da978f2afb45..1ded869cd619 100644
>>> --- a/include/trace/events/lock.h
>>> +++ b/include/trace/events/lock.h
>>> @@ -137,6 +137,23 @@ TRACE_EVENT(contention_end,
>>>  	TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret)
>>>  );
>>>  
>>> +TRACE_EVENT(contended_release,
>>> +
>>> +	TP_PROTO(void *lock),
>>> +
>>> +	TP_ARGS(lock),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(void *, lock_addr)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		__entry->lock_addr = lock;
>>> +	),
>>> +
>>> +	TP_printk("%p", __entry->lock_addr)
>>> +);
>>> +
>>>  #endif /* _TRACE_LOCK_H */
>>>  
>>>  /* This part must be outside protection */
>>> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
>>> index 427187ff02db..6c2c9312eb8f 100644
>>> --- a/kernel/locking/mutex.c
>>> +++ b/kernel/locking/mutex.c
>>> @@ -997,6 +997,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
>>>  		wake_q_add(&wake_q, next);
>>>  	}
>>>  
>>> +	if (trace_contended_release_enabled() && waiter)
>>> +		trace_contended_release(lock);
>>> +
>>
>> This won't compile? waiter is declared in the if block, so you are using
>> it outside scope here.
>>
> 
> Thanks for the feedback, Usama.
> 
> waiter is declared at function scope, right on top. It's also assigned
> before the if block, so it's still in scope at the tracepoint.

Ah ok, I was reviewing on top of mm-new branch from today where waiter
is declared in the if block. Probably something changed related to
locking/tracing and its not in mm-new yet.

^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Gabriele Monaco @ 2026-03-31 13:47 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <87y0j86rq8.fsf@yellow.woof>

On Tue, 2026-03-31 at 15:41 +0200, Nam Cao wrote:
> Gabriele Monaco <gmonaco@redhat.com> writes:
> 
> > On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
> > > Since commit 0c43094f8cc9 ("eventpoll: Replace rwlock with spinlock"),
> > > epoll_wait is real-time-safe syscall for sleeping.
> > > 
> > > Add epoll_wait to the list of rt-safe sleeping APIs.
> > > 
> > > Signed-off-by: Nam Cao <namcao@linutronix.de>
> > 
> > Thanks for the patch, looks reasonable.
> > I tried re-generating the header (sleep.h) with rvgen based on the new
> > specification and I'm getting a different order.
> > 
> > Is what you're committing the result of rvgen on your computer?
> > We probably still have some unpredictable result in the rvgen's output if
> > that's
> > the case (no big deal then, though it triggers me a bit).
> 
> Right, fixing this is in my list. The script uses set and set's order is
> not deterministic. You get different (but equivalent) results every
> time. I should start working on that..

Reasonable, no rush. I just noticed this behaviour when trying to write some
selftests for rvgen and didn't manage to make it deterministic with trivial
changes.

> 
> > I would still like to run some tests on this, how urgently would you like
> > this
> > patch through? I was really about to send Steve a PR with the other changes
> > so
> > this might need to wait for the next merge window.
> 
> The earlier the better, but no one will die because it misses a merge
> window.

Alright I'll see if I can squeeze it in within this week, if not, it'll have to
wait. For now:

Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>

Thanks,
Gabriele


^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Nam Cao @ 2026-03-31 13:41 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <4b47d5e7e9dde0c76beb1a9383a13553c2455d92.camel@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:

> On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
>> Since commit 0c43094f8cc9 ("eventpoll: Replace rwlock with spinlock"),
>> epoll_wait is real-time-safe syscall for sleeping.
>> 
>> Add epoll_wait to the list of rt-safe sleeping APIs.
>> 
>> Signed-off-by: Nam Cao <namcao@linutronix.de>
>
> Thanks for the patch, looks reasonable.
> I tried re-generating the header (sleep.h) with rvgen based on the new
> specification and I'm getting a different order.
>
> Is what you're committing the result of rvgen on your computer?
> We probably still have some unpredictable result in the rvgen's output if that's
> the case (no big deal then, though it triggers me a bit).

Right, fixing this is in my list. The script uses set and set's order is
not deterministic. You get different (but equivalent) results every
time. I should start working on that..

> I would still like to run some tests on this, how urgently would you like this
> patch through? I was really about to send Steve a PR with the other changes so
> this might need to wait for the next merge window.

The earlier the better, but no one will die because it misses a merge
window.

Nam

^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Gabriele Monaco @ 2026-03-31 12:50 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <20260331104918.2710853-1-namcao@linutronix.de>

On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
> Since commit 0c43094f8cc9 ("eventpoll: Replace rwlock with spinlock"),
> epoll_wait is real-time-safe syscall for sleeping.
> 
> Add epoll_wait to the list of rt-safe sleeping APIs.
> 
> Signed-off-by: Nam Cao <namcao@linutronix.de>

Thanks for the patch, looks reasonable.
I tried re-generating the header (sleep.h) with rvgen based on the new
specification and I'm getting a different order.

Is what you're committing the result of rvgen on your computer?
We probably still have some unpredictable result in the rvgen's output if that's
the case (no big deal then, though it triggers me a bit).

I would still like to run some tests on this, how urgently would you like this
patch through? I was really about to send Steve a PR with the other changes so
this might need to wait for the next merge window.

Thanks,
Gabriele

> ---
>  kernel/trace/rv/monitors/sleep/sleep.c    |  8 ++
>  kernel/trace/rv/monitors/sleep/sleep.h    | 98 ++++++++++++-----------
>  tools/verification/models/rtapp/sleep.ltl |  1 +
>  3 files changed, 61 insertions(+), 46 deletions(-)
> 
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.c
> b/kernel/trace/rv/monitors/sleep/sleep.c
> index c1347da69e9d..59091863c17c 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.c
> +++ b/kernel/trace/rv/monitors/sleep/sleep.c
> @@ -49,6 +49,7 @@ static void ltl_atoms_init(struct task_struct *task, struct
> ltl_monitor *mon, bo
>  		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
>  		ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
>  		ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  		ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
>  		ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
>  	}
> @@ -63,6 +64,7 @@ static void ltl_atoms_init(struct task_struct *task, struct
> ltl_monitor *mon, bo
>  		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
>  		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
>  		ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  
>  		if (strstarts(task->comm, "migration/"))
>  			ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true);
> @@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs
> *regs, long id)
>  			break;
>  		}
>  		break;
> +#ifdef __NR_epoll_wait
> +	case __NR_epoll_wait:
> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, true);
> +		break;
> +#endif
>  	}
>  }
>  
> @@ -174,6 +181,7 @@ static void handle_sys_exit(void *data, struct pt_regs
> *regs, long ret)
>  	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
> +	ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  	ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
>  }
>  
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.h
> b/kernel/trace/rv/monitors/sleep/sleep.h
> index 2ab46fd218d2..95dc2727c059 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.h
> +++ b/kernel/trace/rv/monitors/sleep/sleep.h
> @@ -15,6 +15,7 @@ enum ltl_atom {
>  	LTL_ABORT_SLEEP,
>  	LTL_BLOCK_ON_RT_MUTEX,
>  	LTL_CLOCK_NANOSLEEP,
> +	LTL_EPOLL_WAIT,
>  	LTL_FUTEX_LOCK_PI,
>  	LTL_FUTEX_WAIT,
>  	LTL_KERNEL_THREAD,
> @@ -40,6 +41,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
>  		"ab_sl",
>  		"bl_on_rt_mu",
>  		"cl_na",
> +		"ep_wa",
>  		"fu_lo_pi",
>  		"fu_wa",
>  		"ker_th",
> @@ -75,39 +77,41 @@ static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
>  
>  static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
>  {
> -	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> -	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> -	bool val40 = task_is_rcu || task_is_migration;
> -	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> -	bool val41 = futex_lock_pi || val40;
> -	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> -	bool val5 = block_on_rt_mutex || val41;
> -	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> -	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val32 = abort_sleep || kthread_should_stop;
>  	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
> -	bool val33 = woken_by_nmi || val32;
>  	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
> -	bool val34 = woken_by_hardirq || val33;
>  	bool woken_by_equal_or_higher_prio =
> test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
>  	     mon->atoms);
> -	bool val14 = woken_by_equal_or_higher_prio || val34;
>  	bool wake = test_bit(LTL_WAKE, mon->atoms);
> -	bool val13 = !wake;
> -	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
> +	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> +	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> +	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
> +	bool rt = test_bit(LTL_RT, mon->atoms);
> +	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
>  	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
>  	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> -	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool val25 = nanosleep_timer_abstime && val24;
> -	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> -	bool val18 = clock_nanosleep && val25;
> +	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> +	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> -	bool val9 = futex_wait || val18;
> +	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> +	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
> +	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> +	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> +	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> +	bool val42 = task_is_rcu || task_is_migration;
> +	bool val43 = futex_lock_pi || val42;
> +	bool val5 = block_on_rt_mutex || val43;
> +	bool val34 = abort_sleep || kthread_should_stop;
> +	bool val35 = woken_by_nmi || val34;
> +	bool val36 = woken_by_hardirq || val35;
> +	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val13 = !wake;
> +	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> +	bool val27 = nanosleep_timer_abstime && val26;
> +	bool val18 = clock_nanosleep && val27;
> +	bool val20 = val18 || epoll_wait;
> +	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> -	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool val2 = !sleep;
> -	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool val1 = !rt;
>  	bool val3 = val1 || val2;
>  
> @@ -124,39 +128,41 @@ static void ltl_start(struct task_struct *task, struct
> ltl_monitor *mon)
>  static void
>  ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state,
> unsigned long *next)
>  {
> -	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> -	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> -	bool val40 = task_is_rcu || task_is_migration;
> -	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> -	bool val41 = futex_lock_pi || val40;
> -	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> -	bool val5 = block_on_rt_mutex || val41;
> -	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> -	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val32 = abort_sleep || kthread_should_stop;
>  	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
> -	bool val33 = woken_by_nmi || val32;
>  	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
> -	bool val34 = woken_by_hardirq || val33;
>  	bool woken_by_equal_or_higher_prio =
> test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
>  	     mon->atoms);
> -	bool val14 = woken_by_equal_or_higher_prio || val34;
>  	bool wake = test_bit(LTL_WAKE, mon->atoms);
> -	bool val13 = !wake;
> -	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
> +	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> +	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> +	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
> +	bool rt = test_bit(LTL_RT, mon->atoms);
> +	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
>  	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
>  	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> -	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool val25 = nanosleep_timer_abstime && val24;
> -	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> -	bool val18 = clock_nanosleep && val25;
> +	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> +	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> -	bool val9 = futex_wait || val18;
> +	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> +	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
> +	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> +	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> +	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> +	bool val42 = task_is_rcu || task_is_migration;
> +	bool val43 = futex_lock_pi || val42;
> +	bool val5 = block_on_rt_mutex || val43;
> +	bool val34 = abort_sleep || kthread_should_stop;
> +	bool val35 = woken_by_nmi || val34;
> +	bool val36 = woken_by_hardirq || val35;
> +	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val13 = !wake;
> +	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> +	bool val27 = nanosleep_timer_abstime && val26;
> +	bool val18 = clock_nanosleep && val27;
> +	bool val20 = val18 || epoll_wait;
> +	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> -	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool val2 = !sleep;
> -	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool val1 = !rt;
>  	bool val3 = val1 || val2;
>  
> diff --git a/tools/verification/models/rtapp/sleep.ltl
> b/tools/verification/models/rtapp/sleep.ltl
> index 6379bbeb6212..6f26c4810f78 100644
> --- a/tools/verification/models/rtapp/sleep.ltl
> +++ b/tools/verification/models/rtapp/sleep.ltl
> @@ -5,6 +5,7 @@ RT_FRIENDLY_SLEEP = (RT_VALID_SLEEP_REASON or KERNEL_THREAD)
>  
>  RT_VALID_SLEEP_REASON = FUTEX_WAIT
>                       or RT_FRIENDLY_NANOSLEEP
> +                     or EPOLL_WAIT
>  
>  RT_FRIENDLY_NANOSLEEP = CLOCK_NANOSLEEP
>                      and NANOSLEEP_TIMER_ABSTIME


^ permalink raw reply

* Re: [PATCH] tracing: Move snapshot code out of trace.c and into trace_snapshot.c
From: Steven Rostedt @ 2026-03-31 12:34 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: kernel test robot, LKML, Linux trace kernel, llvm, oe-kbuild-all,
	Masami Hiramatsu, Mathieu Desnoyers
In-Reply-To: <6efebcb6-194e-4c21-8808-fdf09160eac0@app.fastmail.com>

On Tue, 31 Mar 2026 12:53:38 +0200
"Arnd Bergmann" <arnd@arndb.de> wrote:

> Right, I assumed it had to be something like that, just didn't immediately
> see what. I've sent a patch to just remove the duplicate inline
> function now.

Ah, I had already sent a patch to fix the duplicate because I saw the
kernel test robot's report.

  https://lore.kernel.org/all/20260330205859.24c0aae3@gandalf.local.home/

-- Steve

^ permalink raw reply

* Re: [PATCH v8 12/12] rv: Add nomiss deadline monitor
From: Juri Lelli @ 2026-03-31 12:32 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: linux-kernel, Steven Rostedt, Nam Cao, Juri Lelli,
	Jonathan Corbet, Masami Hiramatsu, linux-trace-kernel, linux-doc,
	Tomas Glozar, Clark Williams, John Kacur
In-Reply-To: <20260330111010.153663-13-gmonaco@redhat.com>

Hello,

On 30/03/26 13:10, Gabriele Monaco wrote:
> Add the deadline monitors collection to validate the deadline scheduler,
> both for deadline tasks and servers.
> 
> The currently implemented monitors are:
> * nomiss:
>     validate dl entities run to completion before their deadiline
> 
> Reviewed-by: Nam Cao <namcao@linutronix.de>
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>

Looks good to me.

Reviewed-by: Juri Lelli <juri.lelli@redhat.com>

Best,
Juri


^ permalink raw reply

* Re: [PATCH v4 0/5] locking: contended_release tracepoint instrumentation
From: Dmitry Ilvokhin @ 2026-03-31 12:32 UTC (permalink / raw)
  To: Usama Arif
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-kernel, linux-mips,
	virtualization, linux-arch, linux-mm, linux-trace-kernel,
	kernel-team
In-Reply-To: <20260331102704.921355-1-usama.arif@linux.dev>

On Tue, Mar 31, 2026 at 03:27:03AM -0700, Usama Arif wrote:
> On Thu, 26 Mar 2026 15:09:59 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:
> 
> > The existing contention_begin/contention_end tracepoints fire on the
> > waiter side. The lock holder's identity and stack can be captured at
> > contention_begin time (e.g. perf lock contention --lock-owner), but
> > this reflects the holder's state when a waiter arrives, not when the
> > lock is actually released.
> > 
> > This series adds a contended_release tracepoint that fires on the
> > holder side when a lock with waiters is released. This provides:
> > 
> > - Hold time estimation: when the holder's own acquisition was
> >   contended, its contention_end (acquisition) and contended_release
> >   can be correlated to measure how long the lock was held under
> >   contention.
> > 
> > - The holder's stack at release time, which may differ from what perf lock
> >   contention --lock-owner captures if the holder does significant work between
> >   the waiter's arrival and the unlock.
> > 
> > Note: for reader/writer locks, the tracepoint fires for every reader
> > releasing while a writer is waiting, not only for the last reader.
> > 
> 
> Would it be better to reorder the patches? It would help with git
> bisectability as well. Move the refractoring work in patch 4 and
> 5 (excluding adding the tracepoints ofcourse) earlier, and then add
> all the tracepoints in the same commit at the end? It would help
> in the future with git blame to see where all the tracepoints
> were added as well.

Thanks for the suggestion, Usama.

I'd prefer to keep the current ordering: each refactoring commit is
immediately followed by the commit that uses it. For example,
queued_spin_release() is factored out right before the commit that adds
the tracepoint to spinning locks. This makes the motivation for the
refactoring clear and should also ease the review since the context is
still fresh.

Bisectability should be fine as-is, each commit compiles and works
independently, since the refactoring patches do not introduce behavioral
changes on their own.

^ permalink raw reply

* Re: [PATCH v4 3/5] locking: Add contended_release tracepoint to sleepable locks
From: Dmitry Ilvokhin @ 2026-03-31 12:16 UTC (permalink / raw)
  To: Usama Arif
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-kernel, linux-mips,
	virtualization, linux-arch, linux-mm, linux-trace-kernel,
	kernel-team
In-Reply-To: <20260331103451.1070175-1-usama.arif@linux.dev>

On Tue, Mar 31, 2026 at 03:34:50AM -0700, Usama Arif wrote:
> On Thu, 26 Mar 2026 15:10:02 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:
> 
> > Add the contended_release trace event. This tracepoint fires on the
> > holder side when a contended lock is released, complementing the
> > existing contention_begin/contention_end tracepoints which fire on the
> > waiter side.
> > 
> > This enables correlating lock hold time under contention with waiter
> > events by lock address.
> > 
> > Add trace_contended_release() calls to the slowpath unlock paths of
> > sleepable locks: mutex, rtmutex, semaphore, rwsem, percpu-rwsem, and
> > RT-specific rwbase locks.
> > 
> > Where possible, trace_contended_release() fires before the lock is
> > released and before the waiter is woken. For some lock types, the
> > tracepoint fires after the release but before the wake. Making the
> > placement consistent across all lock types is not worth the added
> > complexity.
> > 
> > For reader/writer locks, the tracepoint fires for every reader releasing
> > while a writer is waiting, not only for the last reader.
> > 
> > Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
> > ---
> >  include/trace/events/lock.h   | 17 +++++++++++++++++
> >  kernel/locking/mutex.c        |  4 ++++
> >  kernel/locking/percpu-rwsem.c | 11 +++++++++++
> >  kernel/locking/rtmutex.c      |  1 +
> >  kernel/locking/rwbase_rt.c    |  6 ++++++
> >  kernel/locking/rwsem.c        | 10 ++++++++--
> >  kernel/locking/semaphore.c    |  4 ++++
> >  7 files changed, 51 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
> > index da978f2afb45..1ded869cd619 100644
> > --- a/include/trace/events/lock.h
> > +++ b/include/trace/events/lock.h
> > @@ -137,6 +137,23 @@ TRACE_EVENT(contention_end,
> >  	TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret)
> >  );
> >  
> > +TRACE_EVENT(contended_release,
> > +
> > +	TP_PROTO(void *lock),
> > +
> > +	TP_ARGS(lock),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(void *, lock_addr)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->lock_addr = lock;
> > +	),
> > +
> > +	TP_printk("%p", __entry->lock_addr)
> > +);
> > +
> >  #endif /* _TRACE_LOCK_H */
> >  
> >  /* This part must be outside protection */
> > diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> > index 427187ff02db..6c2c9312eb8f 100644
> > --- a/kernel/locking/mutex.c
> > +++ b/kernel/locking/mutex.c
> > @@ -997,6 +997,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
> >  		wake_q_add(&wake_q, next);
> >  	}
> >  
> > +	if (trace_contended_release_enabled() && waiter)
> > +		trace_contended_release(lock);
> > +
> 
> This won't compile? waiter is declared in the if block, so you are using
> it outside scope here.
>

Thanks for the feedback, Usama.

waiter is declared at function scope, right on top. It's also assigned
before the if block, so it's still in scope at the tracepoint.

^ permalink raw reply

* Re: [PATCH] tracing: Move snapshot code out of trace.c and into trace_snapshot.c
From: Arnd Bergmann @ 2026-03-31 10:53 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: kernel test robot, LKML, Linux trace kernel, llvm, oe-kbuild-all,
	Masami Hiramatsu, Mathieu Desnoyers
In-Reply-To: <20260330120516.73aede9b@gandalf.local.home>

On Mon, Mar 30, 2026, at 18:05, Steven Rostedt wrote:
> On Mon, 30 Mar 2026 16:06:44 +0200 "Arnd Bergmann" <arnd@arndb.de> wrote:
>
>> I saw the same thing and worked around it by removing the function.
>> I then noticed that a bunch of code surrounding it is also unused
>> and I removed that as well (see below). This version passes
>> my randconfig build tests, but I suspect it is still wrong,
>> since the code never had any callers and I don't understand
>> why.
>
> Note, this code is in include/linux/tracing_printk.h, and is for debugging
> purposes (just like trace_printk() is). Hence, it shouldn't be removed.
>
> The purpose is to call tracing_snapshot() when your code detects something
> isn't right (but it doesn't crash), and this will take a snapshot of the
> current trace that lead up to the anomaly.

Right, I assumed it had to be something like that, just didn't immediately
see what. I've sent a patch to just remove the duplicate inline
function now.

> If anything, I should add more to Documentation/trace/debugging.rst about it.

Or maybe a samples module that serves to show how the interface
gets used?

      Arnd

^ permalink raw reply

* [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Nam Cao @ 2026-03-31 10:49 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Nam Cao

Since commit 0c43094f8cc9 ("eventpoll: Replace rwlock with spinlock"),
epoll_wait is real-time-safe syscall for sleeping.

Add epoll_wait to the list of rt-safe sleeping APIs.

Signed-off-by: Nam Cao <namcao@linutronix.de>
---
 kernel/trace/rv/monitors/sleep/sleep.c    |  8 ++
 kernel/trace/rv/monitors/sleep/sleep.h    | 98 ++++++++++++-----------
 tools/verification/models/rtapp/sleep.ltl |  1 +
 3 files changed, 61 insertions(+), 46 deletions(-)

diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index c1347da69e9d..59091863c17c 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -49,6 +49,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo
 		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
 		ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
 		ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+		ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
 		ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
 		ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
 	}
@@ -63,6 +64,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo
 		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
 		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
 		ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
+		ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
 
 		if (strstarts(task->comm, "migration/"))
 			ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true);
@@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
 			break;
 		}
 		break;
+#ifdef __NR_epoll_wait
+	case __NR_epoll_wait:
+		ltl_atom_set(mon, LTL_EPOLL_WAIT, true);
+		break;
+#endif
 	}
 }
 
@@ -174,6 +181,7 @@ static void handle_sys_exit(void *data, struct pt_regs *regs, long ret)
 	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
 	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
 	ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+	ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
 	ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
 }
 
diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h
index 2ab46fd218d2..95dc2727c059 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.h
+++ b/kernel/trace/rv/monitors/sleep/sleep.h
@@ -15,6 +15,7 @@ enum ltl_atom {
 	LTL_ABORT_SLEEP,
 	LTL_BLOCK_ON_RT_MUTEX,
 	LTL_CLOCK_NANOSLEEP,
+	LTL_EPOLL_WAIT,
 	LTL_FUTEX_LOCK_PI,
 	LTL_FUTEX_WAIT,
 	LTL_KERNEL_THREAD,
@@ -40,6 +41,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
 		"ab_sl",
 		"bl_on_rt_mu",
 		"cl_na",
+		"ep_wa",
 		"fu_lo_pi",
 		"fu_wa",
 		"ker_th",
@@ -75,39 +77,41 @@ static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
 
 static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
 {
-	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
-	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
-	bool val40 = task_is_rcu || task_is_migration;
-	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
-	bool val41 = futex_lock_pi || val40;
-	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
-	bool val5 = block_on_rt_mutex || val41;
-	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
-	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
-	bool val32 = abort_sleep || kthread_should_stop;
 	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
-	bool val33 = woken_by_nmi || val32;
 	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
-	bool val34 = woken_by_hardirq || val33;
 	bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
 	     mon->atoms);
-	bool val14 = woken_by_equal_or_higher_prio || val34;
 	bool wake = test_bit(LTL_WAKE, mon->atoms);
-	bool val13 = !wake;
-	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+	bool rt = test_bit(LTL_RT, mon->atoms);
+	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
 	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
 	bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
-	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
-	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
-	bool val25 = nanosleep_timer_abstime && val24;
-	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
-	bool val18 = clock_nanosleep && val25;
+	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
 	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
-	bool val9 = futex_wait || val18;
+	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
+	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+	bool val42 = task_is_rcu || task_is_migration;
+	bool val43 = futex_lock_pi || val42;
+	bool val5 = block_on_rt_mutex || val43;
+	bool val34 = abort_sleep || kthread_should_stop;
+	bool val35 = woken_by_nmi || val34;
+	bool val36 = woken_by_hardirq || val35;
+	bool val14 = woken_by_equal_or_higher_prio || val36;
+	bool val13 = !wake;
+	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+	bool val27 = nanosleep_timer_abstime && val26;
+	bool val18 = clock_nanosleep && val27;
+	bool val20 = val18 || epoll_wait;
+	bool val9 = futex_wait || val20;
 	bool val11 = val9 || kernel_thread;
-	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
 	bool val2 = !sleep;
-	bool rt = test_bit(LTL_RT, mon->atoms);
 	bool val1 = !rt;
 	bool val3 = val1 || val2;
 
@@ -124,39 +128,41 @@ static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
 static void
 ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
 {
-	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
-	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
-	bool val40 = task_is_rcu || task_is_migration;
-	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
-	bool val41 = futex_lock_pi || val40;
-	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
-	bool val5 = block_on_rt_mutex || val41;
-	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
-	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
-	bool val32 = abort_sleep || kthread_should_stop;
 	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
-	bool val33 = woken_by_nmi || val32;
 	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
-	bool val34 = woken_by_hardirq || val33;
 	bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
 	     mon->atoms);
-	bool val14 = woken_by_equal_or_higher_prio || val34;
 	bool wake = test_bit(LTL_WAKE, mon->atoms);
-	bool val13 = !wake;
-	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+	bool rt = test_bit(LTL_RT, mon->atoms);
+	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
 	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
 	bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
-	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
-	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
-	bool val25 = nanosleep_timer_abstime && val24;
-	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
-	bool val18 = clock_nanosleep && val25;
+	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
 	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
-	bool val9 = futex_wait || val18;
+	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
+	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+	bool val42 = task_is_rcu || task_is_migration;
+	bool val43 = futex_lock_pi || val42;
+	bool val5 = block_on_rt_mutex || val43;
+	bool val34 = abort_sleep || kthread_should_stop;
+	bool val35 = woken_by_nmi || val34;
+	bool val36 = woken_by_hardirq || val35;
+	bool val14 = woken_by_equal_or_higher_prio || val36;
+	bool val13 = !wake;
+	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+	bool val27 = nanosleep_timer_abstime && val26;
+	bool val18 = clock_nanosleep && val27;
+	bool val20 = val18 || epoll_wait;
+	bool val9 = futex_wait || val20;
 	bool val11 = val9 || kernel_thread;
-	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
 	bool val2 = !sleep;
-	bool rt = test_bit(LTL_RT, mon->atoms);
 	bool val1 = !rt;
 	bool val3 = val1 || val2;
 
diff --git a/tools/verification/models/rtapp/sleep.ltl b/tools/verification/models/rtapp/sleep.ltl
index 6379bbeb6212..6f26c4810f78 100644
--- a/tools/verification/models/rtapp/sleep.ltl
+++ b/tools/verification/models/rtapp/sleep.ltl
@@ -5,6 +5,7 @@ RT_FRIENDLY_SLEEP = (RT_VALID_SLEEP_REASON or KERNEL_THREAD)
 
 RT_VALID_SLEEP_REASON = FUTEX_WAIT
                      or RT_FRIENDLY_NANOSLEEP
+                     or EPOLL_WAIT
 
 RT_FRIENDLY_NANOSLEEP = CLOCK_NANOSLEEP
                     and NANOSLEEP_TIMER_ABSTIME
-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH v4 3/5] locking: Add contended_release tracepoint to sleepable locks
From: Usama Arif @ 2026-03-31 10:34 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Usama Arif, Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng,
	Waiman Long, Thomas Bogendoerfer, Juergen Gross, Ajay Kaher,
	Alexey Makhalov, Broadcom internal kernel review list,
	Thomas Gleixner, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Arnd Bergmann, Dennis Zhou, Tejun Heo,
	Christoph Lameter, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel, linux-mips, virtualization,
	linux-arch, linux-mm, linux-trace-kernel, kernel-team
In-Reply-To: <d2e5763278812499335b22a013aafb4979e3324b.1774536681.git.d@ilvokhin.com>

On Thu, 26 Mar 2026 15:10:02 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:

> Add the contended_release trace event. This tracepoint fires on the
> holder side when a contended lock is released, complementing the
> existing contention_begin/contention_end tracepoints which fire on the
> waiter side.
> 
> This enables correlating lock hold time under contention with waiter
> events by lock address.
> 
> Add trace_contended_release() calls to the slowpath unlock paths of
> sleepable locks: mutex, rtmutex, semaphore, rwsem, percpu-rwsem, and
> RT-specific rwbase locks.
> 
> Where possible, trace_contended_release() fires before the lock is
> released and before the waiter is woken. For some lock types, the
> tracepoint fires after the release but before the wake. Making the
> placement consistent across all lock types is not worth the added
> complexity.
> 
> For reader/writer locks, the tracepoint fires for every reader releasing
> while a writer is waiting, not only for the last reader.
> 
> Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
> ---
>  include/trace/events/lock.h   | 17 +++++++++++++++++
>  kernel/locking/mutex.c        |  4 ++++
>  kernel/locking/percpu-rwsem.c | 11 +++++++++++
>  kernel/locking/rtmutex.c      |  1 +
>  kernel/locking/rwbase_rt.c    |  6 ++++++
>  kernel/locking/rwsem.c        | 10 ++++++++--
>  kernel/locking/semaphore.c    |  4 ++++
>  7 files changed, 51 insertions(+), 2 deletions(-)
> 
> diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
> index da978f2afb45..1ded869cd619 100644
> --- a/include/trace/events/lock.h
> +++ b/include/trace/events/lock.h
> @@ -137,6 +137,23 @@ TRACE_EVENT(contention_end,
>  	TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret)
>  );
>  
> +TRACE_EVENT(contended_release,
> +
> +	TP_PROTO(void *lock),
> +
> +	TP_ARGS(lock),
> +
> +	TP_STRUCT__entry(
> +		__field(void *, lock_addr)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->lock_addr = lock;
> +	),
> +
> +	TP_printk("%p", __entry->lock_addr)
> +);
> +
>  #endif /* _TRACE_LOCK_H */
>  
>  /* This part must be outside protection */
> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> index 427187ff02db..6c2c9312eb8f 100644
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -997,6 +997,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
>  		wake_q_add(&wake_q, next);
>  	}
>  
> +	if (trace_contended_release_enabled() && waiter)
> +		trace_contended_release(lock);
> +

This won't compile? waiter is declared in the if block, so you are using
it outside scope here.


^ permalink raw reply

* [PATCH] tracing: remove duplicate  latency_fsnotify() stub
From: Arnd Bergmann @ 2026-03-31 10:30 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Arnd Bergmann, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel

From: Arnd Bergmann <arnd@arndb.de>

During the move, an extra copy of latency_fsnotify() crept in:

kernel/trace/trace_snapshot.c:395:20: error: redefinition of 'latency_fsnotify'
  395 | static inline void latency_fsnotify(struct trace_array *tr) { }
      |                    ^~~~~~~~~~~~~~~~
In file included from kernel/trace/trace_snapshot.c:6:
kernel/trace/trace.h:858:20: note: previous definition of 'latency_fsnotify' with type 'void(struct trace_array *)'
  858 | static inline void latency_fsnotify(struct trace_array *tr) { }
      |                    ^~~~~~~~~~~~~~~~

The function is still called from the hwlat and osnoise tracers, so the
copy in the header file is the one that has to stay.

Remove the extra one from trace_snapshot.c

Fixes: bade44fe5462 ("tracing: Move snapshot code out of trace.c and into trace_snapshot.c")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
Please fold into the commit that moves the code, if possible
---
 kernel/trace/trace_snapshot.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/trace/trace_snapshot.c b/kernel/trace/trace_snapshot.c
index 8865b2ef2264..a54e9533e79d 100644
--- a/kernel/trace/trace_snapshot.c
+++ b/kernel/trace/trace_snapshot.c
@@ -391,8 +391,6 @@ void latency_fsnotify(struct trace_array *tr)
 	 */
 	irq_work_queue(&tr->fsnotify_irqwork);
 }
-#else
-static inline void latency_fsnotify(struct trace_array *tr) { }
 #endif /* LATENCY_FS_NOTIFY */
 static const struct file_operations tracing_max_lat_fops;
 
-- 
2.39.5


^ permalink raw reply related

* Re: [PATCH v4 0/5] locking: contended_release tracepoint instrumentation
From: Usama Arif @ 2026-03-31 10:27 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Usama Arif, Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng,
	Waiman Long, Thomas Bogendoerfer, Juergen Gross, Ajay Kaher,
	Alexey Makhalov, Broadcom internal kernel review list,
	Thomas Gleixner, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Arnd Bergmann, Dennis Zhou, Tejun Heo,
	Christoph Lameter, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel, linux-mips, virtualization,
	linux-arch, linux-mm, linux-trace-kernel, kernel-team
In-Reply-To: <cover.1774536681.git.d@ilvokhin.com>

On Thu, 26 Mar 2026 15:09:59 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:

> The existing contention_begin/contention_end tracepoints fire on the
> waiter side. The lock holder's identity and stack can be captured at
> contention_begin time (e.g. perf lock contention --lock-owner), but
> this reflects the holder's state when a waiter arrives, not when the
> lock is actually released.
> 
> This series adds a contended_release tracepoint that fires on the
> holder side when a lock with waiters is released. This provides:
> 
> - Hold time estimation: when the holder's own acquisition was
>   contended, its contention_end (acquisition) and contended_release
>   can be correlated to measure how long the lock was held under
>   contention.
> 
> - The holder's stack at release time, which may differ from what perf lock
>   contention --lock-owner captures if the holder does significant work between
>   the waiter's arrival and the unlock.
> 
> Note: for reader/writer locks, the tracepoint fires for every reader
> releasing while a writer is waiting, not only for the last reader.
> 

Would it be better to reorder the patches? It would help with git
bisectability as well. Move the refractoring work in patch 4 and
5 (excluding adding the tracepoints ofcourse) earlier, and then add
all the tracepoints in the same commit at the end? It would help
in the future with git blame to see where all the tracepoints
were added as well.

^ permalink raw reply

* Re: [PATCH v2] bootconfig: Apply early options from embedded config
From: Kiryl Shutsemau @ 2026-03-31 10:18 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Breno Leitao, Jonathan Corbet, Shuah Khan, linux-kernel,
	linux-trace-kernel, linux-doc, oss, paulmck, rostedt, kernel-team
In-Reply-To: <20260325232204.05edbb21c7602b6408ca007b@kernel.org>

On Wed, Mar 25, 2026 at 11:22:04PM +0900, Masami Hiramatsu wrote:
> > diff --git a/init/main.c b/init/main.c
> > index 453ac9dff2da0..14a04c283fa48 100644
> > --- a/init/main.c
> > +++ b/init/main.c
> > @@ -416,9 +416,64 @@ static int __init warn_bootconfig(char *str)
> >  	return 0;
> >  }
> >  
> > +/*
> > + * do_early_param() is defined later in this file but called from
> > + * bootconfig_apply_early_params() below, so we need a forward declaration.
> > + */
> > +static int __init do_early_param(char *param, char *val,
> > +				 const char *unused, void *arg);
> > +
> > +/*
> > + * bootconfig_apply_early_params - dispatch kernel.* keys from the embedded
> > + * bootconfig as early_param() calls.
> > + *
> > + * early_param() handlers must run before most of the kernel initialises
> > + * (e.g. before the GIC driver reads irqchip.gicv3_pseudo_nmi).  A bootconfig
> > + * attached to the initrd arrives too late for this because the initrd is not
> > + * mapped yet when early params are processed.  The embedded bootconfig lives
> > + * in the kernel image itself (.init.data), so it is always reachable.
> > + *
> > + * This function is called from setup_boot_config() which runs in
> > + * start_kernel() before parse_early_param(), making the timing correct.
> > + */
> > +static void __init bootconfig_apply_early_params(void)
> 
> [sashiko comment]
> | Does this run early enough for architectural parameters?
> | While setup_boot_config() runs before parse_early_param() in start_kernel(),
> | it runs after setup_arch(). setup_boot_config() relies on xbc_init() which
> | uses the memblock allocator, requiring setup_arch() to have already
> | initialized it.
> | However, the kernel expects many early parameters (like mem=, earlycon,
> | noapic, and iommu) to be parsed during setup_arch() via the architecture's
> | call to parse_early_param(). Since setup_arch() completes before
> | setup_boot_config() runs, will these architectural early parameters be
> | silently ignored because the decisions they influence were already
> | finalized?
> 
> This is the major reason that I did not support early parameter
> in bootconfig. Some archs initialize kernel_cmdline in setup_arch()
> and setup early parameters in it.
> To fix this, we need to change setup_arch() for each architecture so
> that it calls this bootconfig_apply_early_params().

Hi Masami,

I have a question about bootconfig design. Is it necessary to parse the
bootconfig at boot time?

We could avoid a lot of complexity if we flattened the bootconfig into a
simple key-value list before embedding it in the kernel image or
attaching it to the initrd. That would eliminate the need for memory
allocations and allow the config to be used earlier during boot.

What am I missing?

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply

* Re: [PATCH v4 1/5] tracing/lock: Remove unnecessary linux/sched.h include
From: Usama Arif @ 2026-03-31 10:11 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Usama Arif, Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng,
	Waiman Long, Thomas Bogendoerfer, Juergen Gross, Ajay Kaher,
	Alexey Makhalov, Broadcom internal kernel review list,
	Thomas Gleixner, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Arnd Bergmann, Dennis Zhou, Tejun Heo,
	Christoph Lameter, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel, linux-mips, virtualization,
	linux-arch, linux-mm, linux-trace-kernel, kernel-team
In-Reply-To: <5593ed9718b1a6e4ec51d99772c485734029d4d4.1774536681.git.d@ilvokhin.com>

On Thu, 26 Mar 2026 15:10:00 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:

> None of the trace events in lock.h reference anything from
> linux/sched.h. Remove the unnecessary include.
> 
> Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
> ---
>  include/trace/events/lock.h | 1 -
>  1 file changed, 1 deletion(-)
> 

Acked-by: Usama Arif <usama.arif@linux.dev>

^ permalink raw reply

* Re: [PATCH v8 3/6] tracefs: Check file permission even if user has CAP_DAC_OVERRIDE
From: Masami Hiramatsu @ 2026-03-31  9:24 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel
In-Reply-To: <20260305123314.41ab284d@gandalf.local.home>

On Thu, 5 Mar 2026 12:33:14 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Thu, 12 Feb 2026 15:15:15 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > > With this still not working this late in the game, it will need to wait
> > > until the next merge window. I'll take the first two patches of this
> > > series now though.  
> > 
> > OK. I will send the next version without the first 2 patches.
> 
> Hi Masami,
> 
> Did you send a new version of this yet?

Oops, I missed this reply. Let me update it.

Thanks,

> 
> -- Steve


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH] tracing: always provide a prototype for tracing_alloc_snapshot()
From: Masami Hiramatsu @ 2026-03-31  9:20 UTC (permalink / raw)
  To: Bartosz Golaszewski
  Cc: Steven Rostedt, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260331082001.31345-1-bartosz.golaszewski@oss.qualcomm.com>

On Tue, 31 Mar 2026 10:20:01 +0200
Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com> wrote:

> The tracing_alloc_snapshot() symbol is always exported even with
> !CONFIG_TRACER_SNAPSHOT so the prototype too must be always visible or
> we'll see the following warning:
> 
> kernel/trace/trace.c:820:5: warning: no previous prototype for ‘tracing_alloc_snapshot’ [-Wmissing-prototypes]
>   820 | int tracing_alloc_snapshot(void)
>       |     ^~~~~~~~~~~~~~~~~~~~~~
> 
> Fixes: bade44fe5462 ("tracing: Move snapshot code out of trace.c and into trace_snapshot.c")
> Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>

Good catch!

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks!

> ---
>  kernel/trace/trace.h | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 6abd9e16ef21..e8612b8b0a34 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -2275,6 +2275,8 @@ static inline void __init trace_event_init(void) { }
>  static inline void trace_event_update_all(struct trace_eval_map **map, int len) { }
>  #endif
>  
> +int tracing_alloc_snapshot(void);
> +
>  #ifdef CONFIG_TRACER_SNAPSHOT
>  extern const struct file_operations snapshot_fops;
>  extern const struct file_operations snapshot_raw_fops;
> @@ -2282,7 +2284,6 @@ extern const struct file_operations snapshot_raw_fops;
>  /* Used when creating instances */
>  int trace_allocate_snapshot(struct trace_array *tr, int size);
>  
> -int tracing_alloc_snapshot(void);
>  void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
>  int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);
>  int tracing_snapshot_cond_disable(struct trace_array *tr);
> -- 
> 2.47.3
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH] blktrace: reject buf_size smaller than blk_io_trace
From: Deepanshu Kartikey @ 2026-03-31  8:47 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: linux-block, linux-kernel, linux-trace-kernel,
	syzbot+ed8bc247f231c1a48e21
In-Reply-To: <20260322051838.1137822-1-kartikey406@gmail.com>

On Sun, Mar 22, 2026 at 10:48 AM Deepanshu Kartikey
<kartikey406@gmail.com> wrote:
>
> blk_trace_setup() accepts any non-zero buf_size.
> If buf_size < sizeof(struct blk_io_trace), relay_reserve()
> always returns NULL and all trace events are silently dropped.
>
> Reject such values early with -EINVAL.
>
> Reported-by: syzbot+ed8bc247f231c1a48e21@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=ed8bc247f231c1a48e21
> Signed-off-by: Deepanshu Kartikey <Kartikey406@gmail.com>
> ---
>  kernel/trace/blktrace.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
> index 8cd2520b4c99..6cc7d83ed1c2 100644
> --- a/kernel/trace/blktrace.c
> +++ b/kernel/trace/blktrace.c
> @@ -773,7 +773,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
>         if (ret)
>                 return -EFAULT;
>
> -       if (!buts.buf_size || !buts.buf_nr)
> +       if (buts.buf_size < sizeof(struct blk_io_trace) || !buts.buf_nr)
>                 return -EINVAL;
>
>         buts2 = (struct blk_user_trace_setup2) {
> --
> 2.43.0
>

Gentle ping on this patch . Let me know if anything else required

Thanks

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox