Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH 05/13] tracing/remotes: Add printk tracefs file
From: Vincent Donnefort @ 2026-06-02 17:11 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260602171146.2238998-1-vdonnefort@google.com>

When enabled, the printk tracefs file enables the redirection of all
events to dmesg. This is similar to tp_printk.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index 21583fae1bd9..1bf0ba159c92 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -21,6 +21,7 @@
 enum tri_type {
 	TRI_CONSUMING,
 	TRI_NONCONSUMING,
+	TRI_PRINTK,
 };
 
 struct trace_remote_iterator {
@@ -42,6 +43,7 @@ struct trace_remote {
 	void				*priv;
 	struct trace_buffer		*trace_buffer;
 	struct trace_buffer_desc	*trace_buffer_desc;
+	struct trace_remote_iterator	*printk;
 	struct dentry			*dentry;
 	struct eventfs_inode		*eventfs_root;
 	struct eventfs_inode		*eventfs_subdir;
@@ -335,6 +337,8 @@ static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
 	return 0;
 }
 
+static void trace_remote_do_printk(struct trace_remote *remote);
+
 static void __poll_remote(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
@@ -342,6 +346,7 @@ static void __poll_remote(struct work_struct *work)
 
 	remote = container_of(dwork, struct trace_remote, poll_work);
 	ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
+	trace_remote_do_printk(remote);
 
 	schedule_delayed_work(dwork, msecs_to_jiffies(remote->poll_ms));
 }
@@ -351,6 +356,8 @@ static void trace_remote_inc_poll(struct trace_remote *remote)
 	/* poll_cnt <= nr_readers, inherits its overflow protection */
 	if (!remote->poll_cnt++) {
 		ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
+		trace_remote_do_printk(remote);
+
 		schedule_delayed_work(&remote->poll_work, msecs_to_jiffies(remote->poll_ms));
 	}
 }
@@ -393,6 +400,14 @@ static struct trace_remote_iterator
 		trace_seq_init(&iter->seq);
 
 		switch (type) {
+		case TRI_PRINTK:
+			/* only one printk iter allowed */
+			if (WARN_ON_ONCE(remote->printk)) {
+				ret = -EBUSY;
+				break;
+			}
+			smp_store_release(&remote->printk, iter);
+			fallthrough;
 		case TRI_CONSUMING:
 			trace_remote_inc_poll(remote);
 			break;
@@ -427,6 +442,11 @@ static void trace_remote_iter_free(struct trace_remote_iterator *iter)
 	lockdep_assert_held(&remote->lock);
 
 	switch (iter->type) {
+	case TRI_PRINTK:
+		WARN_ON_ONCE(remote->printk != iter);
+		smp_store_release(&remote->printk, NULL);
+		flush_delayed_work(&remote->poll_work);
+		fallthrough;
 	case TRI_CONSUMING:
 		trace_remote_dec_poll(remote);
 		break;
@@ -504,6 +524,7 @@ __peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long
 	struct ring_buffer_iter *rb_iter;
 
 	switch (iter->type) {
+	case TRI_PRINTK:
 	case TRI_CONSUMING:
 		return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events);
 	case TRI_NONCONSUMING:
@@ -571,6 +592,7 @@ static void trace_remote_iter_move(struct trace_remote_iterator *iter)
 	struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
 
 	switch (iter->type) {
+	case TRI_PRINTK:
 	case TRI_CONSUMING:
 		ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL);
 		break;
@@ -814,6 +836,80 @@ static const struct file_operations trace_fops = {
 	.release	= trace_release,
 };
 
+static void trace_remote_do_printk(struct trace_remote *remote)
+{
+	struct trace_remote_iterator *iter = smp_load_acquire(&remote->printk);
+
+	if (!iter)
+		return;
+
+	trace_remote_iter_read_start(iter);
+
+	while (trace_remote_iter_read_event(iter)) {
+		trace_seq_init(&iter->seq);
+
+		trace_remote_iter_print_event(iter);
+		if (!pr_emerg("%s", iter->seq.buffer))
+			break;
+
+		trace_remote_iter_move(iter);
+	}
+
+	trace_remote_iter_read_finished(iter);
+}
+
+static int trace_remote_enable_printk(struct trace_remote *remote, bool enable)
+{
+	struct trace_remote_iterator *iter = remote->printk;
+
+	lockdep_assert_held(&remote->lock);
+
+	if (enable == !!iter)
+		return 0;
+
+	if (enable) {
+		iter = trace_remote_iter(remote, RING_BUFFER_ALL_CPUS, TRI_PRINTK);
+		if (IS_ERR(iter))
+			return PTR_ERR(iter);
+	} else {
+		trace_remote_iter_free(remote->printk);
+		/* trace_remote_iter_free has reset remote->printk */
+	}
+
+	return 0;
+}
+
+static ssize_t
+printk_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct seq_file *seq = filp->private_data;
+	struct trace_remote *remote = seq->private;
+	bool val;
+	int ret;
+
+	ret = kstrtobool_from_user(ubuf, cnt, &val);
+	if (ret)
+		return ret;
+
+	guard(mutex)(&remote->lock);
+
+	ret = trace_remote_enable_printk(remote, val);
+	if (ret)
+		return ret;
+
+	return cnt;
+}
+
+static int printk_show(struct seq_file *s, void *unused)
+{
+	struct trace_remote *remote = s->private;
+
+	seq_printf(s, "%d\n", !!remote->printk);
+
+	return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(printk);
+
 static struct dentry *tracefs_root;
 static DEFINE_MUTEX(tracefs_lock);
 static u64 tracefs_root_count;
@@ -858,6 +954,10 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
 	if (!d)
 		goto err;
 
+	d = trace_create_file("printk", TRACEFS_MODE_WRITE, remote_d, remote, &printk_fops);
+	if (!d)
+		goto err;
+
 	percpu_d = tracefs_create_dir("per_cpu", remote_d);
 	if (!percpu_d) {
 		pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name);
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related

* [PATCH 04/13] tracing/simple_ring_buffer: Add support for compressed length
From: Vincent Donnefort @ 2026-06-02 17:11 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260602171146.2238998-1-vdonnefort@google.com>

The array length is the total size in bytes of the data for the current
event. It is possible to compress this value into the event header type,
which has 28 unused types, saving 32 bits for sufficiently small events.

The compressed length is expressed as a multiple of the ring-buffer
alignment, 4-bytes by default. Enforces this alignment.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c
index f4642f5adda3..6be4aa19adca 100644
--- a/kernel/trace/simple_ring_buffer.c
+++ b/kernel/trace/simple_ring_buffer.c
@@ -207,7 +207,12 @@ static unsigned long rb_event_size(unsigned long length)
 {
 	struct ring_buffer_event *event;
 
-	return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]);
+	length = ALIGN(length, RB_ALIGNMENT);
+
+	if (length > RB_MAX_SMALL_DATA)
+		length += sizeof(event->array[0]);
+
+	return length + RB_EVNT_HDR_SIZE;
 }
 
 static struct ring_buffer_event *
@@ -223,12 +228,15 @@ rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta)
 static struct ring_buffer_event *
 simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp)
 {
-	unsigned long ts_ext_size = 0, event_size = rb_event_size(length);
 	struct simple_buffer_page *tail = cpu_buffer->tail_page;
+	unsigned long event_size, array_size, ts_ext_size = 0;
 	struct ring_buffer_event *event;
 	u32 write, prev_write;
 	u64 time_delta;
 
+	event_size = rb_event_size(length);
+	array_size = event_size - RB_EVNT_HDR_SIZE;
+
 	time_delta = timestamp - cpu_buffer->write_stamp;
 
 	if (test_time_stamp(time_delta))
@@ -259,9 +267,13 @@ simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long lengt
 		time_delta = 0;
 	}
 
-	event->type_len = 0;
+	if (length > RB_MAX_SMALL_DATA) {
+		event->type_len = 0;
+		event->array[0] = array_size;
+	} else {
+		event->type_len = DIV_ROUND_UP(array_size, RB_ALIGNMENT);
+	}
 	event->time_delta = time_delta;
-	event->array[0] = event_size - RB_EVNT_HDR_SIZE;
 
 	return event;
 }
@@ -284,7 +296,7 @@ void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned
 
 	rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp);
 
-	return &rb_event->array[1];
+	return rb_event->type_len ? &rb_event->array[0] : &rb_event->array[1];
 }
 EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve);
 
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related

* [PATCH 03/13] tracing/remotes: Use a single per-remote polling work
From: Vincent Donnefort @ 2026-06-02 17:11 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260602171146.2238998-1-vdonnefort@google.com>

Having a per-iterator polling work is wasteful when logging several
trace_remote per_cpu/trace_pipe files in parallel. This result in one
work running per-CPU, where only one would suffice.

Transition to a single per-remote polling work, scheduled on the first
consumer creation and stopped when the last consuming iterator is freed.

This blanket polls all CPUs, regardless of which ones are actually being
read. This is acceptable because the poll consists of reading the
meta-page, which is a fast operation. Also, it is more common to log all
CPUs in the system than only one, so this use-case should be favoured.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index 7fed18f28fa7..21583fae1bd9 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -26,7 +26,6 @@ enum tri_type {
 struct trace_remote_iterator {
 	struct trace_remote		*remote;
 	struct trace_seq		seq;
-	struct delayed_work		poll_work;
 	unsigned long			lost_events;
 	u64				ts;
 	struct ring_buffer_iter		*rb_iter;
@@ -55,6 +54,8 @@ struct trace_remote {
 	struct rw_semaphore		*pcpu_reader_locks;
 	unsigned int			nr_readers;
 	unsigned int			poll_ms;
+	struct delayed_work		poll_work;
+	unsigned int			poll_cnt;
 	bool				tracing_on;
 };
 
@@ -291,17 +292,6 @@ static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu)
 	return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0;
 }
 
-static void __poll_remote(struct work_struct *work)
-{
-	struct delayed_work *dwork = to_delayed_work(work);
-	struct trace_remote_iterator *iter;
-
-	iter = container_of(dwork, struct trace_remote_iterator, poll_work);
-	ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu);
-	schedule_delayed_work((struct delayed_work *)work,
-			      msecs_to_jiffies(iter->remote->poll_ms));
-}
-
 static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
 {
 	if (cpu != RING_BUFFER_ALL_CPUS) {
@@ -345,6 +335,36 @@ static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
 	return 0;
 }
 
+static void __poll_remote(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct trace_remote *remote;
+
+	remote = container_of(dwork, struct trace_remote, poll_work);
+	ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
+
+	schedule_delayed_work(dwork, msecs_to_jiffies(remote->poll_ms));
+}
+
+static void trace_remote_inc_poll(struct trace_remote *remote)
+{
+	/* poll_cnt <= nr_readers, inherits its overflow protection */
+	if (!remote->poll_cnt++) {
+		ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
+		schedule_delayed_work(&remote->poll_work, msecs_to_jiffies(remote->poll_ms));
+	}
+}
+
+static void trace_remote_dec_poll(struct trace_remote *remote)
+{
+	if (WARN_ON_ONCE(!remote->poll_cnt))
+		return;
+
+	remote->poll_cnt--;
+	if (!remote->poll_cnt)
+		cancel_delayed_work_sync(&remote->poll_work);
+}
+
 static struct trace_remote_iterator
 *trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type)
 {
@@ -374,9 +394,7 @@ static struct trace_remote_iterator
 
 		switch (type) {
 		case TRI_CONSUMING:
-			ring_buffer_poll_remote(remote->trace_buffer, cpu);
-			INIT_DELAYED_WORK(&iter->poll_work, __poll_remote);
-			schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms));
+			trace_remote_inc_poll(remote);
 			break;
 		case TRI_NONCONSUMING:
 			ret = __alloc_ring_buffer_iter(iter, cpu);
@@ -410,7 +428,7 @@ static void trace_remote_iter_free(struct trace_remote_iterator *iter)
 
 	switch (iter->type) {
 	case TRI_CONSUMING:
-		cancel_delayed_work_sync(&iter->poll_work);
+		trace_remote_dec_poll(remote);
 		break;
 	case TRI_NONCONSUMING:
 		__free_ring_buffer_iter(iter, iter->cpu);
@@ -939,6 +957,7 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs,
 	remote->poll_ms = 100;
 	mutex_init(&remote->lock);
 	init_rwsem(&remote->reader_lock);
+	INIT_DELAYED_WORK(&remote->poll_work, __poll_remote);
 
 	ret = trace_remote_init_tracefs(name, remote);
 	if (ret)
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related

* [PATCH 02/13] tracing/remotes: Use kstrtobool for boolean tracefs files
From: Vincent Donnefort @ 2026-06-02 17:11 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260602171146.2238998-1-vdonnefort@google.com>

Use kstrtobool in trace_remote.c where possible. This is more user-friendly
as it allows a better variety of input strings.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index cfe84e9b8fe6..7fed18f28fa7 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -176,10 +176,10 @@ tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t
 {
 	struct seq_file *seq = filp->private_data;
 	struct trace_remote *remote = seq->private;
-	unsigned long val;
+	bool val;
 	int ret;
 
-	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+	ret = kstrtobool_from_user(ubuf, cnt, &val);
 	if (ret)
 		return ret;
 
@@ -1090,10 +1090,10 @@ static ssize_t remote_event_enable_write(struct file *filp, const char __user *u
 	struct seq_file *seq = filp->private_data;
 	struct remote_event *evt = seq->private;
 	struct trace_remote *remote = evt->remote;
-	u8 enable;
+	bool enable;
 	int ret;
 
-	ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+	ret = kstrtobool_from_user(ubuf, count, &enable);
 	if (ret)
 		return ret;
 
@@ -1174,10 +1174,10 @@ static ssize_t remote_events_dir_enable_write(struct file *filp, const char __us
 					      size_t count, loff_t *ppos)
 {
 	struct trace_remote *remote = file_inode(filp)->i_private;
+	bool enable;
 	int i, ret;
-	u8 enable;
 
-	ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+	ret = kstrtobool_from_user(ubuf, count, &enable);
 	if (ret)
 		return ret;
 
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related

* [PATCH 01/13] tracing/remotes: Release tracefs,eventfs on registration failure
From: Vincent Donnefort @ 2026-06-02 17:11 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260602171146.2238998-1-vdonnefort@google.com>

In trace_remote_register(), if registration of events or the init
callback fails, the created tracefs and eventfs directories are leaked.

Release the entire eventfs and tracefs hierarchy on trace_remote
registration.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index d6c3f94d67cd..cfe84e9b8fe6 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -44,7 +44,8 @@ struct trace_remote {
 	struct trace_buffer		*trace_buffer;
 	struct trace_buffer_desc	*trace_buffer_desc;
 	struct dentry			*dentry;
-	struct eventfs_inode		*eventfs;
+	struct eventfs_inode		*eventfs_root;
+	struct eventfs_inode		*eventfs_subdir;
 	struct remote_event		*events;
 	unsigned long			nr_events;
 	unsigned long			trace_buffer_size;
@@ -795,26 +796,28 @@ static const struct file_operations trace_fops = {
 	.release	= trace_release,
 };
 
+static struct dentry *tracefs_root;
+static DEFINE_MUTEX(tracefs_lock);
+static u64 tracefs_root_count;
+
 static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
 {
 	struct dentry *remote_d, *percpu_d, *d;
-	static struct dentry *root;
-	static DEFINE_MUTEX(lock);
 	bool root_inited = false;
 	int cpu;
 
-	guard(mutex)(&lock);
+	guard(mutex)(&tracefs_lock);
 
-	if (!root) {
-		root = tracefs_create_dir(TRACEFS_DIR, NULL);
-		if (!root) {
+	if (!tracefs_root) {
+		tracefs_root = tracefs_create_dir(TRACEFS_DIR, NULL);
+		if (!tracefs_root) {
 			pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n");
 			return -ENOMEM;
 		}
 		root_inited = true;
 	}
 
-	remote_d = tracefs_create_dir(name, root);
+	remote_d = tracefs_create_dir(name, tracefs_root);
 	if (!remote_d) {
 		pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name);
 		goto err;
@@ -866,14 +869,15 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
 			goto err;
 	}
 
+	tracefs_root_count++;
 	remote->dentry = remote_d;
 
 	return 0;
 
 err:
 	if (root_inited) {
-		tracefs_remove(root);
-		root = NULL;
+		tracefs_remove(tracefs_root);
+		tracefs_root = NULL;
 	} else {
 		tracefs_remove(remote_d);
 	}
@@ -881,8 +885,26 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
 	return -ENOMEM;
 }
 
+static void trace_remote_remove_tracefs(struct trace_remote *remote)
+{
+	guard(mutex)(&tracefs_lock);
+
+	if (!remote->dentry)
+		return;
+
+	tracefs_remove(remote->dentry);
+	remote->dentry = NULL;
+
+	tracefs_root_count--;
+	if (!tracefs_root_count) {
+		tracefs_remove(tracefs_root);
+		tracefs_root = NULL;
+	}
+}
+
 static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
 					struct remote_event *events, size_t nr_events);
+static void trace_remote_unregister_events(struct trace_remote *remote);
 
 /**
  * trace_remote_register() - Register a Tracefs remote
@@ -905,10 +927,9 @@ static int trace_remote_register_events(const char *remote_name, struct trace_re
 int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv,
 			  struct remote_event *events, size_t nr_events)
 {
-	struct trace_remote *remote;
+	struct trace_remote *remote __free(kfree) = kzalloc_obj(*remote);
 	int ret;
 
-	remote = kzalloc_obj(*remote);
 	if (!remote)
 		return -ENOMEM;
 
@@ -919,22 +940,30 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs,
 	mutex_init(&remote->lock);
 	init_rwsem(&remote->reader_lock);
 
-	if (trace_remote_init_tracefs(name, remote)) {
-		kfree(remote);
-		return -ENOMEM;
-	}
+	ret = trace_remote_init_tracefs(name, remote);
+	if (ret)
+		return ret;
 
 	ret = trace_remote_register_events(name, remote, events, nr_events);
 	if (ret) {
 		pr_err("Failed to register events for trace remote '%s' (%d)\n",
 		       name, ret);
-		return ret;
+		goto err_remove_tracefs;
 	}
 
 	ret = cbs->init ? cbs->init(remote->dentry, priv) : 0;
-	if (ret)
+	if (ret) {
 		pr_err("Init failed for trace remote '%s' (%d)\n", name, ret);
+		goto err_unregister_events;
+	}
+
+	no_free_ptr(remote);
+	return 0;
 
+err_unregister_events:
+	trace_remote_unregister_events(remote);
+err_remove_tracefs:
+	trace_remote_remove_tracefs(remote);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(trace_remote_register);
@@ -1267,7 +1296,6 @@ static int remote_events_dir_callback(const char *name, umode_t *mode, void **da
 static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote,
 				     struct remote_event *evt)
 {
-	struct eventfs_inode *eventfs = remote->eventfs;
 	static struct eventfs_entry dir_entries[] = {
 		{
 			.name		= "enable",
@@ -1292,35 +1320,37 @@ static int trace_remote_init_eventfs(const char *remote_name, struct trace_remot
 			.callback	= remote_event_callback,
 		}
 	};
-	bool eventfs_create = false;
+	struct eventfs_inode *eventfs_root, *eventfs_subdir, *e;
 
-	if (!eventfs) {
-		eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries,
-						    ARRAY_SIZE(dir_entries), remote);
-		if (IS_ERR(eventfs))
-			return PTR_ERR(eventfs);
+	eventfs_root = remote->eventfs_root;
+	eventfs_subdir = remote->eventfs_subdir;
+	if (!eventfs_root) {
+		eventfs_root = eventfs_create_events_dir("events", remote->dentry, dir_entries,
+							 ARRAY_SIZE(dir_entries), remote);
+		if (IS_ERR(eventfs_root))
+			return PTR_ERR(eventfs_root);
 
 		/*
 		 * Create similar hierarchy as local events even if a single system is supported at
 		 * the moment
 		 */
-		eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL);
-		if (IS_ERR(eventfs))
-			return PTR_ERR(eventfs);
-
-		remote->eventfs = eventfs;
-		eventfs_create = true;
+		eventfs_subdir = eventfs_create_dir(remote_name, eventfs_root, NULL, 0, NULL);
+		if (IS_ERR(eventfs_subdir)) {
+			eventfs_remove_events_dir(eventfs_root);
+			return PTR_ERR(eventfs_subdir);
+		}
 	}
 
-	eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt);
-	if (IS_ERR(eventfs)) {
-		if (eventfs_create) {
-			eventfs_remove_events_dir(remote->eventfs);
-			remote->eventfs = NULL;
-		}
-		return PTR_ERR(eventfs);
+	e = eventfs_create_dir(evt->name, eventfs_subdir, entries, ARRAY_SIZE(entries), evt);
+	if (IS_ERR(e)) {
+		if (!remote->eventfs_root)
+			eventfs_remove_events_dir(eventfs_root);
+		return PTR_ERR(e);
 	}
 
+	remote->eventfs_root = eventfs_root;
+	remote->eventfs_subdir = eventfs_subdir;
+
 	return 0;
 }
 
@@ -1335,11 +1365,11 @@ static int trace_remote_attach_events(struct trace_remote *remote, struct remote
 		if (evt->remote)
 			return -EEXIST;
 
-		evt->remote = remote;
-
 		/* We need events to be sorted for efficient lookup */
 		if (i && evt->id <= events[i - 1].id)
 			return -EINVAL;
+
+		evt->remote = remote;
 	}
 
 	remote->events = events;
@@ -1348,14 +1378,33 @@ static int trace_remote_attach_events(struct trace_remote *remote, struct remote
 	return 0;
 }
 
+static void trace_remote_detach_events(struct trace_remote *remote, struct remote_event *events,
+					size_t nr_events)
+{
+	int i;
+
+	for (i = 0; i < nr_events; i++) {
+		struct remote_event *evt = &events[i];
+
+		if (evt->remote == remote)
+			evt->remote = NULL;
+	}
+
+	remote->events = NULL;
+	remote->nr_events = 0;
+}
+
 static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
 					struct remote_event *events, size_t nr_events)
 {
 	int i, ret;
 
 	ret = trace_remote_attach_events(remote, events, nr_events);
-	if (ret)
+	if (ret) {
+		/* It is safe to call detach on a half-registered array */
+		trace_remote_detach_events(remote, events, nr_events);
 		return ret;
+	}
 
 	for (i = 0; i < nr_events; i++) {
 		struct remote_event *evt = &events[i];
@@ -1369,6 +1418,12 @@ static int trace_remote_register_events(const char *remote_name, struct trace_re
 	return 0;
 }
 
+static void trace_remote_unregister_events(struct trace_remote *remote)
+{
+	trace_remote_detach_events(remote, remote->events, remote->nr_events);
+	eventfs_remove_events_dir(remote->eventfs_root);
+}
+
 static int __cmp_events(const void *key, const void *data)
 {
 	const struct remote_event *evt = data;
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related

* [PATCH 00/13] tracing/remotes: Add printk, dump_on_oops and boot parameters
From: Vincent Donnefort @ 2026-06-02 17:11 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
  Cc: kernel-team, linux-kernel, Vincent Donnefort

This series extends the recently introduced trace remotes
infrastructure, bringing useful features for developers:

  * dump_on_oops: Dump the trace remote buffer on system panic.
  * printk: Redirect remote events to dmesg.
  * trace_remote=: Configure a trace_remote from the commandline.

It also brings a couple of optimisations:

  * Header compressed length support for small events.
  * Single work thread for remote polling.

And some misc improvements:

  * Use kstrtobool where possible
  * Fix trace remote unregistering

I didn't put a "Fixes:" tag on the commit which is fixing the remote
registration because I do not believe this is something any user can
trigger.

Vincent Donnefort (13):
  tracing/remotes: Release tracefs,eventfs on registration failure
  tracing/remotes: Use kstrtobool for boolean tracefs files
  tracing/remotes: Use a single per-remote polling work
  tracing/simple_ring_buffer: Add support for compressed length
  tracing/remotes: Add printk tracefs file
  tracing/remotes: selftests: Add a test for the printk tracefs file
  tracing/remotes: selftests: Prefix hypervisor folder
  ring-buffer: Add ring_buffer_read_remote_meta_page()
  tracing/remotes: Add dump_on_oops tracefs file
  tracing/remotes: selftests: Add a test for the dump_on_oops tracefs
    file
  Documentation: tracing/remotes: Add detailed tracefs layout
  tracing/remotes: Add trace_remote cmdline options
  Documentation/kernel-parameters: Add trace_remote

 .../admin-guide/kernel-parameters.txt         |  16 +
 Documentation/trace/remotes.rst               |  63 +-
 include/linux/ring_buffer.h                   |   1 +
 kernel/trace/ring_buffer.c                    |  53 ++
 kernel/trace/simple_ring_buffer.c             |  22 +-
 kernel/trace/trace_remote.c                   | 649 +++++++++++++++---
 .../buffer_size.tc                            |   0
 .../remotes/00hypervisor/dump_on_oops.tc      |  11 +
 .../{hypervisor => 00hypervisor}/hotplug.tc   |   0
 .../test.d/remotes/00hypervisor/printk.tc     |  11 +
 .../{hypervisor => 00hypervisor}/reset.tc     |   0
 .../{hypervisor => 00hypervisor}/trace.tc     |   0
 .../trace_pipe.tc                             |   0
 .../{hypervisor => 00hypervisor}/unloading.tc |   0
 .../ftrace/test.d/remotes/dump_on_oops.tc     |  51 ++
 .../selftests/ftrace/test.d/remotes/functions |   2 +
 .../selftests/ftrace/test.d/remotes/printk.tc |  72 ++
 17 files changed, 847 insertions(+), 104 deletions(-)
 rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/buffer_size.tc (100%)
 create mode 100644 tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/dump_on_oops.tc
 rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/hotplug.tc (100%)
 create mode 100644 tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/printk.tc
 rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/reset.tc (100%)
 rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/trace.tc (100%)
 rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/trace_pipe.tc (100%)
 rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/unloading.tc (100%)
 create mode 100644 tools/testing/selftests/ftrace/test.d/remotes/dump_on_oops.tc
 create mode 100644 tools/testing/selftests/ftrace/test.d/remotes/printk.tc


base-commit: e43ffb69e0438cddd72aaa30898b4dc446f664f8
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply

* Re: [PATCH mm-unstable v18 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Lance Yang @ 2026-06-02 16:34 UTC (permalink / raw)
  To: npache
  Cc: lance.yang, david, linux-doc, linux-kernel, linux-mm,
	linux-trace-kernel, aarcange, akpm, anshuman.khandual, apopple,
	baohua, baolin.wang, byungchul, catalin.marinas, cl, corbet,
	dave.hansen, dev.jain, gourry, hannes, hughd, jack, jackmanb,
	jannh, jglisse, joshua.hahnjy, kas, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe, usama.arif
In-Reply-To: <CAA1CXcD7peS3WHueVgAWhhRrjBO_1b19+Xc0CfZBSO8OwJJKQw@mail.gmail.com>


On Tue, Jun 02, 2026 at 09:30:06AM -0600, Nico Pache wrote:
>On Mon, Jun 1, 2026 at 4:48 AM Lance Yang <lance.yang@linux.dev> wrote:
>>
>>
>>
>> On 2026/6/1 18:23, David Hildenbrand (Arm) wrote:
>> > On 6/1/26 11:08, Lance Yang wrote:
>> >>
>> >>
>> >> On 2026/6/1 14:54, David Hildenbrand (Arm) wrote:
>> >>> On 6/1/26 05:28, Lance Yang wrote:
>> >>>>
>> >>>>
>> >>>> Ah, fair point.
>> >>>>
>> >>>> I was mostly worried about arch hooks that walk vma->vm_mm again, rather
>> >>>> than only using the pte pointer passed in. For example, mips does:
>> >>>
>> >>> Right, a re-walk would be the real problem.
>> >>>
>> >>>>
>> >>>>     update_mmu_cache_range()
>> >>>>       -> __update_tlb()
>> >>>>         -> pgd_offset(vma->vm_mm, address)
>> >>>>         -> pte_offset_map(...)
>> >>>>
>> >>>> and __update_tlb() has this assumption:
>> >>>>
>> >>>>          /*
>> >>>>           * update_mmu_cache() is called between pte_offset_map_lock()
>> >>>>           * and pte_unmap_unlock(), so we can assume that ptep is not
>> >>>>           * NULL here: and what should be done below if it were NULL?
>> >>>>           */
>> >>>>
>> >>>> So if khugepaged happens to run with current->active_mm == vma->vm_mm
>> >>>> here, could __update_tlb() hit the none PMD, get NULL from
>> >>>> pte_offset_map(), and then dereference it?
>> >>>
>> >>> Likely yes -- that MIPS code is horrible. And the comment in MIPS code
>> >>> even spells that out. :(
>> >>>
>> >>> Do you know about other code like that, or is MIPS the only one doing a
>> >>> re-walk and crossing fingers?
>> >>>
>> >>>>
>> >>>> Just wanted to raise it since some arch code may still have assumptions
>> >>>> like this, and the always-enable-mTHP work is getting closer ...
>> >>>
>> >>> Right. I assume set_pte_at() couldn't trigger something similar (re-walk) in
>> >>> arch code,
>> >>> because we simply provide the ptep. update_mmu_cache_range() only consumes the
>> >>> pte.
>> >>>
>> >>>>
>> >>>> Probably very very very hard to hit, though :)
>> >>>
>> >>> Delaying update_mmu_cache_range() is nasty, as we'd have to make sure that
>> >>> nobody can interfere in the meantime ... and the PMD lock will not be sufficient.
>> >>>
>> >>> Maybe we could reinstall the page table with the cleared (none) entries while
>> >>> still holding the PTL?
>> >>>
>> >>> Thinking out loud:
>> >>>
>> >>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> >>> index 5ba298d420b7..e39b750b1e6f 100644
>> >>> --- a/mm/khugepaged.c
>> >>> +++ b/mm/khugepaged.c
>> >>> @@ -1413,13 +1413,17 @@ static enum scan_result collapse_huge_page(struct
>> >>> mm_struct *mm, unsigned long s
>> >>>                   map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
>> >>>           } else {
>> >>>                   /*
>> >>> -                * set_ptes is called in map_anon_folio_pte_nopf with the
>> >>> -                * pmd_ptl lock still held; this is safe as the PMD is expected
>> >>> -                * to be none. The pmd entry is then repopulated below.
>> >>> +                * Re-insert the page table with the cleared entries, but
>> >>> +                * hold the PTL, such that no one can mess with the re-installed
>> >>> +                * page table until we updated the temporarily-cleared entries
>> >>> +                * through map_anon_folio_pte_nopf().
>> >>>                    */
>> >>> -               map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /
>> >>> *uffd_wp=*/ false);
>> >>> -               smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
>> >>
>> >> One small thing, I think we should probably keep the smp_wmb(), and just
>> >> move it before the earlier pmd_populate().
>> >>
>> >> IIUC, the ordering we want is still:
>> >>
>> >>    clear old PTEs
>> >>    smp_wmb()
>> >>    pmd_populate()
>> >>
>> >> so another CPU cannot walk through the re-installed PMD and still observe
>> >> the old PTEs, right?
>> >
>> > There is a smp_wmb() in __folio_mark_uptodate(), that should be sufficient?
>>
>> Ah, cool! __folio_mark_uptodate() already does the job :P
>>
>> So yeah, no extra smp_wmb() needed here!
>
>are we sure? that folio_mark_uptodate is done before the PTEs are
>reinstalled. Then we reinstall the PMD right after. Currently
>separated by the smp_wmb().

Reinstalling the PMD first makes the PTE table reachable again, right?

So before pmd_populate(), we only need to order the old PTE clears before
the PTE table is reachable again; __folio_mark_uptodate() already has the
smp_wmb() for that :)

The new PTEs are filled later under the PTL.

Hopefully I didn't miss soemthing :)

>I was copying this from other THP code that performs similar PTE/PMD juggling.
>
>I can remove it, but I'd rather air on the side of caution with this.

Cheers, Lance

^ permalink raw reply

* Re: [syzbot] [trace?] KASAN: use-after-free Write in ring_buffer_read_page
From: Steven Rostedt @ 2026-06-02 16:28 UTC (permalink / raw)
  To: syzbot
  Cc: linux-kernel, linux-trace-kernel, mathieu.desnoyers, mhiramat,
	syzkaller-bugs
In-Reply-To: <6a1ede7b.b4221f80.1326c5.0003.GAE@google.com>

On Tue, 02 Jun 2026 06:45:31 -0700
syzbot <syzbot+2dd9d02f60775ce5c1fb@syzkaller.appspotmail.com> wrote:

> syzbot found the following issue on:
> 
> HEAD commit:    e7ae89a0c97c Linux 7.1-rc5
> git tree:       upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=16f06e2e580000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=58acee1ac5406016
> dashboard link: https://syzkaller.appspot.com/bug?extid=2dd9d02f60775ce5c1fb
> compiler:       gcc (Debian 14.2.0-19) 14.2.0, GNU ld (GNU Binutils for Debian) 2.44
> 
> Unfortunately, I don't have any reproducer for this issue yet.

Looks like the test was doing something really weird to trigger this.
Without a reproducer, it's pretty much impossible to find out what
happened. Maybe AI could do it?

-- Steve

^ permalink raw reply

* Re: [PATCH] rtla: Fix parsing of multi-character short options
From: John Kacur @ 2026-06-02 16:21 UTC (permalink / raw)
  To: Tomas Glozar; +Cc: linux-trace-kernel, Steven Rostedt, linux-kernel
In-Reply-To: <20260602125506.3325345-1-tglozar@redhat.com>

On Tue, 2 Jun 2026 14:55:06 +0200, Tomas Glozar wrote:
> rtla: Fix parsing of multi-character short options

Thanks for the fix! I've tested this patch with a comprehensive test suite
covering all rtla commands (timerlat hist/top, osnoise hist/top, hwnoise)
with the four option formats:
  -p 100        (short with space)
  -p100         (short attached - previously broken)
  --period=100  (long with equals)
  --period 100  (long with space)

All 20 tests pass. The fix correctly resolves the issue where -p100 was
being parsed as multiple separate options.

Tested-by: John Kacur <jkacur@redhat.com>

^ permalink raw reply

* Re: [PATCH v4 2/3] perf: enable unprivileged syscall tracing with perf trace
From: Anubhav Shelat @ 2026-06-02 16:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mpetlan, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	James Clark, Thomas Falcon, linux-kernel, linux-trace-kernel,
	linux-perf-users
In-Reply-To: <20260518214116.GZ3102624@noisy.programming.kicks-ass.net>

On Mon, May 18, 2026 at 5:41 PM Peter Zijlstra <peterz@infradead.org> wrote:
> Typically patches are supposed to a single thing, you're listing 4
> things. What gives?
All four changes need to be made together to work properly. The second
point could be pulled out as a separate patch, but will be replaced
with the eventfs that Steve suggested. The other three points
represent a single logical change: selectively loosening the
perf_event_open() restrictions without exposing kernel data or
breaking uprobe functionality.

> PERF_SAMPLE_IP should be here too, no?
If PERF_SAMPLE_IP is added to the kaddr_leak mask it blocks uprobes,
so the PERF_SAMPLE_IP check is in the trace_event_perf.c changes where
I can exempt uprobes:
+       if ((p_event->attr.sample_type & PERF_SAMPLE_IP) &&
+           !p_event->attr.exclude_kernel &&
+           !(tp_event->flags & TRACE_EVENT_FL_UPROBE) &&
+           sysctl_perf_event_paranoid > 1 && !perfmon_capable())
+               return -EACCES;

> And I'm not sure if tracepoints can trigger it, but PHYS_ADDR also seems
> something we shouldn't allow.
There's a check for unprivileged access to PHYS_ADDR at core.c:13917
so I didn't add it to kaddr_leak.

> And we're sure RAW doesn't include pointers
PERF_SAMPLE_RAW for TRACE_EVENT_FL_CAP_ANY tracepoints doesn't include
kernel pointers.

> Again, you're doing the same thing in multiple places. If only there was
> something to re-use a previous expression.
>
> None of this gives me warm and fuzzy feelings.
You're right. I'll factor the checks out for the next version.

Anubhav


^ permalink raw reply

* [PATCH 2/2] rtla: Add tests for option parsing with attached arguments
From: John Kacur @ 2026-06-02 15:52 UTC (permalink / raw)
  To: linux-trace-kernel; +Cc: Steven Rostedt, Tomas Glozar, linux-kernel
In-Reply-To: <20260602155210.60439-1-jkacur@redhat.com>

Add tests to verify that short options with attached numeric arguments
work correctly for all rtla commands after the parsing fixes.

Tests verify four formats for each command:
- Short option with space: -p 100
- Short option attached: -p100
- Long option with equals: --period=100
- Long option with space: --period 100

For osnoise and hwnoise commands, the tests also include -r 100 (runtime)
to satisfy the osnoise constraint that runtime <= period.

These tests complement the existing timerlat hist tests added in commit
d489b602c669 ("rtla/timerlat: Add tests for option parsing with attached
arguments").

Assisted-by: Claude:claude-sonnet-4-5
Signed-off-by: John Kacur <jkacur@redhat.com>
---
 tools/tracing/rtla/tests/hwnoise.t  | 10 ++++++++++
 tools/tracing/rtla/tests/osnoise.t  | 18 ++++++++++++++++++
 tools/tracing/rtla/tests/timerlat.t |  8 ++++++++
 3 files changed, 36 insertions(+)

diff --git a/tools/tracing/rtla/tests/hwnoise.t b/tools/tracing/rtla/tests/hwnoise.t
index 23ce250a6852..c61f02bc42fe 100644
--- a/tools/tracing/rtla/tests/hwnoise.t
+++ b/tools/tracing/rtla/tests/hwnoise.t
@@ -19,4 +19,14 @@ check "enable a trace event trigger" \
 	"hwnoise -t -e osnoise:irq_noise --trigger=\"hist:key=desc,duration:sort=desc,duration:vals=hitcount\" -d 10s" \
 	0 "Saving event osnoise:irq_noise hist to osnoise_irq_noise_hist.txt"
 
+# Option parsing tests - verify attached numeric arguments work correctly
+check "verify -p with space" \
+	"hwnoise -p 100 -r 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify -p without space (attached argument)" \
+	"hwnoise -p100 -r 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify --period with equals" \
+	"hwnoise --period=100 -r 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify --period with space" \
+	"hwnoise --period 100 -r 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+
 test_end
diff --git a/tools/tracing/rtla/tests/osnoise.t b/tools/tracing/rtla/tests/osnoise.t
index 396334608920..1807236431df 100644
--- a/tools/tracing/rtla/tests/osnoise.t
+++ b/tools/tracing/rtla/tests/osnoise.t
@@ -17,6 +17,24 @@ check "verify the  --trace param" \
 check "verify the --entries/-E param" \
 	"osnoise hist -P F:1 -c 0 -r 900000 -d 10s -b 10 -E 25"
 
+# Option parsing tests - verify attached numeric arguments work correctly
+check "verify -p with space" \
+	"osnoise hist -p 100 -r 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify -p without space (attached argument)" \
+	"osnoise hist -p100 -r 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify --period with equals" \
+	"osnoise hist --period=100 -r 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify --period with space" \
+	"osnoise hist --period 100 -r 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify osnoise top -p with space" \
+	"osnoise top -p 100 -r 100 -c 0 -d 1s -q" 0 "" "no-irq and no-thread"
+check "verify osnoise top -p without space (attached argument)" \
+	"osnoise top -p100 -r 100 -c 0 -d 1s -q" 0 "" "no-irq and no-thread"
+check "verify osnoise top --period with equals" \
+	"osnoise top --period=100 -r 100 -c 0 -d 1s -q" 0 "" "no-irq and no-thread"
+check "verify osnoise top --period with space" \
+	"osnoise top --period 100 -r 100 -c 0 -d 1s -q" 0 "" "no-irq and no-thread"
+
 # Test setting default period by putting an absurdly high period
 # and stopping on threshold.
 # If default period is not set, this will time out.
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index 1a63301f5d70..506227412027 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -51,6 +51,14 @@ check "verify --period with equals" \
 	"timerlat hist --period=100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
 check "verify --period with space" \
 	"timerlat hist --period 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify timerlat top -p with space" \
+	"timerlat top -p 100 -c 0 -d 1s -q" 0 "" "no-irq and no-thread"
+check "verify timerlat top -p without space (attached argument)" \
+	"timerlat top -p100 -c 0 -d 1s -q" 0 "" "no-irq and no-thread"
+check "verify timerlat top --period with equals" \
+	"timerlat top --period=100 -c 0 -d 1s -q" 0 "" "no-irq and no-thread"
+check "verify timerlat top --period with space" \
+	"timerlat top --period 100 -c 0 -d 1s -q" 0 "" "no-irq and no-thread"
 
 # Actions tests
 check "trace output through -t" \
-- 
2.54.0


^ permalink raw reply related

* [PATCH 1/2] rtla/timerlat: Add tests for option parsing with attached arguments
From: John Kacur @ 2026-06-02 15:52 UTC (permalink / raw)
  To: linux-trace-kernel; +Cc: Steven Rostedt, Tomas Glozar, linux-kernel
In-Reply-To: <20260602155210.60439-1-jkacur@redhat.com>

Add tests to verify that numeric arguments work correctly with both
attached and detached formats:
  -p 100        (short with space)
  -p100         (short without space)
  --period=100  (long with =)
  --period 100  (long with space)

These tests prevent regression of the bug fixed in commit eefa8af46ff7
("rtla/timerlat: Fix parsing of short options with attached arguments")
where -p100 was incorrectly parsed as multiple separate options.

The tests verify that:
1. All four argument formats succeed (exit code 0)
2. None trigger the "no-irq and no-thread" error that occurred when
   the bug was present

Assisted-by: Claude:claude-sonnet-4-5
Signed-off-by: John Kacur <jkacur@redhat.com>
---
 tools/tracing/rtla/tests/timerlat.t | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index fd4935fd7b49..1a63301f5d70 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -42,6 +42,16 @@ check "verify -c/--cpus" \
 check "hist test in nanoseconds" \
 	"timerlat hist -i 2 -c 0 -n -d 10s" 2 "ns"
 
+# Option parsing tests - verify attached numeric arguments work correctly
+check "verify -p with space" \
+	"timerlat hist -p 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify -p without space (attached argument)" \
+	"timerlat hist -p100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify --period with equals" \
+	"timerlat hist --period=100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+check "verify --period with space" \
+	"timerlat hist --period 100 -c 0 -d 1s" 0 "" "no-irq and no-thread"
+
 # Actions tests
 check "trace output through -t" \
 	"timerlat hist -T 2 -t" 2 "^  Saving trace to timerlat_trace.txt$"
-- 
2.54.0


^ permalink raw reply related

* [PATCH 0/2] rtla: Add tests for option parsing with attached arguments
From: John Kacur @ 2026-06-02 15:52 UTC (permalink / raw)
  To: linux-trace-kernel; +Cc: Steven Rostedt, Tomas Glozar, linux-kernel

This patch series adds comprehensive tests to verify that short options
with attached numeric arguments (e.g., -p100) work correctly across all
rtla commands.

These tests complement Tomas Glozar's fix "rtla: Fix parsing of
multi-character short options" which resolves the issue where options
like -p100 were incorrectly parsed as multiple separate options due to
getopt_long() being called twice.

The tests verify four option formats for each command:
  -p 100        (short with space)
  -p100         (short attached - previously broken)
  --period=100  (long with equals)
  --period 100  (long with space)

Commands tested:
- timerlat hist and top
- osnoise hist and top
- hwnoise

All 20 tests pass with Tomas's fix applied, confirming the issue is
resolved and preventing future regressions. These tests will continue to
work when rtla transitions to libsubcmd in the future, ensuring this
functionality remains correct across parsing implementations.

Note: Patch 1/2 is a resend of the timerlat hist tests sent previously.
Patch 2/2 adds tests for the remaining rtla commands.

Signed-off-by: John Kacur <jkacur@redhat.com>

John Kacur (2):
  rtla/timerlat: Add tests for option parsing with attached arguments
  rtla: Add tests for option parsing with attached arguments

 tools/tracing/rtla/tests/hwnoise.t  | 10 ++++++++++
 tools/tracing/rtla/tests/osnoise.t  | 18 ++++++++++++++++++
 tools/tracing/rtla/tests/timerlat.t | 18 ++++++++++++++++++
 3 files changed, 46 insertions(+)

-- 
2.54.0

^ permalink raw reply

* Re: [PATCH mm-unstable v18 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Lance Yang @ 2026-06-02 15:44 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat, mhocko,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <CAA1CXcA+oZmp=cxiC2_EBDxqGX94gAd335d9eFPNv=j_0=og7Q@mail.gmail.com>



On 2026/6/2 18:58, Nico Pache wrote:
> On Sun, May 31, 2026 at 1:19 AM Lance Yang <lance.yang@linux.dev> wrote:
>>
>>
>> On Fri, May 22, 2026 at 09:00:06AM -0600, Nico Pache wrote:
>> [...]
>>> @@ -1587,10 +1749,11 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>>>        if (result == SCAN_SUCCEED) {
>>>                /* collapse_huge_page expects the lock to be dropped before calling */
>>>                mmap_read_unlock(mm);
>>> -              result = collapse_huge_page(mm, start_addr, referenced,
>>> -                                          unmapped, cc, HPAGE_PMD_ORDER);
>>> -              /* collapse_huge_page will return with the mmap_lock released */
>>> +              nr_collapsed = mthp_collapse(mm, vma, start_addr, referenced,
>>> +                                           unmapped, cc, enabled_orders);
>>> +              /* mmap_lock was released above, set lock_dropped */
>>>                *lock_dropped = true;
>>> +              result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;
>>
>> Hmm ... don't we lose the allocation-failure result here?
>>
>> Previously collapse_scan_pmd() propagated SCAN_ALLOC_HUGE_PAGE_FAIL from
>> collapse_huge_page(), so khugepaged would call khugepaged_alloc_sleep()
>> in khugepaged_do_scan().
>>
>> Now if allocation fails and nr_collapsed stays 0, we just return
>> SCAN_FAIL. So we won't back off via khugepaged_alloc_sleep() anymore?
> 
> Ok I did the error propagation! I think I handled both of these cases
> you brought up pretty easily.

Thanks.

> However I don't know what to do in the following case: We successfully
> collapsed some portion of the PMD, but during that process, we also
> hit an allocation failure. Is it best to back off entirely? or can we
> treat some forward progress as a sign we can continue trying collapses
> without sleeping.
> 
> Basically, do we prioritize SCAN_ALLOC_HUGE_PAGE_FAIL or the
> successful collapses as the returned value?

Thinking out loud, forward progress should win here, the allocation
failure only matter if we made no progress at all?

> This is what I currently have:
> done:
>      if (collapsed)
>          return SCAN_SUCCEED;
>      if (alloc_failed)
>          return SCAN_ALLOC_HUGE_PAGE_FAIL;

I'd go with this ordering :)

Cheers, Lance

^ permalink raw reply

* Re: [PATCH 1/2] tracing: work around -Wmissing-format-attribute warning
From: Steven Rostedt @ 2026-06-02 15:40 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Masami Hiramatsu, Andrew Morton, Petr Mladek, Nathan Chancellor,
	Arnd Bergmann, Dennis Dalessandro, Jason Gunthorpe,
	Leon Romanovsky, Arend van Spriel, Miri Korenblit,
	Mathieu Desnoyers, Andy Shevchenko, Rasmus Villemoes,
	Sergey Senozhatsky, Nick Desaulniers, Bill Wendling, Justin Stitt,
	Vlastimil Babka, linux-rdma, linux-kernel, linux-wireless,
	brcm80211, brcm80211-dev-list.pdl, linux-trace-kernel, llvm
In-Reply-To: <20260602150904.2258624-1-arnd@kernel.org>

On Tue,  2 Jun 2026 17:07:05 +0200
Arnd Bergmann <arnd@kernel.org> wrote:

> @@ -2979,6 +2975,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt_str, va_list args)
>  }
>  EXPORT_SYMBOL(vsnprintf);
>  

Should add a comment here for why this is needed.

-- Steve

> +int __printf(3, 0) __vsnprintf(char *buf, size_t size, const char *fmt_str, va_list args)
> +{
> +	return vsnprintf(buf, size, fmt_str, args);
> +}
> +EXPORT_SYMBOL(__vsnprintf);
> +

^ permalink raw reply

* Re: [PATCH mm-unstable v18 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Nico Pache @ 2026-06-02 15:30 UTC (permalink / raw)
  To: Lance Yang
  Cc: David Hildenbrand (Arm), linux-doc, linux-kernel, linux-mm,
	linux-trace-kernel, aarcange, akpm, anshuman.khandual, apopple,
	baohua, baolin.wang, byungchul, catalin.marinas, cl, corbet,
	dave.hansen, dev.jain, gourry, hannes, hughd, jack, jackmanb,
	jannh, jglisse, joshua.hahnjy, kas, liam, ljs, mathieu.desnoyers,
	matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
	raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
	ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
	tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
	willy, yang, ying.huang, ziy, zokeefe, usama.arif
In-Reply-To: <baa0a462-46e0-44ab-b583-c722ad253afe@linux.dev>

On Mon, Jun 1, 2026 at 4:48 AM Lance Yang <lance.yang@linux.dev> wrote:
>
>
>
> On 2026/6/1 18:23, David Hildenbrand (Arm) wrote:
> > On 6/1/26 11:08, Lance Yang wrote:
> >>
> >>
> >> On 2026/6/1 14:54, David Hildenbrand (Arm) wrote:
> >>> On 6/1/26 05:28, Lance Yang wrote:
> >>>>
> >>>>
> >>>> Ah, fair point.
> >>>>
> >>>> I was mostly worried about arch hooks that walk vma->vm_mm again, rather
> >>>> than only using the pte pointer passed in. For example, mips does:
> >>>
> >>> Right, a re-walk would be the real problem.
> >>>
> >>>>
> >>>>     update_mmu_cache_range()
> >>>>       -> __update_tlb()
> >>>>         -> pgd_offset(vma->vm_mm, address)
> >>>>         -> pte_offset_map(...)
> >>>>
> >>>> and __update_tlb() has this assumption:
> >>>>
> >>>>          /*
> >>>>           * update_mmu_cache() is called between pte_offset_map_lock()
> >>>>           * and pte_unmap_unlock(), so we can assume that ptep is not
> >>>>           * NULL here: and what should be done below if it were NULL?
> >>>>           */
> >>>>
> >>>> So if khugepaged happens to run with current->active_mm == vma->vm_mm
> >>>> here, could __update_tlb() hit the none PMD, get NULL from
> >>>> pte_offset_map(), and then dereference it?
> >>>
> >>> Likely yes -- that MIPS code is horrible. And the comment in MIPS code
> >>> even spells that out. :(
> >>>
> >>> Do you know about other code like that, or is MIPS the only one doing a
> >>> re-walk and crossing fingers?
> >>>
> >>>>
> >>>> Just wanted to raise it since some arch code may still have assumptions
> >>>> like this, and the always-enable-mTHP work is getting closer ...
> >>>
> >>> Right. I assume set_pte_at() couldn't trigger something similar (re-walk) in
> >>> arch code,
> >>> because we simply provide the ptep. update_mmu_cache_range() only consumes the
> >>> pte.
> >>>
> >>>>
> >>>> Probably very very very hard to hit, though :)
> >>>
> >>> Delaying update_mmu_cache_range() is nasty, as we'd have to make sure that
> >>> nobody can interfere in the meantime ... and the PMD lock will not be sufficient.
> >>>
> >>> Maybe we could reinstall the page table with the cleared (none) entries while
> >>> still holding the PTL?
> >>>
> >>> Thinking out loud:
> >>>
> >>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >>> index 5ba298d420b7..e39b750b1e6f 100644
> >>> --- a/mm/khugepaged.c
> >>> +++ b/mm/khugepaged.c
> >>> @@ -1413,13 +1413,17 @@ static enum scan_result collapse_huge_page(struct
> >>> mm_struct *mm, unsigned long s
> >>>                   map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
> >>>           } else {
> >>>                   /*
> >>> -                * set_ptes is called in map_anon_folio_pte_nopf with the
> >>> -                * pmd_ptl lock still held; this is safe as the PMD is expected
> >>> -                * to be none. The pmd entry is then repopulated below.
> >>> +                * Re-insert the page table with the cleared entries, but
> >>> +                * hold the PTL, such that no one can mess with the re-installed
> >>> +                * page table until we updated the temporarily-cleared entries
> >>> +                * through map_anon_folio_pte_nopf().
> >>>                    */
> >>> -               map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /
> >>> *uffd_wp=*/ false);
> >>> -               smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
> >>
> >> One small thing, I think we should probably keep the smp_wmb(), and just
> >> move it before the earlier pmd_populate().
> >>
> >> IIUC, the ordering we want is still:
> >>
> >>    clear old PTEs
> >>    smp_wmb()
> >>    pmd_populate()
> >>
> >> so another CPU cannot walk through the re-installed PMD and still observe
> >> the old PTEs, right?
> >
> > There is a smp_wmb() in __folio_mark_uptodate(), that should be sufficient?
>
> Ah, cool! __folio_mark_uptodate() already does the job :P
>
> So yeah, no extra smp_wmb() needed here!

are we sure? that folio_mark_uptodate is done before the PTEs are
reinstalled. Then we reinstall the PMD right after. Currently
separated by the smp_wmb().

I was copying this from other THP code that performs similar PTE/PMD juggling.

I can remove it, but I'd rather air on the side of caution with this.

>
> Cheers, Lance
>


^ permalink raw reply

* [PATCH 2/2] tracing/osnoise: add printf attribute to osnoise_print
From: Arnd Bergmann @ 2026-06-02 15:07 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Crystal Wood
  Cc: Arnd Bergmann, Mathieu Desnoyers, Tomas Glozar, Wang Liang,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260602150904.2258624-1-arnd@kernel.org>

From: Arnd Bergmann <arnd@arndb.de>

gcc points out that tne newly added function uses printf style arguments
and should get an attribute to allow verifying the format strings for
its callers:

kernel/trace/trace_osnoise.c: In function 'osnoise_print':
kernel/trace/trace_osnoise.c:96:17: error: function 'osnoise_print' might be a candidate for 'gnu_printf' format attribute [-Werror=suggest-attribute=format]
   96 |                 trace_array_vprintk(tr, _RET_IP_, fmt, ap);
      |                 ^~~~~~~~~~~~~~~~~~~

Add the attribute as suggested

Fixes: 9cb99c598643 ("tracing/osnoise: Array printk init and cleanup")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/trace/trace_osnoise.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 1fbd8525ab54..6fa015e57899 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -83,7 +83,7 @@ struct osnoise_instance {
 
 static struct list_head osnoise_instances;
 
-static void osnoise_print(const char *fmt, ...)
+static __printf(1, 2) void osnoise_print(const char *fmt, ...)
 {
 	struct osnoise_instance *inst;
 	struct trace_array *tr;
-- 
2.39.5


^ permalink raw reply related

* [PATCH 1/2] tracing: work around -Wmissing-format-attribute warning
From: Arnd Bergmann @ 2026-06-02 15:07 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Andrew Morton, Petr Mladek,
	Nathan Chancellor
  Cc: Arnd Bergmann, Dennis Dalessandro, Jason Gunthorpe,
	Leon Romanovsky, Arend van Spriel, Miri Korenblit,
	Mathieu Desnoyers, Andy Shevchenko, Rasmus Villemoes,
	Sergey Senozhatsky, Nick Desaulniers, Bill Wendling, Justin Stitt,
	Vlastimil Babka, linux-rdma, linux-kernel, linux-wireless,
	brcm80211, brcm80211-dev-list.pdl, linux-trace-kernel, llvm

From: Arnd Bergmann <arnd@arndb.de>

A number of tracing headers turn off -Wsuggest-attribute=format for
gcc, but they don't turn it off for clang, so the same warning still
happens on new versions of clang that support the format attribute.

To avoid duplicating the same thing in each tracing header, as well
as changing all of them to also turn it off for clang, add a new
__vsnprintf() helper that is not annotated this way in linux/sprintf.h
but is defined to work the same way as the regular vsprintf.

Aside from tracing, the same thing can be used in va_format(),
which is part of lib/vsprintf.c itself.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
This version is a fairly simplistic way to work around the warning
reliably. I have resent two more patches to address actually
missing annotations in device drivers, but with all of these
out of the way, we can move the warning from the 'make W=1'
into the default set.

I have also prototyped a variant of this patch that passes down
a 'struct va_format' throughout the tracing code. That patch is
a little more invasive and I have no idea if that actually works,
but the result looks simpler.
---
 drivers/infiniband/hw/hfi1/trace_dbg.h               |  7 -------
 .../broadcom/brcm80211/brcmfmac/tracepoint.h         |  7 -------
 .../brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h    |  7 -------
 drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c    |  3 ---
 include/linux/sprintf.h                              |  1 +
 include/linux/trace_events.h                         |  2 +-
 include/trace/events/qla.h                           |  7 -------
 include/trace/stages/stage6_event_callback.h         |  2 +-
 lib/vsprintf.c                                       | 12 +++++++-----
 samples/trace_events/trace-events-sample.c           |  2 --
 10 files changed, 10 insertions(+), 40 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
index 58304b91380f..05c4f1354269 100644
--- a/drivers/infiniband/hw/hfi1/trace_dbg.h
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -22,11 +22,6 @@
 
 #define MAX_MSG_LEN 512
 
-#pragma GCC diagnostic push
-#ifndef __clang__
-#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
-#endif
-
 DECLARE_EVENT_CLASS(hfi1_trace_template,
 		    TP_PROTO(const char *function, struct va_format *vaf),
 		    TP_ARGS(function, vaf),
@@ -41,8 +36,6 @@ DECLARE_EVENT_CLASS(hfi1_trace_template,
 			      __get_str(msg))
 );
 
-#pragma GCC diagnostic pop
-
 /*
  * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
  * actual function to work and can not be in a macro.
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h
index 96032322b165..6c4e00e9ccd1 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h
@@ -28,11 +28,6 @@ static inline void trace_ ## name(proto) {}
 
 #define MAX_MSG_LEN		100
 
-#pragma GCC diagnostic push
-#ifndef __clang__
-#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
-#endif
-
 TRACE_EVENT(brcmf_err,
 	TP_PROTO(const char *func, struct va_format *vaf),
 	TP_ARGS(func, vaf),
@@ -128,8 +123,6 @@ TRACE_EVENT(brcmf_sdpcm_hdr,
 		  __entry->len, ((u8 *)__get_dynamic_array(hdr))[4])
 );
 
-#pragma GCC diagnostic pop
-
 #ifdef CONFIG_BRCM_TRACING
 
 #undef TRACE_INCLUDE_PATH
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h
index 908ce3c864fe..dc296d8bf775 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h
@@ -24,11 +24,6 @@
 
 #define MAX_MSG_LEN	100
 
-#pragma GCC diagnostic push
-#ifndef __clang__
-#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
-#endif
-
 DECLARE_EVENT_CLASS(brcms_msg_event,
 	TP_PROTO(struct va_format *vaf),
 	TP_ARGS(vaf),
@@ -77,8 +72,6 @@ TRACE_EVENT(brcms_dbg,
 	TP_printk("%s: %s", __get_str(func), __get_str(msg))
 );
 
-#pragma GCC diagnostic pop
-
 #endif /* __TRACE_BRCMSMAC_MSG_H */
 
 #ifdef CONFIG_BRCM_TRACING
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c
index 7e686297963d..49a8196430a7 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c
@@ -12,9 +12,6 @@
 #include "iwl-trans.h"
 
 #define CREATE_TRACE_POINTS
-#ifdef CONFIG_CC_IS_GCC
-#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
-#endif
 #include "iwl-devtrace.h"
 
 EXPORT_TRACEPOINT_SYMBOL(iwlwifi_dev_ucode_event);
diff --git a/include/linux/sprintf.h b/include/linux/sprintf.h
index f06f7b785091..036a247b7c1e 100644
--- a/include/linux/sprintf.h
+++ b/include/linux/sprintf.h
@@ -12,6 +12,7 @@ __printf(2, 3) int sprintf(char *buf, const char * fmt, ...);
 __printf(2, 0) int vsprintf(char *buf, const char *, va_list);
 __printf(3, 4) int snprintf(char *buf, size_t size, const char *fmt, ...);
 __printf(3, 0) int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
+int __vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
 __printf(3, 4) int scnprintf(char *buf, size_t size, const char *fmt, ...);
 __printf(3, 0) int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
 __printf(2, 3) __malloc char *kasprintf(gfp_t gfp, const char *fmt, ...);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index d49338c44014..4715330c7b6b 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -962,7 +962,7 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 	int __ret;					\
 							\
 	va_copy(__ap, *(va));				\
-	__ret = vsnprintf(NULL, 0, fmt, __ap) + 1;	\
+	__ret = __vsnprintf(NULL, 0, fmt, __ap) + 1;	\
 	va_end(__ap);					\
 							\
 	min(__ret, TRACE_EVENT_STR_MAX);		\
diff --git a/include/trace/events/qla.h b/include/trace/events/qla.h
index 8800c35525a1..74a7534b99b6 100644
--- a/include/trace/events/qla.h
+++ b/include/trace/events/qla.h
@@ -9,11 +9,6 @@
 
 #define QLA_MSG_MAX 256
 
-#pragma GCC diagnostic push
-#ifndef __clang__
-#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
-#endif
-
 DECLARE_EVENT_CLASS(qla_log_event,
 	TP_PROTO(const char *buf,
 		struct va_format *vaf),
@@ -32,8 +27,6 @@ DECLARE_EVENT_CLASS(qla_log_event,
 	TP_printk("%s %s", __get_str(buf), __get_str(msg))
 );
 
-#pragma GCC diagnostic pop
-
 DEFINE_EVENT(qla_log_event, ql_dbg_log,
 	TP_PROTO(const char *buf, struct va_format *vaf),
 	TP_ARGS(buf, vaf)
diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h
index 1691676fd858..7d6a6ca6e779 100644
--- a/include/trace/stages/stage6_event_callback.h
+++ b/include/trace/stages/stage6_event_callback.h
@@ -45,7 +45,7 @@
 	do {								\
 		va_list __cp_va;					\
 		va_copy(__cp_va, *(va));				\
-		vsnprintf(__get_str(dst), TRACE_EVENT_STR_MAX, fmt, __cp_va); \
+		__vsnprintf(__get_str(dst), TRACE_EVENT_STR_MAX, fmt, __cp_va); \
 		va_end(__cp_va);					\
 	} while (0)
 
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index a3017bc58986..3caf0796f54d 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1702,9 +1702,6 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
 	return buf;
 }
 
-__diag_push();
-__diag_ignore(GCC, all, "-Wsuggest-attribute=format",
-	      "Not a valid __printf() conversion candidate.");
 static char *va_format(char *buf, char *end, struct va_format *va_fmt,
 		       struct printf_spec spec)
 {
@@ -1714,12 +1711,11 @@ static char *va_format(char *buf, char *end, struct va_format *va_fmt,
 		return buf;
 
 	va_copy(va, *va_fmt->va);
-	buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
+	buf += __vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
 	va_end(va);
 
 	return buf;
 }
-__diag_pop();
 
 static noinline_for_stack
 char *uuid_string(char *buf, char *end, const u8 *addr,
@@ -2979,6 +2975,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt_str, va_list args)
 }
 EXPORT_SYMBOL(vsnprintf);
 
+int __printf(3, 0) __vsnprintf(char *buf, size_t size, const char *fmt_str, va_list args)
+{
+	return vsnprintf(buf, size, fmt_str, args);
+}
+EXPORT_SYMBOL(__vsnprintf);
+
 /**
  * vscnprintf - Format a string and place it in a buffer
  * @buf: The buffer to place the result into
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 9993fb5d5f98..ecc7db237f2e 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -9,8 +9,6 @@
  * creates the handles for the trace points.
  */
 #define CREATE_TRACE_POINTS
-__diag_ignore(GCC, all, "-Wsuggest-attribute=format",
-             "trace_event_get_offsets_foo_bar can't easily be annotated as __printf");
 #include "trace-events-sample.h"
 
 static const char *random_strings[] = {
-- 
2.39.5


^ permalink raw reply related

* Re: [PATCH v2] tracing: fix CFI violation in probestub test
From: Masami Hiramatsu @ 2026-06-02 14:33 UTC (permalink / raw)
  To: Eva Kurchatova
  Cc: rostedt, linux-trace-kernel, linux-kernel, mathieu.desnoyers,
	peterz, jpoimboe, samitolvanen
In-Reply-To: <20260602135425.542073-1-eva.kurchatova@virtuozzo.com>

On Tue,  2 Jun 2026 16:54:08 +0300
Eva Kurchatova <eva.kurchatova@virtuozzo.com> wrote:

> When multiple callbacks are registered on the same tracepoint,
> callbacks will be indirectly called via traceiter helper.
> 
> Pointers to __probestub_* callbacks reside in __tracepoints section,
> which is excluded from ENDBR checks in objtool, causing objtool to
> assume those functions are never indirectly called.
> 
> Registering multiple callbacks using sched_wakeup test will result
> in #CP exception due to missing ENDBR in __probestub_sched_wakeup
> on a CFI-enabled machine.
> 
> Fix this by adding CFI_NOSEAL annotation to probestub declaration.
> 

Thanks, this looks good to me.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

> Fixes: d5173f753750 ("objtool: Exclude __tracepoints data from ENDBR checks")
> Signed-off-by: Eva Kurchatova <eva.kurchatova@virtuozzo.com>
> ---
>  include/linux/tracepoint.h | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index 763eea4d80d8..38e9f49a71b7 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -20,6 +20,7 @@
>  #include <linux/rcupdate_trace.h>
>  #include <linux/tracepoint-defs.h>
>  #include <linux/static_call.h>
> +#include <asm/cfi.h>
>  
>  struct module;
>  struct tracepoint;
> @@ -389,6 +390,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
>  	void __probestub_##_name(void *__data, proto)			\
>  	{								\
>  	}								\
> +	/*								\
> +	 * Annotate the probestub 'CFI_NOSEAL' to stop objtool from	\
> +	 * requesting the kernel remove the ENDBR, because the only	\
> +	 * references to the function are in the __tracepoint section,	\
> +	 * that objtool doesn't scan.					\
> +	 */								\
> +	CFI_NOSEAL(__probestub_##_name);				\
>  	DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);	\
>  	DEFINE_RUST_DO_TRACE(_name, TP_PROTO(proto), TP_ARGS(args))
>  
> -- 
> 2.54.0
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH v2] tracing: fix CFI violation in probestub test
From: Eva Kurchatova @ 2026-06-02 13:54 UTC (permalink / raw)
  To: mhiramat, rostedt
  Cc: linux-trace-kernel, linux-kernel, mathieu.desnoyers, peterz,
	jpoimboe, samitolvanen, eva.kurchatova

When multiple callbacks are registered on the same tracepoint,
callbacks will be indirectly called via traceiter helper.

Pointers to __probestub_* callbacks reside in __tracepoints section,
which is excluded from ENDBR checks in objtool, causing objtool to
assume those functions are never indirectly called.

Registering multiple callbacks using sched_wakeup test will result
in #CP exception due to missing ENDBR in __probestub_sched_wakeup
on a CFI-enabled machine.

Fix this by adding CFI_NOSEAL annotation to probestub declaration.

Fixes: d5173f753750 ("objtool: Exclude __tracepoints data from ENDBR checks")
Signed-off-by: Eva Kurchatova <eva.kurchatova@virtuozzo.com>
---
 include/linux/tracepoint.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 763eea4d80d8..38e9f49a71b7 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -20,6 +20,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/tracepoint-defs.h>
 #include <linux/static_call.h>
+#include <asm/cfi.h>
 
 struct module;
 struct tracepoint;
@@ -389,6 +390,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	void __probestub_##_name(void *__data, proto)			\
 	{								\
 	}								\
+	/*								\
+	 * Annotate the probestub 'CFI_NOSEAL' to stop objtool from	\
+	 * requesting the kernel remove the ENDBR, because the only	\
+	 * references to the function are in the __tracepoint section,	\
+	 * that objtool doesn't scan.					\
+	 */								\
+	CFI_NOSEAL(__probestub_##_name);				\
 	DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);	\
 	DEFINE_RUST_DO_TRACE(_name, TP_PROTO(proto), TP_ARGS(args))
 
-- 
2.54.0


^ permalink raw reply related

* [syzbot] [trace?] KASAN: use-after-free Write in ring_buffer_read_page
From: syzbot @ 2026-06-02 13:45 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel, mathieu.desnoyers, mhiramat,
	rostedt, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    e7ae89a0c97c Linux 7.1-rc5
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=16f06e2e580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=58acee1ac5406016
dashboard link: https://syzkaller.appspot.com/bug?extid=2dd9d02f60775ce5c1fb
compiler:       gcc (Debian 14.2.0-19) 14.2.0, GNU ld (GNU Binutils for Debian) 2.44

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/9b0c5b4e3645/disk-e7ae89a0.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/ed163d3ad68b/vmlinux-e7ae89a0.xz
kernel image: https://storage.googleapis.com/syzbot-assets/f2408b333334/bzImage-e7ae89a0.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+2dd9d02f60775ce5c1fb@syzkaller.appspotmail.com

==================================================================
BUG: KASAN: use-after-free in ring_buffer_read_page+0xd51/0x15a0 kernel/trace/ring_buffer.c:7059
Write of size 16308 at addr ffff88805ceb404c by task syz.3.1872/14532

CPU: 0 UID: 0 PID: 14532 Comm: syz.3.1872 Tainted: G             L      syzkaller #0 PREEMPT(full) 
Tainted: [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:94 [inline]
 dump_stack_lvl+0x100/0x190 lib/dump_stack.c:120
 print_address_description mm/kasan/report.c:378 [inline]
 print_report+0x13d/0x4b0 mm/kasan/report.c:482
 kasan_report+0xdf/0x1d0 mm/kasan/report.c:595
 check_region_inline mm/kasan/generic.c:186 [inline]
 kasan_check_range+0x10f/0x1e0 mm/kasan/generic.c:200
 __asan_memset+0x23/0x50 mm/kasan/shadow.c:84
 ring_buffer_read_page+0xd51/0x15a0 kernel/trace/ring_buffer.c:7059
 tracing_buffers_read+0x2bf/0xaf0 kernel/trace/trace.c:7129
 vfs_read+0x1e4/0xb30 fs/read_write.c:572
 ksys_read+0x12a/0x250 fs/read_write.c:717
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x10b/0x830 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fb1aad9ce59
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fb1abca5028 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 00007fb1ab015fa0 RCX: 00007fb1aad9ce59
RDX: 0000000000001000 RSI: 00002000000002c0 RDI: 0000000000000008
RBP: 00007fb1aae32d6f R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fb1ab016038 R14: 00007fb1ab015fa0 R15: 00007ffec139a1b8
 </TASK>

The buggy address belongs to the physical page:
page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5ceb4
flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000000000 0000000000000000 dead000000000122 0000000000000000
raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 0, migratetype Unmovable, gfp_mask 0x44dc0(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL|__GFP_COMP), pid 5959, tgid 5958 (syz.1.37), ts 95924498456, free_ts 95918916027
 set_page_owner include/linux/page_owner.h:32 [inline]
 post_alloc_hook+0xfd/0x120 mm/page_alloc.c:1858
 prep_new_page mm/page_alloc.c:1866 [inline]
 get_page_from_freelist+0x11a6/0x33b0 mm/page_alloc.c:3946
 __alloc_frozen_pages_noprof+0x27c/0x2bc0 mm/page_alloc.c:5226
 __alloc_pages_noprof+0xb/0x110 mm/page_alloc.c:5260
 __alloc_pages_node_noprof include/linux/gfp.h:289 [inline]
 alloc_pages_node_noprof include/linux/gfp.h:316 [inline]
 alloc_cpu_data+0x60/0x130 kernel/trace/ring_buffer.c:406
 ring_buffer_alloc_read_page+0x430/0x560 kernel/trace/ring_buffer.c:6801
 tracing_buffers_read+0x603/0xaf0 kernel/trace/trace.c:7110
 vfs_read+0x1e4/0xb30 fs/read_write.c:572
 ksys_read+0x12a/0x250 fs/read_write.c:717
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x10b/0x830 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
page last free pid 5946 tgid 5945 stack trace:
 reset_page_owner include/linux/page_owner.h:25 [inline]
 __free_pages_prepare mm/page_alloc.c:1402 [inline]
 __free_frozen_pages+0x747/0x1040 mm/page_alloc.c:2943
 tlb_batch_list_free mm/mmu_gather.c:161 [inline]
 tlb_finish_mmu+0x27d/0x810 mm/mmu_gather.c:552
 exit_mmap+0x454/0xa10 mm/mmap.c:1313
 __mmput+0x12a/0x410 kernel/fork.c:1178
 mmput+0x67/0x80 kernel/fork.c:1201
 exit_mm kernel/exit.c:582 [inline]
 do_exit+0x8b2/0x2af0 kernel/exit.c:964
 do_group_exit+0xd5/0x2a0 kernel/exit.c:1119
 get_signal+0x20ff/0x2210 kernel/signal.c:3037
 arch_do_signal_or_restart+0x91/0x7a0 arch/x86/kernel/signal.c:337
 __exit_to_user_mode_loop kernel/entry/common.c:64 [inline]
 exit_to_user_mode_loop+0x8b/0x4f0 kernel/entry/common.c:98
 __exit_to_user_mode_prepare include/linux/irq-entry-common.h:207 [inline]
 syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:230 [inline]
 syscall_exit_to_user_mode include/linux/entry-common.h:318 [inline]
 do_syscall_64+0x6f2/0x830 arch/x86/entry/syscall_64.c:100
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Memory state around the buggy address:
 ffff88805ceb4f00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff88805ceb4f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>ffff88805ceb5000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
                   ^
 ffff88805ceb5080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 ffff88805ceb5100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
==================================================================


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* Re: [PATCH] tracing/events: Expand ring buffer for in-kernel event enables
From: Steven Rostedt @ 2026-06-02 13:00 UTC (permalink / raw)
  To: Manjunath Patil
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260601233716.2517987-1-manjunath.b.patil@oracle.com>

On Mon,  1 Jun 2026 16:24:43 -0700
Manjunath Patil <manjunath.b.patil@oracle.com> wrote:

> Ftrace keeps trace arrays at a boot-minimum ring-buffer size until
> tracing is used. Tracefs event-enable paths already call
> tracing_update_buffers() before enabling events, but the exported
> in-kernel helpers trace_set_clr_event() and trace_array_set_clr_event()
> directly enable events through __ftrace_set_clr_event().
> 
> This can leave events enabled by in-kernel users recording into the tiny
> boot-minimum buffer instead of the configured default-sized buffer. Any
> caller that enables events through these exported helpers observes
> different buffer-expansion behavior than a userspace tracefs event enable.
> 
> Expand the relevant trace array before enabling events through the
> exported in-kernel helpers, matching the tracefs event-enable behavior.
> Disabling events remains unchanged.

The above explains everything correctly, but you left out what needs this?

Internal code should not be using the main ring buffer except for
debugging, in which case you can use trace_printk(), which will cause the
tracing buffers to be expanded by default.

Other areas of the kernel should create their own trace array which will be
created expanded by default too.

-- Steve

^ permalink raw reply

* Re: [PATCH] rtla: Fix parsing of multi-character short options
From: Tomas Glozar @ 2026-06-02 12:56 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar
  Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
	Wander Lairson Costa, LKML, linux-trace-kernel
In-Reply-To: <20260602125506.3325345-1-tglozar@redhat.com>

út 2. 6. 2026 v 14:55 odesílatel Tomas Glozar <tglozar@redhat.com> napsal:
>
> A bug was reported where the parsing of multi-character short options,
> be it a short option with an argument specified without space (e.g.
> "-p100") or multiple short options in one argument (e.g. -un), ignores
> options specific to individual tools.
>
> Furthermore, if the rest of the option is supposed to be an argument, it
> gets reinterpreted as a string of options. For example, -p100 gets
> interpreted as -100, which is due to hackish implementation read as
> --no-thread --no-irq --no-irq with timerlat hist, causing rtla to error
> out:
>
> $ rtla timerlat hist -p100
> no-irq and no-thread set, there is nothing to do here
>
> This behavior is caused by getopt_long() being called twice on each
> argument, once in common_parse_options(), once in [tool]_parse_args():
>
> - common_parse_options() calls getopt_long() with an array of options
>   common for all rtla tools, while suppressing errors (opterr = 0).
> - If the option fails to parse, common_parse_options() returns 0.
> - If 0 is returned from common_parse_options(), [tool]_parse_args()
>   calls getopt_long() again, with its own set of options.
>
> * [tool] means one of {osnoise,timerlat}_{top,hist}
>
> At least in glibc, getopt_long() increments its internal nextchar
> variable even if the option is not recognized. That means that in the
> case of "-p100", common_parse_options() sets nextchar pointing to '1',
> and timerlat_hist_parse_args() sees '1', not 'p'; the same then repeats
> for the first and second '0'.
>
> As there is no way to restore the correct internal state of
> getopt_long() reliably, fix the issue by merging the common options back
> to the longopt array and option string of the [tool]_parse_args()
> functions using a macro; only the switch part is left in the original
> function, which is renamed to set_common_option().
>
> Fixes: 850cd24cb6d6 ("tools/rtla: Add common_parse_options()")
> Reported-by: John Kacur <jkacur@redhat.com>
> Signed-off-by: Tomas Glozar <tglozar@redhat.com>
> ---

Forgot to add note to the original email: This fix is only for 7.1,
7.0 needs tweaking of the commit, 7.2 will remove the command line
parsing logic entirely and replace it with libsubcmd, where this
works.

Tomas


^ permalink raw reply

* [PATCH] rtla: Fix parsing of multi-character short options
From: Tomas Glozar @ 2026-06-02 12:55 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar
  Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
	Wander Lairson Costa, LKML, linux-trace-kernel

A bug was reported where the parsing of multi-character short options,
be it a short option with an argument specified without space (e.g.
"-p100") or multiple short options in one argument (e.g. -un), ignores
options specific to individual tools.

Furthermore, if the rest of the option is supposed to be an argument, it
gets reinterpreted as a string of options. For example, -p100 gets
interpreted as -100, which is due to hackish implementation read as
--no-thread --no-irq --no-irq with timerlat hist, causing rtla to error
out:

$ rtla timerlat hist -p100
no-irq and no-thread set, there is nothing to do here

This behavior is caused by getopt_long() being called twice on each
argument, once in common_parse_options(), once in [tool]_parse_args():

- common_parse_options() calls getopt_long() with an array of options
  common for all rtla tools, while suppressing errors (opterr = 0).
- If the option fails to parse, common_parse_options() returns 0.
- If 0 is returned from common_parse_options(), [tool]_parse_args()
  calls getopt_long() again, with its own set of options.

* [tool] means one of {osnoise,timerlat}_{top,hist}

At least in glibc, getopt_long() increments its internal nextchar
variable even if the option is not recognized. That means that in the
case of "-p100", common_parse_options() sets nextchar pointing to '1',
and timerlat_hist_parse_args() sees '1', not 'p'; the same then repeats
for the first and second '0'.

As there is no way to restore the correct internal state of
getopt_long() reliably, fix the issue by merging the common options back
to the longopt array and option string of the [tool]_parse_args()
functions using a macro; only the switch part is left in the original
function, which is renamed to set_common_option().

Fixes: 850cd24cb6d6 ("tools/rtla: Add common_parse_options()")
Reported-by: John Kacur <jkacur@redhat.com>
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 28 +++++---------------------
 tools/tracing/rtla/src/common.h        | 12 ++++++++++-
 tools/tracing/rtla/src/osnoise_hist.c  |  7 ++++---
 tools/tracing/rtla/src/osnoise_top.c   |  7 ++++---
 tools/tracing/rtla/src/timerlat_hist.c |  7 ++++---
 tools/tracing/rtla/src/timerlat_top.c  |  7 ++++---
 6 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 35e3d3aa922e..bc9d01ddd102 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -84,37 +84,20 @@ int getopt_auto(int argc, char **argv, const struct option *long_opts)
 }
 
 /*
- * common_parse_options - parse common command line options
+ * set_common_option - set common options
  *
+ * @c: option character
  * @argc: argument count
  * @argv: argument vector
  * @common: common parameters structure
  *
  * Parse command line options that are common to all rtla tools.
  *
- * Returns: non zero if a common option was parsed, or 0
- * if the option should be handled by tool-specific parsing.
+ * Returns: 1 if the option was set, 0 otherwise.
  */
-int common_parse_options(int argc, char **argv, struct common_params *common)
+int set_common_option(int c, int argc, char **argv, struct common_params *common)
 {
 	struct trace_events *tevent;
-	int saved_state = optind;
-	int c;
-
-	static struct option long_options[] = {
-		{"cpus",                required_argument,      0, 'c'},
-		{"cgroup",              optional_argument,      0, 'C'},
-		{"debug",               no_argument,            0, 'D'},
-		{"duration",            required_argument,      0, 'd'},
-		{"event",               required_argument,      0, 'e'},
-		{"house-keeping",       required_argument,      0, 'H'},
-		{"priority",            required_argument,      0, 'P'},
-		{0, 0, 0, 0}
-	};
-
-	opterr = 0;
-	c = getopt_auto(argc, argv, long_options);
-	opterr = 1;
 
 	switch (c) {
 	case 'c':
@@ -154,11 +137,10 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 		common->set_sched = 1;
 		break;
 	default:
-		optind = saved_state;
 		return 0;
 	}
 
-	return c;
+	return 1;
 }
 
 /*
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index 51665db4ffce..8921807bda98 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -178,7 +178,17 @@ int osnoise_set_stop_total_us(struct osnoise_context *context,
 			      long long stop_total_us);
 
 int getopt_auto(int argc, char **argv, const struct option *long_opts);
-int common_parse_options(int argc, char **argv, struct common_params *common);
+
+#define COMMON_OPTIONS \
+	{"cpus",                required_argument,      0, 'c'},\
+	{"cgroup",              optional_argument,      0, 'C'},\
+	{"debug",               no_argument,            0, 'D'},\
+	{"duration",            required_argument,      0, 'd'},\
+	{"event",               required_argument,      0, 'e'},\
+	{"house-keeping",       required_argument,      0, 'H'},\
+	{"priority",            required_argument,      0, 'P'}
+int set_common_option(int c, int argc, char **argv, struct common_params *common);
+
 int common_apply_config(struct osnoise_tool *tool, struct common_params *params);
 int top_main_loop(struct osnoise_tool *tool);
 int hist_main_loop(struct osnoise_tool *tool);
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 8ad816b80265..cb4ce58c5987 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -475,6 +475,7 @@ static struct common_params
 
 	while (1) {
 		static struct option long_options[] = {
+			COMMON_OPTIONS,
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"entries",		required_argument,	0, 'E'},
@@ -498,15 +499,15 @@ static struct common_params
 			{0, 0, 0, 0}
 		};
 
-		if (common_parse_options(argc, argv, &params->common))
-			continue;
-
 		c = getopt_auto(argc, argv, long_options);
 
 		/* detect the end of the options. */
 		if (c == -1)
 			break;
 
+		if (set_common_option(c, argc, argv, &params->common))
+			continue;
+
 		switch (c) {
 		case 'a':
 			/* set sample stop to auto_thresh */
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 244bdce022ad..e65312ec26c4 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -328,6 +328,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 
 	while (1) {
 		static struct option long_options[] = {
+			COMMON_OPTIONS,
 			{"auto",		required_argument,	0, 'a'},
 			{"help",		no_argument,		0, 'h'},
 			{"period",		required_argument,	0, 'p'},
@@ -346,15 +347,15 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 			{0, 0, 0, 0}
 		};
 
-		if (common_parse_options(argc, argv, &params->common))
-			continue;
-
 		c = getopt_auto(argc, argv, long_options);
 
 		/* Detect the end of the options. */
 		if (c == -1)
 			break;
 
+		if (set_common_option(c, argc, argv, &params->common))
+			continue;
+
 		switch (c) {
 		case 'a':
 			/* set sample stop to auto_thresh */
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 79142af4f566..4b6708e333b8 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -785,6 +785,7 @@ static struct common_params
 
 	while (1) {
 		static struct option long_options[] = {
+			COMMON_OPTIONS,
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"entries",		required_argument,	0, 'E'},
@@ -819,11 +820,11 @@ static struct common_params
 			{0, 0, 0, 0}
 		};
 
-		if (common_parse_options(argc, argv, &params->common))
-			continue;
-
 		c = getopt_auto(argc, argv, long_options);
 
+		if (set_common_option(c, argc, argv, &params->common))
+			continue;
+
 		/* detect the end of the options. */
 		if (c == -1)
 			break;
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 64cbdcc878b0..91f88bbebad9 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -549,6 +549,7 @@ static struct common_params
 
 	while (1) {
 		static struct option long_options[] = {
+			COMMON_OPTIONS,
 			{"auto",		required_argument,	0, 'a'},
 			{"help",		no_argument,		0, 'h'},
 			{"irq",			required_argument,	0, 'i'},
@@ -577,11 +578,11 @@ static struct common_params
 			{0, 0, 0, 0}
 		};
 
-		if (common_parse_options(argc, argv, &params->common))
-			continue;
-
 		c = getopt_auto(argc, argv, long_options);
 
+		if (set_common_option(c, argc, argv, &params->common))
+			continue;
+
 		/* detect the end of the options. */
 		if (c == -1)
 			break;
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH v2 4/8] riscv: ftrace: always preserve s0 in dynamic ftrace register frame
From: Shuai Xue @ 2026-06-02 11:37 UTC (permalink / raw)
  To: Wang Han, Paul Walmsley, Palmer Dabbelt, Albert Ou
  Cc: Steven Rostedt, Alexandre Ghiti, Masami Hiramatsu, Mark Rutland,
	Catalin Marinas, Chen Pei, Andy Chiu, Björn Töpel,
	Deepak Gupta, Puranjay Mohan, Conor Dooley, Josh Poimboeuf,
	Jiri Kosina, Miroslav Benes, Petr Mladek, Joe Lawrence,
	Shuah Khan, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, oliver.yang, zhuo.song, jkchen, linux-riscv,
	linux-kernel, linux-trace-kernel, live-patching, linux-kselftest,
	linux-perf-users
In-Reply-To: <20260528082310.1994388-5-wanghan@linux.alibaba.com>

On 5/28/26 4:23 PM, Wang Han wrote:
> The dynamic ftrace entry/exit only saved s0 (the architectural frame
> pointer) when HAVE_FUNCTION_GRAPH_FP_TEST was selected. The upcoming
> reliable frame-pointer unwinder needs s0 to be present in
> ftrace_regs unconditionally so it can use the frame pointer as the
> function-graph return-address cookie regardless of FP_TEST.

Nit: A prefered commit log:

struct __arch_ftrace_regs declares s0 unconditionally, and both
ftrace_regs_get_frame_pointer() and ftrace_partial_regs() read it
unconditionally. But the SAVE_ABI_REGS / RESTORE_ABI_REGS macros in
mcount-dyn.S only stored s0 under HAVE_FUNCTION_GRAPH_FP_TEST
(CONFIG_FUNCTION_GRAPH_TRACER && CONFIG_FRAME_POINTER). With
CONFIG_FRAME_POINTER=n the slot held whatever was on the stack before,
so any callback going through ftrace_partial_regs() saw a garbage
regs->s0. RISC-V kernels default to FRAME_POINTER=y, which is why
this has not bitten in practice.

Save and restore s0 unconditionally in the dynamic ftrace ABI register
frame. This fixes the latent garbage-s0 case, brings the dynamic ftrace
path in line with the static _mcount path (mcount.S SAVE_ABI_STATE
already saves s0 unconditionally), and matches the frame layout already
documented in the comment above SAVE_ABI_REGS. It is also a
prerequisite for the upcoming reliable unwinder, which reads
ftrace_regs_get_frame_pointer(fregs) directly.

Save and restore s0 unconditionally in the dynamic ftrace ABI register
frame. This fixes the latent garbage-s0 case, brings the dynamic ftrace
path in line with the static _mcount path (mcount.S SAVE_ABI_STATE
already saves s0 unconditionally), and matches the frame layout already
documented in the comment above SAVE_ABI_REGS. It is also a
prerequisite for the upcoming reliable unwinder, which reads
ftrace_regs_get_frame_pointer(fregs) directly.

The cost is one extra REG_S/REG_L pair per traced call, negligible
compared to the overall ftrace cost; the existing FREGS_SIZE_ON_STACK
already reserved the slot, so no extra stack space is used.

Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>

Thanks.
Shuai

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox