Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH rfc 0/2] Improvements to ftrace comm[] handling
From: David Laight @ 2026-06-26 21:23 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Michal Koutný
  Cc: David Laight

RFC because they are untested and I want to send them before going
on holiday for 2 weeks (should still have email access).

The first patch avoids a lot of 'potentially unsized' string
functions by embedding the char[] use to hold task->comm[] in a
structure.

The second adjsuts the data structure used to cache the task
names in the thread switch code.

David Laight (2):
  tracing: Embed 'char comm[16]' in a structure
  tracing: Keep pid and comm[] in the same structure

 kernel/trace/blktrace.c              |  28 +++----
 kernel/trace/trace.c                 |   3 +-
 kernel/trace/trace.h                 |   9 ++-
 kernel/trace/trace_events_filter.c   |   2 +-
 kernel/trace/trace_events_hist.c     |  26 +++---
 kernel/trace/trace_functions_graph.c |  10 +--
 kernel/trace/trace_output.c          |  24 +++---
 kernel/trace/trace_sched_switch.c    | 113 ++++++++++++---------------
 8 files changed, 101 insertions(+), 114 deletions(-)

-- 
2.39.5


^ permalink raw reply

* [PATCH 1/2] tracing: Embed 'char comm[16]' in a structure
From: David Laight @ 2026-06-26 21:23 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Michal Koutný
  Cc: David Laight
In-Reply-To: <20260626212356.64150-1-david.laight.linux@gmail.com>

Embedding the array in a stucture makes the size explicit and lets
structure copies be used.

Limit the size to 16 charatacters even if task_struct.comm is
made larger (there are plans to increase it).

Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
 kernel/trace/blktrace.c              | 28 +++++++++----------
 kernel/trace/trace.c                 |  3 +-
 kernel/trace/trace.h                 |  9 ++++--
 kernel/trace/trace_events_filter.c   |  2 +-
 kernel/trace/trace_events_hist.c     | 26 +++++++----------
 kernel/trace/trace_functions_graph.c | 10 +++----
 kernel/trace/trace_output.c          | 24 ++++++++--------
 kernel/trace/trace_sched_switch.c    | 42 ++++++++++++++++------------
 8 files changed, 75 insertions(+), 69 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8cd2520b4c99..68ffc95548b7 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1590,20 +1590,20 @@ static void blk_log_dump_pdu(struct trace_seq *s,
 
 static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
-	char cmd[TASK_COMM_LEN];
+	struct trace_comm cmd;
 
-	trace_find_cmdline(ent->pid, cmd);
+	trace_find_cmdline(ent->pid, &cmd);
 
 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
 		trace_seq_printf(s, "%u ", t_bytes(ent));
 		blk_log_dump_pdu(s, ent, has_cg);
-		trace_seq_printf(s, "[%s]\n", cmd);
+		trace_seq_printf(s, "[%s]\n", cmd.comm);
 	} else {
 		if (t_sec(ent))
 			trace_seq_printf(s, "%llu + %u [%s]\n",
-						t_sector(ent), t_sec(ent), cmd);
+						t_sector(ent), t_sec(ent), cmd.comm);
 		else
-			trace_seq_printf(s, "[%s]\n", cmd);
+			trace_seq_printf(s, "[%s]\n", cmd.comm);
 	}
 }
 
@@ -1637,30 +1637,30 @@ static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bo
 
 static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
-	char cmd[TASK_COMM_LEN];
+	struct trace_comm cmd;
 
-	trace_find_cmdline(ent->pid, cmd);
+	trace_find_cmdline(ent->pid, &cmd);
 
-	trace_seq_printf(s, "[%s]\n", cmd);
+	trace_seq_printf(s, "[%s]\n", cmd.comm);
 }
 
 static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
-	char cmd[TASK_COMM_LEN];
+	struct trace_comm cmd;
 
-	trace_find_cmdline(ent->pid, cmd);
+	trace_find_cmdline(ent->pid, &cmd);
 
-	trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
+	trace_seq_printf(s, "[%s] %llu\n", cmd.comm, get_pdu_int(ent, has_cg));
 }
 
 static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
-	char cmd[TASK_COMM_LEN];
+	struct trace_comm cmd;
 
-	trace_find_cmdline(ent->pid, cmd);
+	trace_find_cmdline(ent->pid, &cmd);
 
 	trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-			 get_pdu_int(ent, has_cg), cmd);
+			 get_pdu_int(ent, has_cg), cmd.comm);
 }
 
 static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6eb4d3097a4d..7de658b8ee0d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -52,6 +52,7 @@
 #include <linux/sort.h>
 #include <linux/io.h> /* vmap_page_range() */
 #include <linux/fs_context.h>
+#include <linux/trace_printk.h>
 
 #include <asm/setup.h> /* COMMAND_LINE_SIZE */
 
@@ -2972,7 +2973,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	seq_puts(m, "#    -----------------\n");
 	seq_printf(m, "#    | task: %.16s-%d "
 		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
-		   data->comm, data->pid,
+		   data->comm.comm, data->pid,
 		   from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
 		   data->policy, data->rt_priority);
 	seq_puts(m, "#    -----------------\n");
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 80fe152af1dd..afd59d79e1fe 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -183,6 +183,11 @@ struct fexit_trace_entry_head {
 
 struct trace_array;
 
+/* task_struct->comm[] may be truncated to save memory/width */
+struct trace_comm {
+	char comm[16];
+};
+
 /*
  * The CPU trace array - it consists of thousands of trace entries
  * plus some other descriptor data: (for example which task started
@@ -203,7 +208,7 @@ struct trace_array_cpu {
 	u64			preempt_timestamp;
 	pid_t			pid;
 	kuid_t			uid;
-	char			comm[TASK_COMM_LEN];
+	struct trace_comm	comm;
 
 #ifdef CONFIG_FUNCTION_TRACER
 	int			ftrace_ignore_pid;
@@ -906,7 +911,7 @@ void trace_last_func_repeats(struct trace_array *tr,
 
 extern u64 ftrace_now(int cpu);
 
-extern void trace_find_cmdline(int pid, char comm[]);
+extern void trace_find_cmdline(int pid, struct trace_comm *comm);
 extern int trace_find_tgid(int pid);
 extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 609325f57942..749887aff315 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -994,7 +994,7 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)
 	int cmp;
 
 	cmp = pred->regex->match(current->comm, pred->regex,
-				TASK_COMM_LEN);
+				sizeof(current->comm));
 	return cmp ^ pred->not;
 }
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0dbbf6cca9bc..1b51491b2a41 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -680,7 +680,7 @@ struct track_data {
 };
 
 struct hist_elt_data {
-	char *comm;
+	struct trace_comm *comm;
 	u64 *var_ref_vals;
 	char **field_var_str;
 	int n_field_var_str;
@@ -756,7 +756,7 @@ static struct track_data *track_data_alloc(unsigned int key_len,
 
 	data->elt.private_data = elt_data;
 
-	elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL);
+	elt_data->comm = kzalloc_obj(*elt_data->comm);
 	if (!elt_data->comm) {
 		track_data_free(data);
 		return ERR_PTR(-ENOMEM);
@@ -1608,19 +1608,19 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
 	return ERR_PTR(ret);
 }
 
-static inline void save_comm(char *comm, struct task_struct *task)
+static inline void save_comm(struct trace_comm *comm, struct task_struct *task)
 {
 	if (!task->pid) {
-		strcpy(comm, "<idle>");
+		strcpy(comm->comm, "<idle>");
 		return;
 	}
 
 	if (WARN_ON_ONCE(task->pid < 0)) {
-		strcpy(comm, "<XXX>");
+		strcpy(comm->comm, "<XXX>");
 		return;
 	}
 
-	strscpy(comm, task->comm, TASK_COMM_LEN);
+	strscpy(comm->comm, task->comm);
 }
 
 static void hist_elt_data_free(struct hist_elt_data *elt_data)
@@ -1646,7 +1646,6 @@ static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
 static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
 {
 	struct hist_trigger_data *hist_data = elt->map->private_data;
-	unsigned int size = TASK_COMM_LEN;
 	struct hist_elt_data *elt_data;
 	struct hist_field *hist_field;
 	unsigned int i, n_str;
@@ -1659,7 +1658,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
 		hist_field = hist_data->fields[i];
 
 		if (hist_field->flags & HIST_FIELD_FL_EXECNAME) {
-			elt_data->comm = kzalloc(size, GFP_KERNEL);
+			elt_data->comm = kzalloc_obj(*elt_data->comm);
 			if (!elt_data->comm) {
 				kfree(elt_data);
 				return -ENOMEM;
@@ -1677,8 +1676,6 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
 
 	BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1));
 
-	size = STR_VAR_LEN_MAX;
-
 	elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL);
 	if (!elt_data->field_var_str) {
 		hist_elt_data_free(elt_data);
@@ -1687,7 +1684,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
 	elt_data->n_field_var_str = n_str;
 
 	for (i = 0; i < n_str; i++) {
-		elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
+		elt_data->field_var_str[i] = kzalloc(STR_VAR_LEN_MAX, GFP_KERNEL);
 		if (!elt_data->field_var_str[i]) {
 			hist_elt_data_free(elt_data);
 			return -ENOMEM;
@@ -3449,7 +3446,7 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
 	elt_data = context->elt->private_data;
 	track_elt_data = track_data->elt.private_data;
 	if (elt_data->comm)
-		strscpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
+		track_elt_data->comm = elt_data->comm;
 
 	track_data->updated = true;
 
@@ -5505,16 +5502,13 @@ static void hist_trigger_print_key(struct seq_file *m,
 				   uval, (void *)(uintptr_t)uval);
 		} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
 			struct hist_elt_data *elt_data = elt->private_data;
-			char *comm;
 
 			if (WARN_ON_ONCE(!elt_data))
 				return;
 
-			comm = elt_data->comm;
-
 			uval = *(u64 *)(key + key_field->offset);
 			seq_printf(m, "%s: %-16s[%10llu]", field_name,
-				   comm, uval);
+				   elt_data->comm->comm, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
 			const char *syscall_name;
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0d2d3a2ea7dd..e46c5aa0d4e4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -561,19 +561,19 @@ static void print_graph_cpu(struct trace_seq *s, int cpu)
 
 static void print_graph_proc(struct trace_seq *s, pid_t pid)
 {
-	char comm[TASK_COMM_LEN];
+	struct trace_comm comm;
 	/* sign + log10(MAX_INT) + '\0' */
 	char pid_str[12];
 	int spaces = 0;
 	int len;
 	int i;
 
-	trace_find_cmdline(pid, comm);
-	comm[7] = '\0';
+	trace_find_cmdline(pid, &comm);
+	comm.comm[7] = '\0';
 	sprintf(pid_str, "%d", pid);
 
 	/* 1 stands for the "-" character */
-	len = strlen(comm) + strlen(pid_str) + 1;
+	len = strlen(comm.comm) + strlen(pid_str) + 1;
 
 	if (len < TRACE_GRAPH_PROCINFO_LENGTH)
 		spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
@@ -582,7 +582,7 @@ static void print_graph_proc(struct trace_seq *s, pid_t pid)
 	for (i = 0; i < spaces / 2; i++)
 		trace_seq_putc(s, ' ');
 
-	trace_seq_printf(s, "%s-%s", comm, pid_str);
+	trace_seq_printf(s, "%s-%s", comm.comm, pid_str);
 
 	/* Last spaces to align center */
 	for (i = 0; i < spaces - (spaces / 2); i++)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a5ad76175d10..58405291b44f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -554,12 +554,12 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 static int
 lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
-	char comm[TASK_COMM_LEN];
+	struct trace_comm comm;
 
-	trace_find_cmdline(entry->pid, comm);
+	trace_find_cmdline(entry->pid, &comm);
 
 	trace_seq_printf(s, "%8.8s-%-7d %3d",
-			 comm, entry->pid, cpu);
+			 comm.comm, entry->pid, cpu);
 
 	return trace_print_lat_fmt(s, entry);
 }
@@ -658,11 +658,11 @@ int trace_print_context(struct trace_iterator *iter)
 	struct trace_array *tr = iter->tr;
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent;
-	char comm[TASK_COMM_LEN];
+	struct trace_comm comm;
 
-	trace_find_cmdline(entry->pid, comm);
+	trace_find_cmdline(entry->pid, &comm);
 
-	trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid);
+	trace_seq_printf(s, "%16s-%-7d ", comm.comm, entry->pid);
 
 	if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) {
 		unsigned int tgid = trace_find_tgid(entry->pid);
@@ -700,13 +700,13 @@ int trace_print_lat_context(struct trace_iterator *iter)
 	entry = iter->ent;
 
 	if (verbose) {
-		char comm[TASK_COMM_LEN];
+		struct trace_comm comm;
 
-		trace_find_cmdline(entry->pid, comm);
+		trace_find_cmdline(entry->pid, &comm);
 
 		trace_seq_printf(
 			s, "%16s %7d %3d %d %08x %08lx ",
-			comm, entry->pid, iter->cpu, entry->flags,
+			comm.comm, entry->pid, iter->cpu, entry->flags,
 			entry->preempt_count & 0xf, iter->idx);
 	} else {
 		lat_print_generic(s, entry, iter->cpu);
@@ -1276,7 +1276,7 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 					     char *delim)
 {
 	struct ctx_switch_entry *field;
-	char comm[TASK_COMM_LEN];
+	struct trace_comm comm;
 	int S, T;
 
 
@@ -1284,7 +1284,7 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 
 	T = task_index_to_char(field->next_state);
 	S = task_index_to_char(field->prev_state);
-	trace_find_cmdline(field->next_pid, comm);
+	trace_find_cmdline(field->next_pid, &comm);
 	trace_seq_printf(&iter->seq,
 			 " %7d:%3d:%c %s [%03d] %7d:%3d:%c %s\n",
 			 field->prev_pid,
@@ -1293,7 +1293,7 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 			 field->next_cpu,
 			 field->next_pid,
 			 field->next_prio,
-			 T, comm);
+			 T, comm.comm);
 
 	return trace_handle_return(&iter->seq);
 }
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e9f0ff962660..972883643097 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -172,27 +172,33 @@ struct saved_cmdlines_buffer {
 	unsigned *map_cmdline_to_pid;
 	unsigned cmdline_num;
 	int cmdline_idx;
-	char saved_cmdlines[];
+	struct trace_comm saved_cmdlines[];
 };
 static struct saved_cmdlines_buffer *savedcmd;
 
 /* Holds the size of a cmdline and pid element */
 #define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s)			\
-	(TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0]))
+	(sizeof(struct trace_comm) + sizeof((s)->map_cmdline_to_pid[0]))
 
-static inline char *get_saved_cmdlines(int idx)
+static inline struct trace_comm *get_saved_cmdlines(int idx)
 {
-	return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
+	return &savedcmd->saved_cmdlines[idx];
 }
 
-static inline void set_cmdline(int idx, const char *cmdline)
+static inline void set_cmdline(int idx, const struct task_struct *tsk)
 {
-	strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+	struct trace_comm *comm = get_saved_cmdlines(idx);
+
+	BUILD_BUG_ON(sizeof(comm->comm) > sizeof(tsk->comm));
+
+	memcpy(comm->comm, tsk->comm, sizeof comm->comm);
+	if (sizeof(comm->comm) != sizeof(tsk->comm))
+		comm->comm[ARRAY_SIZE(comm->comm) - 1] = 0;
 }
 
 static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
 {
-	int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+	int order = get_order(sizeof(*s) + s->cmdline_num * sizeof(struct trace_comm));
 
 	kmemleak_free(s);
 	free_pages((unsigned long)s, order);
@@ -222,7 +228,7 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
 	s->cmdline_num = val;
 
 	/* Place map_cmdline_to_pid array right after saved_cmdlines */
-	s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];
+	s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val];
 
 	memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
 	       sizeof(s->map_pid_to_cmdline));
@@ -273,25 +279,25 @@ int trace_save_cmdline(struct task_struct *tsk)
 	}
 
 	savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
-	set_cmdline(idx, tsk->comm);
+	set_cmdline(idx, tsk);
 
 	arch_spin_unlock(&trace_cmdline_lock);
 
 	return 1;
 }
 
-static void __trace_find_cmdline(int pid, char comm[])
+static void __trace_find_cmdline(int pid, struct trace_comm *comm)
 {
 	unsigned map;
 	int tpid;
 
 	if (!pid) {
-		strcpy(comm, "<idle>");
+		strcpy(comm->comm, "<idle>");
 		return;
 	}
 
 	if (WARN_ON_ONCE(pid < 0)) {
-		strcpy(comm, "<XXX>");
+		strcpy(comm->comm, "<XXX>");
 		return;
 	}
 
@@ -300,14 +306,14 @@ static void __trace_find_cmdline(int pid, char comm[])
 	if (map != NO_CMDLINE_MAP) {
 		tpid = savedcmd->map_cmdline_to_pid[map];
 		if (tpid == pid) {
-			strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+			*comm = *get_saved_cmdlines(map);
 			return;
 		}
 	}
-	strcpy(comm, "<...>");
+	strcpy(comm->comm, "<...>");
 }
 
-void trace_find_cmdline(int pid, char comm[])
+void trace_find_cmdline(int pid, struct trace_comm *comm)
 {
 	preempt_disable();
 	arch_spin_lock(&trace_cmdline_lock);
@@ -561,11 +567,11 @@ static void saved_cmdlines_stop(struct seq_file *m, void *v)
 
 static int saved_cmdlines_show(struct seq_file *m, void *v)
 {
-	char buf[TASK_COMM_LEN];
+	struct trace_comm buf;
 	unsigned int *pid = v;
 
-	__trace_find_cmdline(*pid, buf);
-	seq_printf(m, "%d %s\n", *pid, buf);
+	__trace_find_cmdline(*pid, &buf);
+	seq_printf(m, "%d %s\n", *pid, buf.comm);
 	return 0;
 }
 
-- 
2.39.5


^ permalink raw reply related

* [PATCH 2/2] tracing: Keep pid and comm[] in the same structure
From: David Laight @ 2026-06-26 21:23 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Michal Koutný
  Cc: David Laight
In-Reply-To: <20260626212356.64150-1-david.laight.linux@gmail.com>

Rather than have two separate dynamic arrays on the end of struct
saved_commandlines_buffer have a single dynamic array where each
entry contains the pid and associated task->comm[].
This simplifies the initialisation and lookup.

Don't bother trying to initialise the pid field no a non-zero value,
it only matters in the tracing_saved_cmdlines_seq_ops code.
Allocate entry [0] first so that the tracing_saved_cmdlines_seq_ops
code can just index the array with the file offset.

The code now uses the correct size when determining the page 'order'
to free the structure. The smaller size will always give the same
'order'.

Signed-off-by: David Laight <david.laight.linux@gmail.com>
---

Is there any reason why this code uses alloc_pages() rather
than vmalloc()?
map_pid_to_cmdline[] is 64k*sizeof(int) so the whole structure
expands to 512k with about 64k/20 (about 3200) pid entries even
though the default is 128.
AFAICT there is only one copy of the data - so it could be static.
Perhaps with pointers to map_pid_cmdline[] and (after this patch)
pid_comm[], both of which could be separately resized.

I also noticed that map_pid_to_cmdline[] contains indexes into
pid_comm[], restricting these to 16bits would half the data area.

 kernel/trace/trace_sched_switch.c | 97 +++++++++++++------------------
 1 file changed, 39 insertions(+), 58 deletions(-)

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 972883643097..5e7c8cf444b8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -167,27 +167,21 @@ static size_t tgid_map_max;
  * where interrupt is disabled.
  */
 static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+struct pid_comm {
+	pid_t pid;
+	struct trace_comm comm;
+};
 struct saved_cmdlines_buffer {
 	unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
-	unsigned *map_cmdline_to_pid;
 	unsigned cmdline_num;
 	int cmdline_idx;
-	struct trace_comm saved_cmdlines[];
+	struct pid_comm pid_comm[];
 };
 static struct saved_cmdlines_buffer *savedcmd;
 
-/* Holds the size of a cmdline and pid element */
-#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s)			\
-	(sizeof(struct trace_comm) + sizeof((s)->map_cmdline_to_pid[0]))
-
-static inline struct trace_comm *get_saved_cmdlines(int idx)
-{
-	return &savedcmd->saved_cmdlines[idx];
-}
-
-static inline void set_cmdline(int idx, const struct task_struct *tsk)
+static inline void set_cmdline(struct pid_comm *pid_comm, const struct task_struct *tsk)
 {
-	struct trace_comm *comm = get_saved_cmdlines(idx);
+	struct trace_comm *comm = &pid_comm->comm;
 
 	BUILD_BUG_ON(sizeof(comm->comm) > sizeof(tsk->comm));
 
@@ -212,7 +206,7 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
 	int order;
 
 	/* Figure out how much is needed to hold the given number of cmdlines */
-	orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
+	orig_size = sizeof(*s) + val * sizeof(s->pid_comm[0]);
 	order = get_order(orig_size);
 	size = 1 << (order + PAGE_SHIFT);
 	page = alloc_pages(GFP_KERNEL, order);
@@ -224,16 +218,11 @@ static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
 	memset(s, 0, sizeof(*s));
 
 	/* Round up to actual allocation */
-	val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
+	val = (size - sizeof(*s)) / sizeof(s->pid_comm[0]);
 	s->cmdline_num = val;
 
-	/* Place map_cmdline_to_pid array right after saved_cmdlines */
-	s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val];
-
 	memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
 	       sizeof(s->map_pid_to_cmdline));
-	memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
-	       val * sizeof(*s->map_cmdline_to_pid));
 
 	return s;
 }
@@ -247,6 +236,7 @@ int trace_create_savedcmd(void)
 
 int trace_save_cmdline(struct task_struct *tsk)
 {
+	struct pid_comm *pid_comm;
 	unsigned tpid, idx;
 
 	/* treat recording of idle task as a success */
@@ -272,14 +262,16 @@ int trace_save_cmdline(struct task_struct *tsk)
 
 	idx = savedcmd->map_pid_to_cmdline[tpid];
 	if (idx == NO_CMDLINE_MAP) {
-		idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
-
+		idx = savedcmd->cmdline_idx;
 		savedcmd->map_pid_to_cmdline[tpid] = idx;
+		if (++idx >= savedcmd->cmdline_num)
+			idx = 0;
 		savedcmd->cmdline_idx = idx;
 	}
 
-	savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
-	set_cmdline(idx, tsk);
+	pid_comm = savedcmd->pid_comm + idx;
+	pid_comm->pid = tsk->pid;
+	set_cmdline(pid_comm, tsk);
 
 	arch_spin_unlock(&trace_cmdline_lock);
 
@@ -288,8 +280,8 @@ int trace_save_cmdline(struct task_struct *tsk)
 
 static void __trace_find_cmdline(int pid, struct trace_comm *comm)
 {
+	struct pid_comm *pid_comm;
 	unsigned map;
-	int tpid;
 
 	if (!pid) {
 		strcpy(comm->comm, "<idle>");
@@ -301,12 +293,11 @@ static void __trace_find_cmdline(int pid, struct trace_comm *comm)
 		return;
 	}
 
-	tpid = pid & (PID_MAX_DEFAULT - 1);
-	map = savedcmd->map_pid_to_cmdline[tpid];
+	map = savedcmd->map_pid_to_cmdline[pid & (PID_MAX_DEFAULT - 1)];
 	if (map != NO_CMDLINE_MAP) {
-		tpid = savedcmd->map_cmdline_to_pid[map];
-		if (tpid == pid) {
-			*comm = *get_saved_cmdlines(map);
+		pid_comm = savedcmd->pid_comm + map;
+		if (pid_comm->pid == pid) {
+			*comm = pid_comm->comm;;
 			return;
 		}
 	}
@@ -521,42 +512,34 @@ const struct file_operations tracing_saved_tgids_fops = {
 	.release	= seq_release,
 };
 
-static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
+static struct pid_comm *saved_cmdlines_entry(loff_t off)
 {
-	unsigned int *ptr = v;
+	struct pid_comm *pid_comm;
 
-	if (*pos || m->count)
-		ptr++;
-
-	(*pos)++;
+	if (off >= savedcmd->cmdline_num)
+		return NULL;
 
-	for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
-	     ptr++) {
-		if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
-			continue;
+	/* Entries are used in sequence and never freed */
+	pid_comm = &savedcmd->pid_comm[off];
+	return pid_comm->pid ? pid_comm : NULL;
+}
 
-		return ptr;
-	}
+static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	loff_t off = *pos;
 
-	return NULL;
+	v = saved_cmdlines_entry(off);
+	if (v)
+		*pos = off + 1;
+	return v;
 }
 
 static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
 {
-	void *v;
-	loff_t l = 0;
-
 	preempt_disable();
 	arch_spin_lock(&trace_cmdline_lock);
 
-	v = &savedcmd->map_cmdline_to_pid[0];
-	while (l <= *pos) {
-		v = saved_cmdlines_next(m, v, &l);
-		if (!v)
-			return NULL;
-	}
-
-	return v;
+	return saved_cmdlines_entry(*pos);
 }
 
 static void saved_cmdlines_stop(struct seq_file *m, void *v)
@@ -567,11 +550,9 @@ static void saved_cmdlines_stop(struct seq_file *m, void *v)
 
 static int saved_cmdlines_show(struct seq_file *m, void *v)
 {
-	struct trace_comm buf;
-	unsigned int *pid = v;
+	struct pid_comm *ptr = v;
 
-	__trace_find_cmdline(*pid, &buf);
-	seq_printf(m, "%d %s\n", *pid, buf.comm);
+	seq_printf(m, "%d %s\n", ptr->pid, ptr->comm.comm);
 	return 0;
 }
 
-- 
2.39.5


^ permalink raw reply related

* Re: [PATCH v10 6/6] selftests/mm: add hwpoison-panic destructive test
From: Mike Rapoport @ 2026-06-27  7:52 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Miaohe Lin, Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
	Vlastimil Babka, Suren Baghdasaryan, Michal Hocko, Shuah Khan,
	Naoya Horiguchi, Jonathan Corbet, Shuah Khan, Liam R. Howlett,
	lance.yang, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-mm, linux-kernel, linux-doc, linux-kselftest,
	linux-trace-kernel, kernel-team
In-Reply-To: <20260626-ecc_panic-v10-6-6dacb8ad024d@debian.org>

Hi Breno,

On Fri, Jun 26, 2026 at 08:33:20AM -0700, Breno Leitao wrote:
> Add a destructive selftest that verifies
> vm.panic_on_unrecoverable_memory_failure actually panics when a
> hwpoison error hits a kernel-owned page.

> +ksft_skip=4

...

> +ksft_print() { echo "# $*"; }
> +ksft_exit_skip() { ksft_print "$*"; exit "$ksft_skip"; }
> +ksft_exit_fail() { echo "not ok 1 $*"; exit 1; }

There is tools/testing/selftests/kselftest/ktap_helpers.sh that already
implements this :)

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH] tracing: eprobe: read the complete FILTER_PTR_STRING pointer
From: Steven Rostedt @ 2026-06-27  8:36 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Martin Kaiser, Masami Hiramatsu (Google), linux-trace-kernel,
	linux-kernel
In-Reply-To: <20260627004538.4cc8b34a4a20ee14f9a38c28@kernel.org>



On June 26, 2026 4:45:38 PM GMT+01:00, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>On Fri, 26 Jun 2026 06:42:23 -0400
>Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> .
>> 
>> I'll queue this up.
>
>I've already queued this on my probes/core branch.
>(which will be probes/fixes)


Awesome. Thanks Masami!

-- Steve


^ permalink raw reply

* Re: [PATCH] usb: typec: add trace point for typec_set_mode
From: Steven Rostedt @ 2026-06-27  9:23 UTC (permalink / raw)
  To: Heikki Krogerus
  Cc: Ahmad Fatoum, Greg Kroah-Hartman, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel, linux-usb, linux-trace-kernel,
	kernel
In-Reply-To: <ajPW9P6qZyhqtjvp@kuha>

On Thu, 18 Jun 2026 14:31:00 +0300
Heikki Krogerus <heikki.krogerus@linux.intel.com> wrote:

> > >  obj-$(CONFIG_TYPEC)            += typec.o
> > >  typec-y                                := class.o mux.o bus.o pd.o retimer.o mode_selection.o
> > >  typec-$(CONFIG_ACPI)           += port-mapper.o
> > > +typec-$(CONFIG_TRACING)                += trace.o  
> > 
> > Thanks for the suggestion. I will do that for v2.
> > 
> > I also saw there is Sashiko AI feedback on this patch[1], but I am not
> > familiar enough with how the event headers are used outside the kernel
> > to determine if that's actionable advice or if it can be ignored.
> > 
> > Do you have an opinion on that?
> > 
> > [1]:
> > https://sashiko.dev/#/patchset/20260617-typec_set_mode-tracepoint-v1-1-bdfbb39cfccd%40pengutronix.de  
> 
> It's correct. You need to use a private trace.h in this case, so just
> move it here: drivers/usb/typec/trace.h
> 
> And also make sure you include everything needed in that header like
> it's telling you.


You may need more updates to make this work in a different directory.

Please read the comments in:

  samples/trace_events/trace-events-sample.h

and

  samples/trace_events/Makefile

-- Steve

^ permalink raw reply

* Re: [RFC PATCH 00/40] mm: reliable 1GB page allocation
From: Lorenzo Stoakes @ 2026-06-27  9:28 UTC (permalink / raw)
  To: Rik van Riel
  Cc: linux-kernel, kernel-team, linux-mm, david, willy, surenb, hannes,
	ziy, usama.arif, fvdl, Andrew Morton, Jonathan Corbet,
	Chris Mason, David Sterba, Vlastimil Babka, Steven Rostedt,
	Masami Hiramatsu, Rafael J. Wysocki, Oscar Salvador,
	Mike Rapoport, linux-doc, linux-btrfs, linux-trace-kernel,
	linux-pm, linux-cxl, Linus Torvalds
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>

+cc missing maintainers.
+cc Linus for general issues raised.

(Apologies, this email is VERY long, even for me)

Hi Rik,

I appreciate this is an RFC, but obviously part of that is early
feedback. So if this is meant to be an (EXTREMELY) rough pre-RFC
proof-of-concept then fine (however your 'todo' suggests otherwise).

And of course, before I get critical :) thank you for looking into this,
it's very interesting work. I want this feature to land. BUT :)

So if this in any way resembles what you plan to send upstream un-RFC'd I
need to pour a fairly large bucket of very cold water over this.

TL;DR: The series is completely unmergeable as it stands. Not even close.

Before we get into the code, please please please make sure you cc- the
right people. You're doing very invasive, very major work here.

You failed to even cc- the page allocator maintainer (Vlastimil) for a
series that radically alters page allocation.

MAKE SURE you cc- Vlastimil and all other maintainers and reviewers + lists
on future (RFC!!) revisions of this please.

If you want to do a quiet off-list pre-RFC that's fine, but it's
unforgivable to leave Vlastimil out of this even for that.

It's 30 seconds running a script (filtering for maintainers alone to save
on noise):

$ scripts/get_maintainer.pl --nogit --nogit-fallback --nor 20260520150018.2491267-1-riel@surriel.com.mbx
Andrew Morton <akpm@linux-foundation.org> (maintainer:MEMORY MANAGEMENT - CORE)
David Hildenbrand <david@kernel.org> (maintainer:MEMORY MANAGEMENT - CORE)
Jonathan Corbet <corbet@lwn.net> (maintainer:DOCUMENTATION)
Chris Mason <clm@fb.com> (maintainer:BTRFS FILE SYSTEM)
David Sterba <dsterba@suse.com> (maintainer:BTRFS FILE SYSTEM)
Vlastimil Babka <vbabka@kernel.org> (maintainer:MEMORY MANAGEMENT - PAGE ALLOCATOR)
Steven Rostedt <rostedt@goodmis.org> (maintainer:TRACING)
Masami Hiramatsu <mhiramat@kernel.org> (maintainer:TRACING)
"Rafael J. Wysocki" <rafael@kernel.org> (maintainer:HIBERNATION (aka Software Suspend, aka swsusp))
Oscar Salvador <osalvador@suse.de> (maintainer:MEMORY HOT(UN)PLUG)
Mike Rapoport <rppt@kernel.org> (maintainer:MEMBLOCK AND MEMORY MANAGEMENT INITIALIZATION)
Johannes Weiner <hannes@cmpxchg.org> (maintainer:MEMORY MANAGEMENT - RECLAIM)
linux-mm@kvack.org (open list:MEMORY MANAGEMENT - CORE)
linux-doc@vger.kernel.org (open list:DOCUMENTATION)
linux-kernel@vger.kernel.org (open list)
linux-btrfs@vger.kernel.org (open list:BTRFS FILE SYSTEM)
linux-trace-kernel@vger.kernel.org (open list:TRACING)
linux-pm@vger.kernel.org (open list:HIBERNATION (aka Software Suspend, aka swsusp))
linux-cxl@vger.kernel.org (open list:MEMORY HOT(UN)PLUG)
HIBERNATION (aka Software Suspend, aka swsusp) status: Supported
SUSPEND TO RAM status: Supported

OK, so getting into the code. You didn't include an overall diffstat. So let's
see:

$ git checkout e1914add2799
...
$ b4 shazam 20260520150018.2491267-1-riel@surriel.com
...
$ git format-patch --cover-letter HEAD~40
...
$ tail -31 0000-cover-letter.patch
 Documentation/admin-guide/sysctl/vm.rst |   21 -
 Documentation/mm/physical_memory.rst    |   13 +-
 fs/btrfs/extent_io.c                    |   69 +-
 fs/btrfs/extent_io.h                    |    4 +-
 fs/btrfs/inode.c                        |    2 +-
 fs/btrfs/ioctl.c                        |    2 +-
 fs/btrfs/raid56.c                       |    6 +-
 fs/btrfs/relocation.c                   |    2 +-
 fs/btrfs/scrub.c                        |    3 +-
 include/linux/mmzone.h                  |  236 +-
 include/linux/page-flags.h              |    9 +
 include/linux/pageblock-flags.h         |   10 +
 include/linux/vm_event_item.h           |   10 +
 include/trace/events/kmem.h             |  373 ++
 kernel/power/snapshot.c                 |   35 +-
 mm/compaction.c                         |  360 +-
 mm/debug.c                              |   19 +-
 mm/internal.h                           |   39 +
 mm/memory_hotplug.c                     |    4 +
 mm/mm_init.c                            |  400 +-
 mm/page_alloc.c                         | 5064 ++++++++++++++++++++---
 mm/page_reporting.c                     |  149 +-
 mm/show_mem.c                           |   27 +-
 mm/sparse.c                             |    3 +-
 mm/vmscan.c                             |  115 +-
 mm/vmstat.c                             |   71 +-
 26 files changed, 6131 insertions(+), 915 deletions(-)

This is completely insane :) I do hope the omission of the diffstat was not
to hide this, but at any rate, adding ~5,000 lines of code to page_alloc.c
is a complete non-starter.

In any case I think clearly more files are required, mm/super_pageblock.c
or whatever it'll be, but at any rate this much code being added to me
clearly indicates something is terribly wrong here anyway.

And you added no tests whatsoever. I get that some things are hard to test
in the kernel and mm, but not even an attempt? Surely you have some
localised stuff that you've been using to exercise this, couldn't some of
that be made into selftests?

Certainly if tests are not present, this needs to be justified in the cover
letter.

(Again I appreciate this is an RFC, so perhaps those are coming.)

Anyway let's look at some of this code. Johannes's patches all look
reasonable, but as soon as we start seeing:

Assisted-by: Claude:claude-opus-4.7

Things start to go awry.

The series is full of extremely dense comments that contain far too much
specific information to be parseable. E.g.:

+	 *  - owner_cpu == this CPU (or no owner): take the local PCP
+	 *    lock with spin_trylock and enqueue normally. The trylock
+	 *    fails only on rare local self re-entry (IRQ/NMI fires
+	 *    while the interrupted task already holds the lock) or
+	 *    while a remote drain is active; either way, fall back to
+	 *    free_one_page (or the zone-llist for FPI_TRYLOCK). No
+	 *    irqsave: the trylock cannot block on self, and remote
+	 *    CPUs never take this pcp->lock (they go via free_llist),
+	 *    so an interruption cannot deadlock against another freer.

This isn't readable. This isn't acceptable. It matches exactly what I've
found LLMs to do, adding far too much detail in way that isn't
human-readable.

You must redo all of these.

The patches are all quite dense. 40 patches is already far too many IMO for
a series this massive, but you're also piling in tonnes of complexity in
every patch.

The series should really be broken out into separate series that each
slowly build up the prerequisites, so they are reviewable, parseable, can
be seen in action before we move to the superblock concept and each having
tests or means of asserting that they in fact function.

It feels that there are patches that indeed can be broken out sensibly like
this,
e.g. https://lore.kernel.org/all/20260520150018.2491267-5-riel@surriel.com/
(which also seems to be a fix? So maybe should be sent as one with
appropriate tags?), so I think that's really the only viable way to land
something this huge.

I see Usama commented on you breaking userspace, and you said you are not
going to do that after all (thanks)... but it worries me that you
essentially render the watermark boost broken? I don't think that's OK?

The code in general dumps large blocks of code into already existing huge
functions but, even more unforgivably, turns small, maintainable, easily
understood functions into completely disgusting horror shows.

For instance, __rmqueue_smallest() goes from ~20 lines to 604 lines. This
is completely insane and totally unacceptable.

The code is also full of software engineering 101 fails. For instance in
__rmqueue_sb_find_fallback():

	if (search_cats & SB_SEARCH_PREFERRED) {
		...
		for (full = SB_FULL; full < __NR_SB_FULLNESS; full++) {
			list_for_each_entry(sb,
					    &zone->spb_lists[cat][full], list) {
				...
				if (movable && cat == SB_TAINTED &&
				    sb->nr_free <= spb_tainted_reserve(sb))
					continue;
				...
				for (i = 0; i < MIGRATE_PCPTYPES - 1; i++) {
					...
					if (page) {
						...
					}
				}
			}
		}
	}

Note the mix of insane nesting AND guard clauses to make it even more
indecipherable.

There's no world in which I, or any other mm maintainer (I would venture to
say) want to maintain stuff like this.

A particularly concerning patch is
https://lore.kernel.org/all/20260520150018.2491267-15-riel@surriel.com/ -

The commit messages suffer from the same issue as the comments - incredibly
dense material that is far too detailed to the point of absolutely
clarifying nothing. E.g.:

"The SPB list heads (zone->spb_empty and the spb_lists[cat][full] matrix)
 are initialized only by setup_superpageblocks(), which is __init and runs
 only at boot.  Hot-add into a previously-empty zone invokes
 init_one_superpageblock() with zero-initialized list_heads, and the
 inlined list_add_tail() NULL-derefs walking ->next->prev."

Is just complete word salad. It may as well be written in Egyptian
hieroglyphs.

You're explaining spaghetti code using English language which makes it even
more indecipherable, and not getting to the root of what you are actually
doing.

Again this feels very LLM-generated. I've noticed that they tend to do this
'write out the code again but in English' stuff (and is why I do not
allow LLMs to generate code or comments for me!)

You need to make commit messages and comments as succinct as possible and
as clear as possible for _human beings_ :)

I also notice you have this huge blocks of word salad comments all over the
place but I haven't seen a single kdoc comment (EDIT: noticed at least 1 later!)

This is a far more helpful, structured, form of describing functions and
for anything put in a header you really do need to provide them.

Back to the patch. The diffstat is:

 include/linux/mmzone.h |  10 +
 mm/compaction.c        |  36 +-
 mm/internal.h          |  10 +
 mm/mm_init.c           | 146 +++++--
 mm/page_alloc.c        | 853 ++++++++++++++++++++++++++++++++---------
 mm/vmstat.c            |  66 ++--
 6 files changed, 883 insertions(+), 238 deletions(-)

This is (generally speaking, and certainly in this case) far, far too much
for a single patch.

But as per above this is the patch where you commit what are essentially
code war crimes against __rmqueue_smallest(), adding code so densely nested
that you have to lop off the end of code to fit:

+	if (!movable && !is_migrate_cma(migratetype)) {
+		for (full = SB_FULL; full < __NR_SB_FULLNESS; full++) {
+			list_for_each_entry(sb,
+				&zone->spb_lists[SB_TAINTED][full], list) {
+				if (!sb->nr_free)
+					continue;
+				for (current_order = max_t(unsigned int,
+						order, pageblock_order);
+				     current_order < NR_PAGE_ORDERS;
+				     ++current_order) {
+					area = &sb->free_area[current_order];
+					page = get_page_from_free_area(
+						area, MIGRATE_MOVABLE);
+					if (!page)
+						continue;
+					if (get_pageblock_isolate(page))
+						continue;
+					if (is_migrate_cma(
+					    get_pageblock_migratetype(page)))
+						continue;
+					page = claim_whole_block(zone, page,
+						current_order, order,
+						migratetype, MIGRATE_MOVABLE);
+					trace_mm_page_alloc_zone_locked(
+						page, order, migratetype,
+						pcp_allowed_order(order) &&
+						migratetype < MIGRATE_PCPTYPES);
+					return page;
+				}
+			}
+		}
+	}

I mean in what world are we taking code like this?

https://lore.kernel.org/all/20260520150018.2491267-22-riel@surriel.com/ has
what seem to be more human comments (good!) but then so densely nested that
they have to be cropped (bad).

using smaller, separate, functions and a structured programming approach
here is the way to go.

There's random smaller issues too like
https://lore.kernel.org/all/20260520150018.2491267-23-riel@surriel.com/
seems to assume the compiler can't figure out to remove dead code (note the
wild inconsistency too in comments)

You seem in some commits to undo or correct stuff you did in previous ones
like
https://lore.kernel.org/all/20260520150018.2491267-24-riel@surriel.com/

I mean perhaps I'm mistaken, but otherwise, can you instead rebase your
series please?

And again we see the world salad:

+ * Maximum tainted superpageblock candidates per spb_evacuate_for_order call.
+ * Collected under zone->lock, then evacuated without it. Larger than the
+ * contig-allocation candidate cap because evacuation runs from the slowpath
+ * after reclaim/compaction failed: we need a meaningful chance of freeing a
+ * non-MOV-claimable pageblock before the slowpath escalates to dropping
+ * ALLOC_NOFRAGMENT (which lets __rmqueue_claim taint clean SPBs). Sized to
+ * scan a meaningful fraction of a typical tainted-pool population.

No kdocs, some functions/complicated logic missing any comments at all,
then piles of words thrown at you like a rock.

This isn't pleasant to read, it adds no clarity, it just makes the whole
thing a mess.

The code in that one is at least vaguely ok (though still too nested).

I feel like where you've manually written code the quality substantially
improves, where the LLM has, the quality nosedives into oblivion.

For instance
https://lore.kernel.org/all/20260520150018.2491267-26-riel@surriel.com/
seems to be radically better, albeit adding far too much code. So I assume
that was more manual.

Stuff like
https://lore.kernel.org/all/20260520150018.2491267-27-riel@surriel.com/
shows more of a structural issue.

In:

+	 * Mark callers that have a cheap fallback if the page allocator returns
+	 * NULL, so __rmqueue can refuse to taint a clean SPB when an existing
+	 * tainted SPB still has free pageblocks waiting to be evacuated.
+	 *
+	 * Two shapes qualify:
+	 *
+	 *  1. Explicit fallback declaration: __GFP_NORETRY without
+	 *     __GFP_RETRY_MAYFAIL. Used by THP, slab high-order refill,
+	 *     skb_page_frag_refill on full sockets, etc.
+	 *
+	 *  2. Atomic-context shape: no __GFP_DIRECT_RECLAIM, no __GFP_NOMEMALLOC,
+	 *     no __GFP_NOFAIL. These callers (GFP_ATOMIC, GFP_NOWAIT, including
+	 *     ALLOC_HIGHATOMIC consumers) have implicit fallbacks: drop the
+	 *     packet, demote the slab order, return ENOMEM up the slowpath,
+	 *     retry from process context with GFP_KERNEL, etc. ALLOC_HIGHATOMIC
+	 *     callers also get a second crack at the dedicated MIGRATE_HIGHATOMIC
+	 *     reserve in rmqueue_buddy after __rmqueue returns NULL.
+	 *     Tainting a 1 GiB SPB to satisfy any of them is a long-lived
+	 *     fragmentation event for short-lived data.
+	 *

The comment is vastly better than most, but you seem to be tying far too
much up in assumptions about what particular workloads do.

This is indicative perhaps of a need to refactor to more reasonably
determine these.

Adding a function and state, say, that expresses these properties would
allow you to break out these concepts and have the code be self-documenting
(as well as adding suitable actual documentation in comments that would be
more succinct).

https://lore.kernel.org/all/20260520150018.2491267-29-riel@surriel.com/
also shows some real issues with how you've implemented this, e.g.:

+ * Called from each PASS_1/2/2B/2C/2D success path after a successful
+ * allocation against a tainted SPB. If the SPB is below its shrink
+ * high-water mark, queue the SPB-driven slab shrink and try to start
+ * the per-SPB defrag worker. Both have their own cooldown gates inside,
+ * so this is cheap to call on every such allocation.

Now you're putting information from one gargantuan function with labels
about 'passes' (rather reminiscent of VMA merge 'cases') into another.

This is a clear sign of a broken abstraction and the code not being
structured correctly.

You should _express_ state encoded in these 'passes' in the _actual code_,
break that code up sensibly and in a way that can be assessed for
correctness, not put a bit-rotting comment on top of a function whose
correctness is much harder to confirm.

Again this patch has similar issues with ridiculously dense indecipherable
comments, e.g.:

+	 * Last-chance defrag trigger before tainting a fresh clean SPB.
+	 * Walk the tainted-SPB list and try to wake the per-SPB defrag
+	 * worker on each. Catches SPBs that are stuck in expired-cooldown
+	 * state because no allocator activity has touched them recently
+	 * (the routine event-driven trigger from spb_update_list only
+	 * fires on bucket transitions, not on every alloc). Once the
+	 * cooldown has expired, spb_maybe_start_defrag() will requeue
+	 * work; otherwise the gate inside spb_needs_defrag() no-ops
+	 * cheaply. Bounded by nr_tainted_spbs and only runs when we are
+	 * already on the slow path of fragmenting the clean pool.

The spoondecker is montiplexed in the fradupple complex dedadderated in the
splunkyfied concratanator underfined by the transpontaculatoration
matrixifier... :)

I think this is symptomatic of the abstraction being fundamentally broken.

If you have to establish vast swathes of cognitive context like this mid
way through a huge function (again poor __rmqueue_smallest() who will never
forgive you), then you've basically failed to abstract it.

Please I beg you ADD FUNCTIONS :) structured programming is a thing,
struct's are a thing, abstraction is a thing :)

Pouring spaghetti into a single function is something you expect to see on
a 1990's PHP website, not in core mm code ;)

https://lore.kernel.org/all/20260520150018.2491267-30-riel@surriel.com/
seems to be another example of you just rethinking parts of what you
already submitted midway through the series.

Again I might be missing something here, but if you are doing this, please
just rebase your series to use $NEWIDEA from the start.

https://lore.kernel.org/all/20260520150018.2491267-35-riel@surriel.com/ is
another wall of text (newlines! Please :) but it seems like something you
can break out separately no?

It seems at least reviewable :)

https://lore.kernel.org/all/20260520150018.2491267-36-riel@surriel.com/
contains more of the same broken abstraction stuff, terrible nesting stuff,
word salad stuff but is at least mercifully smaller.

https://lore.kernel.org/all/20260520150018.2491267-37-riel@surriel.com/ is
almost emblematic of the terrible (LLM I hope?) comment issue in the
series. Overly dense hieroglyphs.

https://lore.kernel.org/all/20260520150018.2491267-38-riel@surriel.com/ is
another 'why didn't you rebase?' patch (again maybe I'm missing something
here!)

https://lore.kernel.org/all/20260520150018.2491267-39-riel@surriel.com/
seems more reasonable although newlines please :)

And finally
https://lore.kernel.org/all/20260520150018.2491267-40-riel@surriel.com/ -
while it's a do-not-merge patch in an RFC that changes tracing so maybe
unfair, but the comments are suffering the same symptoms as the rest of the
series.

~

So really, you need to start again, from scratch, and without the use of an
LLM for generating code, or at least with it kept on a (very very short)
leash.

And to be clear, I _want_ this concept of GB superpageblocks to land. It's
a really exciting concept.

Pulling compaction kicking and screaming into 2026 stands to significantly
benefit linux users and developers.

But the execution has to be _completely_ rethought.

I also worry about correctness - I simply cannot see how you can have sense
of it, given the state of the code. For something so invasive and so
critical to kernel functionality, code quality is simply not optional here.

Practical thoughts on how to rework the series:

- Properly abstract the concepts
- Properly separate out functions and data structures
- Add _human-readable_ comments that are succinct + clear as possible
- If a function becomes too nested, separate into smaller functions
- Add tests, if at all or in any way possible (and justify if you can't)
- Clarify commit messages, don't just rewrite code in English
- Use newlines for new paragraphs everywhere :)
- Refactor existing code in preparation for your changes
- Add a new file to contain your changes (+ add to page alloc MAINTAINERS
  section)
- Add kdocs for anything not static, and clearly describe static functions
- Split into separate series as much as possible, gradually building
  foundations for your changes
- Make everything less dense and more abstracted

In general, write the series with reviewers and other kernel developers in
mind - write clear explanations in comments and commit messages, have the
series slowly build to what you're implementing.

That way, we can see that correctness is maintained throughout, can review
what's there sensibly, and the series becomes upstreamable.

IOW I say we take off and nuke the entire site from orbit. It's the only
way to be sure :)

~

Another issue here is maintainer time - even this _extremely_ light-touch
review has taken me a few hours (of my weekend :). To review it in detail
would take probably DAYS of dedicated work.

In general, I'm concerned that we're going to get caught in a cycle of LLM
code sent - reviewers spending hours reviewing - LLM generates new revision
- etc.

This simply isn't scalable, maintainers cannot do this in the face of the
sheer quantity and complexity of code that LLMs can generate.

We are simply going to have to NAK. And that helps nobody.

Luckily for me, this isn't in a sub-(sub-)system I maintain, so I am not
obliged, but I do have empathy for my fellow maintainers, and am VERY
concerned about this trend.

There has recently been an absolute wave of LLM code, some acked as such
(and I think you for doing so here!) but others unacknowledged entirely,
and the workload, which was already too much, has risen significantly (Jon
has noted the rise in commit count for instance in LWN).

Treating maintainer time as without value was already an issue, but I fear
that we'll see a significant increase in maintainer stress and sadly,
burnout.

As such, I feel that we will have to implement measures at some point to
deprioritise/quickly dismiss such series, unfortunately.

But series from smart and capable engineers such as yourselfk who are well
respected in the communityk will likely not bek and thus will suffer from
this issue indefinitely.

So on that basis, I ask respectfully that you account for this when using
this tooling.

(Also, I appreciate that this is an RFC, but your recent non-RFC GUP series
https://lore.kernel.org/all/20260616190300.1509639-1-riel@surriel.com/
(revision v2 at
https://lore.kernel.org/all/20260625015053.2445008-1-riel@surriel.com/) has
all the same hallmarks as this one, so I feel this point needs to be
underlined).

~

Speaking more generally across the industry, I've been reading about
companies generating code blindly and running into terrible problems with
software that ends up becoming totally unmanageable.

I won't tolerate this happening happening in mm, and strongly object to the
concept (held by some AI proponents) that code is simply an unimportant
byproduct of AI.

The kernel's code (and especially mm's) is _critically_ important, quite
literally, as it forms the basis of the world's critical technical
infrastructure.

I am not opposed to the use of LLMs, but they _must_ serve as tools to
_assist_ experts at their job, not a means by which code bases are
degraded.

~

OK with all that said - to be absolutely clear - I respect you a great
deal, and I KNOW you're (much, much) better than this.

And, to repeat, this idea is very exciting and I _want_ to see this land.

But I feel you've rather let the LLM run amok and it's selling you (very,
very) short, given just how smart and capable you are.

Let's try again :)

Thanks, Lorenzo

^ permalink raw reply

page:              | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox