public inbox for linux-rt-users@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/12] sched_debug: Unify parsing methods for task_info
@ 2025-10-17  2:24 Clark Williams
  2025-10-17  2:24 ` [PATCH 02/12] sched_debug: Fix runqueue task parsing logic and state filtering Clark Williams
                   ` (11 more replies)
  0 siblings, 12 replies; 25+ messages in thread
From: Clark Williams @ 2025-10-17  2:24 UTC (permalink / raw)
  To: linux-rt-users
  Cc: Clark Williams, Clark Williams, Derek Barbosa, wander,
	marco.chiappero, chris.friesen, luochunsheng

From: Clark Williams <williams@redhat.com>

In the sched_debug backend code, there are two logical paths for parsing
the sched_debug file's task_info "running tasks". These are currently
divided into "OLD" and "NEW" parsing functions, each with their own
logic.

Unify these branching code paths by creating a line-based "word" parser
that stores the word-offset of the neccessary fields in a struct.
Accomodate "legacy" behavior where needed using an enumerated type.

parse_task_lines() now can parse multiple formats of output from the
debugfs file which on modern systems is located at
/sys/kernel/debug/sched/debug, and on 3.X "legacy" systems, is located
at /proc/sched_debug.

The detect_task_format() function records field offsets which are used
by parse_task_lines to pull fields out of the "running tasks:" section.

Signed-off-by: Clark Williams <clrkwllms@kernel.org>
Signed-off-by: Derek Barbosa <debarbos@redhat.com>
Signed-off-by: Clark Williams <williams@redhat.com>
---
 src/sched_debug.c | 405 +++++++++++++++++++++-------------------------
 src/sched_debug.h |  65 +++++++-
 2 files changed, 244 insertions(+), 226 deletions(-)

diff --git a/src/sched_debug.c b/src/sched_debug.c
index fa2f74bf36ed..180932ca7aa3 100644
--- a/src/sched_debug.c
+++ b/src/sched_debug.c
@@ -24,6 +24,9 @@
  */
 static int config_task_format;
 
+static struct task_format_offsets
+    config_task_format_offsets  = { 0, 0, 0, 0 };
+
 /*
  * Read the contents of sched_debug into the input buffer.
  */
@@ -88,8 +91,12 @@ static char *get_next_cpu_info_start(char *start)
 {
 	const char *next_cpu = "cpu#";
 
-	/* Skip the current CPU definition. */
-	start += 10;
+        /*
+         * Skip the current CPU definition.
+         * We want to move our "cursor" past the current "cpu#" definition.
+         * This number is arbitrary. It is purely to assist strstr().
+         */
+        start += 10;
 
 	return strstr(start, next_cpu);
 }
@@ -143,9 +150,13 @@ static inline char *skipchars(char *str)
 	return str;
 }
 
+/*
+ * Note, for our purposes newline is *not* a space
+ * and we want to stop when we hit it
+ */
 static inline char *skipspaces(char *str)
 {
-	while (*str && isspace(*str))
+	while (*str && isspace(*str) && (*str != '\n'))
 		str++;
 	return str;
 }
@@ -156,6 +167,20 @@ static inline char *nextline(char *str)
 	return ptr ? ptr+1 : NULL;
 }
 
+/*
+ * skip a specified number of words on a task line
+ */
+
+static inline char *skipwords(char *ptr, int nwords)
+{
+	int i;
+	for (i=0; i < nwords; i++) {
+		ptr = skipspaces(ptr);
+		ptr = skipchars(ptr);
+	}
+	return ptr;
+}
+
 /*
  * Read sched_debug and figure out if it's old or new format
  * done once so if we fail just exit the program.
@@ -173,6 +198,7 @@ static int detect_task_format(void)
 	char *ptr;
 	int status;
 	int fd;
+	int i, count=0;
 
 	bufsiz = bufincrement = BUFFER_PAGES * page_size;
 
@@ -204,122 +230,63 @@ static int detect_task_format(void)
 
 	ptr = strstr(buffer, TASK_MARKER);
 	if (ptr == NULL) {
-		fprintf(stderr, "unable to find 'runnable tasks' in buffer, invalid input\n");
+		die("unable to find 'runnable tasks' in buffer, invalid input\n");
 		exit(-1);
 	}
 
-	ptr += strlen(TASK_MARKER) + 1;
-	ptr = skipspaces(ptr);
+	ptr = nextline(ptr);
+	i = 0;
 
-	if (strncmp(ptr, "task", 4) == 0) {
-		retval = OLD_TASK_FORMAT;
-		log_msg("detected old task format\n");
-	} else if (strncmp(ptr, "S", 1) == 0) {
+	/*
+	 * Determine the TASK_FORMAT from the first "word" in the header
+	 * line.
+	 */
+	ptr = skipspaces(ptr);
+	if (strncmp(ptr, "S", strlen("S")) == 0) {
+		log_msg("detect_task_format: NEW_TASK_FORMAT detected\n");
 		retval = NEW_TASK_FORMAT;
-		log_msg("detected new task format\n");
 	}
-
-	free(buffer);
-	return retval;
-}
-
-/*
- * Parse the new sched_debug format.
- *
- * Example:
- * ' S           task   PID         tree-key  switches  prio     wait-time             sum-exec        sum-sleep'
- * '-----------------------------------------------------------------------------------------------------------'
- * ' I         rcu_gp     3        13.973264         2   100         0.000000         0.004469         0.000000 0 0 /
- */
-static int parse_new_task_format(char *buffer, struct task_info *task_info, int nr_entries)
-{
-	char *R, *X, *start = buffer;
-	struct task_info *task;
-	int tasks = 0;
-	int comm_size;
-	char *end;
+	else {
+		log_msg("detect_task_format: OLD_TASK_FORMAT detected\n");
+		retval = OLD_TASK_FORMAT;
+	}
 
 	/*
-	 * If we have less than two tasks on the CPU there is no
-	 * possibility of a stall.
+	 * Look for our header keywords and store their offset
+	 * we'll use the offsets when we actually parse the task
+	 * line data
 	 */
-	if (nr_entries < 2)
-		return 0;
-
-	while (tasks < nr_entries) {
-		task = &task_info[tasks];
-
-		/*
-		 * Runnable tasks.
-		 */
-		R = strstr(start, "\n R");
-
-		/*
-		 * Dying tasks.
-		 */
-		X = strstr(start, "\n X");
-
-		/*
-		 * Get the first one, the only one, or break.
-		 */
-		if (X && R) {
-			start = R < X ? R : X;
-		} else if (X || R) {
-			start = R ? R : X;
-		} else {
-			break;
+	while (*ptr != '\n') {
+		ptr = skipspaces(ptr);
+		if (strncmp(ptr, "task", strlen("task")) == 0) {
+			config_task_format_offsets.task = i;
+			count++;
+			log_msg("detect_task_format: found 'task' at word %d\n", i);
 		}
-
-		/* Skip '\n R' || '\n X'. */
-		start = &start[3];
-
-		/* Skip the spaces. */
-		start = skipspaces(start);
-
-		/* Find the end of the string. */
-		end = skipchars(start);
-
-		comm_size = end - start;
-
-		if (comm_size >= COMM_SIZE) {
-			warn("comm_size is too large: %d\n", comm_size);
-			comm_size = COMM_SIZE - 1;
+		else if (strncmp(ptr, "PID", strlen("PID")) == 0) {
+			config_task_format_offsets.pid = i;
+			count++;
+			log_msg("detect_task_format: found 'PID' at word %d\n", i);
 		}
-
-		strncpy(task->comm, start, comm_size);
-
-		task->comm[comm_size] = '\0';
-
-		/* Go to the end of the task comm. */
-		start=end;
-
-		task->pid = strtol(start, &end, 10);
-
-		/* Get the id of the thread group leader. */
-		task->tgid = get_tgid(task->pid);
-
-		/* Go to the end of the pid. */
-		start=end;
-
-		/* Skip the tree-key. */
-		start = skipspaces(start);
-		start = skipchars(start);
-
-		task->ctxsw = strtol(start, &end, 10);
-
-		start = end;
-
-		task->prio = strtol(start, &end, 10);
-
-		task->since = time(NULL);
-
-		/* Go to the end and try to find the next occurrence. */
-		start = end;
-
-		tasks++;
+		else if (strncmp(ptr, "switches", strlen("switches")) == 0) {
+			config_task_format_offsets.switches = i;
+			count++;
+			log_msg("detect_task_format: found 'switches' at word %d\n", i);
+		}
+		else if (strncmp(ptr, "prio", strlen("prio")) == 0) {
+			config_task_format_offsets.prio = i;
+			count++;
+			log_msg("detect_task_format: found 'prio' at word %d\n", i);
+		}
+		ptr = skipchars(ptr);
+		i++;
 	}
 
-	return tasks;
+	if (count != 4)
+		die("detect_task_format: did not detect all task line fields we need\n");
+
+	free(buffer);
+	return retval;
 }
 
 /*
@@ -387,104 +354,80 @@ static int is_runnable(int pid)
 	return runnable;
 }
 
-static int count_task_lines(char *buffer)
-{
-	int lines = 0;
-	char *ptr;
-	int len;
-
-	len = strlen(buffer);
-
-	/* Find the runnable tasks: header. */
-	ptr = strstr(buffer, TASK_MARKER);
-	if (ptr == NULL)
-		return 0;
-
-	/* Skip to the end of the dashed line separator. */
-	ptr = strstr(ptr, "-\n");
-	if (ptr == NULL)
-		return 0;
-
-	ptr += 2;
-	while(*ptr && ptr < (buffer+len)) {
-		lines++;
-		ptr = strchr(ptr, '\n');
-		if (ptr == NULL)
-			break;
-		ptr++;
-	}
-	return lines;
-}
-
-/*
- * Parse the old sched debug format:
- *
- * Example:
- * '            task   PID         tree-key  switches  prio     wait-time             sum-exec        sum-sleep
- * ' ----------------------------------------------------------------------------------------------------------
- * '     watchdog/35   296       -11.731402      4081     0         0.000000        44.052473         0.000000 /
- */
-static int parse_old_task_format(char *buffer, struct task_info *task_info, int nr_entries)
+static int parse_task_lines(char *buffer, struct task_info *task_info, int nr_entries)
 {
 	int pid, ctxsw, prio, comm_size;
-	char *start, *end, *buffer_end;
+	char *ptr, *line, *end;
 	struct task_info *task;
 	char comm[COMM_SIZE];
-	int waiting_tasks = 0;
-
-	start = buffer;
-	start = strstr(start, TASK_MARKER);
-	start = strstr(start, "-\n");
-	start++;
+	int tasks = 0;
 
-	buffer_end = buffer + strlen(buffer);
+	if ((ptr = strstr(buffer, TASK_MARKER)) == NULL)
+		die ("no runnable task section found!\n");
 
 	/*
-	 * We can't short-circuit using nr_entries, we have to scan the
-	 * entire list of processes that is on this CPU.
+	 * If we have less than two tasks on the CPU there is no
+	 * possibility of a stall.
 	 */
-	while (*start && start < buffer_end) {
-		task = &task_info[waiting_tasks];
+	if (nr_entries < 2)
+		return 0;
+	line = ptr;
+
+	/* skip header and divider */
+	line = nextline(line);
+	line = nextline(line);
+	
+	/* now loop over the task info */
+	while (tasks < nr_entries) {
+		task = &task_info[tasks];
 
-		/* Only care about tasks that are not R (running on a CPU). */
-		if (start[0] == 'R') {
+		/*
+		 * In 3.X kernels, only the singular RUNNING task receives
+		 * a "running state" label. Therefore, only care about
+		 * tasks that are not R (running on a CPU).
+		 */
+		if ((config_task_format == OLD_TASK_FORMAT) &&
+			(*ptr == 'R')) {
 			/* Go to the end of the line and ignore this task. */
-			start = strchr(start, '\n');
-			start++;
+			ptr = strchr(ptr, '\n');
+			ptr++;
 			continue;
 		}
 
-		/* Pick up the comm field. */
-		start = skipspaces(start);
-		end = skipchars(start);
-		comm_size = end - start;
+		/* get the task field */
+		ptr = skipwords(line, config_task_format_offsets.task);
+
+		/* Find the end of the task field */
+		end = skipchars(ptr);
+		comm_size = end - ptr;
+
+		/* make sure we don't overflow the comm array */
 		if (comm_size >= COMM_SIZE) {
 			warn("comm_size is too large: %d\n", comm_size);
 			comm_size = COMM_SIZE - 1;
 		}
-		strncpy(comm, start, comm_size);
-		comm[comm_size] = 0;
-
-		/* Go to the end of the task comm. */
-		start=end;
-
-		/* Now pick up the pid. */
-		pid = strtol(start, &end, 10);
-
-		/* Go to the end of the pid. */
-		start=end;
-
-		/* Skip the tree-key. */
-		start = skipspaces(start);
-		start = skipchars(start);
-
-		/* Pick up the context switch count. */
-		ctxsw = strtol(start, &end, 10);
-		start = end;
-
-		/* Get the priority. */
-		prio = strtol(start, &end, 10);
-		if (is_runnable(pid)) {
+		strncpy(comm, ptr, comm_size);
+		comm[comm_size] = '\0';
+		ptr = end;
+
+		/* get the PID field */
+		ptr = skipwords(line, config_task_format_offsets.pid);
+		pid = strtol(ptr, NULL, 10);
+
+		/* get the context switches field */
+		ptr = skipwords(line, config_task_format_offsets.switches);
+		ctxsw = strtol(ptr, NULL, 10);
+
+		/* get the prio field */
+		ptr = skipwords(line, config_task_format_offsets.prio);
+		prio = strtol(ptr, NULL, 10);
+
+                /*
+                 * In older formats, we must check to
+                 * see if the process is runnable prior to storing header
+                 * fields and incrementing task processing
+                 */
+                if ((config_task_format == NEW_TASK_FORMAT) || (is_runnable(pid))) {
 			strncpy(task->comm, comm, comm_size);
 			task->comm[comm_size] = 0;
 			task->pid = pid;
@@ -492,18 +435,44 @@ static int parse_old_task_format(char *buffer, struct task_info *task_info, int
 			task->ctxsw = ctxsw;
 			task->prio = prio;
 			task->since = time(NULL);
-			waiting_tasks++;
+			/* increment the count of tasks processed */
+			tasks++;
+		} else {
+			continue;
 		}
 
-		if ((start = nextline(start)) == NULL)
-			break;
+	}
+	return tasks;
+}
 
-		if (waiting_tasks >= nr_entries) {
+
+static int count_task_lines(char *buffer)
+{
+	int lines = 0;
+	char *ptr;
+	int len;
+
+	len = strlen(buffer);
+
+	/* Find the runnable tasks: header. */
+	ptr = strstr(buffer, TASK_MARKER);
+	if (ptr == NULL)
+		return 0;
+
+	/* Skip to the end of the dashed line separator. */
+	ptr = strstr(ptr, "-\n");
+	if (ptr == NULL)
+		return 0;
+
+	ptr += 2;
+	while(*ptr && ptr < (buffer+len)) {
+		lines++;
+		ptr = strchr(ptr, '\n');
+		if (ptr == NULL)
 			break;
-		}
+		ptr++;
 	}
-
-	return waiting_tasks;
+	return lines;
 }
 
 static int fill_waiting_task(char *buffer, struct cpu_info *cpu_info)
@@ -515,36 +484,23 @@ static int fill_waiting_task(char *buffer, struct cpu_info *cpu_info)
 		warn("NULL cpu_info pointer!\n");
 		return 0;
 	}
-	nr_entries = cpu_info->nr_running;
-
-	switch (config_task_format) {
-	case NEW_TASK_FORMAT:
-		cpu_info->starving = malloc(sizeof(struct task_info) * nr_entries);
-		if (cpu_info->starving == NULL) {
-			warn("failed to malloc %d task_info structs", nr_entries);
-			return 0;
-		}
-		nr_waiting = parse_new_task_format(buffer, cpu_info->starving, nr_entries);
-		break;
-	case OLD_TASK_FORMAT:
-		/*
-		 * The old task format does not output a correct value for
-		 * nr_running (the initializer for nr_entries) so count the
-		 * task lines for this CPU data and use that instead.
-		 */
+
+	if (config_task_format == OLD_TASK_FORMAT)
 		nr_entries = count_task_lines(buffer);
-		if (nr_entries <= 0)
-			return 0;
-		cpu_info->starving = malloc(sizeof(struct task_info) * nr_entries);
-		if (cpu_info->starving == NULL) {
-			warn("failed to malloc %d task_info structs", nr_entries);
-			return 0;
-		}
-		nr_waiting = parse_old_task_format(buffer, cpu_info->starving, nr_entries);
-		break;
-	default:
-		die("invalid value for config_task_format: %d\n", config_task_format);
+	else
+		nr_entries = cpu_info->nr_running;
+
+	if (nr_entries <= 0)
+		return 0;
+
+	cpu_info->starving = malloc(sizeof(struct task_info) * nr_entries);
+	if (cpu_info->starving == NULL) {
+		warn("failed to malloc %d task_info structs", nr_entries);
+		return 0;
 	}
+
+	nr_waiting = parse_task_lines(buffer, cpu_info->starving, nr_entries);
+
 	return nr_waiting;
 }
 
@@ -574,7 +530,7 @@ static int sched_debug_parse(struct cpu_info *cpu_info, char *buffer, size_t buf
 	}
 
 	/*
-	 * The NEW_TASK_FORMAT produces useful output values for nr_running and
+	 * NEW_TASK_FORMAT and produces useful output values for nr_running and
 	 * rt_nr_running, so in this case use them. For the old format just leave
 	 * them initialized to zero.
 	 */
@@ -613,7 +569,8 @@ static int sched_debug_has_starving_task(struct cpu_info *cpu)
 static int sched_debug_init(void)
 {
 	find_sched_debug_path();
-	config_task_format = detect_task_format();
+	if ((config_task_format = detect_task_format()) == TASK_FORMAT_UNKNOWN)
+		die("Can't handle task format!\n");
 	return 0;
 }
 
diff --git a/src/sched_debug.h b/src/sched_debug.h
index 21f9da27866e..4b12c39bf7fe 100644
--- a/src/sched_debug.h
+++ b/src/sched_debug.h
@@ -1,6 +1,67 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-#define OLD_TASK_FORMAT  1
-#define NEW_TASK_FORMAT  2
 #define TASK_MARKER	"runnable tasks:"
+#define TASK_DIVIDER	"-\n"
+
+/*
+ * Over time, the various 'runnable task' output in SCHED_DEBUG has
+ * changed significantly.
+ *
+ * Depending on the version of the running kernel, the task formats can
+ * differ greatly.
+ *
+ * For example, in 3.X kernels, the sched_debug running tasks format denotes the current
+ * running task on the current CPU with a singular state label, 'R'. Other tasks do not
+ * receive a state label.
+ *
+ * example:
+ * '          task   PID         tree-key  switches  prio     wait-time             sum-exec        sum-sleep'
+ * ' ----------------------------------------------------------------------------------------------------------'
+ * '      watchdog/5    33        -8.984472       151     0         0.000000         0.535614         0.000000 0 /'
+ * ' R          less  9542      2382.087644        56   120         0.000000        16.444493         0.000000 0 /'
+ *
+ * In 4.18+ kernels, the sched_debug format running tasks format included an additional 'S'
+ * state field to denote the state of the running tasks on said CPU.
+ *
+ * example:
+ * ' S           task   PID         tree-key  switches  prio     wait-time             sum-exec        sum-sleep'
+ * '-----------------------------------------------------------------------------------------------------------'
+ * ' I         rcu_gp     3        13.973264         2   100         0.000000         0.004469         0.000000 0 0 /'
+ *
+ * Introduced in 6.12+, 2cab4bd024d2 sched/debug: Fix the runnable tasks
+ * output, the sched_debug running tasks format was changed to include
+ * four new EEVDF fields.
+ *
+ * Example:
+ *  'S            task   PID       vruntime   eligible    deadline             slice          sum-exec      switches  prio         wait-time        sum-sleep       sum-block  node   group-id  group-path'
+ *  '-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'
+ *  ' I kworker/R-rcu_g     4        -1.048576   E          -1.040501           0.700000         0.000000         2     100         0.000000         0.000000         0.000000   0      0        /'
+ *
+ * As there are considerable differences in the location of the fields
+ * needed to boost task prioriy, handle the logical code differences with
+ * an enumerated type.
+ */
+enum task_format {
+	TASK_FORMAT_UNKNOWN =0,
+	OLD_TASK_FORMAT,	// 3.10 kernel
+	NEW_TASK_FORMAT,	// 4.18+ kernel
+	TASK_FORMAT_LIMIT
+};
+
+
+/*
+ * set of offsets in a task format line based on offsets
+ * discovered by discover_task_format
+ *
+ * Note: These are *NOT* character offsets, these are "word" offsets.
+ * Requiring consumers of this struct to parse through the individual
+ * lines.
+ */
+struct task_format_offsets {
+	int task;
+	int pid;
+	int switches;
+	int prio;
+	int wait_time;
+};
 
 extern struct stalld_backend sched_debug_backend;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2025-10-21 17:46 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-17  2:24 [PATCH 01/12] sched_debug: Unify parsing methods for task_info Clark Williams
2025-10-17  2:24 ` [PATCH 02/12] sched_debug: Fix runqueue task parsing logic and state filtering Clark Williams
2025-10-21 15:58   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 03/12] sched_debug: Fix double-free crash in fill_waiting_task() Clark Williams
2025-10-21 16:01   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 04/12] stalld.c: remove noisy idle report and added report to should_skip_idle_cpus() Clark Williams
2025-10-21 16:03   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 05/12] stalld.c: initialize cpu_info->idle_time to be -1 Clark Williams
2025-10-21 16:15   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 06/12] stalld.c: get rid of misleading print about DL-Server Clark Williams
2025-10-21 16:16   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 07/12] stalld.c: Add starvation logging in single-threaded log-only mode Clark Williams
2025-10-21 16:27   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 08/12] stalld: Add -N/--no_idle_detect flag to disable idle detection Clark Williams
2025-10-21 16:33   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 09/12] stalld: Add defensive checks in print_boosted_info Clark Williams
2025-10-21 17:36   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 10/12] Makefile: Add support for legacy kernels Clark Williams
2025-10-17 12:50   ` Derek Barbosa
2025-10-21 17:43   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 11/12] scripts: fix run-local if bashism Clark Williams
2025-10-21 17:45   ` Wander Lairson Costa
2025-10-17  2:24 ` [PATCH 12/12] Fix segfault in adaptive/aggressive modes Clark Williams
2025-10-21 17:45   ` Wander Lairson Costa
2025-10-21 15:54 ` [PATCH 01/12] sched_debug: Unify parsing methods for task_info Wander Lairson Costa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox