Igt-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH i-g-t v6] runner/executor: Abort if dmesg is flooded
@ 2025-08-28 10:31 Kamil Konieczny
  2025-08-28 21:44 ` ✓ i915.CI.BAT: success for runner/executor: Abort if dmesg is flooded (rev6) Patchwork
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Kamil Konieczny @ 2025-08-28 10:31 UTC (permalink / raw)
  To: igt-dev
  Cc: Kamil Konieczny, Ewelina Musial, Karol Krol, Krzysztof Karas,
	Petri Latvala, Peter Senna Tschudin, Ryszard Knop, Vitaly Prosyak,
	Zbigniew Kempczyński

Current disk limit triggers once when it is exceeded during test
monitoring. After that happens executor no longer checks if
kernel is still printing plenty of messages.

Create a way to abort test in such scenarios with the help of
measuring kernel kmsg activity before first test is executed
and then also check it after exceeding disk limit.

Also print info about it after each test ends.

v2: fix error when kmsg open fails, fix reading proc (Kamil)
  changed calculation of max bps (Peter)
v3: abort only when disk limit was actually exceeded,
  print info instead of aborting after every test (Kamil)
v4: added abort at main monitoring loop, also cleaning up
  printing messages (Kamil)
v5: fix typo, use less vars in a loop  (Krzysztof)
  reused existing dmesg reading function (Kamil)
v6: check error in check_dmesg_ratio() (Vitaly)

Cc: Ewelina Musial <ewelina.musial@intel.com>
Cc: Karol Krol <karol.krol@intel.com>
Cc: Krzysztof Karas <krzysztof.karas@intel.com>
Cc: Petri Latvala <adrinael@adrinael.net>
Cc: Peter Senna Tschudin <peter.senna@linux.intel.com>
Cc: Ryszard Knop <ryszard.knop@intel.com>
Cc: Vitaly Prosyak <vitaly.prosyak@amd.com>
Cc: "Zbigniew Kempczyński" <zbigniew.kempczynski@intel.com>
Signed-off-by: Kamil Konieczny <kamil.konieczny@linux.intel.com>
Reviewed-by: Vitaly Prosyak <vitaly.prosyak@amd.com>
---
 runner/executor.c | 153 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 138 insertions(+), 15 deletions(-)

diff --git a/runner/executor.c b/runner/executor.c
index 847abe481..87fa69b1c 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -637,6 +637,11 @@ const char *get_out_filename(int fid)
 	return "output-filename-index-error";
 }
 
+static int open_kmsg_rdonly(void)
+{
+	return open("/dev/kmsg", O_RDONLY | O_CLOEXEC | O_NONBLOCK);
+}
+
 /* Returns the number of bytes written to disk, or a negative number on error */
 static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
 {
@@ -646,6 +651,8 @@ static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
 	 * /dev/kmsg doesn't support seeking to -1 from SEEK_END
 	 * so we need to use a second fd to read a message to
 	 *  match against, or stop when we reach EAGAIN.
+	 * If outfd==0 it will not write anything but still count
+	 * how many bytes was readed from dmesg.
 	 */
 
 	int comparefd;
@@ -711,7 +718,9 @@ static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
 			return written;
 		}
 
-		write(outfd, buf, r);
+		if (outfd)
+			write(outfd, buf, r);
+
 		written += r;
 
 		if (comparefd < 0 && sscanf(buf, "%u,%llu,%llu,%c;",
@@ -948,6 +957,86 @@ static size_t calc_last_dmesg_chunk(size_t limit, size_t disk_usage)
 	return dt != 0 ? dt : -1;
 }
 
+/*
+ * Tries to read from dmesg for 0.1 seconds
+ *
+ * Returns:
+ *  =>0.0 - Success, measured kmsg activity in bytes/second
+ *  -1.0 - Failure
+ */
+static double measure_dmesg_bytes_per_sec(int wrfd)
+{
+	struct timespec time_beg, time_now, nsec_sleep;
+	unsigned long long dmsg_read, rnow;
+	double time;
+	int kmsgfd;
+
+	if ((kmsgfd = open_kmsg_rdonly()) < 0) {
+		errf("Warning: Cannot open /dev/kmsg\n");
+
+		return -1.0;
+	}
+
+	lseek(kmsgfd, 0, SEEK_END);
+	nsec_sleep.tv_sec = 0;
+	nsec_sleep.tv_nsec = 10ULL * 1000ULL * 1000ULL; /* 10^7 nanoseconds = 10^-2 sec */
+	runner_gettime(&time_beg);
+	time = 0.0;
+	dmsg_read = 0.0;
+	while (1) {
+		rnow = dump_dmesg(kmsgfd, wrfd, 64 * 4096); /* 64KB max */
+		if (rnow > 0)
+			dmsg_read += rnow;
+
+		runner_gettime(&time_now);
+		time = igt_time_elapsed(&time_beg, &time_now);
+		if (time <= 0.0) {
+			errf("Warning: Time underflow\n");
+			return -1.0;
+		}
+
+		if (time >= 0.1)
+			break;
+
+		if (rnow == 0)
+			nanosleep(&nsec_sleep, NULL);
+	}
+
+	runner_gettime(&time_now);
+	time = igt_time_elapsed(&time_beg, &time_now);
+	if (time <= 0.0) {
+		errf("Warning: Time underflow\n");
+
+		return -1.0;
+	}
+
+	return (double)dmsg_read / time;
+}
+
+/* Returns: true if ratio lower or equal than maxratio */
+static bool check_dmesg_ratio(double maxratio, int log_level, bool sync, int wrfd)
+{
+	double new_dmesg_bps = measure_dmesg_bytes_per_sec(wrfd);
+
+	if (new_dmesg_bps < 0.0) {
+		if (log_level >= LOG_LEVEL_NORMAL) {
+			outf("Dmesg ratio: unavailable (no /dev/kmsg)\n");
+			if (sync)
+				fflush(stdout);
+		}
+
+	return true; /* don’t gate on missing measurement */
+	}
+
+	if (log_level >= LOG_LEVEL_NORMAL) {
+		outf("Dmesg ratio %0.3fKB/s\n", new_dmesg_bps / 1024);
+		if (sync)
+			fflush(stdout);
+	}
+
+	return new_dmesg_bps <= maxratio;
+}
+
 /*
  * Returns:
  *  =0 - Success
@@ -960,6 +1049,7 @@ static int monitor_output(pid_t child,
 			  int *outputs,
 			  double *time_spent,
 			  struct settings *settings,
+			  double max_dmesg_ratio,
 			  char **abortreason,
 			  bool *abort_already_written)
 {
@@ -1463,29 +1553,31 @@ static int monitor_output(pid_t child,
 				 * exceeded the disk usage limit.
 				 */
 				if (killed && disk_usage_limit_exceeded(settings, disk_usage)) {
+					char killmsg[256];
+
 					timeoutresult = false;
+					snprintf(killmsg, sizeof(killmsg),
+						 "runner: This test was killed due to exceeding disk usage limit. "
+						 "(Used %zd bytes, limit %zd)\n",
+						 disk_usage,
+						 settings->disk_usage_limit);
 
 					if (socket_comms_used) {
 						struct runnerpacket *message;
-						char killmsg[256];
 
-						snprintf(killmsg, sizeof(killmsg),
-							 "runner: This test was killed due to exceeding disk usage limit. "
-							 "(Used %zd bytes, limit %zd)\n",
-							 disk_usage,
-							 settings->disk_usage_limit);
 						message = runnerpacket_log(STDOUT_FILENO, killmsg);
 						write_packet_with_canary(outputs[_F_SOCKET], message, settings->sync);
 						free(message);
 					} else {
-						dprintf(outputs[_F_OUT],
-							"\nrunner: This test was killed due to exceeding disk usage limit. "
-							"(Used %zd bytes, limit %zd)\n",
-							disk_usage,
-							settings->disk_usage_limit);
+						dprintf(outputs[_F_OUT], "%s", killmsg);
 						if (settings->sync)
 							fdatasync(outputs[_F_OUT]);
 					}
+
+					if (!check_dmesg_ratio(max_dmesg_ratio, settings->log_level, settings->sync, outputs[_F_DMESG])) {
+						asprintf(abortreason, "Dmesg ratio exceeded during test run.");
+						aborting = true;
+					}
 				}
 
 				if (socket_comms_used) {
@@ -1587,7 +1679,7 @@ static int monitor_output(pid_t child,
 	dmesgwritten = dump_dmesg(kmsgfd, outputs[_F_DMESG], dmsg_chunk_size);
 	if (settings->sync)
 		fdatasync(outputs[_F_DMESG]);
-	if (dmesgwritten > 0) {
+	if (dmesgwritten > 0 && !aborting) {
 		disk_usage += dmesgwritten;
 		if (settings->disk_usage_limit && disk_usage > settings->disk_usage_limit) {
 			char disk[1024];
@@ -1599,6 +1691,11 @@ static int monitor_output(pid_t child,
 			} else if (killed) {
 				errf("%s", disk);
 			}
+
+			if (!check_dmesg_ratio(max_dmesg_ratio, settings->log_level, settings->sync, outputs[_F_DMESG])) {
+				asprintf(abortreason, "Dmesg ratio exceeded after test ends.");
+				aborting = true;
+			}
 		}
 	}
 
@@ -1766,6 +1863,7 @@ static int execute_next_entry(struct execute_state *state,
 			      size_t total,
 			      double *time_spent,
 			      struct settings *settings,
+			      double max_dmesg_ratio,
 			      struct job_list_entry *entry,
 			      int testdirfd, int resdirfd,
 			      int sigfd, sigset_t *sigmask,
@@ -1814,7 +1912,7 @@ static int execute_next_entry(struct execute_state *state,
 		goto out_pipe;
 	}
 
-	if ((kmsgfd = open("/dev/kmsg", O_RDONLY | O_CLOEXEC | O_NONBLOCK)) < 0) {
+	if ((kmsgfd = open_kmsg_rdonly()) < 0) {
 		errf("Warning: Cannot open /dev/kmsg\n");
 	} else {
 		/* TODO: Checking of abort conditions in pre-execute dmesg */
@@ -1885,7 +1983,7 @@ static int execute_next_entry(struct execute_state *state,
 
 	result = monitor_output(child, outfd, errfd, socketfd,
 				kmsgfd, sigfd,
-				outputs, time_spent, settings,
+				outputs, time_spent, settings, max_dmesg_ratio,
 				abortreason, abort_already_written);
 
 out_kmsgfd:
@@ -2418,6 +2516,8 @@ bool execute(struct execute_state *state,
 	     struct settings *settings,
 	     struct job_list *job_list)
 {
+	static double dmesg_bps = -1.0;
+	static double max_dmesg_bps = -1.0;
 	int resdirfd, testdirfd, unamefd, timefd, sigfd;
 	struct environment_variable *env_var;
 	struct utsname unamebuf;
@@ -2560,6 +2660,26 @@ bool execute(struct execute_state *state,
 		}
 	}
 
+	if (max_dmesg_bps < 0.0) {
+		double ncpu_bps = 4 * 1024 * max_t(size_t, sysconf(_SC_NPROCESSORS_ONLN), 4);
+		double set_bps = 0.0;
+
+		dmesg_bps = measure_dmesg_bytes_per_sec(0);
+
+		if (settings->disk_usage_limit > 0 && settings->per_test_timeout > 0)
+			set_bps = (double)settings->disk_usage_limit / (double)settings->per_test_timeout;
+
+		outf("Dmesg KB/s ratio settings:%0.3f ncpu:%0.3f current:%0.3f\n",
+		     set_bps / 1024, ncpu_bps / 1024, dmesg_bps / 1024);
+
+		if (set_bps > 0.0)
+			max_dmesg_bps = set_bps;
+		else
+			max_dmesg_bps = dmesg_bps > ncpu_bps ? dmesg_bps : ncpu_bps;
+
+		outf("Using max level dmesg ratio %0.3fKB/s\n", max_dmesg_bps / 1024);
+	}
+
 	for (; state->next < job_list->size;
 	     state->next++) {
 		char *reason = NULL;
@@ -2596,6 +2716,7 @@ bool execute(struct execute_state *state,
 						    job_list->size,
 						    &time_spent,
 						    settings,
+						    max_dmesg_bps,
 						    &job_list->entries[state->next],
 						    testdirfd, resdirfd,
 						    sigfd, &sigmask,
@@ -2651,6 +2772,8 @@ bool execute(struct execute_state *state,
 			break;
 		}
 
+		check_dmesg_ratio(max_dmesg_bps, settings->log_level, settings->sync, 0);
+
 		if (result > 0) {
 			double time_left = state->time_left;
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-09-18 13:41 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-28 10:31 [PATCH i-g-t v6] runner/executor: Abort if dmesg is flooded Kamil Konieczny
2025-08-28 21:44 ` ✓ i915.CI.BAT: success for runner/executor: Abort if dmesg is flooded (rev6) Patchwork
2025-08-28 21:49 ` ✓ Xe.CI.BAT: " Patchwork
2025-08-29  5:49 ` ✓ Xe.CI.Full: " Patchwork
2025-08-29  6:36 ` ✗ i915.CI.Full: failure " Patchwork
2025-08-29  8:23   ` Kamil Konieczny
2025-09-02 12:39 ` [PATCH i-g-t v6] runner/executor: Abort if dmesg is flooded Peter Senna Tschudin
2025-09-02 13:31   ` Peter Senna Tschudin
2025-09-18 13:40     ` Kamil Konieczny

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox