Igt-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Mauro Carvalho Chehab <mauro.chehab@linux.intel.com>
To: Kamil Konieczny <kamil.konieczny@linux.intel.com>
Cc: igt-dev@lists.freedesktop.org
Subject: Re: [igt-dev] [PATCH i-g-t 2/2] runner/executor: write GPU error on timeout
Date: Tue, 19 Sep 2023 09:29:44 +0200	[thread overview]
Message-ID: <20230919092944.6fe8f6a3@maurocar-mobl2> (raw)
In-Reply-To: <20230714153946.36448-3-kamil.konieczny@linux.intel.com>

On Fri, 14 Jul 2023 17:39:46 +0200
Kamil Konieczny <kamil.konieczny@linux.intel.com> wrote:

> When test is interrupted due to per-test timeout or inactivity
> kernel GPU state dump may be empty. In that case create dump of
> GPU state from driver, either drm error or dri debug info.
> 
> Signed-off-by: Kamil Konieczny <kamil.konieczny@linux.intel.com>
> ---
>  runner/executor.c | 68 +++++++++++++++++++++++++++++++++++++++++++++--
>  runner/executor.h |  2 ++
>  2 files changed, 68 insertions(+), 2 deletions(-)
> 
> diff --git a/runner/executor.c b/runner/executor.c
> index d3e6296dd..738d6c0bf 100644
> --- a/runner/executor.c
> +++ b/runner/executor.c
> @@ -39,6 +39,9 @@
>  #define KMSG_WARN 4
>  #define GRACEFUL_EXITCODE -SIGHUP
>  
> +#define MSG_TIMEOUT_PER_TEST "Per-test timeout exceeded. Killing the current test with SIGQUIT.\n"
> +#define MSG_TIMEOUT_INACTIVITY "Inactivity timeout exceeded. Killing the current test with SIGQUIT.\n"
> +
>  static struct {
>  	int *fds;
>  	size_t num_dogs;
> @@ -531,6 +534,8 @@ static const char *filenames[_F_LAST] = {
>  	[_F_ERR] = "err.txt",
>  	[_F_DMESG] = "dmesg.txt",
>  	[_F_SOCKET] = "comms",
> +	[_F_GPUERROR] = "gpu_error.dmp",
> +	[_F_GPUINFO] = "gpu_info.dmp",
>  };
>  
>  static int open_at_end(int dirfd, const char *name)
> @@ -745,6 +750,59 @@ static void kmsg_log(int severity, const char *msg)
>  	free(str);
>  }
>  
> +static size_t write_gpu_dump(int dump_fd, const char *gpu_error_path, char *buf, size_t bufsize)
> +{
> +	size_t s, written;
> +	int fd;
> +
> +	fd = open(gpu_error_path, O_RDONLY);
> +	if (fd < 0)
> +		return 0;
> +
> +	written = 0;
> +	do {
> +		s = read(fd, buf, bufsize);
> +		if (s > 0) {
> +			write(dump_fd, buf, s);
> +			written += s;
> +		}
> +	} while (s > 0);

Hmm... as you're foreseeing multiple chunks of 16MB here, maybe you could
use zlib here to reduce the dump size, storing it with a ".gz" suffix.

> +
> +	close(fd);
> +
> +	return written;
> +}
> +
> +static void dump_gpu_error(int drm_err_fd, int dri_dbg_fd, const char *dri_debug)
> +{
> +	char gpu_err_path[PATH_MAX];
> +	char dri_dbg_path[PATH_MAX];
> +	char *buf;
> +	size_t bufsize;
> +
> +	bufsize = 16 * 1024 * 1024; /* 16MB */
> +	buf = malloc(bufsize);
> +	if (!buf)
> +		return;
> +
> +	/* dump state of gpu with error */
> +	for (int i = 0; i < 256; ++i) {
> +		snprintf(gpu_err_path, sizeof(gpu_err_path),
> +			 "/sys/class/drm/card%d/error", i);

> +		if (access(gpu_err_path, R_OK))
> +			break; /* no more drm cards */

I don't think this shortcut actually works. On my TGL laptop, I can
see:

	# ls /sys/class/drm/
	card1  card1-DP-1  card1-DP-2  card1-eDP-1  card1-HDMI-A-1  card1-HDMI-A-2  renderD128  version
	# ls /sys/kernel/debug/dri/
	1  128

So, I guess you need to change it to:

		if (access(gpu_err_path, R_OK))
			continue;

> +		snprintf(dri_dbg_path, sizeof(dri_dbg_path),
> +			 "/sys/kernel/debug/dri/%d/%s", i, dri_debug);
> +		if (access(gpu_err_path, R_OK))
> +			continue; /* not our card */
> +
> +		if (write_gpu_dump(drm_err_fd, gpu_err_path, buf, bufsize))
> +			break;
> +		if (write_gpu_dump(dri_dbg_fd, dri_dbg_path, buf, bufsize))
> +			break;
> +	}
> +}
> +
>  static const char *show_kernel_task_state(const char *msg)
>  {
>  	kmsg_log(KMSG_WARN, msg);
> @@ -816,14 +874,14 @@ static const char *need_to_timeout(struct settings *settings,
>  	    time_since_subtest > settings->per_test_timeout / decrease) {
>  		if (decrease > 1)
>  			return "Killing the test because the kernel is tainted.\n";
> -		return show_kernel_task_state("Per-test timeout exceeded. Killing the current test with SIGQUIT.\n");
> +		return show_kernel_task_state(MSG_TIMEOUT_PER_TEST);
>  	}
>  
>  	if (settings->inactivity_timeout != 0 &&
>  	    time_since_activity > settings->inactivity_timeout / decrease ) {
>  		if (decrease > 1)
>  			return "Killing the test because the kernel is tainted.\n";
> -		return show_kernel_task_state("Inactivity timeout exceeded. Killing the current test with SIGQUIT.\n");
> +		return show_kernel_task_state(MSG_TIMEOUT_INACTIVITY);
>  	}
>  
>  	if (disk_usage_limit_exceeded(settings, disk_usage))
> @@ -1446,6 +1504,12 @@ static int monitor_output(pid_t child,
>  						 disk_usage);
>  
>  		if (timeout_reason) {
> +			if (settings->dump_gpu_on_timeout &&
> +			    (!strcmp(timeout_reason, MSG_TIMEOUT_PER_TEST) ||
> +			     !strcmp(timeout_reason, MSG_TIMEOUT_INACTIVITY))) {
> +				dump_gpu_error(outputs[_F_GPUERROR], outputs[_F_GPUINFO],
> +					       settings->dump_gpu_on_timeout);
> +			}
>  			if (killed == SIGKILL) {
>  				/* Nothing that can be done, really. Let's tell the caller we want to abort. */
>  
> diff --git a/runner/executor.h b/runner/executor.h
> index ab6a0c176..23a53c8ba 100644
> --- a/runner/executor.h
> +++ b/runner/executor.h
> @@ -22,6 +22,8 @@ enum {
>  	_F_ERR,
>  	_F_DMESG,
>  	_F_SOCKET,
> +	_F_GPUERROR,
> +	_F_GPUINFO,
>  	_F_LAST,
>  };
>  

  reply	other threads:[~2023-09-19  7:29 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-14 15:39 [igt-dev] [PATCH i-g-t 0/2] RFC: runner: dump GPU error state on timeout Kamil Konieczny
2023-07-14 15:39 ` [igt-dev] [PATCH i-g-t 1/2] runner/settings: add dump-gpu-on-timeout option Kamil Konieczny
2023-09-19  7:20   ` Mauro Carvalho Chehab
2023-07-14 15:39 ` [igt-dev] [PATCH i-g-t 2/2] runner/executor: write GPU error on timeout Kamil Konieczny
2023-09-19  7:29   ` Mauro Carvalho Chehab [this message]
2023-09-19  8:09     ` Kamil Konieczny
2023-07-14 15:58 ` [igt-dev] ✗ GitLab.Pipeline: warning for RFC: runner: dump GPU error state " Patchwork
2023-07-14 16:29 ` [igt-dev] ✓ Fi.CI.BAT: success " Patchwork
2023-07-14 17:01 ` [igt-dev] ○ CI.xeBAT: info " Patchwork
2023-07-14 18:30 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230919092944.6fe8f6a3@maurocar-mobl2 \
    --to=mauro.chehab@linux.intel.com \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=kamil.konieczny@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox