public inbox for igt-dev@lists.freedesktop.org
 help / color / mirror / Atom feed
From: Carlos Santa <carlos.santa@intel.com>
To: igt-dev@lists.freedesktop.org
Cc: Chris@freedesktop.org, Ursulin Tvrtko <tvrtko.ursulin@intel.com>
Subject: [igt-dev] [RFC] tests/gem_watchdog: Initial set of tests for GPU watchdog
Date: Mon, 15 Apr 2019 11:22:51 -0700	[thread overview]
Message-ID: <20190415182251.22427-2-carlos.santa@intel.com> (raw)
In-Reply-To: <20190415182251.22427-1-carlos.santa@intel.com>

This test adds basic set of tests to reset the different
GPU engines through the watchdog timer.

Credits to Antonio for the original codebase this is based on.

Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
Cc: Antonio Argenziano <antonio.argenziano@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk
Signed-off-by: Carlos Santa <carlos.santa@intel.com>
---
 tests/Makefile.sources    |   3 +
 tests/i915/gem_watchdog.c | 439 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/meson.build         |   1 +
 3 files changed, 443 insertions(+)
 create mode 100644 tests/i915/gem_watchdog.c

diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 214698d..7f17f20 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
 TESTS_progs += gem_wait
 gem_wait_SOURCES = i915/gem_wait.c
 
+TESTS_progs += gem_watchdog
+gem_watchdog_SOURCES = i915/gem_watchdog.c
+
 TESTS_progs += gem_workarounds
 gem_workarounds_SOURCES = i915/gem_workarounds.c
 
diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
new file mode 100644
index 0000000..65e2f5b
--- /dev/null
+++ b/tests/i915/gem_watchdog.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "igt.h"
+#include "igt_sysfs.h"
+#include "sw_sync.h"
+
+#include <pthread.h>
+#include <fcntl.h>
+
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/signal.h>
+#include "i915/gem_ring.h"
+
+#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
+#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
+#define HIGH 1
+#define LOW 0
+#define LOCAL_EXEC_FENCE_OUT (1 << 17)
+#define WATCHDOG_THRESHOLD (100)
+#define MAX_ENGINES 5
+#define RENDER_CLASS 0
+#define VIDEO_DECODE_CLASS 1
+#define VIDEO_ENHANCEMENT_CLASS 2
+#define COPY_ENGINE_CLASS 3
+#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
+
+const uint64_t timeout_100ms = 100000000LL;
+static pthread_mutex_t list_lock = PTHREAD_MUTEX_INITIALIZER;
+
+struct drm_i915_gem_watchdog_timeout {
+	union {
+		struct {
+			/*
+			 * Engine class & instance to be configured or queried.
+			 */
+			__u16 engine_class;
+			__u16 engine_instance;
+		};
+		/* Index based addressing mode */
+		__u32 index;
+	};
+	/* GPU Engine watchdog resets timeout in us */
+	__u32 timeout_us;
+};
+
+static void clear_error_state(int fd)
+{
+	int dir;
+
+	dir = igt_sysfs_open(fd);
+
+	if (dir < 0)
+		return;
+
+	/* Any write to the error state clears it */
+	igt_sysfs_set(dir, "error", "");
+	close(dir);
+}
+
+static bool check_error_state(int fd)
+{
+	char *error, *str;
+	bool found = false;
+	int dir;
+
+	dir = igt_sysfs_open(fd);
+
+	error = igt_sysfs_get(dir, "error");
+	igt_sysfs_set(dir, "error", "Begone!");
+
+	igt_assert(error);
+	igt_debug("Error: %s\n", error);
+
+	if ((str = strstr(error, "GPU HANG"))) {
+		igt_debug("Found error state! GPU hang triggered! %s\n", str);
+		found = true;
+	}
+
+	close(dir);
+
+	return found;
+}
+
+static void context_set_watchdog(int fd, int engine_id,
+                                 unsigned ctx_id, unsigned threshold)
+{
+	struct drm_i915_gem_watchdog_timeout engines_threshold[MAX_ENGINES];
+	struct drm_i915_gem_context_param arg = {
+		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
+		.ctx_id = ctx_id,
+		.size = sizeof(engines_threshold),
+		.value = (uint64_t)&engines_threshold
+	};
+
+    memset(&engines_threshold, 0, sizeof(engines_threshold));
+
+    /* read existing values */
+    gem_context_get_param(fd, &arg);
+
+    switch (engine_id & I915_EXEC_RING_MASK) {
+    case I915_EXEC_RENDER:
+         engines_threshold[RENDER_CLASS].timeout_us = threshold;
+	 engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
+	 engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
+	 engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
+         break;
+    case I915_EXEC_BSD:
+         engines_threshold[RENDER_CLASS].timeout_us = 0;
+	 engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
+	 engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
+	 engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
+         break;
+    case I915_EXEC_VEBOX:
+         engines_threshold[RENDER_CLASS].timeout_us = 0;
+	 engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
+         engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = threshold;
+	 engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
+         break;
+    default:
+	engines_threshold[RENDER_CLASS].timeout_us = 0;
+	engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
+	engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
+	engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
+        break;
+    }
+	gem_context_set_param(fd, &arg);
+}
+
+static void send_canary(uint32_t fd, uint32_t ctx_id, unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle, uint64_t timeout)
+{
+    struct drm_i915_gem_exec_object2 obj[2];
+    struct drm_i915_gem_relocation_entry reloc;
+    struct drm_i915_gem_execbuffer2 execbuf;
+    igt_spin_t *spin = NULL;
+    
+    const uint32_t bbe = MI_BATCH_BUFFER_END;
+    int i = 0;
+    int fence;
+
+    gem_quiescent_gpu(fd);
+
+    memset(&execbuf, 0, sizeof(execbuf));
+    memset(&obj, 0, sizeof(obj));
+    memset(&reloc, 0, sizeof(reloc));
+
+    execbuf.buffers_ptr = to_user_pointer(obj);
+
+    execbuf.buffer_count = 2;
+    execbuf.flags = exec_id | LOCAL_EXEC_FENCE_OUT;
+
+    obj[0].handle = target;
+    obj[1].handle = gem_create(fd, 4096);
+
+    obj[1].relocation_count = 1;
+    obj[1].relocs_ptr = to_user_pointer(&reloc);
+
+    reloc.target_handle = obj[0].handle;
+    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
+    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
+    reloc.delta = offset * sizeof(uint32_t);
+
+    reloc.offset = i * sizeof(uint32_t);
+    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
+
+    __sync_synchronize();
+
+    if (handle) {
+        *handle = obj[1].handle;
+        return;
+    }
+    gem_sync(fd, obj[1].handle);
+    execbuf.rsvd1 = ctx_id;
+    execbuf.rsvd2 = -1;
+
+    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
+    igt_spin_batch_set_timeout(spin, timeout);
+    igt_assert(gem_bo_busy(fd, obj[0].handle));
+
+    gem_execbuf_wr(fd, &execbuf);
+    igt_spin_batch_free(fd, spin);
+
+    fence = execbuf.rsvd2 >> 32;
+
+#if 0
+    igt_info("fence:%d, fence status : %d EIO: %d ctx_id:%d\n",fence, sync_fence_status(fence),-EIO, ctx_id);
+#endif
+
+    close(fence);    
+
+    gem_close(fd, obj[1].handle);
+    gem_quiescent_gpu(fd);
+}
+
+static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
+{
+	uint32_t ctx = gem_context_create(fd);
+
+	switch (ctx_prio) {
+	case HIGH:
+		__gem_context_set_priority(fd, ctx, MAX_PRIO);
+		igt_info("Setting MAX priority %d\n", ctx_prio);
+		break;
+	case LOW:
+		__gem_context_set_priority(fd, ctx, MIN_PRIO);
+		igt_info("Setting MIN priority %d\n", ctx_prio);
+		break;
+	default:
+		igt_info("Ignoring context priority %d\n", ctx_prio);
+		break;
+	}
+	return ctx;
+}
+
+static void bb_factory(uint32_t fd, uint32_t *ctx, int prio, uint64_t timeout, int pid)
+{
+	uint32_t scratch;
+	unsigned int nengine = 0;
+	unsigned int engines[16];
+	int i, j;
+	unsigned int engine;
+
+	for_each_physical_engine(fd, engine)
+		engines[nengine++] = engine;
+
+	igt_require(nengine);
+
+	for (i = 0; i < nengine; i++) {
+		scratch = gem_create(fd, 4096);
+		pthread_mutex_lock(&list_lock);
+
+		if (prio == HIGH)
+			create_ctx_with_priority(fd, HIGH);
+		else if (prio == LOW)
+			create_ctx_with_priority(fd, LOW);
+		else
+			ctx[j] = gem_context_create(fd);
+
+		send_canary(fd, ctx[j], engines[i], scratch, 0, NULL, timeout);
+		gem_close(fd, scratch);
+		pthread_mutex_unlock(&list_lock);
+		j++;
+	}
+}
+
+static void inject_hang(uint32_t fd, unsigned ring, uint32_t ctx_id,  unsigned flags)
+{
+	igt_hang_t hang;
+	hang = igt_hang_ctx(fd, ctx_id, ring, flags);
+	gem_sync(fd, hang.spin->handle);
+}
+
+static void gpu_watchdog_long_batch_2_contexts(int fd)
+{
+	unsigned flags = HANG_ALLOW_CAPTURE;
+	int i = 0, j=0;
+
+	igt_fork(child, 1) {
+		/* Child process */
+		uint32_t ctx[5];
+		const uint64_t batch_timeout_ms = timeout_100ms*3;
+		memset(&ctx, 0, sizeof(ctx));
+		bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));
+
+		sleep(6);
+
+	    	context_set_watchdog(fd, 1, 1, WATCHDOG_THRESHOLD);
+	    	clear_error_state(fd);
+
+		/* Engine ID: RCS0, Context ID: 1 */
+	   	inject_hang(fd, 1, 1, flags);
+
+		for (i = 0; i <5; i++) {
+			printf("ctx_child: %u ctx_cnt: %d tid:%d\n",ctx[i], syscall(SYS_gettid));
+			//gem_context_destroy(fd, ctx[i]);
+		}
+	}
+
+	{
+		/* Parent process */
+		uint32_t ctx[5];
+		const uint64_t batch_timeout_ms = timeout_100ms*1;
+		memset(&ctx, 0, sizeof(ctx));
+		printf("%s %d , tid: %d\n", __FUNCTION__,__LINE__, syscall(SYS_gettid));
+		
+		bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));
+
+		sleep(6);
+
+	    	context_set_watchdog(fd, 4, 10, WATCHDOG_THRESHOLD);
+	    	clear_error_state(fd);
+
+        	/* Engine ID: VECS0, Context ID: 10 */
+   		inject_hang(fd, 4, 10, flags);
+
+		for (i = 0; i <5; i++) {
+			igt_info("ctx_dad: %u ctx_cnt: %d tid:%d \n",ctx[i], i, syscall(SYS_gettid));
+			//gem_context_destroy(fd, ctx[i]);
+		}
+	}
+
+	igt_waitchildren();
+}
+
+static void gpu_watchodg_hang_long_batch_single_engine(int fd, unsigned engine_id, int nengine)
+{
+	uint32_t *ctx;
+	int i;
+	unsigned flags = HANG_ALLOW_CAPTURE;
+	const uint64_t batch_timeout_ms = timeout_100ms*4;
+
+	igt_require(nengine);
+
+	ctx = (uint32_t *)malloc(sizeof(uint32_t)*nengine);
+
+	if (!ctx) {
+		igt_info("Out of memory\n");  
+		exit(1);
+	}
+
+	bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));
+	
+	if (engine_id == 1) {
+		context_set_watchdog(fd, engine_id, 1, WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+		inject_hang(fd, engine_id, 1, flags);
+	} else if (engine_id == 8194) {
+		context_set_watchdog(fd, engine_id, 2, WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+		inject_hang(fd, engine_id, 2, flags);
+	} else if (engine_id == 16386) {
+		context_set_watchdog(fd, engine_id, 3, WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+		inject_hang(fd, engine_id, 3, flags);
+	} else if (engine_id == 4) {
+		context_set_watchdog(fd, engine_id, 5, WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+		inject_hang(fd, engine_id, 5, flags);
+	}
+
+	for (i = 0; i <nengine; i++)
+		gem_context_destroy(fd, ctx[i]);
+	free(ctx);
+}
+static void no_gpu_hang_long_batch_all_engines(int fd, int nengine, int prio)
+{
+	uint32_t *ctx;
+	int i;
+	const uint64_t batch_timeout_ms = timeout_100ms*4;
+
+	ctx = (uint32_t *)malloc(sizeof(uint32_t)*nengine);
+
+	if (!ctx) {
+		printf("Out of memory\n");  
+		exit(1);
+	}
+
+	bb_factory(fd, ctx, prio, batch_timeout_ms, syscall(SYS_gettid));
+
+	for (i = 0; i <nengine; i++)
+		gem_context_destroy(fd, ctx[i]);
+
+	free(ctx);
+}
+
+igt_main
+{
+	int fd;
+	unsigned int nengine = 0;
+	unsigned int engine;
+	unsigned int engines[16];
+
+	igt_skip_on_simulation();
+
+	igt_fixture {
+		fd = drm_open_driver(DRIVER_INTEL);
+		igt_require_gem(fd);
+
+		for_each_physical_engine(fd, engine)
+			engines[nengine++] = engine;
+		igt_require(nengine);
+	}
+
+	igt_subtest_group {
+		igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-no-priority") {
+			int prio = -1;
+			no_gpu_hang_long_batch_all_engines(fd, nengine, prio);
+		}
+
+		igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-low-priority") {
+			int prio = LOW;
+			no_gpu_hang_long_batch_all_engines(fd, nengine, prio);
+		}
+
+		igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-high-priority") {
+			int prio = HIGH;
+			no_gpu_hang_long_batch_all_engines(fd, nengine, prio);
+		}
+
+		for (const struct intel_execution_engine *e = intel_execution_engines; e->name; e++) {
+			/* no support for gpu watchdog on BLT */
+			if (e->exec_id == 0 || e->exec_id == I915_EXEC_BLT)
+				continue;
+
+			igt_subtest_f("gpu-watchdog-long-batch-%s", e->name) {
+				igt_require(gem_ring_has_physical_engine(fd, e->exec_id | e->flags));
+				gpu_watchodg_hang_long_batch_single_engine(fd, e->exec_id | e->flags, nengine);
+			}
+		}
+
+		igt_subtest_f("gpu-watchdog-long-batch-2-contexts") {
+			gpu_watchdog_long_batch_2_contexts(fd);
+		}
+    }
+
+    igt_fixture {
+	close(fd);
+    }
+}
diff --git a/tests/meson.build b/tests/meson.build
index 5167a6c..b281b75 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -210,6 +210,7 @@ i915_progs = [
 	'gem_unref_active_buffers',
 	'gem_userptr_blits',
 	'gem_wait',
+        'gem_watchdog',
 	'gem_workarounds',
 	'gem_write_read_ring_switch',
 	'i915_fb_tiling',
-- 
2.7.4

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

  reply	other threads:[~2019-04-15 18:23 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-15 18:22 [igt-dev] [RFC] IGT GPU watchdog Carlos Santa
2019-04-15 18:22 ` Carlos Santa [this message]
2019-04-15 21:50   ` [igt-dev] [RFC] tests/gem_watchdog: Initial set of tests for " Antonio Argenziano
2019-04-16  2:02     ` Carlos Santa
2019-04-16 20:16       ` Antonio Argenziano
2019-04-16 20:21         ` Chris Wilson
2019-04-16 20:29           ` Antonio Argenziano
2019-04-15 18:32 ` [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev3) Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190415182251.22427-2-carlos.santa@intel.com \
    --to=carlos.santa@intel.com \
    --cc=Chris@freedesktop.org \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=tvrtko.ursulin@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox