Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH i-g-t 1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state
@ 2024-04-09 22:19 Rodrigo Vivi
  2024-04-09 22:19 ` [PATCH i-g-t 2/3] tests/intel/xe_wedged: Also add a simple exec to confirm GPU health Rodrigo Vivi
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Rodrigo Vivi @ 2024-04-09 22:19 UTC (permalink / raw)
  To: igt-dev; +Cc: intel-xe, Rodrigo Vivi, Lucas De Marchi, Himal Prasad Ghimiray

Let's inject a gt_reset failure that will put Xe device in the
new wedged state, then we confirm the IOCTL is blocked and we
reload the driver to get back to a clean state for other test
execution, since wedged state in Xe is a final state that can only
be cleared with a device rebind/reprobe.

The fault injection of this test is entirely based on xe_uevent
provided by Himal.

v2: Use rebind instead of module reload (Lucas)
    And other improvements also pointed out by Lucas.

Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc:  Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 tests/intel/xe_wedged.c | 108 ++++++++++++++++++++++++++++++++++++++++
 tests/meson.build       |   1 +
 2 files changed, 109 insertions(+)
 create mode 100644 tests/intel/xe_wedged.c

diff --git a/tests/intel/xe_wedged.c b/tests/intel/xe_wedged.c
new file mode 100644
index 000000000..f2587cc43
--- /dev/null
+++ b/tests/intel/xe_wedged.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+/**
+ * TEST: cause fake gt reset failure which put Xe device in wedged state
+ * Category: Software building block
+ * Sub-category: driver
+ * Functionality: wedged
+ * Test category: functionality test
+ */
+
+#include <limits.h>
+#include <dirent.h>
+
+#include "igt.h"
+#include "igt_device.h"
+#include "igt_kmod.h"
+#include "igt_sysfs.h"
+
+#include "xe/xe_ioctl.h"
+
+static void force_wedged(int fd)
+{
+	igt_debugfs_write(fd, "fail_gt_reset/probability", "100");
+	igt_debugfs_write(fd, "fail_gt_reset/times", "2");
+
+	xe_force_gt_reset(fd, 0);
+	sleep(1);
+}
+
+static int rebind_xe(int fd)
+{
+	char pci_slot[NAME_MAX];
+	int sysfs;
+
+	igt_device_get_pci_slot_name(fd, pci_slot);
+
+	sysfs = open("/sys/bus/pci/drivers/xe", O_DIRECTORY);
+	igt_assert(sysfs);
+
+        igt_assert(igt_sysfs_set(sysfs, "unbind", pci_slot));
+
+	/*
+	 * We need to close the client for a proper release, before
+	 * binding back again.
+	 */
+	close(fd);
+
+        igt_assert(igt_sysfs_set(sysfs, "bind", pci_slot));
+	close(sysfs);
+
+	/* Renew the client connection */
+	fd = drm_open_driver(DRIVER_XE);
+	igt_assert(fd);
+
+	return fd;
+}
+
+static int simple_ioctl(int fd)
+{
+	int ret;
+
+	struct drm_xe_vm_create create = {
+		.extensions = 0,
+		.flags = 0,
+	};
+
+	ret = igt_ioctl(fd, DRM_IOCTL_XE_VM_CREATE, &create);
+
+	if (ret == 0)
+		xe_vm_destroy(fd, create.vm_id);
+
+	return ret;
+}
+
+/**
+ * SUBTEST: basic-wedged
+ * Description: Force Xe device wedged after injecting a failure in GT reset
+ */
+igt_main
+{
+	int fd;
+
+	igt_fixture {
+		fd = drm_open_driver(DRIVER_XE);
+	}
+
+	igt_subtest("basic-wedged") {
+		igt_require(igt_debugfs_exists(fd, "fail_gt_reset/probability",
+					       O_RDWR));
+
+		igt_assert_eq(simple_ioctl(fd), 0);
+		force_wedged(fd);
+		igt_assert_neq(simple_ioctl(fd), 0);
+		fd = rebind_xe(fd);
+		igt_assert_eq(simple_ioctl(fd), 0);
+	}
+
+	igt_fixture {
+		if (igt_debugfs_exists(fd, "fail_gt_reset/probability", O_RDWR)) {
+			igt_debugfs_write(fd, "fail_gt_reset/probability", "0");
+			igt_debugfs_write(fd, "fail_gt_reset/times", "1");
+		}
+		drm_close_driver(fd);
+	}
+}
diff --git a/tests/meson.build b/tests/meson.build
index a856510fc..65b8bf23b 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -274,6 +274,7 @@ intel_kms_progs = [
 ]
 
 intel_xe_progs = [
+	'xe_wedged',
 	'xe_ccs',
 	'xe_create',
 	'xe_compute',
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH i-g-t 2/3] tests/intel/xe_wedged: Also add a simple exec to confirm GPU health
  2024-04-09 22:19 [PATCH i-g-t 1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state Rodrigo Vivi
@ 2024-04-09 22:19 ` Rodrigo Vivi
  2024-04-18 14:35   ` Ghimiray, Himal Prasad
  2024-04-09 22:19 ` [PATCH i-g-t 3/3] tests/intel/xe_wedged: Introduce test for wedged_mode=2 Rodrigo Vivi
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 7+ messages in thread
From: Rodrigo Vivi @ 2024-04-09 22:19 UTC (permalink / raw)
  To: igt-dev; +Cc: intel-xe, Rodrigo Vivi

Besides confirming that the rebind puts the device in a state
where we can send IOCTLs, let's also ensure it can really
perform some basic exec functions.

Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 tests/intel/xe_wedged.c | 90 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/tests/intel/xe_wedged.c b/tests/intel/xe_wedged.c
index f2587cc43..ab9bf23d5 100644
--- a/tests/intel/xe_wedged.c
+++ b/tests/intel/xe_wedged.c
@@ -17,9 +17,13 @@
 #include "igt.h"
 #include "igt_device.h"
 #include "igt_kmod.h"
+#include "igt_syncobj.h"
 #include "igt_sysfs.h"
 
+#include "xe_drm.h"
 #include "xe/xe_ioctl.h"
+#include "xe/xe_query.h"
+#include "xe/xe_spin.h"
 
 static void force_wedged(int fd)
 {
@@ -75,12 +79,96 @@ static int simple_ioctl(int fd)
 	return ret;
 }
 
+static void
+simple_exec(int fd, struct drm_xe_engine_class_instance *eci)
+{
+	uint32_t vm;
+	uint64_t addr = 0x1a0000;
+	struct drm_xe_sync sync[2] = {
+		{ .type = DRM_XE_SYNC_TYPE_SYNCOBJ, .flags = DRM_XE_SYNC_FLAG_SIGNAL, },
+		{ .type = DRM_XE_SYNC_TYPE_SYNCOBJ, .flags = DRM_XE_SYNC_FLAG_SIGNAL, },
+	};
+	struct drm_xe_exec exec = {
+		.num_batch_buffer = 1,
+		.num_syncs = 2,
+		.syncs = to_user_pointer(sync),
+	};
+	uint64_t batch_offset, batch_addr, sdi_offset, sdi_addr;
+	uint32_t exec_queue;
+	uint32_t syncobjs;
+	size_t bo_size;
+	uint32_t bo = 0;
+	struct {
+		uint32_t batch[16];
+		uint64_t pad;
+		uint32_t data;
+	} *data;
+	int b;
+
+	vm = xe_vm_create(fd, 0, 0);
+
+	bo_size = sizeof(*data) * 2;
+	bo_size = xe_bb_size(fd, bo_size);
+	bo = xe_bo_create(fd, vm, bo_size,
+			  vram_if_possible(fd, eci->gt_id),
+			  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
+	data = xe_bo_map(fd, bo, bo_size);
+
+	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
+
+	syncobjs = syncobj_create(fd, 0);
+	sync[0].handle = syncobj_create(fd, 0);
+
+	xe_vm_bind_async(fd, vm, 0, bo, 0, addr,
+			 bo_size, sync, 1);
+
+	batch_offset = (char *)&data[0].batch - (char *)data;
+	batch_addr = addr + batch_offset;
+	sdi_offset = (char *)&data[0].data - (char *)data;
+	sdi_addr = addr + sdi_offset;
+
+	b = 0;
+	data[0].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
+	data[0].batch[b++] = sdi_addr;
+	data[0].batch[b++] = sdi_addr >> 32;
+	data[0].batch[b++] = 0xc0ffee;
+	data[0].batch[b++] = MI_BATCH_BUFFER_END;
+	igt_assert(b <= ARRAY_SIZE(data[0].batch));
+
+	sync[0].flags &= ~DRM_XE_SYNC_FLAG_SIGNAL;
+	sync[1].flags |= DRM_XE_SYNC_FLAG_SIGNAL;
+	sync[1].handle = syncobjs;
+
+	exec.exec_queue_id = exec_queue;
+	exec.address = batch_addr;
+
+	syncobj_reset(fd, &syncobjs, 1);
+
+	xe_exec(fd, &exec);
+
+	igt_assert(syncobj_wait(fd, &syncobjs, 1, INT64_MAX, 0, NULL));
+	igt_assert_eq(data[0].data, 0xc0ffee);
+	igt_assert(syncobj_wait(fd, &sync[0].handle, 1, INT64_MAX, 0, NULL));
+	sync[0].flags |= DRM_XE_SYNC_FLAG_SIGNAL;
+	xe_vm_unbind_async(fd, vm, 0, 0, addr, bo_size, sync, 1);
+	igt_assert(syncobj_wait(fd, &sync[0].handle, 1, INT64_MAX, 0, NULL));
+	igt_assert_eq(data[0].data, 0xc0ffee);
+
+	syncobj_destroy(fd, sync[0].handle);
+	syncobj_destroy(fd, syncobjs);
+	xe_exec_queue_destroy(fd, exec_queue);
+	munmap(data, bo_size);
+	gem_close(fd, bo);
+	xe_vm_destroy(fd, vm);
+}
+
 /**
  * SUBTEST: basic-wedged
  * Description: Force Xe device wedged after injecting a failure in GT reset
  */
 igt_main
 {
+	struct drm_xe_engine_class_instance *hwe;
 	int fd;
 
 	igt_fixture {
@@ -96,6 +184,8 @@ igt_main
 		igt_assert_neq(simple_ioctl(fd), 0);
 		fd = rebind_xe(fd);
 		igt_assert_eq(simple_ioctl(fd), 0);
+		xe_for_each_engine(fd, hwe)
+			simple_exec(fd, hwe);
 	}
 
 	igt_fixture {
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH i-g-t 3/3] tests/intel/xe_wedged: Introduce test for wedged_mode=2
  2024-04-09 22:19 [PATCH i-g-t 1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state Rodrigo Vivi
  2024-04-09 22:19 ` [PATCH i-g-t 2/3] tests/intel/xe_wedged: Also add a simple exec to confirm GPU health Rodrigo Vivi
@ 2024-04-09 22:19 ` Rodrigo Vivi
  2024-04-09 23:07 ` ✗ CI.Patch_applied: failure for series starting with [i-g-t,1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state Patchwork
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Rodrigo Vivi @ 2024-04-09 22:19 UTC (permalink / raw)
  To: igt-dev; +Cc: intel-xe, Rodrigo Vivi

In this mode, selected with debugfs, the GPU will be declared
as wedged at any timeout. So, let's also introduce a command
that will surely timeout. Based on the xe_exec_threads hang.

Then we confirm the GPU is back alive after a rebind.

Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 tests/intel/xe_wedged.c | 69 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/tests/intel/xe_wedged.c b/tests/intel/xe_wedged.c
index ab9bf23d5..35fc905e7 100644
--- a/tests/intel/xe_wedged.c
+++ b/tests/intel/xe_wedged.c
@@ -162,10 +162,60 @@ simple_exec(int fd, struct drm_xe_engine_class_instance *eci)
 	xe_vm_destroy(fd, vm);
 }
 
+static void
+simple_hang(int fd)
+{
+	struct drm_xe_engine_class_instance *eci = &xe_engine(fd, 0)->instance;
+	uint32_t vm;
+	uint64_t addr = 0x1a0000;
+	struct drm_xe_exec exec_hang = {
+		.num_batch_buffer = 1,
+	};
+	uint64_t spin_offset;
+	uint32_t hang_exec_queue;
+	size_t bo_size;
+	uint32_t bo = 0;
+	struct {
+		struct xe_spin spin;
+		uint32_t batch[16];
+		uint64_t pad;
+		uint32_t data;
+	} *data;
+	struct xe_spin_opts spin_opts = { .preempt = false };
+	int err;
+
+	vm = xe_vm_create(fd, 0, 0);
+	bo_size = xe_bb_size(fd, sizeof(*data));
+	bo = xe_bo_create(fd, vm, bo_size,
+			  vram_if_possible(fd, eci->gt_id),
+			  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
+	data = xe_bo_map(fd, bo, bo_size);
+	hang_exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
+
+	spin_offset = (char *)&data[0].spin - (char *)data;
+	spin_opts.addr = addr + spin_offset;
+	xe_spin_init(&data[0].spin, &spin_opts);
+	exec_hang.exec_queue_id = hang_exec_queue;
+	exec_hang.address = spin_opts.addr;
+
+	do {
+		err =  igt_ioctl(fd, DRM_IOCTL_XE_EXEC, &exec_hang);
+	} while (err && errno == ENOMEM);
+
+	xe_exec_queue_destroy(fd, hang_exec_queue);
+	munmap(data, bo_size);
+	gem_close(fd, bo);
+	xe_vm_destroy(fd, vm);
+}
+
 /**
  * SUBTEST: basic-wedged
  * Description: Force Xe device wedged after injecting a failure in GT reset
  */
+/**
+ * SUBTEST: wedged-at-any-timeout
+ * Description: Force Xe device wedged after a simple guc timeout
+ */
 igt_main
 {
 	struct drm_xe_engine_class_instance *hwe;
@@ -188,6 +238,25 @@ igt_main
 			simple_exec(fd, hwe);
 	}
 
+	igt_subtest_f("wedged-at-any-timeout") {
+		igt_require(igt_debugfs_exists(fd, "wedged_mode", O_RDWR));
+
+		igt_debugfs_write(fd, "wedged_mode", "2");
+		simple_hang(fd);
+		/*
+		 * Any ioctl after the first timeout on wedged_mode=2 is blocked
+		 * so we cannot relly on sync objects. Let's wait a bit for
+		 * things to settle before we confirm device as wedged and
+		 * rebind.
+		 */
+		sleep(1);
+		igt_assert_neq(simple_ioctl(fd), 0);
+		fd = rebind_xe(fd);
+		igt_assert_eq(simple_ioctl(fd), 0);
+		xe_for_each_engine(fd, hwe)
+			simple_exec(fd, hwe);
+	}
+
 	igt_fixture {
 		if (igt_debugfs_exists(fd, "fail_gt_reset/probability", O_RDWR)) {
 			igt_debugfs_write(fd, "fail_gt_reset/probability", "0");
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* ✗ CI.Patch_applied: failure for series starting with [i-g-t,1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state
  2024-04-09 22:19 [PATCH i-g-t 1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state Rodrigo Vivi
  2024-04-09 22:19 ` [PATCH i-g-t 2/3] tests/intel/xe_wedged: Also add a simple exec to confirm GPU health Rodrigo Vivi
  2024-04-09 22:19 ` [PATCH i-g-t 3/3] tests/intel/xe_wedged: Introduce test for wedged_mode=2 Rodrigo Vivi
@ 2024-04-09 23:07 ` Patchwork
  2024-04-10  4:17 ` [PATCH i-g-t 1/3] " Ghimiray, Himal Prasad
  2024-04-18 14:28 ` Ghimiray, Himal Prasad
  4 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2024-04-09 23:07 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: intel-xe

== Series Details ==

Series: series starting with [i-g-t,1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state
URL   : https://patchwork.freedesktop.org/series/132233/
State : failure

== Summary ==

=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: 057ec21a54cd drm-tip: 2024y-04m-09d-21h-23m-50s UTC integration manifest
=== git am output follows ===
error: tests/meson.build: does not exist in index
hint: Use 'git am --show-current-patch=diff' to see the failed patch
Applying: tests/intel/xe_wedged: Introduce a new test for Xe device wedged state
Patch failed at 0001 tests/intel/xe_wedged: Introduce a new test for Xe device wedged state
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH i-g-t 1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state
  2024-04-09 22:19 [PATCH i-g-t 1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state Rodrigo Vivi
                   ` (2 preceding siblings ...)
  2024-04-09 23:07 ` ✗ CI.Patch_applied: failure for series starting with [i-g-t,1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state Patchwork
@ 2024-04-10  4:17 ` Ghimiray, Himal Prasad
  2024-04-18 14:28 ` Ghimiray, Himal Prasad
  4 siblings, 0 replies; 7+ messages in thread
From: Ghimiray, Himal Prasad @ 2024-04-10  4:17 UTC (permalink / raw)
  To: Rodrigo Vivi, igt-dev; +Cc: intel-xe, Lucas De Marchi

[-- Attachment #1: Type: text/plain, Size: 3884 bytes --]


On 10-04-2024 03:49, Rodrigo Vivi wrote:
> Let's inject a gt_reset failure that will put Xe device in the
> new wedged state, then we confirm the IOCTL is blocked and we
> reload the driver to get back to a clean state for other test
> execution, since wedged state in Xe is a final state that can only
> be cleared with a device rebind/reprobe.
>
> The fault injection of this test is entirely based on xe_uevent
> provided by Himal.
>
> v2: Use rebind instead of module reload (Lucas)
>      And other improvements also pointed out by Lucas.
>
> Cc: Lucas De Marchi<lucas.demarchi@intel.com>
> Cc:  Himal Prasad Ghimiray<himal.prasad.ghimiray@intel.com>
> Signed-off-by: Rodrigo Vivi<rodrigo.vivi@intel.com>
> ---
>   tests/intel/xe_wedged.c | 108 ++++++++++++++++++++++++++++++++++++++++
>   tests/meson.build       |   1 +
>   2 files changed, 109 insertions(+)
>   create mode 100644 tests/intel/xe_wedged.c
>
> diff --git a/tests/intel/xe_wedged.c b/tests/intel/xe_wedged.c
> new file mode 100644
> index 000000000..f2587cc43
> --- /dev/null
> +++ b/tests/intel/xe_wedged.c
> @@ -0,0 +1,108 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +/**
> + * TEST: cause fake gt reset failure which put Xe device in wedged state
> + * Category: Software building block
> + * Sub-category: driver
> + * Functionality: wedged
> + * Test category: functionality test
> + */
> +
> +#include <limits.h>
> +#include <dirent.h>
> +
> +#include "igt.h"
> +#include "igt_device.h"
> +#include "igt_kmod.h"
> +#include "igt_sysfs.h"
> +
> +#include "xe/xe_ioctl.h"
> +
> +static void force_wedged(int fd)
> +{
> +	igt_debugfs_write(fd, "fail_gt_reset/probability", "100");
> +	igt_debugfs_write(fd, "fail_gt_reset/times", "2");
> +
> +	xe_force_gt_reset(fd, 0);
> +	sleep(1);
> +}
> +
> +static int rebind_xe(int fd)
> +{
> +	char pci_slot[NAME_MAX];
> +	int sysfs;
> +
> +	igt_device_get_pci_slot_name(fd, pci_slot);
> +
> +	sysfs = open("/sys/bus/pci/drivers/xe", O_DIRECTORY);
> +	igt_assert(sysfs);
> +
> +        igt_assert(igt_sysfs_set(sysfs, "unbind", pci_slot));
> +
> +	/*
> +	 * We need to close the client for a proper release, before
> +	 * binding back again.
> +	 */
> +	close(fd);
> +
> +        igt_assert(igt_sysfs_set(sysfs, "bind", pci_slot));
> +	close(sysfs);
> +
> +	/* Renew the client connection */
> +	fd = drm_open_driver(DRIVER_XE);
> +	igt_assert(fd);
> +
> +	return fd;
> +}
> +
> +static int simple_ioctl(int fd)
> +{
> +	int ret;
> +
> +	struct drm_xe_vm_create create = {
> +		.extensions = 0,
> +		.flags = 0,
> +	};
> +
> +	ret = igt_ioctl(fd, DRM_IOCTL_XE_VM_CREATE, &create);
> +
> +	if (ret == 0)
> +		xe_vm_destroy(fd, create.vm_id);
> +
> +	return ret;
> +}
> +
> +/**
> + * SUBTEST: basic-wedged
> + * Description: Force Xe device wedged after injecting a failure in GT reset
> + */
> +igt_main
> +{
> +	int fd;
> +
> +	igt_fixture {
> +		fd = drm_open_driver(DRIVER_XE);
> +	}
> +
> +	igt_subtest("basic-wedged") {
> +		igt_require(igt_debugfs_exists(fd, "fail_gt_reset/probability",
> +					       O_RDWR));
> +
> +		igt_assert_eq(simple_ioctl(fd), 0);
> +		force_wedged(fd);
> +		igt_assert_neq(simple_ioctl(fd), 0);
> +		fd = rebind_xe(fd);
> +		igt_assert_eq(simple_ioctl(fd), 0);
> +	}
> +
> +	igt_fixture {
> +		if (igt_debugfs_exists(fd, "fail_gt_reset/probability", O_RDWR)) {
> +			igt_debugfs_write(fd, "fail_gt_reset/probability", "0");
> +			igt_debugfs_write(fd, "fail_gt_reset/times", "1");
> +		}
> +		drm_close_driver(fd);
> +	}
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index a856510fc..65b8bf23b 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -274,6 +274,7 @@ intel_kms_progs = [
>   ]
>   
>   intel_xe_progs = [
> +	'xe_wedged',
>   	'xe_ccs',
>   	'xe_create',
>   	'xe_compute',

Acked-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>

[-- Attachment #2: Type: text/html, Size: 40107 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH i-g-t 1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state
  2024-04-09 22:19 [PATCH i-g-t 1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state Rodrigo Vivi
                   ` (3 preceding siblings ...)
  2024-04-10  4:17 ` [PATCH i-g-t 1/3] " Ghimiray, Himal Prasad
@ 2024-04-18 14:28 ` Ghimiray, Himal Prasad
  4 siblings, 0 replies; 7+ messages in thread
From: Ghimiray, Himal Prasad @ 2024-04-18 14:28 UTC (permalink / raw)
  To: Rodrigo Vivi, igt-dev; +Cc: intel-xe, Lucas De Marchi


On 10-04-2024 03:49, Rodrigo Vivi wrote:
> Let's inject a gt_reset failure that will put Xe device in the
> new wedged state, then we confirm the IOCTL is blocked and we
> reload the driver to get back to a clean state for other test
> execution, since wedged state in Xe is a final state that can only
> be cleared with a device rebind/reprobe.
>
> The fault injection of this test is entirely based on xe_uevent
> provided by Himal.
>
> v2: Use rebind instead of module reload (Lucas)
>      And other improvements also pointed out by Lucas.
>
> Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> Cc:  Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> ---
>   tests/intel/xe_wedged.c | 108 ++++++++++++++++++++++++++++++++++++++++
>   tests/meson.build       |   1 +
>   2 files changed, 109 insertions(+)
>   create mode 100644 tests/intel/xe_wedged.c
>
> diff --git a/tests/intel/xe_wedged.c b/tests/intel/xe_wedged.c
> new file mode 100644
> index 000000000..f2587cc43
> --- /dev/null
> +++ b/tests/intel/xe_wedged.c
> @@ -0,0 +1,108 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +/**
> + * TEST: cause fake gt reset failure which put Xe device in wedged state
> + * Category: Software building block
> + * Sub-category: driver
> + * Functionality: wedged
> + * Test category: functionality test
> + */
> +
> +#include <limits.h>
> +#include <dirent.h>
> +
> +#include "igt.h"
> +#include "igt_device.h"
> +#include "igt_kmod.h"
> +#include "igt_sysfs.h"
> +
> +#include "xe/xe_ioctl.h"
> +
> +static void force_wedged(int fd)
> +{
> +	igt_debugfs_write(fd, "fail_gt_reset/probability", "100");
> +	igt_debugfs_write(fd, "fail_gt_reset/times", "2");
> +
> +	xe_force_gt_reset(fd, 0);
> +	sleep(1);
> +}
> +
> +static int rebind_xe(int fd)
> +{
> +	char pci_slot[NAME_MAX];
> +	int sysfs;
> +
> +	igt_device_get_pci_slot_name(fd, pci_slot);
> +
> +	sysfs = open("/sys/bus/pci/drivers/xe", O_DIRECTORY);
> +	igt_assert(sysfs);
> +
> +        igt_assert(igt_sysfs_set(sysfs, "unbind", pci_slot));
> +
> +	/*
> +	 * We need to close the client for a proper release, before
> +	 * binding back again.
> +	 */
> +	close(fd);
> +
> +        igt_assert(igt_sysfs_set(sysfs, "bind", pci_slot));
> +	close(sysfs);
> +
> +	/* Renew the client connection */
> +	fd = drm_open_driver(DRIVER_XE);
> +	igt_assert(fd);
> +
> +	return fd;
> +}
> +
> +static int simple_ioctl(int fd)
> +{
> +	int ret;
> +
> +	struct drm_xe_vm_create create = {
> +		.extensions = 0,
> +		.flags = 0,
> +	};
> +
> +	ret = igt_ioctl(fd, DRM_IOCTL_XE_VM_CREATE, &create);
> +
> +	if (ret == 0)
> +		xe_vm_destroy(fd, create.vm_id);
> +
> +	return ret;
> +}
> +
> +/**
> + * SUBTEST: basic-wedged
> + * Description: Force Xe device wedged after injecting a failure in GT reset
> + */
> +igt_main
> +{
> +	int fd;
> +
> +	igt_fixture {
> +		fd = drm_open_driver(DRIVER_XE);
> +	}
> +
> +	igt_subtest("basic-wedged") {
> +		igt_require(igt_debugfs_exists(fd, "fail_gt_reset/probability",
> +					       O_RDWR));
> +
> +		igt_assert_eq(simple_ioctl(fd), 0);
> +		force_wedged(fd);
> +		igt_assert_neq(simple_ioctl(fd), 0);
> +		fd = rebind_xe(fd);
> +		igt_assert_eq(simple_ioctl(fd), 0);
> +	}
> +
> +	igt_fixture {
> +		if (igt_debugfs_exists(fd, "fail_gt_reset/probability", O_RDWR)) {
> +			igt_debugfs_write(fd, "fail_gt_reset/probability", "0");
> +			igt_debugfs_write(fd, "fail_gt_reset/times", "1");
> +		}
> +		drm_close_driver(fd);
> +	}
> +}


Patch LGTM.

Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>


> diff --git a/tests/meson.build b/tests/meson.build
> index a856510fc..65b8bf23b 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -274,6 +274,7 @@ intel_kms_progs = [
>   ]
>   
>   intel_xe_progs = [
> +	'xe_wedged',
>   	'xe_ccs',
>   	'xe_create',
>   	'xe_compute',

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH i-g-t 2/3] tests/intel/xe_wedged: Also add a simple exec to confirm GPU health
  2024-04-09 22:19 ` [PATCH i-g-t 2/3] tests/intel/xe_wedged: Also add a simple exec to confirm GPU health Rodrigo Vivi
@ 2024-04-18 14:35   ` Ghimiray, Himal Prasad
  0 siblings, 0 replies; 7+ messages in thread
From: Ghimiray, Himal Prasad @ 2024-04-18 14:35 UTC (permalink / raw)
  To: Rodrigo Vivi, igt-dev; +Cc: intel-xe


On 10-04-2024 03:49, Rodrigo Vivi wrote:
> Besides confirming that the rebind puts the device in a state
> where we can send IOCTLs, let's also ensure it can really
> perform some basic exec functions.
>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> ---
>   tests/intel/xe_wedged.c | 90 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 90 insertions(+)
>
> diff --git a/tests/intel/xe_wedged.c b/tests/intel/xe_wedged.c
> index f2587cc43..ab9bf23d5 100644
> --- a/tests/intel/xe_wedged.c
> +++ b/tests/intel/xe_wedged.c
> @@ -17,9 +17,13 @@
>   #include "igt.h"
>   #include "igt_device.h"
>   #include "igt_kmod.h"
> +#include "igt_syncobj.h"
>   #include "igt_sysfs.h"
>   
> +#include "xe_drm.h"
>   #include "xe/xe_ioctl.h"
> +#include "xe/xe_query.h"
> +#include "xe/xe_spin.h"
>   
>   static void force_wedged(int fd)
>   {
> @@ -75,12 +79,96 @@ static int simple_ioctl(int fd)
>   	return ret;
>   }
>   
> +static void
> +simple_exec(int fd, struct drm_xe_engine_class_instance *eci)
> +{
> +	uint32_t vm;
> +	uint64_t addr = 0x1a0000;
> +	struct drm_xe_sync sync[2] = {
> +		{ .type = DRM_XE_SYNC_TYPE_SYNCOBJ, .flags = DRM_XE_SYNC_FLAG_SIGNAL, },
> +		{ .type = DRM_XE_SYNC_TYPE_SYNCOBJ, .flags = DRM_XE_SYNC_FLAG_SIGNAL, },
> +	};
> +	struct drm_xe_exec exec = {
> +		.num_batch_buffer = 1,
> +		.num_syncs = 2,
> +		.syncs = to_user_pointer(sync),
> +	};
> +	uint64_t batch_offset, batch_addr, sdi_offset, sdi_addr;
> +	uint32_t exec_queue;
> +	uint32_t syncobjs;
> +	size_t bo_size;
> +	uint32_t bo = 0;
> +	struct {
> +		uint32_t batch[16];
> +		uint64_t pad;
> +		uint32_t data;
> +	} *data;
> +	int b;
> +
> +	vm = xe_vm_create(fd, 0, 0);
> +
> +	bo_size = sizeof(*data) * 2;
> +	bo_size = xe_bb_size(fd, bo_size);
> +	bo = xe_bo_create(fd, vm, bo_size,
> +			  vram_if_possible(fd, eci->gt_id),
> +			  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> +	data = xe_bo_map(fd, bo, bo_size);
> +
> +	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
> +
> +	syncobjs = syncobj_create(fd, 0);
> +	sync[0].handle = syncobj_create(fd, 0);
> +
> +	xe_vm_bind_async(fd, vm, 0, bo, 0, addr,
> +			 bo_size, sync, 1);
> +
> +	batch_offset = (char *)&data[0].batch - (char *)data;
> +	batch_addr = addr + batch_offset;
> +	sdi_offset = (char *)&data[0].data - (char *)data;
> +	sdi_addr = addr + sdi_offset;
> +
> +	b = 0;
> +	data[0].batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> +	data[0].batch[b++] = sdi_addr;
> +	data[0].batch[b++] = sdi_addr >> 32;
> +	data[0].batch[b++] = 0xc0ffee;
> +	data[0].batch[b++] = MI_BATCH_BUFFER_END;
> +	igt_assert(b <= ARRAY_SIZE(data[0].batch));
> +
> +	sync[0].flags &= ~DRM_XE_SYNC_FLAG_SIGNAL;
> +	sync[1].flags |= DRM_XE_SYNC_FLAG_SIGNAL;
> +	sync[1].handle = syncobjs;
> +
> +	exec.exec_queue_id = exec_queue;
> +	exec.address = batch_addr;
> +
> +	syncobj_reset(fd, &syncobjs, 1);
> +
> +	xe_exec(fd, &exec);
> +
> +	igt_assert(syncobj_wait(fd, &syncobjs, 1, INT64_MAX, 0, NULL));
> +	igt_assert_eq(data[0].data, 0xc0ffee);
> +	igt_assert(syncobj_wait(fd, &sync[0].handle, 1, INT64_MAX, 0, NULL));
> +	sync[0].flags |= DRM_XE_SYNC_FLAG_SIGNAL;
> +	xe_vm_unbind_async(fd, vm, 0, 0, addr, bo_size, sync, 1);
> +	igt_assert(syncobj_wait(fd, &sync[0].handle, 1, INT64_MAX, 0, NULL));
> +	igt_assert_eq(data[0].data, 0xc0ffee);
> +
> +	syncobj_destroy(fd, sync[0].handle);
> +	syncobj_destroy(fd, syncobjs);
> +	xe_exec_queue_destroy(fd, exec_queue);
> +	munmap(data, bo_size);
> +	gem_close(fd, bo);
> +	xe_vm_destroy(fd, vm);
> +}
> +
>   /**
>    * SUBTEST: basic-wedged
>    * Description: Force Xe device wedged after injecting a failure in GT reset
>    */
>   igt_main
>   {
> +	struct drm_xe_engine_class_instance *hwe;
>   	int fd;
>   
>   	igt_fixture {
> @@ -96,6 +184,8 @@ igt_main
>   		igt_assert_neq(simple_ioctl(fd), 0);
>   		fd = rebind_xe(fd);
>   		igt_assert_eq(simple_ioctl(fd), 0);
> +		xe_for_each_engine(fd, hwe)
> +			simple_exec(fd, hwe);


LGTM

Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>


>   	}
>   
>   	igt_fixture {

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-04-18 14:36 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-09 22:19 [PATCH i-g-t 1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state Rodrigo Vivi
2024-04-09 22:19 ` [PATCH i-g-t 2/3] tests/intel/xe_wedged: Also add a simple exec to confirm GPU health Rodrigo Vivi
2024-04-18 14:35   ` Ghimiray, Himal Prasad
2024-04-09 22:19 ` [PATCH i-g-t 3/3] tests/intel/xe_wedged: Introduce test for wedged_mode=2 Rodrigo Vivi
2024-04-09 23:07 ` ✗ CI.Patch_applied: failure for series starting with [i-g-t,1/3] tests/intel/xe_wedged: Introduce a new test for Xe device wedged state Patchwork
2024-04-10  4:17 ` [PATCH i-g-t 1/3] " Ghimiray, Himal Prasad
2024-04-18 14:28 ` Ghimiray, Himal Prasad

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox