All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: igt-dev@lists.freedesktop.org
Cc: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>,
	"Matthew Brost" <matthew.brost@intel.com>,
	"Maarten Lankhorst" <maarten.lankhorst@linux.intel.com>,
	"Michal Mrozek" <michal.mrozek@intel.com>,
	"John Falkowski" <john.falkowski@intel.com>,
	"Rodrigo Vivi" <rodrigo.vivi@intel.com>,
	"Lahtinen Joonas" <joonas.lahtinen@intel.com>
Subject: [PATCH i-g-t 4/4] tests/intel/xe_exec_compute_mode: Restart VM on ENOMEM/ENOSPC errors
Date: Fri, 12 Jun 2026 13:06:19 +0200	[thread overview]
Message-ID: <20260612110619.103198-5-thomas.hellstrom@linux.intel.com> (raw)
In-Reply-To: <20260612110619.103198-1-thomas.hellstrom@linux.intel.com>

When a DRM_XE_EVENT_VM_ERR event is received with error code -ENOMEM or
-ENOSPC, call DRM_IOCTL_XE_VM_RESTART to attempt recovery via the
preempt-rebind worker.

To pass the file descriptor into the event callback, introduce struct
xe_watch_ctx embedding the existing struct xe_watch_event alongside an
fd field, following the container_of() pattern documented by the
xe_watch library.

The restart uses __xe_vm_restart() (failable) rather than the asserting
xe_vm_restart() since the callback runs on the background listener
thread.  -ENOENT is treated as non-fatal (event arrived after VM
destruction); other errors are logged as warnings.

Alongside the test change, add the __xe_vm_restart() and xe_vm_restart()
helpers to lib/xe/xe_ioctl and the DRM_XE_VM_RESTART UAPI to
include/drm-uapi/xe_drm.h, taken from the xe_event kernel branch.

Assisted-by: GitHub Copilot:claude-sonnet-4.6
---
 include/drm-uapi/xe_drm.h          |  6 ++---
 tests/intel/xe_exec_compute_mode.c | 37 +++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/include/drm-uapi/xe_drm.h b/include/drm-uapi/xe_drm.h
index 1b7857a9f..f6ec85a02 100644
--- a/include/drm-uapi/xe_drm.h
+++ b/include/drm-uapi/xe_drm.h
@@ -2624,8 +2624,8 @@ enum drm_xe_ras_error_component {
  * been created with exec queues that use preempt fences).
  *
  * On return the rebind attempt has completed or a retriable error was
- * encountered.  Any non-retriable error is surfaced through the event
- * mechanism if the caller has subscribed to %DRM_XE_EVENT_MASK_VM_ERR.
+ * encountered.  Any non-retriable error is surfaced through the watch queue
+ * if the caller has subscribed via %DRM_IOCTL_XE_WATCH_QUEUE.
  * The IOCTL may return -EAGAIN if userptr memory needs to be repinned;
  * callers should retry in that case.
  */
@@ -2654,8 +2654,6 @@ struct drm_xe_vm_restart {
 
 /**
  * DOC: DRM_XE_WATCH_QUEUE
- *
- * Subscribe a notification pipe to receive device events for the calling
  * process's DRM file handle.  Events are scoped to the subscribing file:
  * only events that belong to that file (for example, VM error on a VM created
  * through the same file) are delivered, preventing information leaks between
diff --git a/tests/intel/xe_exec_compute_mode.c b/tests/intel/xe_exec_compute_mode.c
index 5bab971b0..71b6110e2 100644
--- a/tests/intel/xe_exec_compute_mode.c
+++ b/tests/intel/xe_exec_compute_mode.c
@@ -11,7 +11,9 @@
  * Functionality: compute test
  */
 
+#include <errno.h>
 #include <fcntl.h>
+#include <stdatomic.h>
 
 #include "igt.h"
 #include "lib/igt_syncobj.h"
@@ -40,11 +42,19 @@
 #define FREE_MAPPPING			(0x1 << 7)
 #define UNMAP_MAPPPING			(0x1 << 8)
 
+struct xe_watch_ctx {
+	struct xe_watch_event base;
+	int fd;
+	atomic_uint restart_count;
+};
+
 static void xe_event_fn(struct xe_watch_event *event)
 {
+	struct xe_watch_ctx *ctx = igt_container_of(event, ctx, base);
 	const struct watch_notification *notif = event->notif;
 	const struct drm_xe_watch_notification_vm_err *err_event =
 		igt_container_of(notif, err_event, base);
+	int err;
 
 	switch (notif->type) {
 	case WATCH_TYPE_META:
@@ -67,6 +77,16 @@ static void xe_event_fn(struct xe_watch_event *event)
 			igt_info("VM with id %u saw an error: %d\n",
 				 (unsigned int) err_event->vm_id,
 				 (int) err_event->error_code);
+			if (err_event->error_code == -ENOMEM ||
+			    err_event->error_code == -ENOSPC) {
+				err = __xe_vm_restart(ctx->fd, err_event->vm_id,
+					      err_event->timestamp_ns);
+				if (err && err != -ENOENT)
+					igt_warn("VM %u restart failed: %d\n",
+						 (unsigned int) err_event->vm_id, err);
+				else if (!err)
+					atomic_fetch_add(&ctx->restart_count, 1);
+			}
 			break;
 		default:
 			igt_warn("Unknown XE watch subtype %u\n",
@@ -176,7 +196,8 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 	igt_debug("%s running on: %s\n", __func__, xe_engine_class_string(eci->engine_class));
 	igt_assert_lte(n_exec_queues, MAX_N_EXECQUEUES);
 
-	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
+	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
+			  DRM_XE_VM_CREATE_FLAG_RESTARTABLE, 0);
 	bo_size = sizeof(*data) * n_execs;
 	bo_size = xe_bb_size(fd, bo_size);
 
@@ -401,7 +422,8 @@ static void lr_mode_workload(int fd)
 	uint32_t bo;
 	uint32_t ts_1, ts_2;
 
-	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
+	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE |
+			  DRM_XE_VM_CREATE_FLAG_RESTARTABLE, 0);
 	ahnd = intel_allocator_open(fd, 0, INTEL_ALLOCATOR_RELOC);
 	bo_size = xe_bb_size(fd, sizeof(*spin));
 	engine = xe_find_engine_by_class(fd, DRM_XE_ENGINE_CLASS_COPY);
@@ -481,14 +503,15 @@ int igt_main()
 		{ NULL },
 	};
 	int fd;
-	struct xe_watch_event watch_event = {
-		.ops = &event_ops,
+	struct xe_watch_ctx watch_ctx = {
+		.base.ops = &event_ops,
 	};
 	struct xe_watch_listener *listener;
 
 	igt_fixture() {
 		fd = drm_open_driver(DRIVER_XE);
-		listener = xe_watch_listener_create(fd, &watch_event);
+		watch_ctx.fd = fd;
+		listener = xe_watch_listener_create(fd, &watch_ctx.base);
 	}
 
 	for (const struct section *s = sections; s->name; s++) {
@@ -523,7 +546,9 @@ int igt_main()
 
 
 	igt_fixture() {
-		drm_close_driver(fd);
 		xe_watch_listener_destroy(listener);
+		igt_info("VM restarts performed: %u\n",
+			 atomic_load(&watch_ctx.restart_count));
+		drm_close_driver(fd);
 	}
 }
-- 
2.54.0


      parent reply	other threads:[~2026-06-12 11:08 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-12 11:06 [PATCH i-g-t 0/4] xe: watch queue event support and VM restart recovery Thomas Hellström
2026-06-12 11:06 ` [PATCH i-g-t 1/4] lib/xe: add xe_vm_restart ioctl helper Thomas Hellström
2026-06-12 14:38   ` Kamil Konieczny
2026-06-12 11:06 ` [PATCH i-g-t 2/4] lib/xe: add xe_watch listener for watch queue events Thomas Hellström
2026-06-12 11:06 ` [PATCH i-g-t 3/4] tests/intel/xe_exec_compute_mode: Add a listener for file events Thomas Hellström
2026-06-12 11:06 ` Thomas Hellström [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260612110619.103198-5-thomas.hellstrom@linux.intel.com \
    --to=thomas.hellstrom@linux.intel.com \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=john.falkowski@intel.com \
    --cc=joonas.lahtinen@intel.com \
    --cc=maarten.lankhorst@linux.intel.com \
    --cc=matthew.brost@intel.com \
    --cc=michal.mrozek@intel.com \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.