Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Matthew Brost <matthew.brost@intel.com>
To: intel-xe@lists.freedesktop.org, dri-devel@lists.freedesktop.org
Cc: kenneth.w.graunke@intel.com, lionel.g.landwerlin@intel.com,
	jose.souza@intel.com, simona.vetter@ffwll.ch,
	thomas.hellstrom@linux.intel.com, boris.brezillon@collabora.com,
	airlied@gmail.com, christian.koenig@amd.com,
	mihail.atanassov@arm.com, steven.price@arm.com,
	shashank.sharma@amd.com
Subject: [RFC PATCH 29/29] drm/xe: Add user fence TDR
Date: Mon, 18 Nov 2024 15:37:57 -0800	[thread overview]
Message-ID: <20241118233757.2374041-30-matthew.brost@intel.com> (raw)
In-Reply-To: <20241118233757.2374041-1-matthew.brost@intel.com>

We cannot let user fences exported as dma-fence run forever. Add a TDR
to protect against this. If the TDR fires the entire VM is killed as
dma-fences are not tied to an individual queue.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_vm.c       | 164 +++++++++++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_vm_types.h |  22 +++++
 2 files changed, 179 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 5078aeea2bd8..8b475e76bfe0 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -30,6 +30,7 @@
 #include "xe_exec_queue.h"
 #include "xe_gt_pagefault.h"
 #include "xe_gt_tlb_invalidation.h"
+#include "xe_hw_fence.h"
 #include "xe_migrate.h"
 #include "xe_pat.h"
 #include "xe_pm.h"
@@ -336,11 +337,15 @@ void xe_vm_kill(struct xe_vm *vm, bool unlocked)
 	if (unlocked)
 		xe_vm_lock(vm, false);
 
-	vm->flags |= XE_VM_FLAG_BANNED;
-	trace_xe_vm_kill(vm);
+	if (!(vm->flags |= XE_VM_FLAG_BANNED)) {
+		vm->flags |= XE_VM_FLAG_BANNED;
+		trace_xe_vm_kill(vm);
 
-	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
-		q->ops->kill(q);
+		list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
+			q->ops->kill(q);
+
+		/* TODO: Unmap usermap doorbells */
+	}
 
 	if (unlocked)
 		xe_vm_unlock(vm);
@@ -1393,6 +1398,9 @@ static void xe_vm_free_scratch(struct xe_vm *vm)
 	}
 }
 
+static void userfence_tdr(struct work_struct *w);
+static void userfence_kill(struct work_struct *w);
+
 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
 {
 	struct drm_gem_object *vm_resv_obj;
@@ -1517,6 +1525,12 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
 		}
 	}
 
+	spin_lock_init(&vm->userfence.lock);
+	INIT_LIST_HEAD(&vm->userfence.pending_list);
+	vm->userfence.timeout = HZ * 5;
+	INIT_DELAYED_WORK(&vm->userfence.tdr, userfence_tdr);
+	INIT_WORK(&vm->userfence.kill_work, userfence_kill);
+
 	if (number_tiles > 1)
 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
 
@@ -1562,6 +1576,9 @@ void xe_vm_close_and_put(struct xe_vm *vm)
 	xe_vm_close(vm);
 	flush_work(&vm->preempt.rebind_work);
 
+	flush_delayed_work(&vm->userfence.tdr);
+	flush_work(&vm->userfence.kill_work);
+
 	down_write(&vm->lock);
 	for_each_tile(tile, xe, id) {
 		if (vm->q[id])
@@ -3449,6 +3466,114 @@ static int check_semaphores(struct xe_vm *vm, struct xe_sync_entry *syncs,
 	return 0;
 }
 
+struct tdr_item {
+	struct dma_fence *fence;
+	struct xe_vm *vm;
+	struct list_head link;
+	struct dma_fence_cb cb;
+	u64 deadline;
+};
+
+static void userfence_kill(struct work_struct *w)
+{
+	struct xe_vm *vm =
+		container_of(w, struct xe_vm, userfence.kill_work);
+
+	down_write(&vm->lock);
+	xe_vm_kill(vm, true);
+	up_write(&vm->lock);
+}
+
+static void userfence_tdr(struct work_struct *w)
+{
+	struct xe_vm *vm =
+		container_of(w, struct xe_vm, userfence.tdr.work);
+	struct tdr_item *tdr_item;
+	bool timeout = false, cookie = dma_fence_begin_signalling();
+
+	xe_hw_fence_irq_stop(&vm->xe->user_fence_irq);
+
+	spin_lock_irq(&vm->userfence.lock);
+	list_for_each_entry(tdr_item, &vm->userfence.pending_list, link) {
+		if (!dma_fence_is_signaled(tdr_item->fence)) {
+			drm_notice(&vm->xe->drm,
+				   "Timedout usermap fence: seqno=%llu, deadline=%llu, jiffies=%llu",
+				   tdr_item->fence->seqno, tdr_item->deadline,
+				   get_jiffies_64());
+			dma_fence_set_error(tdr_item->fence, -ETIME);
+			timeout = true;
+			vm->userfence.timeout = 0;
+		}
+	}
+	spin_unlock_irq(&vm->userfence.lock);
+
+	xe_hw_fence_irq_start(&vm->xe->user_fence_irq);
+
+	/*
+	 * This is dma-fence signaling path so we cannot take the locks requires
+	 * to kill a VM. Defer killing to a worker.
+	 */
+	if (timeout)
+		schedule_work(&vm->userfence.kill_work);
+
+	dma_fence_end_signalling(cookie);
+}
+
+static void userfence_fence_cb(struct dma_fence *fence,
+			       struct dma_fence_cb *cb)
+{
+	struct tdr_item *next, *tdr_item = container_of(cb, struct tdr_item, cb);
+	struct xe_vm *vm = tdr_item->vm;
+	struct xe_gt *gt = xe_device_get_gt(vm->xe, 0);
+
+	if (fence)
+		spin_lock(&vm->userfence.lock);
+	else
+		spin_lock_irq(&vm->userfence.lock);
+
+	list_del(&tdr_item->link);
+	next = list_first_entry_or_null(&vm->userfence.pending_list,
+					typeof(*next), link);
+	if (next)
+		mod_delayed_work(gt->ordered_wq, &vm->userfence.tdr,
+				 next->deadline - get_jiffies_64());
+	else
+		cancel_delayed_work(&vm->userfence.tdr);
+
+	if (fence)
+		spin_unlock(&vm->userfence.lock);
+	else
+		spin_unlock_irq(&vm->userfence.lock);
+
+	dma_fence_put(tdr_item->fence);
+	xe_vm_put(tdr_item->vm);
+	kfree(tdr_item);
+}
+
+static void userfence_tdr_add(struct xe_vm *vm, struct tdr_item *tdr_item,
+			      struct dma_fence *fence)
+{
+	struct xe_gt *gt = xe_device_get_gt(vm->xe, 0);
+	int ret;
+
+	tdr_item->fence = dma_fence_get(fence);
+	tdr_item->vm = xe_vm_get(vm);
+	INIT_LIST_HEAD(&tdr_item->link);
+	tdr_item->deadline = vm->userfence.timeout + get_jiffies_64();
+
+	spin_lock_irq(&vm->userfence.lock);
+	list_add_tail(&tdr_item->link, &vm->userfence.pending_list);
+	if (list_is_singular(&vm->userfence.pending_list))
+		mod_delayed_work(gt->ordered_wq,
+				 &vm->userfence.tdr,
+				 vm->userfence.timeout);
+	spin_unlock_irq(&vm->userfence.lock);
+
+	ret = dma_fence_add_callback(fence, &tdr_item->cb, userfence_fence_cb);
+	if (ret == -ENOENT)
+		userfence_fence_cb(NULL, &tdr_item->cb);
+}
+
 int xe_vm_convert_fence_ioctl(struct drm_device *dev, void *data,
 			      struct drm_file *file)
 {
@@ -3459,6 +3584,7 @@ int xe_vm_convert_fence_ioctl(struct drm_device *dev, void *data,
 	struct drm_xe_semaphore __user *semaphores_user;
 	struct xe_sync_entry *syncs = NULL;
 	struct xe_vm *vm;
+	struct tdr_item **tdr_items = NULL;
 	int err = 0, i, num_syncs = 0;
 	bool done = false;
 	struct drm_exec exec;
@@ -3493,6 +3619,12 @@ int xe_vm_convert_fence_ioctl(struct drm_device *dev, void *data,
 		goto release_vm_lock;
 	}
 
+	tdr_items = kcalloc(args->num_syncs, sizeof(*tdr_items), GFP_KERNEL);
+	if (!tdr_items) {
+		err = -ENOMEM;
+		goto release_vm_lock;
+	}
+
 	syncs_user = u64_to_user_ptr(args->syncs);
 	semaphores_user = u64_to_user_ptr(args->semaphores);
 	for (i = 0; i < args->num_syncs; i++, num_syncs++) {
@@ -3505,6 +3637,15 @@ int xe_vm_convert_fence_ioctl(struct drm_device *dev, void *data,
 		if (err)
 			goto release_syncs;
 
+		if (sync->flags & DRM_XE_SYNC_FLAG_SIGNAL) {
+			tdr_items[i] = kmalloc(sizeof(struct tdr_item),
+					       GFP_KERNEL);
+			if (!tdr_items[i]) {
+				xe_sync_entry_cleanup(&syncs[i]);
+				goto release_syncs;
+			}
+		}
+
 		err = xe_sync_semaphore_parse(xe, xef, semaphore_sync,
 					      &semaphores_user[i],
 					      sync->flags);
@@ -3591,6 +3732,10 @@ int xe_vm_convert_fence_ioctl(struct drm_device *dev, void *data,
 				&semaphore_sync->chain_fence->base;
 			semaphore_sync->chain_fence = NULL;
 
+			userfence_tdr_add(vm, tdr_items[i],
+					  semaphore_sync->fence);
+			tdr_items[i] = 0;
+
 			semaphore_sync->fence = NULL;   /* Ref owned by chain */
 		} else {
 			xe_sync_entry_signal(semaphore_sync, sync->fence);
@@ -3617,9 +3762,13 @@ int xe_vm_convert_fence_ioctl(struct drm_device *dev, void *data,
 out_unlock:
 	drm_exec_fini(&exec);
 release_syncs:
-	while (err != -EAGAIN && num_syncs--) {
-		xe_sync_entry_cleanup(&syncs[num_syncs]);
-		xe_sync_entry_cleanup(&syncs[args->num_syncs + num_syncs]);
+	if (err != -EAGAIN) {
+		for (i = 0; i < num_syncs; ++i)
+			kfree(tdr_items[i]);
+		while (num_syncs--) {
+			xe_sync_entry_cleanup(&syncs[num_syncs]);
+			xe_sync_entry_cleanup(&syncs[args->num_syncs + num_syncs]);
+		}
 	}
 release_vm_lock:
 	if (err == -EAGAIN)
@@ -3629,6 +3778,7 @@ int xe_vm_convert_fence_ioctl(struct drm_device *dev, void *data,
 	xe_vm_put(vm);
 	free_preempt_fences(&preempt_fences);
 	kfree(syncs);
+	kfree(tdr_items);
 
 	return err;
 }
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index c5cb83722706..49cac5716f72 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -260,6 +260,28 @@ struct xe_vm {
 		struct dma_fence *exported_fence;
 	} preempt;
 
+	/** @userfence: User fence state */
+	struct {
+		/**
+		 * @userfence.lock: fence lock
+		 */
+		spinlock_t lock;
+		/**
+		 * @userfence.pending_list: pending fence list, protected by
+		 * userfence.lock
+		 */
+		struct list_head pending_list;
+		/** @userfence.tdr: fence TDR */
+		struct delayed_work tdr;
+		/** @userfence.kill_work */
+		struct work_struct kill_work;
+		/**
+		 * @userfence.timeout: Fence timeout period, protected by
+		 * userfence.lock
+		 */
+		u32 timeout;
+	} userfence;
+
 	/** @um: unified memory state */
 	struct {
 		/** @asid: address space ID, unique to each VM */
-- 
2.34.1


  parent reply	other threads:[~2024-11-18 23:37 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-11-18 23:37 [RFC PATCH 00/29] UMD direct submission in Xe Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 01/29] dma-fence: Add dma_fence_preempt base class Matthew Brost
2024-11-20 13:31   ` Christian König
2024-11-20 17:36     ` Matthew Brost
2024-11-21 10:04       ` Christian König
2024-11-21 18:41         ` Matthew Brost
2024-11-22 10:56           ` Christian König
2024-11-18 23:37 ` [RFC PATCH 02/29] dma-fence: Add dma_fence_user_fence Matthew Brost
2024-11-20 13:38   ` Christian König
2024-11-20 22:50     ` Matthew Brost
2024-11-21  9:31       ` Christian König
2024-11-22  2:35         ` Matthew Brost
2024-11-22 10:28           ` Christian König
2024-11-18 23:37 ` [RFC PATCH 03/29] drm/xe: Use dma_fence_preempt base class Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 04/29] drm/xe: Allocate doorbells for UMD exec queues Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 05/29] drm/xe: Add doorbell ID to snapshot capture Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 06/29] drm/xe: Break submission ring out into its own BO Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 07/29] drm/xe: Break indirect ring state " Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 08/29] drm/xe: Clear GGTT in xe_bo_restore_kernel Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 09/29] FIXME: drm/xe: Add pad to ring and indirect state Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 10/29] drm/xe: Enable indirect ring on media GT Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 11/29] drm/xe: Don't add pinned mappings to VM bulk move Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 12/29] drm/xe: Add exec queue post init extension processing Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 13/29] drm/xe/mmap: Add mmap support for PCI memory barrier Matthew Brost
2024-11-19 10:00   ` Christian König
2024-11-19 11:57     ` Joonas Lahtinen
2024-11-19 12:42       ` Mrozek, Michal
2024-12-18 12:59         ` Upadhyay, Tejas
2024-11-18 23:37 ` [RFC PATCH 14/29] drm/xe: Add support for mmapping doorbells to user space Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 15/29] drm/xe: Add support for mmapping submission ring and indirect ring state " Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 16/29] drm/xe/uapi: Define UMD exec queue mapping uAPI Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 17/29] drm/xe: Add usermap exec queue extension Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 18/29] drm/xe: Drop EXEC_QUEUE_FLAG_UMD_SUBMISSION flag Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 19/29] drm/xe: Do not allow usermap exec queues in exec IOCTL Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 20/29] drm/xe: Teach GuC backend to kill usermap queues Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 21/29] drm/xe: Enable preempt fences on " Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 22/29] drm/xe/uapi: Add uAPI to convert user semaphore to / from drm syncobj Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 23/29] drm/xe: Add user fence IRQ handler Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 24/29] drm/xe: Add xe_hw_fence_user_init Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 25/29] drm/xe: Add a message lock to the Xe GPU scheduler Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 26/29] drm/xe: Always wait on preempt fences in vma_check_userptr Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 27/29] drm/xe: Teach xe_sync layer about drm_xe_semaphore Matthew Brost
2024-11-18 23:37 ` [RFC PATCH 28/29] drm/xe: Add VM convert fence IOCTL Matthew Brost
2024-11-18 23:37 ` Matthew Brost [this message]
2024-11-18 23:55 ` ✓ CI.Patch_applied: success for UMD direct submission in Xe Patchwork
2024-11-18 23:56 ` ✗ CI.checkpatch: warning " Patchwork
2024-11-18 23:57 ` ✓ CI.KUnit: success " Patchwork
2024-11-19  0:15 ` ✓ CI.Build: " Patchwork
2024-11-19  0:17 ` ✗ CI.Hooks: failure " Patchwork
2024-11-19  0:19 ` ✓ CI.checksparse: success " Patchwork
2024-11-19  0:39 ` ✗ CI.BAT: failure " Patchwork
2024-11-19 11:44 ` ✗ CI.FULL: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241118233757.2374041-30-matthew.brost@intel.com \
    --to=matthew.brost@intel.com \
    --cc=airlied@gmail.com \
    --cc=boris.brezillon@collabora.com \
    --cc=christian.koenig@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=jose.souza@intel.com \
    --cc=kenneth.w.graunke@intel.com \
    --cc=lionel.g.landwerlin@intel.com \
    --cc=mihail.atanassov@arm.com \
    --cc=shashank.sharma@amd.com \
    --cc=simona.vetter@ffwll.ch \
    --cc=steven.price@arm.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox