Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Mika Kuoppala <mika.kuoppala@linux.intel.com>
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org, christian.koenig@amd.com,
	Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>,
	Mika Kuoppala <mika.kuoppala@linux.intel.com>
Subject: [PATCH 26/26] drm/xe/eudebug: Enable EU pagefault handling
Date: Mon,  9 Dec 2024 15:33:17 +0200	[thread overview]
Message-ID: <20241209133318.1806472-27-mika.kuoppala@linux.intel.com> (raw)
In-Reply-To: <20241209133318.1806472-1-mika.kuoppala@linux.intel.com>

From: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>

The XE2 (and PVC) HW has a limitation that the pagefault due to invalid
access will halt the corresponding EUs. To solve this problem, enable
EU pagefault handling functionality, which allows to unhalt pagefaulted
eu threads and to EU debugger to get inform about the eu attentions state
of EU threads during execution.

If a pagefault occurs, send the DRM_XE_EUDEBUG_EVENT_PAGEFAULT event to
the client connected to the xe_eudebug after handling the pagefault.

The pagefault handling is a mechanism that allows a stalled EU thread to
enter SIP mode by installing a temporal null page to the page table entry
where the pagefault happened.

A brief description of the page fault handling mechanism flow between KMD
and the eu thread is as follows

(1) eu thread accesses unallocated address
(2) pagefault happens and eu thread stalls
(3) XE kmd set an force eu thread exception to allow the running eu thread
    to enter SIP mode (kmd set ForceException / ForceExternalHalt bit of
    TD_CTL register)
    Not stalled (none-pagefaulted) eu threads enter SIP mode
(4) XE kmd installs temporal null page to the pagetable entry of the
    address where pagefault happened.
(5) XE kmd replies pagefault successful message to GUC
(6) stalled eu thread resumes as per pagefault condition has resolved
(7) resumed eu thread enters SIP mode due to force exception set by (3)

As designed this feature to only work when eudbug is enabled, it should
have no impact to regular recoverable pagefault code path.

Signed-off-by: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_gt_pagefault.c | 83 +++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index 5558342b8e07..4e2883e19018 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -13,6 +13,7 @@
 
 #include "abi/guc_actions_abi.h"
 #include "xe_bo.h"
+#include "xe_eudebug.h"
 #include "xe_gt.h"
 #include "xe_gt_tlb_invalidation.h"
 #include "xe_guc.h"
@@ -199,12 +200,16 @@ static struct xe_vm *asid_to_vm(struct xe_device *xe, u32 asid)
 	return vm;
 }
 
-static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
+static int handle_pagefault_start(struct xe_gt *gt, struct pagefault *pf,
+				  struct xe_vm **pf_vm,
+				  struct xe_eudebug_pagefault **eudebug_pf_out)
 {
-	struct xe_device *xe = gt_to_xe(gt);
+	struct xe_eudebug_pagefault *eudebug_pf;
 	struct xe_tile *tile = gt_to_tile(gt);
-	struct xe_vm *vm;
+	struct xe_device *xe = gt_to_xe(gt);
+	bool  destroy_eudebug_pf = false;
 	struct xe_vma *vma = NULL;
+	struct xe_vm *vm;
 	int err;
 
 	/* SW isn't expected to handle TRTT faults */
@@ -215,6 +220,10 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
 	if (IS_ERR(vm))
 		return PTR_ERR(vm);
 
+	eudebug_pf = xe_eudebug_pagefault_create(gt, vm, pf->page_addr,
+						 pf->fault_type, pf->fault_level,
+						 pf->access_type);
+
 	/*
 	 * TODO: Change to read lock? Using write lock for simplicity.
 	 */
@@ -227,8 +236,27 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
 
 	vma = xe_gt_pagefault_lookup_vma(vm, pf->page_addr);
 	if (!vma) {
-		err = -EINVAL;
-		goto unlock_vm;
+		if (eudebug_pf)
+			vma = xe_vm_create_null_vma(vm, pf->page_addr);
+
+		if (IS_ERR_OR_NULL(vma)) {
+			err = -EINVAL;
+			if (eudebug_pf)
+				destroy_eudebug_pf = true;
+
+			goto unlock_vm;
+		}
+	} else {
+		/*
+		 * When creating an instance of eudebug_pagefault, there was
+		 * no vma containing the ppgtt address where the pagefault occurred,
+		 * but when reacquiring vm->lock, there is.
+		 * During not aquiring the vm->lock from this context,
+		 * but vma corresponding to the address where the pagefault occurred
+		 * in another context has allocated.
+		 */
+		if (eudebug_pf)
+			destroy_eudebug_pf = true;
 	}
 
 	err = handle_vma_pagefault(tile, pf, vma);
@@ -237,11 +265,43 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
 	if (!err)
 		vm->usm.last_fault_vma = vma;
 	up_write(&vm->lock);
-	xe_vm_put(vm);
+
+	if (destroy_eudebug_pf) {
+		xe_eudebug_pagefault_destroy(gt, vm, eudebug_pf, false);
+		*eudebug_pf_out = NULL;
+	} else {
+		*eudebug_pf_out = eudebug_pf;
+	}
+
+	/* while the lifetime of the eudebug pagefault instance, keep the VM instance.*/
+	if (!*eudebug_pf_out) {
+		xe_vm_put(vm);
+		*pf_vm = NULL;
+	} else {
+		*pf_vm = vm;
+	}
 
 	return err;
 }
 
+static void handle_pagefault_end(struct xe_gt *gt, struct xe_vm *vm,
+				 struct xe_eudebug_pagefault *eudebug_pf)
+{
+	/* if there no eudebug_pagefault then return */
+	if (!eudebug_pf)
+		return;
+
+	xe_eudebug_pagefault_process(gt, eudebug_pf);
+
+	/*
+	 * TODO: Remove VMA added to handle eudebug pagefault
+	 */
+
+	xe_eudebug_pagefault_destroy(gt, vm, eudebug_pf, true);
+
+	xe_vm_put(vm);
+}
+
 static int send_pagefault_reply(struct xe_guc *guc,
 				struct xe_guc_pagefault_reply *reply)
 {
@@ -367,7 +427,10 @@ static void pf_queue_work_func(struct work_struct *w)
 	threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS);
 
 	while (get_pagefault(pf_queue, &pf)) {
-		ret = handle_pagefault(gt, &pf);
+		struct xe_eudebug_pagefault *eudebug_pf = NULL;
+		struct xe_vm *vm = NULL;
+
+		ret = handle_pagefault_start(gt, &pf, &vm, &eudebug_pf);
 		if (unlikely(ret)) {
 			print_pagefault(xe, &pf);
 			pf.fault_unsuccessful = 1;
@@ -385,7 +448,11 @@ static void pf_queue_work_func(struct work_struct *w)
 			FIELD_PREP(PFR_ENG_CLASS, pf.engine_class) |
 			FIELD_PREP(PFR_PDATA, pf.pdata);
 
-		send_pagefault_reply(&gt->uc.guc, &reply);
+		ret = send_pagefault_reply(&gt->uc.guc, &reply);
+		if (unlikely(ret))
+			drm_dbg(&xe->drm, "GuC Pagefault reply failed: %d\n", ret);
+
+		handle_pagefault_end(gt, vm, eudebug_pf);
 
 		if (time_after(jiffies, threshold) &&
 		    pf_queue->tail != pf_queue->head) {
-- 
2.43.0


  parent reply	other threads:[~2024-12-09 13:33 UTC|newest]

Thread overview: 62+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-09 13:32 [PATCH 00/26] Intel Xe GPU debug support (eudebug) v3 Mika Kuoppala
2024-12-09 13:32 ` [PATCH 01/26] ptrace: export ptrace_may_access Mika Kuoppala
2024-12-10  4:29   ` Christoph Hellwig
2024-12-12  9:16     ` Joonas Lahtinen
2024-12-09 13:32 ` [PATCH 02/26] drm/xe/eudebug: Introduce eudebug support Mika Kuoppala
2024-12-09 13:32 ` [PATCH 03/26] drm/xe/eudebug: Introduce discovery for resources Mika Kuoppala
2024-12-09 13:32 ` [PATCH 04/26] drm/xe/eudebug: Introduce exec_queue events Mika Kuoppala
2024-12-09 13:32 ` [PATCH 05/26] drm/xe/eudebug: Introduce exec queue placements event Mika Kuoppala
2024-12-09 13:32 ` [PATCH 06/26] drm/xe/eudebug: hw enablement for eudebug Mika Kuoppala
2024-12-09 13:32 ` [PATCH 07/26] drm/xe: Add EUDEBUG_ENABLE exec queue property Mika Kuoppala
2024-12-09 13:32 ` [PATCH 08/26] drm/xe/eudebug: Introduce per device attention scan worker Mika Kuoppala
2024-12-09 13:33 ` [PATCH 09/26] drm/xe/eudebug: Introduce EU control interface Mika Kuoppala
2024-12-09 13:33 ` [PATCH 10/26] drm/xe/eudebug: Add vm bind and vm bind ops Mika Kuoppala
2024-12-09 13:33 ` [PATCH 11/26] drm/xe/eudebug: Add UFENCE events with acks Mika Kuoppala
2024-12-09 13:33 ` [PATCH 12/26] drm/xe/eudebug: vm open/pread/pwrite Mika Kuoppala
2024-12-09 13:33 ` [PATCH 13/26] drm/xe: add system memory page iterator support to xe_res_cursor Mika Kuoppala
2024-12-09 13:33 ` [PATCH 14/26] drm/xe/eudebug: implement userptr_vma access Mika Kuoppala
2024-12-09 14:03   ` Christian König
2024-12-09 14:56     ` Joonas Lahtinen
2024-12-09 15:31     ` Simona Vetter
2024-12-09 15:42       ` Christian König
2024-12-09 15:45         ` Christian König
2024-12-10  9:33         ` Joonas Lahtinen
2024-12-10 10:00           ` Christian König
2024-12-10 11:57             ` Joonas Lahtinen
2024-12-10 14:03               ` Christian König
2024-12-11 12:59                 ` Joonas Lahtinen
2024-12-17 14:12                   ` Joonas Lahtinen
2024-12-20 12:47                     ` Mika Kuoppala
2024-12-10 11:17         ` Simona Vetter
2024-12-12  8:49       ` Thomas Hellström
2024-12-12 10:12         ` Simona Vetter
2024-12-13 19:39           ` Matthew Brost
2024-12-16 14:17   ` [PATCH 13/26] RFC drm/xe/eudebug: userptr vm pread/pwrite Mika Kuoppala
2024-12-20 11:31   ` Mika Kuoppala
2024-12-20 12:56     ` Christian König
2025-01-29  8:03       ` Joonas Lahtinen
2025-01-29 10:33         ` Christian König
2025-01-29 18:18           ` Joonas Lahtinen
2025-01-30 12:09             ` Christian König
2024-12-23 10:31     ` Thomas Hellström
2025-01-13 13:22       ` Mika Kuoppala
2025-01-13 13:32       ` [PATCH 13/27] mm: export access_remote_vm symbol for debugger use Mika Kuoppala
2025-01-13 13:32       ` [PATCH 14/27] drm/xe/eudebug: userptr vm access pread/pwrite Mika Kuoppala
2024-12-09 13:33 ` [PATCH 15/26] drm/xe: Debug metadata create/destroy ioctls Mika Kuoppala
2024-12-09 13:33 ` [PATCH 16/26] drm/xe: Attach debug metadata to vma Mika Kuoppala
2024-12-09 13:33 ` [PATCH 17/26] drm/xe/eudebug: Add debug metadata support for xe_eudebug Mika Kuoppala
2024-12-09 13:33 ` [PATCH 18/26] drm/xe/eudebug: Implement vm_bind_op discovery Mika Kuoppala
2024-12-09 13:33 ` [PATCH 19/26] drm/xe/eudebug: Dynamically toggle debugger functionality Mika Kuoppala
2024-12-09 13:33 ` [PATCH 20/26] drm/xe/eudebug_test: Introduce xe_eudebug wa kunit test Mika Kuoppala
2024-12-09 13:33 ` [PATCH 21/26] drm/xe/eudebug/ptl: Add support for extra attention register Mika Kuoppala
2024-12-09 13:33 ` [PATCH 22/26] drm/xe/eudebug/ptl: Add RCU_DEBUG_1 register support for xe3 Mika Kuoppala
2024-12-09 13:33 ` [PATCH 23/26] drm/xe/eudebug: Add read/count/compare helper for eu attention Mika Kuoppala
2024-12-09 13:33 ` [PATCH 24/26] drm/xe/eudebug: Introduce EU pagefault handling interface Mika Kuoppala
2024-12-09 13:33 ` [PATCH 25/26] drm/xe/vm: Support for adding null page VMA to VM on request Mika Kuoppala
2024-12-09 13:33 ` Mika Kuoppala [this message]
2024-12-09 14:37 ` ✓ CI.Patch_applied: success for Intel Xe GPU debug support (eudebug) v3 Patchwork
2024-12-09 14:38 ` ✗ CI.checkpatch: warning " Patchwork
2024-12-09 14:39 ` ✗ CI.KUnit: failure " Patchwork
2024-12-16 14:22 ` ✗ CI.Patch_applied: failure for Intel Xe GPU debug support (eudebug) v3 (rev2) Patchwork
2024-12-20 14:36 ` ✗ CI.Patch_applied: failure for Intel Xe GPU debug support (eudebug) v3 (rev3) Patchwork
2025-01-13 16:15 ` ✗ CI.Patch_applied: failure for Intel Xe GPU debug support (eudebug) v3 (rev4) Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241209133318.1806472-27-mika.kuoppala@linux.intel.com \
    --to=mika.kuoppala@linux.intel.com \
    --cc=christian.koenig@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=gwan-gyeong.mun@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox