AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Jonathan Kim <jonathan.kim@amd.com>
To: <amd-gfx@lists.freedesktop.org>, <dri-devel@lists.freedesktop.org>
Cc: Felix.Kuehling@amd.com, Jonathan.Kim@amd.com
Subject: [PATCH 17/32] drm/amdkfd: add raise exception event function
Date: Wed, 25 Jan 2023 14:53:46 -0500	[thread overview]
Message-ID: <20230125195401.4183544-18-jonathan.kim@amd.com> (raw)
In-Reply-To: <20230125195401.4183544-1-jonathan.kim@amd.com>

Exception events can be generated from interrupts or queue activitity.

The raise event function will save exception status of a queue, device
or process then notify the debugger of the status change by writing to
a debugger polled file descriptor that the debugger provides during
debug attach.

For memory violation exceptions, extra exception data will be saved.

The debugger will be able to query the saved exception states by query
operation that will be provided by follow up patches.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 91 +++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  7 ++
 3 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 659dfc7411fe..fcd064b13f6a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -38,6 +38,93 @@ void debug_event_write_work_handler(struct work_struct *work)
 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
 }
 
+/* update process/device/queue exception status, write to descriptor
+ * only if exception_status is enabled.
+ */
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+			struct kfd_process *process, struct kfd_dev *dev,
+			unsigned int source_id, bool use_worker,
+			void *exception_data, size_t exception_data_size)
+{
+	struct process_queue_manager *pqm;
+	struct process_queue_node *pqn;
+	int i;
+	static const char write_data = '.';
+	loff_t pos = 0;
+	bool is_subscribed = true;
+
+	if (!(process && process->debug_trap_enabled))
+		return false;
+
+	mutex_lock(&process->event_mutex);
+
+	if (event_mask & KFD_EC_MASK_DEVICE) {
+		for (i = 0; i < process->n_pdds; i++) {
+			struct kfd_process_device *pdd = process->pdds[i];
+
+			if (pdd->dev != dev)
+				continue;
+
+			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
+
+			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+				if (!pdd->vm_fault_exc_data) {
+					pdd->vm_fault_exc_data = kmemdup(
+							exception_data,
+							exception_data_size,
+							GFP_KERNEL);
+					if (!pdd->vm_fault_exc_data)
+						pr_debug("Failed to allocate exception data memory");
+				} else {
+					pr_debug("Debugger exception data not saved\n");
+					print_hex_dump_bytes("exception data: ",
+							DUMP_PREFIX_OFFSET,
+							exception_data,
+							exception_data_size);
+				}
+			}
+			break;
+		}
+	} else if (event_mask & KFD_EC_MASK_PROCESS) {
+		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
+	} else {
+		pqm = &process->pqm;
+		list_for_each_entry(pqn, &pqm->queues,
+				process_queue_list) {
+			int target_id;
+
+			if (!pqn->q)
+				continue;
+
+			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
+					pqn->q->properties.queue_id :
+							pqn->q->doorbell_id;
+
+			if (pqn->q->device != dev || target_id != source_id)
+				continue;
+
+			pqn->q->properties.exception_status |= event_mask;
+			break;
+		}
+	}
+
+	if (process->exception_enable_mask & event_mask) {
+		if (use_worker)
+			schedule_work(&process->debug_event_workarea);
+		else
+			kernel_write(process->dbg_ev_file,
+					&write_data,
+					1,
+					&pos);
+	} else {
+		is_subscribed = false;
+	}
+
+	mutex_unlock(&process->event_mutex);
+
+	return is_subscribed;
+}
+
 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
 {
 	struct mqd_update_info minfo = {0};
@@ -88,7 +175,6 @@ static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
 	}
 
 	return r;
-}
 
 static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
 {
@@ -114,6 +200,9 @@ static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int
 {
 	int i, count = 0;
 
+	if (!unwind)
+		cancel_work_sync(&target->debug_event_workarea);
+
 	for (i = 0; i < target->n_pdds; i++) {
 		struct kfd_process_device *pdd = target->pdds[i];
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index f199698d8d60..2d5bc102f6b4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -28,6 +28,11 @@
 void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
 					uint32_t vmid,
 					bool stall);
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+			struct kfd_process *process, struct kfd_dev *dev,
+			unsigned int source_id, bool use_worker,
+			void *exception_data,
+			size_t exception_data_size);
 int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 			void __user *runtime_info,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 75521d96e937..e503bd94dda6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -495,6 +495,7 @@ struct queue_properties {
 	uint32_t ctl_stack_size;
 	uint64_t tba_addr;
 	uint64_t tma_addr;
+	uint64_t exception_status;
 };
 
 #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
@@ -786,6 +787,11 @@ struct kfd_process_device {
 	uint64_t page_in;
 	uint64_t page_out;
 
+	/* Exception code status*/
+	uint64_t exception_status;
+	void *vm_fault_exc_data;
+	size_t vm_fault_exc_data_size;
+
 	/* Tracks debug per-vmid request settings */
 	uint32_t spi_dbg_override;
 	uint32_t spi_dbg_launch_mode;
@@ -921,6 +927,7 @@ struct kfd_process {
 
 	/* Exception code enable mask and status */
 	uint64_t exception_enable_mask;
+	uint64_t exception_status;
 
 	/* shared virtual memory registered by this process */
 	struct svm_range_list svms;
-- 
2.25.1


  parent reply	other threads:[~2023-01-25 19:55 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-25 19:53 [PATCH 00/32] Upstream of kernel support for AMDGPU ISA debugging Jonathan Kim
2023-01-25 19:53 ` [PATCH 01/32] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
2023-02-16 22:16   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 02/32] drm/amdkfd: display debug capabilities Jonathan Kim
2023-02-16 22:24   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 03/32] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
2023-02-16 23:44   ` Felix Kuehling
2023-03-23 19:12     ` Kim, Jonathan
2023-03-23 20:08       ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 04/32] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
2023-01-25 19:53 ` [PATCH 05/32] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
2023-02-16 22:39   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 06/32] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
2023-01-29  5:12   ` kernel test robot
2023-02-16 22:54   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 07/32] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
2023-01-29  6:34   ` kernel test robot
2023-02-16 23:01   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 08/32] drm/amdgpu: add gfx10 " Jonathan Kim
2023-01-29  7:55   ` kernel test robot
2023-02-16 23:11   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 09/32] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
2023-02-16 23:14   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 10/32] drm/amdgpu: add gfx11 " Jonathan Kim
2023-02-16 23:19   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 11/32] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
2023-03-20 19:19   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 12/32] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
2023-03-20 20:06   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 13/32] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
2023-03-20 20:16   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 14/32] drm/amdgpu: expose debug api for mes Jonathan Kim
2023-03-20 20:47   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11 Jonathan Kim
2023-03-20 21:49   ` Felix Kuehling
2023-03-23 13:50     ` Kim, Jonathan
2023-03-23 14:00       ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 16/32] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
2023-03-20 23:06   ` Felix Kuehling
2023-01-25 19:53 ` Jonathan Kim [this message]
2023-03-20 23:18   ` [PATCH 17/32] drm/amdkfd: add raise exception event function Felix Kuehling
2023-01-25 19:53 ` [PATCH 18/32] drm/amdkfd: add send exception operation Jonathan Kim
2023-03-20 23:26   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 19/32] drm/amdkfd: add runtime enable operation Jonathan Kim
2023-03-21  0:31   ` Felix Kuehling
2023-03-23 19:45     ` Kim, Jonathan
2023-01-25 19:53 ` [PATCH 20/32] drm/amdkfd: add debug trap enabled flag to tma Jonathan Kim
2023-01-25 19:53 ` [PATCH 21/32] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
2023-03-21 21:07   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 22/32] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
2023-01-25 19:53 ` [PATCH 23/32] drm/amdkfd: add debug wave launch override operation Jonathan Kim
2023-03-21 21:37   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 24/32] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
2023-03-21 21:42   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 25/32] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
2023-03-21 22:16   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 26/32] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
2023-03-22 21:38   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 27/32] drm/amdkfd: add debug set flags operation Jonathan Kim
2023-03-22 21:47   ` Felix Kuehling
2023-01-25 19:53 ` [PATCH 28/32] drm/amdkfd: add debug query event operation Jonathan Kim
2023-01-25 19:53 ` [PATCH 29/32] drm/amdkfd: add debug query exception info operation Jonathan Kim
2023-01-25 19:53 ` [PATCH 30/32] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
2023-03-22 21:52   ` Felix Kuehling
2023-01-25 19:54 ` [PATCH 31/32] drm/amdkfd: add debug device " Jonathan Kim
2023-03-22 21:54   ` Felix Kuehling
2023-01-25 19:54 ` [PATCH 32/32] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
2023-03-22 21:56   ` Felix Kuehling

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230125195401.4183544-18-jonathan.kim@amd.com \
    --to=jonathan.kim@amd.com \
    --cc=Felix.Kuehling@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=dri-devel@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox