From: Jonathan Kim <jonathan.kim@amd.com>
To: <amd-gfx@lists.freedesktop.org>,
<dri-devel@lists.freedesktop.org>, <Felix.Kuehling@amd.com>
Cc: Jinhuieric.Huang@amd.com
Subject: [PATCH 19/33] drm/amdkfd: add send exception operation
Date: Thu, 25 May 2023 13:27:31 -0400 [thread overview]
Message-ID: <20230525172745.702700-19-jonathan.kim@amd.com> (raw)
In-Reply-To: <20230525172745.702700-1-jonathan.kim@amd.com>
Add a debug operation that allows the debugger to send an exception
directly to runtime through a payload address.
For memory violations, normal vmfault signals will be applied to
notify runtime instead after passing in the saved exception data
when a memory violation was raised to the debugger.
For runtime exceptions, this will unblock the runtime enable
function which will be explained and implemented in a follow up
patch.
Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
---
.../gpu/drm/amd/amdkfd/cik_event_interrupt.c | 4 +-
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 ++
drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 43 +++++++++++
drivers/gpu/drm/amd/amdkfd/kfd_debug.h | 6 ++
drivers/gpu/drm/amd/amdkfd/kfd_events.c | 3 +-
.../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 +-
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 7 +-
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 71 ++++++++++++++++++-
8 files changed, 135 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 4ebfff6b6c55..795382b55e0a 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -118,9 +118,9 @@ static void cik_event_interrupt_wq(struct kfd_node *dev,
return;
if (info.vmid == vmid)
- kfd_signal_vm_fault_event(dev, pasid, &info);
+ kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
else
- kfd_signal_vm_fault_event(dev, pasid, NULL);
+ kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index a5c457863048..ec5a85454192 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2833,6 +2833,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
r = kfd_dbg_trap_disable(target);
break;
case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+ r = kfd_dbg_send_exception_to_runtime(target,
+ args->send_runtime_event.gpu_id,
+ args->send_runtime_event.queue_id,
+ args->send_runtime_event.exception_mask);
+ break;
case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index dccb27fc764b..61098975bb0e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,49 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
return is_subscribed;
}
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+ unsigned int dev_id,
+ unsigned int queue_id,
+ uint64_t error_reason)
+{
+ if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+ struct kfd_process_device *pdd = NULL;
+ struct kfd_hsa_memory_exception_data *data;
+ int i;
+
+ for (i = 0; i < p->n_pdds; i++) {
+ if (p->pdds[i]->dev->id == dev_id) {
+ pdd = p->pdds[i];
+ break;
+ }
+ }
+
+ if (!pdd)
+ return -ENODEV;
+
+ data = (struct kfd_hsa_memory_exception_data *)
+ pdd->vm_fault_exc_data;
+
+ kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
+ kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
+ error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
+ }
+
+ if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
+ /*
+ * block should only happen after the debugger receives runtime
+ * enable notice.
+ */
+ up(&p->runtime_enable_sema);
+ error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
+ }
+
+ if (error_reason)
+ return kfd_send_exception_to_runtime(p, queue_id, error_reason);
+
+ return 0;
+}
+
static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
{
struct mqd_update_info minfo = {0};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 66ee7b95d08a..2c6866bb8850 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -34,6 +34,12 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
void __user *runtime_info,
uint32_t *runtime_info_size);
+
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+ unsigned int dev_id,
+ unsigned int queue_id,
+ uint64_t error_reason);
+
static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev)
{
return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index c02e4e680237..7ff5c4e1b7e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1222,7 +1222,8 @@ void kfd_signal_hw_exception_event(u32 pasid)
}
void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
- struct kfd_vm_fault_info *info)
+ struct kfd_vm_fault_info *info,
+ struct kfd_hsa_memory_exception_data *data)
{
struct kfd_event *ev;
uint32_t id;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 861bccb1e9dc..8cf58be80f4e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -362,7 +362,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
kfd_smi_event_update_vmfault(dev, pasid);
kfd_dqm_evict_pasid(dev->dqm, pasid);
- kfd_signal_vm_fault_event(dev, pasid, &info);
+ kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index b18cd4bf76bf..58b82fa59584 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -979,6 +979,7 @@ struct kfd_process {
bool queues_paused;
/* Tracks runtime enable status */
+ struct semaphore runtime_enable_sema;
struct kfd_runtime_info runtime_info;
};
@@ -1447,7 +1448,8 @@ int kfd_get_num_events(struct kfd_process *p);
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
- struct kfd_vm_fault_info *info);
+ struct kfd_vm_fault_info *info,
+ struct kfd_hsa_memory_exception_data *data);
void kfd_signal_reset_event(struct kfd_node *dev);
@@ -1463,6 +1465,9 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0);
}
+int kfd_send_exception_to_runtime(struct kfd_process *p,
+ unsigned int queue_id,
+ uint64_t error_reason);
bool kfd_is_locked(void);
/* Compute profile */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index fa19c1218748..8bfd0c91fb92 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1462,6 +1462,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
process->debugger_process = NULL;
process->exception_enable_mask = 0;
atomic_set(&process->debugged_process_count, 0);
+ sema_init(&process->runtime_enable_sema, 0);
process->pasid = kfd_pasid_alloc();
if (process->pasid == 0) {
@@ -2120,6 +2121,75 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
}
}
+struct send_exception_work_handler_workarea {
+ struct work_struct work;
+ struct kfd_process *p;
+ unsigned int queue_id;
+ uint64_t error_reason;
+};
+
+static void send_exception_work_handler(struct work_struct *work)
+{
+ struct send_exception_work_handler_workarea *workarea;
+ struct kfd_process *p;
+ struct queue *q;
+ struct mm_struct *mm;
+ struct kfd_context_save_area_header __user *csa_header;
+ uint64_t __user *err_payload_ptr;
+ uint64_t cur_err;
+ uint32_t ev_id;
+
+ workarea = container_of(work,
+ struct send_exception_work_handler_workarea,
+ work);
+ p = workarea->p;
+
+ mm = get_task_mm(p->lead_thread);
+
+ if (!mm)
+ return;
+
+ kthread_use_mm(mm);
+
+ q = pqm_get_user_queue(&p->pqm, workarea->queue_id);
+
+ if (!q)
+ goto out;
+
+ csa_header = (void __user *)q->properties.ctx_save_restore_area_address;
+
+ get_user(err_payload_ptr, (uint64_t __user **)&csa_header->err_payload_addr);
+ get_user(cur_err, err_payload_ptr);
+ cur_err |= workarea->error_reason;
+ put_user(cur_err, err_payload_ptr);
+ get_user(ev_id, &csa_header->err_event_id);
+
+ kfd_set_event(p, ev_id);
+
+out:
+ kthread_unuse_mm(mm);
+ mmput(mm);
+}
+
+int kfd_send_exception_to_runtime(struct kfd_process *p,
+ unsigned int queue_id,
+ uint64_t error_reason)
+{
+ struct send_exception_work_handler_workarea worker;
+
+ INIT_WORK_ONSTACK(&worker.work, send_exception_work_handler);
+
+ worker.p = p;
+ worker.queue_id = queue_id;
+ worker.error_reason = error_reason;
+
+ schedule_work(&worker.work);
+ flush_work(&worker.work);
+ destroy_work_on_stack(&worker.work);
+
+ return 0;
+}
+
struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id)
{
int i;
@@ -2179,4 +2249,3 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
}
#endif
-
--
2.25.1
next prev parent reply other threads:[~2023-05-25 17:28 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-05-25 17:27 [PATCH 01/33] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
2023-05-25 17:27 ` [PATCH 02/33] drm/amdkfd: display debug capabilities Jonathan Kim
2023-05-30 19:20 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 03/33] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
2023-05-25 17:27 ` [PATCH 04/33] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
2023-05-25 17:27 ` [PATCH 05/33] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
2023-05-30 19:23 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 06/33] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
2023-05-25 17:27 ` [PATCH 07/33] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
2023-05-25 17:27 ` [PATCH 08/33] drm/amdkfd: fix kfd_suspend_all_processes Jonathan Kim
2023-05-25 17:27 ` [PATCH 09/33] drm/amdgpu: add gfx10 hw debug mode enable and disable calls Jonathan Kim
2023-05-25 17:27 ` [PATCH 10/33] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
2023-05-25 17:27 ` [PATCH 11/33] drm/amdgpu: add gfx11 " Jonathan Kim
2023-05-25 17:27 ` [PATCH 12/33] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
2023-05-30 19:28 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 13/33] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
2023-05-30 19:36 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 14/33] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
2023-05-30 19:55 ` Felix Kuehling
2023-05-30 19:58 ` Kim, Jonathan
2023-05-25 17:27 ` [PATCH 15/33] drm/amdgpu: expose debug api for mes Jonathan Kim
2023-05-25 17:27 ` [PATCH 16/33] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
2023-05-30 20:04 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 17/33] drm/amdkfd: apply trap workaround for gfx11 Jonathan Kim
2023-05-25 17:27 ` [PATCH 18/33] drm/amdkfd: add raise exception event function Jonathan Kim
2023-05-30 20:07 ` Felix Kuehling
2023-05-25 17:27 ` Jonathan Kim [this message]
2023-05-25 17:27 ` [PATCH 20/33] drm/amdkfd: add runtime enable operation Jonathan Kim
2023-05-30 20:11 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 21/33] drm/amdkfd: add debug trap enabled flag to tma Jonathan Kim
2023-05-25 17:27 ` [PATCH 22/33] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
2023-05-30 20:16 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 23/33] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
2023-05-25 17:27 ` [PATCH 24/33] drm/amdkfd: add debug wave launch override operation Jonathan Kim
2023-05-30 20:21 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 25/33] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
2023-05-30 20:22 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 26/33] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
2023-05-30 20:24 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 27/33] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
2023-05-30 20:26 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 28/33] drm/amdkfd: add debug set flags operation Jonathan Kim
2023-05-30 20:30 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 29/33] drm/amdkfd: add debug query event operation Jonathan Kim
2023-05-25 17:27 ` [PATCH 30/33] drm/amdkfd: add debug query exception info operation Jonathan Kim
2023-05-25 17:27 ` [PATCH 31/33] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
2023-05-25 17:27 ` [PATCH 32/33] drm/amdkfd: add debug device " Jonathan Kim
2023-05-30 20:31 ` Felix Kuehling
2023-05-25 17:27 ` [PATCH 33/33] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
2023-05-30 19:17 ` [PATCH 01/33] drm/amdkfd: add debug and runtime enable interface Felix Kuehling
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230525172745.702700-19-jonathan.kim@amd.com \
--to=jonathan.kim@amd.com \
--cc=Felix.Kuehling@amd.com \
--cc=Jinhuieric.Huang@amd.com \
--cc=amd-gfx@lists.freedesktop.org \
--cc=dri-devel@lists.freedesktop.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox