* [PATCH v4 04/10] drm/amdgpu: Create hqd info structure @ 2026-04-13 18:34 Amber Lin 2026-04-13 18:34 ` [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin 2026-04-13 20:22 ` [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Alex Deucher 0 siblings, 2 replies; 5+ messages in thread From: Amber Lin @ 2026-04-13 18:34 UTC (permalink / raw) To: amd-gfx, alexdeucher; +Cc: Amber Lin, Jonathan Kim Create hung_queue_hqd_info structure and fill in hung queses information passed by MES, including queue type, pipe id, and queue id. Suggested-by: Jonathan Kim <jonathan.kim@amd.com> Signed-off-by: Amber Lin <Amber.Lin@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 13 ++++--------- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index f1f8bbfc31e0..ae42fbaba34f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, { struct mes_detect_and_reset_queue_input input; u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id]; - int r, i; + int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i; if (!hung_db_num || !hung_db_array) return -EINVAL; @@ -481,18 +481,13 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, } } - if (r && !hung_db_num) { + if (r && !(*hung_db_num)) { dev_err(adev->dev, "Failed to detect and reset hung queues\n"); return r; } - /* - * TODO: return HQD info for MES scheduled user compute queue reset cases - * stored in hung_db_array hqd info offset to full array size - */ - - if (r) - dev_err(adev->dev, "failed to reset\n"); + for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++) + hung_db_array[i] = db_array[i]; return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index f80e3aca9c78..2e6ae9f84db0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -170,6 +170,19 @@ struct amdgpu_mes { uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES]; }; +struct amdgpu_mes_hung_queue_hqd_info { + union { + struct { + uint32_t queue_type: 3; // queue type + uint32_t pipe_index: 4; // pipe index + uint32_t queue_index: 8; // queue index + uint32_t reserved: 17; + }; + + uint32_t bit0_31; + }; +}; + struct amdgpu_mes_gang { int gang_id; int priority; -- 2.43.0 ^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES 2026-04-13 18:34 [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Amber Lin @ 2026-04-13 18:34 ` Amber Lin 2026-04-13 20:24 ` Alex Deucher 2026-04-13 20:22 ` [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Alex Deucher 1 sibling, 1 reply; 5+ messages in thread From: Amber Lin @ 2026-04-13 18:34 UTC (permalink / raw) To: amd-gfx, alexdeucher; +Cc: Amber Lin, Jonathan Kim When removing queues fails, KFD calls amdgpu_mes to detect and reset hung queues, then cleans up those hung queues in KFD. Suggested-by: Jonathan Kim <jonathan.kim@amd.com> Signed-off-by: Amber Lin <Amber.Lin@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 + drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++- .../drm/amd/amdkfd/kfd_device_queue_manager.h | 4 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + 5 files changed, 156 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 51ad6563ec73..d13bed68d50b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -787,6 +787,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev) amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0)); } +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev) +{ + return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) && + (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73); +} + /* Fix me -- node_id is used to identify the correct MES instances in the future */ static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 643b4f8d757a..44fa4d73bce8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes) } bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev); +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev); int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index ec8d7f4be840..2670741f3e53 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id); static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma); +static int resume_all_queues_mes(struct device_queue_manager *dqm); +static int suspend_all_queues_mes(struct device_queue_manager *dqm); +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, + uint32_t doorbell_offset); +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, return r; } -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd) +static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd, + bool is_for_reset, + bool flush_mes_queue) { struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; int r; struct mes_remove_queue_input queue_input; + /* queue was already removed during reset */ + if (q->properties.is_reset) + return 0; + if (!dqm->sched_running || dqm->sched_halt) return 0; if (!down_read_trylock(&adev->reset_domain->sem)) @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); queue_input.doorbell_offset = q->properties.doorbell_off; queue_input.gang_context_addr = q->gang_ctx_gpu_addr; + queue_input.remove_queue_after_reset = flush_mes_queue; queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1; amdgpu_mes_lock(&adev->mes); @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_unlock(&adev->mes); up_read(&adev->reset_domain->sem); + if (is_for_reset) + return r; + if (r) { + if (!suspend_all_queues_mes(dqm)) + return resume_all_queues_mes(dqm); + dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n", q->properties.doorbell_off); dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, return r; } +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false); +} + static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm) { struct device_process_node *cur; @@ -359,6 +384,92 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm) return retval; } +static int reset_queues_mes(struct device_queue_manager *dqm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset; + int num_hung = 0, r = 0, i, pipe, queue, queue_type; + uint32_t *hung_array = dqm->hung_db_array; + struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info; + struct kfd_process_device *pdd; + struct queue *q; + + if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) { + r = -ENOTRECOVERABLE; + goto fail; + } + + /* reset should be used only in dqm locked queue reset */ + if (WARN_ON(dqm->detect_hang_count > 0)) + return 0; + + if (!amdgpu_gpu_recovery) { + r = -ENOTRECOVERABLE; + goto fail; + } + + if (!hung_array || !hqd_info) { + r = -ENOMEM; + goto fail; + } + + memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info)); + + /* + * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called + * post suspend_all as reset & detect will return all hung queue types. + * + * Passed parameter is for targeting queues not scheduled by MES add_queue. + */ + r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE, + false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1); + + if (!num_hung || r) { + r = -ENOTRECOVERABLE; + goto fail; + } + + /* MES resets queue/pipe and cleans up internally */ + for (i = 0; i < num_hung; i++) { + hqd_info[i].bit0_31 = hung_array[i + hqd_info_size]; + pipe = hqd_info[i].pipe_index; + queue = hqd_info[i].queue_index; + queue_type = hqd_info[i].queue_type; + + if (queue_type != MES_QUEUE_TYPE_COMPUTE && + queue_type != MES_QUEUE_TYPE_SDMA) { + pr_warn("Unsupported hung queue reset type: %d\n", queue_type); + hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET; + continue; + } + + q = find_queue_by_doorbell_offset(dqm, hung_array[i]); + if (!q) { + r = -ENOTRECOVERABLE; + goto fail; + } + + pdd = kfd_get_process_device_data(q->device, q->process); + if (!pdd) { + r = -ENODEV; + goto fail; + } + + pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n", + hung_array[i], pipe, queue, queue_type); + /* Proceed remove_queue with reset=true */ + remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false); + set_queue_as_reset(dqm, q, &pdd->qpd); + } + + dqm->detect_hang_count = num_hung; + kfd_signal_reset_event(dqm->dev); + +fail: + dqm->detect_hang_count = 0; + return r; +} + static int suspend_all_queues_mes(struct device_queue_manager *dqm) { struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; @@ -371,6 +482,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm) up_read(&adev->reset_domain->sem); if (r) { + if (!reset_queues_mes(dqm)) + return 0; + dev_err(adev->dev, "failed to suspend gangs from MES\n"); dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); kfd_hws_hang(dqm); @@ -1821,6 +1935,9 @@ static int start_cpsch(struct device_queue_manager *dqm) { struct device *dev = dqm->dev->adev->dev; int retval, num_hw_queue_slots; + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int hung_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev); + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset; dqm_lock(dqm); @@ -1870,6 +1987,11 @@ static int start_cpsch(struct device_queue_manager *dqm) goto fail_detect_hang_buffer; } + dqm->hung_db_array = kzalloc(hung_array_size * sizeof(uint32_t), GFP_KERNEL); + dqm->hqd_info = kzalloc( + hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info), + GFP_KERNEL); + dqm_unlock(dqm); return 0; @@ -1910,6 +2032,9 @@ static int stop_cpsch(struct device_queue_manager *dqm) pm_uninit(&dqm->packet_mgr); kfree(dqm->detect_hang_info); dqm->detect_hang_info = NULL; + kfree(dqm->hung_db_array); + kfree(dqm->hqd_info); + dqm_unlock(dqm); return ret; @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q q->properties.queue_id, pdd->process->lead_thread->pid); pdd->has_reset_queue = true; + q->properties.is_reset = true; if (q->properties.is_active) { q->properties.is_active = false; decrement_queue_count(dqm, qpd, q); @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin return NULL; } +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset) +{ + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (doorbell_offset == q->properties.doorbell_off) + return q; + } + } + + return NULL; +} + static int reset_hung_queues(struct device_queue_manager *dqm) { int r = 0, reset_count = 0, i; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 3272328da11f..e6eca38cae4e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -32,7 +32,6 @@ #include "kfd_priv.h" #include "kfd_mqd_manager.h" - #define VMID_NUM 16 #define KFD_MES_PROCESS_QUANTUM 100000 @@ -285,6 +284,9 @@ struct device_queue_manager { struct dqm_detect_hang_info *detect_hang_info; size_t detect_hang_info_size; int detect_hang_count; + /* for per-queue reset with mes */ + uint32_t *hung_db_array; + struct amdgpu_mes_hung_queue_hqd_info *hqd_info; }; void device_queue_manager_init_cik( diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index e7a8f3e17872..7e0d4b83c2cf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -523,6 +523,7 @@ struct queue_properties { uint32_t pm4_target_xcc; bool is_dbg_wa; bool is_user_cu_masked; + bool is_reset; /* Not relevant for user mode queues in cp scheduling */ unsigned int vmid; /* Relevant only for sdma queues*/ -- 2.43.0 ^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES 2026-04-13 18:34 ` [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin @ 2026-04-13 20:24 ` Alex Deucher 0 siblings, 0 replies; 5+ messages in thread From: Alex Deucher @ 2026-04-13 20:24 UTC (permalink / raw) To: Amber Lin; +Cc: amd-gfx, Jonathan Kim On Mon, Apr 13, 2026 at 2:34 PM Amber Lin <Amber.Lin@amd.com> wrote: > > When removing queues fails, KFD calls amdgpu_mes to detect and reset > hung queues, then cleans up those hung queues in KFD. > > Suggested-by: Jonathan Kim <jonathan.kim@amd.com> > Signed-off-by: Amber Lin <Amber.Lin@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 + > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++- > .../drm/amd/amdkfd/kfd_device_queue_manager.h | 4 +- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + > 5 files changed, 156 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > index 51ad6563ec73..d13bed68d50b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > @@ -787,6 +787,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev) > amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0)); > } > > +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev) > +{ > + return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) && > + (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73); > +} > + > /* Fix me -- node_id is used to identify the correct MES instances in the future */ > static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, > uint32_t node_id, bool enable) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > index 643b4f8d757a..44fa4d73bce8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes) > } > > bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev); > +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev); > > int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index ec8d7f4be840..2670741f3e53 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, > struct queue *q, const uint32_t *restore_sdma_id); > > static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma); > +static int resume_all_queues_mes(struct device_queue_manager *dqm); > +static int suspend_all_queues_mes(struct device_queue_manager *dqm); > +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, > + uint32_t doorbell_offset); > +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q, > + struct qcm_process_device *qpd); > > static inline > enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) > @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, > return r; > } > > -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, > - struct qcm_process_device *qpd) > +static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q, > + struct qcm_process_device *qpd, > + bool is_for_reset, > + bool flush_mes_queue) > { > struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > int r; > struct mes_remove_queue_input queue_input; > > + /* queue was already removed during reset */ > + if (q->properties.is_reset) > + return 0; > + > if (!dqm->sched_running || dqm->sched_halt) > return 0; > if (!down_read_trylock(&adev->reset_domain->sem)) > @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, > memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); > queue_input.doorbell_offset = q->properties.doorbell_off; > queue_input.gang_context_addr = q->gang_ctx_gpu_addr; > + queue_input.remove_queue_after_reset = flush_mes_queue; > queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1; > > amdgpu_mes_lock(&adev->mes); > @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, > amdgpu_mes_unlock(&adev->mes); > up_read(&adev->reset_domain->sem); > > + if (is_for_reset) > + return r; > + > if (r) { > + if (!suspend_all_queues_mes(dqm)) > + return resume_all_queues_mes(dqm); > + > dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n", > q->properties.doorbell_off); > dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); > @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, > return r; > } > > +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, > + struct qcm_process_device *qpd) > +{ > + return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false); > +} > + > static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm) > { > struct device_process_node *cur; > @@ -359,6 +384,92 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm) > return retval; > } > > +static int reset_queues_mes(struct device_queue_manager *dqm) > +{ > + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset; > + int num_hung = 0, r = 0, i, pipe, queue, queue_type; > + uint32_t *hung_array = dqm->hung_db_array; > + struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info; > + struct kfd_process_device *pdd; > + struct queue *q; > + > + if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) { > + r = -ENOTRECOVERABLE; > + goto fail; > + } > + > + /* reset should be used only in dqm locked queue reset */ > + if (WARN_ON(dqm->detect_hang_count > 0)) > + return 0; > + > + if (!amdgpu_gpu_recovery) { > + r = -ENOTRECOVERABLE; > + goto fail; > + } > + > + if (!hung_array || !hqd_info) { > + r = -ENOMEM; > + goto fail; > + } > + > + memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info)); > + > + /* > + * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called > + * post suspend_all as reset & detect will return all hung queue types. > + * > + * Passed parameter is for targeting queues not scheduled by MES add_queue. > + */ > + r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE, > + false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1); > + > + if (!num_hung || r) { > + r = -ENOTRECOVERABLE; > + goto fail; > + } > + > + /* MES resets queue/pipe and cleans up internally */ > + for (i = 0; i < num_hung; i++) { > + hqd_info[i].bit0_31 = hung_array[i + hqd_info_size]; > + pipe = hqd_info[i].pipe_index; > + queue = hqd_info[i].queue_index; > + queue_type = hqd_info[i].queue_type; > + > + if (queue_type != MES_QUEUE_TYPE_COMPUTE && > + queue_type != MES_QUEUE_TYPE_SDMA) { > + pr_warn("Unsupported hung queue reset type: %d\n", queue_type); > + hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET; > + continue; > + } > + > + q = find_queue_by_doorbell_offset(dqm, hung_array[i]); > + if (!q) { > + r = -ENOTRECOVERABLE; > + goto fail; > + } > + > + pdd = kfd_get_process_device_data(q->device, q->process); > + if (!pdd) { > + r = -ENODEV; > + goto fail; > + } > + > + pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n", > + hung_array[i], pipe, queue, queue_type); > + /* Proceed remove_queue with reset=true */ > + remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false); > + set_queue_as_reset(dqm, q, &pdd->qpd); > + } > + > + dqm->detect_hang_count = num_hung; > + kfd_signal_reset_event(dqm->dev); > + > +fail: > + dqm->detect_hang_count = 0; > + return r; > +} > + > static int suspend_all_queues_mes(struct device_queue_manager *dqm) > { > struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > @@ -371,6 +482,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm) > up_read(&adev->reset_domain->sem); > > if (r) { > + if (!reset_queues_mes(dqm)) > + return 0; > + > dev_err(adev->dev, "failed to suspend gangs from MES\n"); > dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); > kfd_hws_hang(dqm); > @@ -1821,6 +1935,9 @@ static int start_cpsch(struct device_queue_manager *dqm) > { > struct device *dev = dqm->dev->adev->dev; > int retval, num_hw_queue_slots; > + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > + int hung_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev); > + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset; > > dqm_lock(dqm); > > @@ -1870,6 +1987,11 @@ static int start_cpsch(struct device_queue_manager *dqm) > goto fail_detect_hang_buffer; > } > > + dqm->hung_db_array = kzalloc(hung_array_size * sizeof(uint32_t), GFP_KERNEL); > + dqm->hqd_info = kzalloc( > + hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info), > + GFP_KERNEL); > + > dqm_unlock(dqm); > > return 0; > @@ -1910,6 +2032,9 @@ static int stop_cpsch(struct device_queue_manager *dqm) > pm_uninit(&dqm->packet_mgr); > kfree(dqm->detect_hang_info); > dqm->detect_hang_info = NULL; > + kfree(dqm->hung_db_array); > + kfree(dqm->hqd_info); > + > dqm_unlock(dqm); > > return ret; > @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q > q->properties.queue_id, pdd->process->lead_thread->pid); > > pdd->has_reset_queue = true; > + q->properties.is_reset = true; > if (q->properties.is_active) { > q->properties.is_active = false; > decrement_queue_count(dqm, qpd, q); > @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin > return NULL; > } > > +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset) > +{ > + struct device_process_node *cur; > + struct qcm_process_device *qpd; > + struct queue *q; > + > + list_for_each_entry(cur, &dqm->queues, list) { > + qpd = cur->qpd; > + list_for_each_entry(q, &qpd->queues_list, list) { > + if (doorbell_offset == q->properties.doorbell_off) > + return q; > + } > + } > + > + return NULL; > +} > + > static int reset_hung_queues(struct device_queue_manager *dqm) > { > int r = 0, reset_count = 0, i; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > index 3272328da11f..e6eca38cae4e 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > @@ -32,7 +32,6 @@ > #include "kfd_priv.h" > #include "kfd_mqd_manager.h" > > - > #define VMID_NUM 16 > > #define KFD_MES_PROCESS_QUANTUM 100000 > @@ -285,6 +284,9 @@ struct device_queue_manager { > struct dqm_detect_hang_info *detect_hang_info; > size_t detect_hang_info_size; > int detect_hang_count; > + /* for per-queue reset with mes */ > + uint32_t *hung_db_array; > + struct amdgpu_mes_hung_queue_hqd_info *hqd_info; > }; > > void device_queue_manager_init_cik( > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index e7a8f3e17872..7e0d4b83c2cf 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -523,6 +523,7 @@ struct queue_properties { > uint32_t pm4_target_xcc; > bool is_dbg_wa; > bool is_user_cu_masked; > + bool is_reset; > /* Not relevant for user mode queues in cp scheduling */ > unsigned int vmid; > /* Relevant only for sdma queues*/ > -- > 2.43.0 > ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v4 04/10] drm/amdgpu: Create hqd info structure 2026-04-13 18:34 [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Amber Lin 2026-04-13 18:34 ` [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin @ 2026-04-13 20:22 ` Alex Deucher 1 sibling, 0 replies; 5+ messages in thread From: Alex Deucher @ 2026-04-13 20:22 UTC (permalink / raw) To: Amber Lin; +Cc: amd-gfx, Jonathan Kim On Mon, Apr 13, 2026 at 2:34 PM Amber Lin <Amber.Lin@amd.com> wrote: > > Create hung_queue_hqd_info structure and fill in hung queses information > passed by MES, including queue type, pipe id, and queue id. > > Suggested-by: Jonathan Kim <jonathan.kim@amd.com> > Signed-off-by: Amber Lin <Amber.Lin@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 13 ++++--------- > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++ > 2 files changed, 17 insertions(+), 9 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > index f1f8bbfc31e0..ae42fbaba34f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, > { > struct mes_detect_and_reset_queue_input input; > u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id]; > - int r, i; > + int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i; > > if (!hung_db_num || !hung_db_array) > return -EINVAL; > @@ -481,18 +481,13 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, > } > } > > - if (r && !hung_db_num) { > + if (r && !(*hung_db_num)) { > dev_err(adev->dev, "Failed to detect and reset hung queues\n"); > return r; > } > > - /* > - * TODO: return HQD info for MES scheduled user compute queue reset cases > - * stored in hung_db_array hqd info offset to full array size > - */ > - > - if (r) > - dev_err(adev->dev, "failed to reset\n"); > + for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++) > + hung_db_array[i] = db_array[i]; > > return r; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > index f80e3aca9c78..2e6ae9f84db0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > @@ -170,6 +170,19 @@ struct amdgpu_mes { > uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES]; > }; > > +struct amdgpu_mes_hung_queue_hqd_info { > + union { > + struct { > + uint32_t queue_type: 3; // queue type > + uint32_t pipe_index: 4; // pipe index > + uint32_t queue_index: 8; // queue index > + uint32_t reserved: 17; > + }; > + > + uint32_t bit0_31; > + }; > +}; > + > struct amdgpu_mes_gang { > int gang_id; > int priority; > -- > 2.43.0 > ^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH v4 04/10] drm/amdgpu: Create hqd info structure @ 2026-04-01 19:15 Amber Lin 0 siblings, 0 replies; 5+ messages in thread From: Amber Lin @ 2026-04-01 19:15 UTC (permalink / raw) To: amd-gfx, alexdeucher; +Cc: Amber Lin, Jonathan Kim Create hung_queue_hqd_info structure and fill in hung queses information passed by MES, including queue type, pipe id, and queue id. Suggested-by: Jonathan Kim <jonathan.kim@amd.com> Signed-off-by: Amber Lin <Amber.Lin@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 20 ++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index f1f8bbfc31e0..47c989980824 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, { struct mes_detect_and_reset_queue_input input; u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id]; - int r, i; + int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i; if (!hung_db_num || !hung_db_array) return -EINVAL; @@ -466,8 +466,9 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes, &input); - if (r && detect_only) { - dev_err(adev->dev, "Failed to detect hung queues\n"); + if (r) { + dev_warn(adev->dev, "Failed to %s hung queues\n", + detect_only? "detect" : "reset"); return r; } @@ -481,18 +482,13 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, } } - if (r && !hung_db_num) { - dev_err(adev->dev, "Failed to detect and reset hung queues\n"); + if (!hung_db_num) { + dev_warn(adev->dev, "No hung queues info from MES\n"); return r; } - /* - * TODO: return HQD info for MES scheduled user compute queue reset cases - * stored in hung_db_array hqd info offset to full array size - */ - - if (r) - dev_err(adev->dev, "failed to reset\n"); + for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++) + hung_db_array[i] = db_array[i]; return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index f80e3aca9c78..2e6ae9f84db0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -170,6 +170,19 @@ struct amdgpu_mes { uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES]; }; +struct amdgpu_mes_hung_queue_hqd_info { + union { + struct { + uint32_t queue_type: 3; // queue type + uint32_t pipe_index: 4; // pipe index + uint32_t queue_index: 8; // queue index + uint32_t reserved: 17; + }; + + uint32_t bit0_31; + }; +}; + struct amdgpu_mes_gang { int gang_id; int priority; -- 2.43.0 ^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2026-04-13 20:24 UTC | newest] Thread overview: 5+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-04-13 18:34 [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Amber Lin 2026-04-13 18:34 ` [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin 2026-04-13 20:24 ` Alex Deucher 2026-04-13 20:22 ` [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Alex Deucher -- strict thread matches above, loose matches on Subject: below -- 2026-04-01 19:15 Amber Lin
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox