* [PATCH v4 04/10] drm/amdgpu: Create hqd info structure
@ 2026-04-01 19:15 Amber Lin
0 siblings, 0 replies; 5+ messages in thread
From: Amber Lin @ 2026-04-01 19:15 UTC (permalink / raw)
To: amd-gfx, alexdeucher; +Cc: Amber Lin, Jonathan Kim
Create hung_queue_hqd_info structure and fill in hung queses information
passed by MES, including queue type, pipe id, and queue id.
Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 20 ++++++++------------
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
2 files changed, 21 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index f1f8bbfc31e0..47c989980824 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
{
struct mes_detect_and_reset_queue_input input;
u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
- int r, i;
+ int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
if (!hung_db_num || !hung_db_array)
return -EINVAL;
@@ -466,8 +466,9 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
&input);
- if (r && detect_only) {
- dev_err(adev->dev, "Failed to detect hung queues\n");
+ if (r) {
+ dev_warn(adev->dev, "Failed to %s hung queues\n",
+ detect_only? "detect" : "reset");
return r;
}
@@ -481,18 +482,13 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
}
}
- if (r && !hung_db_num) {
- dev_err(adev->dev, "Failed to detect and reset hung queues\n");
+ if (!hung_db_num) {
+ dev_warn(adev->dev, "No hung queues info from MES\n");
return r;
}
- /*
- * TODO: return HQD info for MES scheduled user compute queue reset cases
- * stored in hung_db_array hqd info offset to full array size
- */
-
- if (r)
- dev_err(adev->dev, "failed to reset\n");
+ for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
+ hung_db_array[i] = db_array[i];
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index f80e3aca9c78..2e6ae9f84db0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -170,6 +170,19 @@ struct amdgpu_mes {
uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
};
+struct amdgpu_mes_hung_queue_hqd_info {
+ union {
+ struct {
+ uint32_t queue_type: 3; // queue type
+ uint32_t pipe_index: 4; // pipe index
+ uint32_t queue_index: 8; // queue index
+ uint32_t reserved: 17;
+ };
+
+ uint32_t bit0_31;
+ };
+};
+
struct amdgpu_mes_gang {
int gang_id;
int priority;
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v4 04/10] drm/amdgpu: Create hqd info structure
@ 2026-04-13 18:34 Amber Lin
2026-04-13 18:34 ` [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
2026-04-13 20:22 ` [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Alex Deucher
0 siblings, 2 replies; 5+ messages in thread
From: Amber Lin @ 2026-04-13 18:34 UTC (permalink / raw)
To: amd-gfx, alexdeucher; +Cc: Amber Lin, Jonathan Kim
Create hung_queue_hqd_info structure and fill in hung queses information
passed by MES, including queue type, pipe id, and queue id.
Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 13 ++++---------
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
2 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index f1f8bbfc31e0..ae42fbaba34f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
{
struct mes_detect_and_reset_queue_input input;
u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
- int r, i;
+ int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
if (!hung_db_num || !hung_db_array)
return -EINVAL;
@@ -481,18 +481,13 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
}
}
- if (r && !hung_db_num) {
+ if (r && !(*hung_db_num)) {
dev_err(adev->dev, "Failed to detect and reset hung queues\n");
return r;
}
- /*
- * TODO: return HQD info for MES scheduled user compute queue reset cases
- * stored in hung_db_array hqd info offset to full array size
- */
-
- if (r)
- dev_err(adev->dev, "failed to reset\n");
+ for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
+ hung_db_array[i] = db_array[i];
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index f80e3aca9c78..2e6ae9f84db0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -170,6 +170,19 @@ struct amdgpu_mes {
uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
};
+struct amdgpu_mes_hung_queue_hqd_info {
+ union {
+ struct {
+ uint32_t queue_type: 3; // queue type
+ uint32_t pipe_index: 4; // pipe index
+ uint32_t queue_index: 8; // queue index
+ uint32_t reserved: 17;
+ };
+
+ uint32_t bit0_31;
+ };
+};
+
struct amdgpu_mes_gang {
int gang_id;
int priority;
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES
2026-04-13 18:34 [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Amber Lin
@ 2026-04-13 18:34 ` Amber Lin
2026-04-13 20:24 ` Alex Deucher
2026-04-13 20:22 ` [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Alex Deucher
1 sibling, 1 reply; 5+ messages in thread
From: Amber Lin @ 2026-04-13 18:34 UTC (permalink / raw)
To: amd-gfx, alexdeucher; +Cc: Amber Lin, Jonathan Kim
When removing queues fails, KFD calls amdgpu_mes to detect and reset
hung queues, then cleans up those hung queues in KFD.
Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
.../drm/amd/amdkfd/kfd_device_queue_manager.h | 4 +-
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
5 files changed, 156 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 51ad6563ec73..d13bed68d50b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -787,6 +787,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
}
+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
+{
+ return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
+ (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
+}
+
/* Fix me -- node_id is used to identify the correct MES instances in the future */
static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
uint32_t node_id, bool enable)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 643b4f8d757a..44fa4d73bce8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
}
bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ec8d7f4be840..2670741f3e53 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q, const uint32_t *restore_sdma_id);
static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
+static int resume_all_queues_mes(struct device_queue_manager *dqm);
+static int suspend_all_queues_mes(struct device_queue_manager *dqm);
+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
+ uint32_t doorbell_offset);
+static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd);
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
@@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
return r;
}
-static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
- struct qcm_process_device *qpd)
+static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd,
+ bool is_for_reset,
+ bool flush_mes_queue)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
int r;
struct mes_remove_queue_input queue_input;
+ /* queue was already removed during reset */
+ if (q->properties.is_reset)
+ return 0;
+
if (!dqm->sched_running || dqm->sched_halt)
return 0;
if (!down_read_trylock(&adev->reset_domain->sem))
@@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
queue_input.doorbell_offset = q->properties.doorbell_off;
queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
+ queue_input.remove_queue_after_reset = flush_mes_queue;
queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
amdgpu_mes_lock(&adev->mes);
@@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
amdgpu_mes_unlock(&adev->mes);
up_read(&adev->reset_domain->sem);
+ if (is_for_reset)
+ return r;
+
if (r) {
+ if (!suspend_all_queues_mes(dqm))
+ return resume_all_queues_mes(dqm);
+
dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
q->properties.doorbell_off);
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
@@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
return r;
}
+static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd)
+{
+ return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
+}
+
static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
{
struct device_process_node *cur;
@@ -359,6 +384,92 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
return retval;
}
+static int reset_queues_mes(struct device_queue_manager *dqm)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+ int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
+ int num_hung = 0, r = 0, i, pipe, queue, queue_type;
+ uint32_t *hung_array = dqm->hung_db_array;
+ struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info;
+ struct kfd_process_device *pdd;
+ struct queue *q;
+
+ if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ /* reset should be used only in dqm locked queue reset */
+ if (WARN_ON(dqm->detect_hang_count > 0))
+ return 0;
+
+ if (!amdgpu_gpu_recovery) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ if (!hung_array || !hqd_info) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info));
+
+ /*
+ * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
+ * post suspend_all as reset & detect will return all hung queue types.
+ *
+ * Passed parameter is for targeting queues not scheduled by MES add_queue.
+ */
+ r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
+ false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
+
+ if (!num_hung || r) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ /* MES resets queue/pipe and cleans up internally */
+ for (i = 0; i < num_hung; i++) {
+ hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
+ pipe = hqd_info[i].pipe_index;
+ queue = hqd_info[i].queue_index;
+ queue_type = hqd_info[i].queue_type;
+
+ if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
+ queue_type != MES_QUEUE_TYPE_SDMA) {
+ pr_warn("Unsupported hung queue reset type: %d\n", queue_type);
+ hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
+ continue;
+ }
+
+ q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
+ if (!q) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ pdd = kfd_get_process_device_data(q->device, q->process);
+ if (!pdd) {
+ r = -ENODEV;
+ goto fail;
+ }
+
+ pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
+ hung_array[i], pipe, queue, queue_type);
+ /* Proceed remove_queue with reset=true */
+ remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false);
+ set_queue_as_reset(dqm, q, &pdd->qpd);
+ }
+
+ dqm->detect_hang_count = num_hung;
+ kfd_signal_reset_event(dqm->dev);
+
+fail:
+ dqm->detect_hang_count = 0;
+ return r;
+}
+
static int suspend_all_queues_mes(struct device_queue_manager *dqm)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
@@ -371,6 +482,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
up_read(&adev->reset_domain->sem);
if (r) {
+ if (!reset_queues_mes(dqm))
+ return 0;
+
dev_err(adev->dev, "failed to suspend gangs from MES\n");
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
@@ -1821,6 +1935,9 @@ static int start_cpsch(struct device_queue_manager *dqm)
{
struct device *dev = dqm->dev->adev->dev;
int retval, num_hw_queue_slots;
+ struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+ int hung_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
+ int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
dqm_lock(dqm);
@@ -1870,6 +1987,11 @@ static int start_cpsch(struct device_queue_manager *dqm)
goto fail_detect_hang_buffer;
}
+ dqm->hung_db_array = kzalloc(hung_array_size * sizeof(uint32_t), GFP_KERNEL);
+ dqm->hqd_info = kzalloc(
+ hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info),
+ GFP_KERNEL);
+
dqm_unlock(dqm);
return 0;
@@ -1910,6 +2032,9 @@ static int stop_cpsch(struct device_queue_manager *dqm)
pm_uninit(&dqm->packet_mgr);
kfree(dqm->detect_hang_info);
dqm->detect_hang_info = NULL;
+ kfree(dqm->hung_db_array);
+ kfree(dqm->hqd_info);
+
dqm_unlock(dqm);
return ret;
@@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
q->properties.queue_id, pdd->process->lead_thread->pid);
pdd->has_reset_queue = true;
+ q->properties.is_reset = true;
if (q->properties.is_active) {
q->properties.is_active = false;
decrement_queue_count(dqm, qpd, q);
@@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
return NULL;
}
+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset)
+{
+ struct device_process_node *cur;
+ struct qcm_process_device *qpd;
+ struct queue *q;
+
+ list_for_each_entry(cur, &dqm->queues, list) {
+ qpd = cur->qpd;
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ if (doorbell_offset == q->properties.doorbell_off)
+ return q;
+ }
+ }
+
+ return NULL;
+}
+
static int reset_hung_queues(struct device_queue_manager *dqm)
{
int r = 0, reset_count = 0, i;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 3272328da11f..e6eca38cae4e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -32,7 +32,6 @@
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
-
#define VMID_NUM 16
#define KFD_MES_PROCESS_QUANTUM 100000
@@ -285,6 +284,9 @@ struct device_queue_manager {
struct dqm_detect_hang_info *detect_hang_info;
size_t detect_hang_info_size;
int detect_hang_count;
+ /* for per-queue reset with mes */
+ uint32_t *hung_db_array;
+ struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
};
void device_queue_manager_init_cik(
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e7a8f3e17872..7e0d4b83c2cf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -523,6 +523,7 @@ struct queue_properties {
uint32_t pm4_target_xcc;
bool is_dbg_wa;
bool is_user_cu_masked;
+ bool is_reset;
/* Not relevant for user mode queues in cp scheduling */
unsigned int vmid;
/* Relevant only for sdma queues*/
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH v4 04/10] drm/amdgpu: Create hqd info structure
2026-04-13 18:34 [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Amber Lin
2026-04-13 18:34 ` [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
@ 2026-04-13 20:22 ` Alex Deucher
1 sibling, 0 replies; 5+ messages in thread
From: Alex Deucher @ 2026-04-13 20:22 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Jonathan Kim
On Mon, Apr 13, 2026 at 2:34 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> Create hung_queue_hqd_info structure and fill in hung queses information
> passed by MES, including queue type, pipe id, and queue id.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 13 ++++---------
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
> 2 files changed, 17 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index f1f8bbfc31e0..ae42fbaba34f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
> {
> struct mes_detect_and_reset_queue_input input;
> u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
> - int r, i;
> + int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
>
> if (!hung_db_num || !hung_db_array)
> return -EINVAL;
> @@ -481,18 +481,13 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
> }
> }
>
> - if (r && !hung_db_num) {
> + if (r && !(*hung_db_num)) {
> dev_err(adev->dev, "Failed to detect and reset hung queues\n");
> return r;
> }
>
> - /*
> - * TODO: return HQD info for MES scheduled user compute queue reset cases
> - * stored in hung_db_array hqd info offset to full array size
> - */
> -
> - if (r)
> - dev_err(adev->dev, "failed to reset\n");
> + for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
> + hung_db_array[i] = db_array[i];
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index f80e3aca9c78..2e6ae9f84db0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -170,6 +170,19 @@ struct amdgpu_mes {
> uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
> };
>
> +struct amdgpu_mes_hung_queue_hqd_info {
> + union {
> + struct {
> + uint32_t queue_type: 3; // queue type
> + uint32_t pipe_index: 4; // pipe index
> + uint32_t queue_index: 8; // queue index
> + uint32_t reserved: 17;
> + };
> +
> + uint32_t bit0_31;
> + };
> +};
> +
> struct amdgpu_mes_gang {
> int gang_id;
> int priority;
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES
2026-04-13 18:34 ` [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
@ 2026-04-13 20:24 ` Alex Deucher
0 siblings, 0 replies; 5+ messages in thread
From: Alex Deucher @ 2026-04-13 20:24 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Jonathan Kim
On Mon, Apr 13, 2026 at 2:34 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> When removing queues fails, KFD calls amdgpu_mes to detect and reset
> hung queues, then cleans up those hung queues in KFD.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
> .../drm/amd/amdkfd/kfd_device_queue_manager.h | 4 +-
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
> 5 files changed, 156 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 51ad6563ec73..d13bed68d50b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -787,6 +787,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
> amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
> }
>
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
> +{
> + return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
> + (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
> +}
> +
> /* Fix me -- node_id is used to identify the correct MES instances in the future */
> static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
> uint32_t node_id, bool enable)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 643b4f8d757a..44fa4d73bce8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
> }
>
> bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
>
> int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ec8d7f4be840..2670741f3e53 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q, const uint32_t *restore_sdma_id);
>
> static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
> +static int resume_all_queues_mes(struct device_queue_manager *dqm);
> +static int suspend_all_queues_mes(struct device_queue_manager *dqm);
> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
> + uint32_t doorbell_offset);
> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
> + struct qcm_process_device *qpd);
>
> static inline
> enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
> @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> return r;
> }
>
> -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> - struct qcm_process_device *qpd)
> +static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q,
> + struct qcm_process_device *qpd,
> + bool is_for_reset,
> + bool flush_mes_queue)
> {
> struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> int r;
> struct mes_remove_queue_input queue_input;
>
> + /* queue was already removed during reset */
> + if (q->properties.is_reset)
> + return 0;
> +
> if (!dqm->sched_running || dqm->sched_halt)
> return 0;
> if (!down_read_trylock(&adev->reset_domain->sem))
> @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
> queue_input.doorbell_offset = q->properties.doorbell_off;
> queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
> + queue_input.remove_queue_after_reset = flush_mes_queue;
> queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
>
> amdgpu_mes_lock(&adev->mes);
> @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> amdgpu_mes_unlock(&adev->mes);
> up_read(&adev->reset_domain->sem);
>
> + if (is_for_reset)
> + return r;
> +
> if (r) {
> + if (!suspend_all_queues_mes(dqm))
> + return resume_all_queues_mes(dqm);
> +
> dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
> q->properties.doorbell_off);
> dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
> @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> return r;
> }
>
> +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> + struct qcm_process_device *qpd)
> +{
> + return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
> +}
> +
> static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
> {
> struct device_process_node *cur;
> @@ -359,6 +384,92 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
> return retval;
> }
>
> +static int reset_queues_mes(struct device_queue_manager *dqm)
> +{
> + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
> + int num_hung = 0, r = 0, i, pipe, queue, queue_type;
> + uint32_t *hung_array = dqm->hung_db_array;
> + struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info;
> + struct kfd_process_device *pdd;
> + struct queue *q;
> +
> + if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
> + r = -ENOTRECOVERABLE;
> + goto fail;
> + }
> +
> + /* reset should be used only in dqm locked queue reset */
> + if (WARN_ON(dqm->detect_hang_count > 0))
> + return 0;
> +
> + if (!amdgpu_gpu_recovery) {
> + r = -ENOTRECOVERABLE;
> + goto fail;
> + }
> +
> + if (!hung_array || !hqd_info) {
> + r = -ENOMEM;
> + goto fail;
> + }
> +
> + memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info));
> +
> + /*
> + * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
> + * post suspend_all as reset & detect will return all hung queue types.
> + *
> + * Passed parameter is for targeting queues not scheduled by MES add_queue.
> + */
> + r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
> + false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
> +
> + if (!num_hung || r) {
> + r = -ENOTRECOVERABLE;
> + goto fail;
> + }
> +
> + /* MES resets queue/pipe and cleans up internally */
> + for (i = 0; i < num_hung; i++) {
> + hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
> + pipe = hqd_info[i].pipe_index;
> + queue = hqd_info[i].queue_index;
> + queue_type = hqd_info[i].queue_type;
> +
> + if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
> + queue_type != MES_QUEUE_TYPE_SDMA) {
> + pr_warn("Unsupported hung queue reset type: %d\n", queue_type);
> + hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
> + continue;
> + }
> +
> + q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
> + if (!q) {
> + r = -ENOTRECOVERABLE;
> + goto fail;
> + }
> +
> + pdd = kfd_get_process_device_data(q->device, q->process);
> + if (!pdd) {
> + r = -ENODEV;
> + goto fail;
> + }
> +
> + pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
> + hung_array[i], pipe, queue, queue_type);
> + /* Proceed remove_queue with reset=true */
> + remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false);
> + set_queue_as_reset(dqm, q, &pdd->qpd);
> + }
> +
> + dqm->detect_hang_count = num_hung;
> + kfd_signal_reset_event(dqm->dev);
> +
> +fail:
> + dqm->detect_hang_count = 0;
> + return r;
> +}
> +
> static int suspend_all_queues_mes(struct device_queue_manager *dqm)
> {
> struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> @@ -371,6 +482,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
> up_read(&adev->reset_domain->sem);
>
> if (r) {
> + if (!reset_queues_mes(dqm))
> + return 0;
> +
> dev_err(adev->dev, "failed to suspend gangs from MES\n");
> dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
> kfd_hws_hang(dqm);
> @@ -1821,6 +1935,9 @@ static int start_cpsch(struct device_queue_manager *dqm)
> {
> struct device *dev = dqm->dev->adev->dev;
> int retval, num_hw_queue_slots;
> + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> + int hung_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
> + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
>
> dqm_lock(dqm);
>
> @@ -1870,6 +1987,11 @@ static int start_cpsch(struct device_queue_manager *dqm)
> goto fail_detect_hang_buffer;
> }
>
> + dqm->hung_db_array = kzalloc(hung_array_size * sizeof(uint32_t), GFP_KERNEL);
> + dqm->hqd_info = kzalloc(
> + hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info),
> + GFP_KERNEL);
> +
> dqm_unlock(dqm);
>
> return 0;
> @@ -1910,6 +2032,9 @@ static int stop_cpsch(struct device_queue_manager *dqm)
> pm_uninit(&dqm->packet_mgr);
> kfree(dqm->detect_hang_info);
> dqm->detect_hang_info = NULL;
> + kfree(dqm->hung_db_array);
> + kfree(dqm->hqd_info);
> +
> dqm_unlock(dqm);
>
> return ret;
> @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
> q->properties.queue_id, pdd->process->lead_thread->pid);
>
> pdd->has_reset_queue = true;
> + q->properties.is_reset = true;
> if (q->properties.is_active) {
> q->properties.is_active = false;
> decrement_queue_count(dqm, qpd, q);
> @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
> return NULL;
> }
>
> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset)
> +{
> + struct device_process_node *cur;
> + struct qcm_process_device *qpd;
> + struct queue *q;
> +
> + list_for_each_entry(cur, &dqm->queues, list) {
> + qpd = cur->qpd;
> + list_for_each_entry(q, &qpd->queues_list, list) {
> + if (doorbell_offset == q->properties.doorbell_off)
> + return q;
> + }
> + }
> +
> + return NULL;
> +}
> +
> static int reset_hung_queues(struct device_queue_manager *dqm)
> {
> int r = 0, reset_count = 0, i;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 3272328da11f..e6eca38cae4e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -32,7 +32,6 @@
> #include "kfd_priv.h"
> #include "kfd_mqd_manager.h"
>
> -
> #define VMID_NUM 16
>
> #define KFD_MES_PROCESS_QUANTUM 100000
> @@ -285,6 +284,9 @@ struct device_queue_manager {
> struct dqm_detect_hang_info *detect_hang_info;
> size_t detect_hang_info_size;
> int detect_hang_count;
> + /* for per-queue reset with mes */
> + uint32_t *hung_db_array;
> + struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
> };
>
> void device_queue_manager_init_cik(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index e7a8f3e17872..7e0d4b83c2cf 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -523,6 +523,7 @@ struct queue_properties {
> uint32_t pm4_target_xcc;
> bool is_dbg_wa;
> bool is_user_cu_masked;
> + bool is_reset;
> /* Not relevant for user mode queues in cp scheduling */
> unsigned int vmid;
> /* Relevant only for sdma queues*/
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2026-04-13 20:24 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-13 18:34 [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Amber Lin
2026-04-13 18:34 ` [PATCH v4 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
2026-04-13 20:24 ` Alex Deucher
2026-04-13 20:22 ` [PATCH v4 04/10] drm/amdgpu: Create hqd info structure Alex Deucher
-- strict thread matches above, loose matches on Subject: below --
2026-04-01 19:15 Amber Lin
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox