[PATCH v3 04/10] drm/amdgpu: Create hqd info structure

public inbox for amd-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed

* [PATCH v3 04/10] drm/amdgpu: Create hqd info structure
@ 2026-03-27 20:31 Amber Lin
  2026-03-27 20:31 ` [PATCH v3 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Amber Lin @ 2026-03-27 20:31 UTC (permalink / raw)
  To: amd-gfx, alexdeucher; +Cc: Amber Lin, Jonathan Kim

Create hung_queue_hqd_info structure and fill in hung queses information
passed by MES, including queue type, pipe id, and queue id.

Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 17 +++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index f1f8bbfc31e0..436a46ba1dfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
 {
 	struct mes_detect_and_reset_queue_input input;
 	u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
-	int r, i;
+	int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
 
 	if (!hung_db_num || !hung_db_array)
 		return -EINVAL;
@@ -471,6 +471,12 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
 		return r;
 	}
 
+	if (r && (queue_type != AMDGPU_RING_TYPE_COMPUTE)) {
+		dev_err(adev->dev, "MES resetting queue type %d is not supported\n",
+				queue_type);
+		return r;
+	}
+
 	*hung_db_num = 0;
 	/* MES passes hung queues' doorbell to driver */
 	for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
@@ -486,13 +492,8 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
 		return r;
 	}
 
-	/*
-	 * TODO: return HQD info for MES scheduled user compute queue reset cases
-	 * stored in hung_db_array hqd info offset to full array size
-	 */
-
-	if (r)
-		dev_err(adev->dev, "failed to reset\n");
+	for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
+		hung_db_array[i] = db_array[i];
 
 	return r;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index f80e3aca9c78..2e6ae9f84db0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -170,6 +170,19 @@ struct amdgpu_mes {
 	uint64_t            shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
 };
 
+struct amdgpu_mes_hung_queue_hqd_info {
+	union {
+		struct {
+			uint32_t queue_type: 3; // queue type
+			uint32_t pipe_index: 4; // pipe index
+			uint32_t queue_index: 8; // queue index
+			uint32_t reserved: 17;
+		};
+
+		uint32_t bit0_31;
+	};
+};
+
 struct amdgpu_mes_gang {
 	int 				gang_id;
 	int 				priority;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v3 09/10] drm/amdkfd: Reset queue/pipe in MES
  2026-03-27 20:31 [PATCH v3 04/10] drm/amdgpu: Create hqd info structure Amber Lin
@ 2026-03-27 20:31 ` Amber Lin
  2026-03-31 14:27   ` Amber Lin
  2026-03-31 16:23   ` Alex Deucher
  2026-03-31 14:25 ` [PATCH v3 04/10] drm/amdgpu: Create hqd info structure Amber Lin
  2026-03-31 16:20 ` Alex Deucher
  2 siblings, 2 replies; 7+ messages in thread
From: Amber Lin @ 2026-03-27 20:31 UTC (permalink / raw)
  To: amd-gfx, alexdeucher; +Cc: Amber Lin, Jonathan Kim

When removing queues fails, KFD calls amdgpu_mes to detect and reset
hung queues, then cleans up those hung queues in KFD.

Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       |   6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |   1 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |   4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   1 +
 5 files changed, 156 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index fb7fdf5d0973..75720d247b4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -793,6 +793,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
 		amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
 }
 
+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
+{
+	return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
+		(adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
+}
+
 /* Fix me -- node_id is used to identify the correct MES instances in the future */
 static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
 					    uint32_t node_id, bool enable)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 643b4f8d757a..44fa4d73bce8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
 }
 
 bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
 
 int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ec8d7f4be840..2670741f3e53 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q, const uint32_t *restore_sdma_id);
 
 static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
+static int resume_all_queues_mes(struct device_queue_manager *dqm);
+static int suspend_all_queues_mes(struct device_queue_manager *dqm);
+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
+						   uint32_t doorbell_offset);
+static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
+			       struct qcm_process_device *qpd);
 
 static inline
 enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
@@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	return r;
 }
 
-static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
-			struct qcm_process_device *qpd)
+static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q,
+					    struct qcm_process_device *qpd,
+					    bool is_for_reset,
+					    bool flush_mes_queue)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
 	int r;
 	struct mes_remove_queue_input queue_input;
 
+	/* queue was already removed during reset */
+	if (q->properties.is_reset)
+		return 0;
+
 	if (!dqm->sched_running || dqm->sched_halt)
 		return 0;
 	if (!down_read_trylock(&adev->reset_domain->sem))
@@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
 	queue_input.doorbell_offset = q->properties.doorbell_off;
 	queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
+	queue_input.remove_queue_after_reset = flush_mes_queue;
 	queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
 
 	amdgpu_mes_lock(&adev->mes);
@@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	amdgpu_mes_unlock(&adev->mes);
 	up_read(&adev->reset_domain->sem);
 
+	if (is_for_reset)
+		return r;
+
 	if (r) {
+		if (!suspend_all_queues_mes(dqm))
+			return resume_all_queues_mes(dqm);
+
 		dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
 			q->properties.doorbell_off);
 		dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
@@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	return r;
 }
 
+static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
+			    struct qcm_process_device *qpd)
+{
+	return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
+}
+
 static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
 {
 	struct device_process_node *cur;
@@ -359,6 +384,92 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
 	return retval;
 }
 
+static int reset_queues_mes(struct device_queue_manager *dqm)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+	int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
+	int num_hung = 0, r = 0, i, pipe, queue, queue_type;
+	uint32_t *hung_array = dqm->hung_db_array;
+	struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info;
+	struct kfd_process_device *pdd;
+	struct queue *q;
+
+	if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
+		r = -ENOTRECOVERABLE;
+		goto fail;
+	}
+
+	/* reset should be used only in dqm locked queue reset */
+	if (WARN_ON(dqm->detect_hang_count > 0))
+		return 0;
+
+	if (!amdgpu_gpu_recovery) {
+		r = -ENOTRECOVERABLE;
+		goto fail;
+	}
+
+	if (!hung_array || !hqd_info) {
+		r = -ENOMEM;
+		goto fail;
+	}
+
+	memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info));
+
+	/*
+	 * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
+	 * post suspend_all as reset & detect will return all hung queue types.
+	 *
+	 * Passed parameter is for targeting queues not scheduled by MES add_queue.
+	 */
+	r =  amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
+		false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
+
+	if (!num_hung || r) {
+		r = -ENOTRECOVERABLE;
+		goto fail;
+	}
+
+	/* MES resets queue/pipe and cleans up internally */
+	for (i = 0; i < num_hung; i++) {
+		hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
+		pipe = hqd_info[i].pipe_index;
+		queue = hqd_info[i].queue_index;
+		queue_type = hqd_info[i].queue_type;
+
+		if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
+		    queue_type != MES_QUEUE_TYPE_SDMA) {
+			pr_warn("Unsupported hung queue reset type: %d\n", queue_type);
+			hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
+			continue;
+		}
+
+		q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
+		if (!q) {
+			r = -ENOTRECOVERABLE;
+			goto fail;
+		}
+
+		pdd = kfd_get_process_device_data(q->device, q->process);
+		if (!pdd) {
+			r = -ENODEV;
+			goto fail;
+		}
+
+		pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
+				hung_array[i], pipe, queue, queue_type);
+		/* Proceed remove_queue with reset=true */
+		remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false);
+		set_queue_as_reset(dqm, q, &pdd->qpd);
+	}
+
+	dqm->detect_hang_count = num_hung;
+	kfd_signal_reset_event(dqm->dev);
+
+fail:
+	dqm->detect_hang_count = 0;
+	return r;
+}
+
 static int suspend_all_queues_mes(struct device_queue_manager *dqm)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
@@ -371,6 +482,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
 	up_read(&adev->reset_domain->sem);
 
 	if (r) {
+		if (!reset_queues_mes(dqm))
+			return 0;
+
 		dev_err(adev->dev, "failed to suspend gangs from MES\n");
 		dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
 		kfd_hws_hang(dqm);
@@ -1821,6 +1935,9 @@ static int start_cpsch(struct device_queue_manager *dqm)
 {
 	struct device *dev = dqm->dev->adev->dev;
 	int retval, num_hw_queue_slots;
+	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+	int hung_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
+	int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
 
 	dqm_lock(dqm);
 
@@ -1870,6 +1987,11 @@ static int start_cpsch(struct device_queue_manager *dqm)
 		goto fail_detect_hang_buffer;
 	}
 
+	dqm->hung_db_array = kzalloc(hung_array_size * sizeof(uint32_t), GFP_KERNEL);
+	dqm->hqd_info = kzalloc(
+		hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info),
+		GFP_KERNEL);
+
 	dqm_unlock(dqm);
 
 	return 0;
@@ -1910,6 +2032,9 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 		pm_uninit(&dqm->packet_mgr);
 	kfree(dqm->detect_hang_info);
 	dqm->detect_hang_info = NULL;
+	kfree(dqm->hung_db_array);
+	kfree(dqm->hqd_info);
+
 	dqm_unlock(dqm);
 
 	return ret;
@@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
 		q->properties.queue_id, pdd->process->lead_thread->pid);
 
 	pdd->has_reset_queue = true;
+	q->properties.is_reset = true;
 	if (q->properties.is_active) {
 		q->properties.is_active = false;
 		decrement_queue_count(dqm, qpd, q);
@@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
 	return NULL;
 }
 
+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset)
+{
+	struct device_process_node *cur;
+	struct qcm_process_device *qpd;
+	struct queue *q;
+
+	list_for_each_entry(cur, &dqm->queues, list) {
+		qpd = cur->qpd;
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			if (doorbell_offset == q->properties.doorbell_off)
+				return q;
+		}
+	}
+
+	return NULL;
+}
+
 static int reset_hung_queues(struct device_queue_manager *dqm)
 {
 	int r = 0, reset_count = 0, i;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 3272328da11f..e6eca38cae4e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -32,7 +32,6 @@
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 
-
 #define VMID_NUM 16
 
 #define KFD_MES_PROCESS_QUANTUM		100000
@@ -285,6 +284,9 @@ struct device_queue_manager {
 	struct dqm_detect_hang_info *detect_hang_info;
 	size_t detect_hang_info_size;
 	int detect_hang_count;
+	/* for per-queue reset with mes */
+	uint32_t *hung_db_array;
+	struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
 };
 
 void device_queue_manager_init_cik(
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 27e4859e4ad7..6cb33f6d71e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -523,6 +523,7 @@ struct queue_properties {
 	uint32_t pm4_target_xcc;
 	bool is_dbg_wa;
 	bool is_user_cu_masked;
+	bool is_reset;
 	/* Not relevant for user mode queues in cp scheduling */
 	unsigned int vmid;
 	/* Relevant only for sdma queues*/
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 04/10] drm/amdgpu: Create hqd info structure
  2026-03-27 20:31 [PATCH v3 04/10] drm/amdgpu: Create hqd info structure Amber Lin
  2026-03-27 20:31 ` [PATCH v3 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
@ 2026-03-31 14:25 ` Amber Lin
  2026-03-31 16:20 ` Alex Deucher
  2 siblings, 0 replies; 7+ messages in thread
From: Amber Lin @ 2026-03-31 14:25 UTC (permalink / raw)
  To: amd-gfx, alexdeucher; +Cc: Jonathan Kim

ping

Regards,
Amber


On 3/27/26 16:31, Amber Lin wrote:
> Create hung_queue_hqd_info structure and fill in hung queses information
> passed by MES, including queue type, pipe id, and queue id.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 17 +++++++++--------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
>   2 files changed, 22 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index f1f8bbfc31e0..436a46ba1dfa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>   {
>   	struct mes_detect_and_reset_queue_input input;
>   	u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
> -	int r, i;
> +	int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
>   
>   	if (!hung_db_num || !hung_db_array)
>   		return -EINVAL;
> @@ -471,6 +471,12 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>   		return r;
>   	}
>   
> +	if (r && (queue_type != AMDGPU_RING_TYPE_COMPUTE)) {
> +		dev_err(adev->dev, "MES resetting queue type %d is not supported\n",
> +				queue_type);
> +		return r;
> +	}
> +
>   	*hung_db_num = 0;
>   	/* MES passes hung queues' doorbell to driver */
>   	for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
> @@ -486,13 +492,8 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>   		return r;
>   	}
>   
> -	/*
> -	 * TODO: return HQD info for MES scheduled user compute queue reset cases
> -	 * stored in hung_db_array hqd info offset to full array size
> -	 */
> -
> -	if (r)
> -		dev_err(adev->dev, "failed to reset\n");
> +	for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
> +		hung_db_array[i] = db_array[i];
>   
>   	return r;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index f80e3aca9c78..2e6ae9f84db0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -170,6 +170,19 @@ struct amdgpu_mes {
>   	uint64_t            shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
>   };
>   
> +struct amdgpu_mes_hung_queue_hqd_info {
> +	union {
> +		struct {
> +			uint32_t queue_type: 3; // queue type
> +			uint32_t pipe_index: 4; // pipe index
> +			uint32_t queue_index: 8; // queue index
> +			uint32_t reserved: 17;
> +		};
> +
> +		uint32_t bit0_31;
> +	};
> +};
> +
>   struct amdgpu_mes_gang {
>   	int 				gang_id;
>   	int 				priority;


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 09/10] drm/amdkfd: Reset queue/pipe in MES
  2026-03-27 20:31 ` [PATCH v3 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
@ 2026-03-31 14:27   ` Amber Lin
  2026-03-31 16:23   ` Alex Deucher
  1 sibling, 0 replies; 7+ messages in thread
From: Amber Lin @ 2026-03-31 14:27 UTC (permalink / raw)
  To: amd-gfx, alexdeucher; +Cc: Jonathan Kim

ping.

Please let me know if re-sending the whole series is preferred instead 
of only sending review pending patches. Thanks.

Regards,
Amber


On 3/27/26 16:31, Amber Lin wrote:
> When removing queues fails, KFD calls amdgpu_mes to detect and reset
> hung queues, then cleans up those hung queues in KFD.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       |   6 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |   1 +
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |   4 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   1 +
>   5 files changed, 156 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index fb7fdf5d0973..75720d247b4a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -793,6 +793,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
>   		amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
>   }
>   
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
> +{
> +	return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
> +		(adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
> +}
> +
>   /* Fix me -- node_id is used to identify the correct MES instances in the future */
>   static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
>   					    uint32_t node_id, bool enable)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 643b4f8d757a..44fa4d73bce8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
>   }
>   
>   bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
>   
>   int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ec8d7f4be840..2670741f3e53 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   				struct queue *q, const uint32_t *restore_sdma_id);
>   
>   static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
> +static int resume_all_queues_mes(struct device_queue_manager *dqm);
> +static int suspend_all_queues_mes(struct device_queue_manager *dqm);
> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
> +						   uint32_t doorbell_offset);
> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
> +			       struct qcm_process_device *qpd);
>   
>   static inline
>   enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
> @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	return r;
>   }
>   
> -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> -			struct qcm_process_device *qpd)
> +static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q,
> +					    struct qcm_process_device *qpd,
> +					    bool is_for_reset,
> +					    bool flush_mes_queue)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
>   	int r;
>   	struct mes_remove_queue_input queue_input;
>   
> +	/* queue was already removed during reset */
> +	if (q->properties.is_reset)
> +		return 0;
> +
>   	if (!dqm->sched_running || dqm->sched_halt)
>   		return 0;
>   	if (!down_read_trylock(&adev->reset_domain->sem))
> @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
>   	queue_input.doorbell_offset = q->properties.doorbell_off;
>   	queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
> +	queue_input.remove_queue_after_reset = flush_mes_queue;
>   	queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
>   
>   	amdgpu_mes_lock(&adev->mes);
> @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	amdgpu_mes_unlock(&adev->mes);
>   	up_read(&adev->reset_domain->sem);
>   
> +	if (is_for_reset)
> +		return r;
> +
>   	if (r) {
> +		if (!suspend_all_queues_mes(dqm))
> +			return resume_all_queues_mes(dqm);
> +
>   		dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
>   			q->properties.doorbell_off);
>   		dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
> @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	return r;
>   }
>   
> +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> +			    struct qcm_process_device *qpd)
> +{
> +	return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
> +}
> +
>   static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
>   {
>   	struct device_process_node *cur;
> @@ -359,6 +384,92 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
>   	return retval;
>   }
>   
> +static int reset_queues_mes(struct device_queue_manager *dqm)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> +	int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
> +	int num_hung = 0, r = 0, i, pipe, queue, queue_type;
> +	uint32_t *hung_array = dqm->hung_db_array;
> +	struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info;
> +	struct kfd_process_device *pdd;
> +	struct queue *q;
> +
> +	if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
> +		r = -ENOTRECOVERABLE;
> +		goto fail;
> +	}
> +
> +	/* reset should be used only in dqm locked queue reset */
> +	if (WARN_ON(dqm->detect_hang_count > 0))
> +		return 0;
> +
> +	if (!amdgpu_gpu_recovery) {
> +		r = -ENOTRECOVERABLE;
> +		goto fail;
> +	}
> +
> +	if (!hung_array || !hqd_info) {
> +		r = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info));
> +
> +	/*
> +	 * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
> +	 * post suspend_all as reset & detect will return all hung queue types.
> +	 *
> +	 * Passed parameter is for targeting queues not scheduled by MES add_queue.
> +	 */
> +	r =  amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
> +		false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
> +
> +	if (!num_hung || r) {
> +		r = -ENOTRECOVERABLE;
> +		goto fail;
> +	}
> +
> +	/* MES resets queue/pipe and cleans up internally */
> +	for (i = 0; i < num_hung; i++) {
> +		hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
> +		pipe = hqd_info[i].pipe_index;
> +		queue = hqd_info[i].queue_index;
> +		queue_type = hqd_info[i].queue_type;
> +
> +		if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
> +		    queue_type != MES_QUEUE_TYPE_SDMA) {
> +			pr_warn("Unsupported hung queue reset type: %d\n", queue_type);
> +			hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
> +			continue;
> +		}
> +
> +		q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
> +		if (!q) {
> +			r = -ENOTRECOVERABLE;
> +			goto fail;
> +		}
> +
> +		pdd = kfd_get_process_device_data(q->device, q->process);
> +		if (!pdd) {
> +			r = -ENODEV;
> +			goto fail;
> +		}
> +
> +		pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
> +				hung_array[i], pipe, queue, queue_type);
> +		/* Proceed remove_queue with reset=true */
> +		remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false);
> +		set_queue_as_reset(dqm, q, &pdd->qpd);
> +	}
> +
> +	dqm->detect_hang_count = num_hung;
> +	kfd_signal_reset_event(dqm->dev);
> +
> +fail:
> +	dqm->detect_hang_count = 0;
> +	return r;
> +}
> +
>   static int suspend_all_queues_mes(struct device_queue_manager *dqm)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> @@ -371,6 +482,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
>   	up_read(&adev->reset_domain->sem);
>   
>   	if (r) {
> +		if (!reset_queues_mes(dqm))
> +			return 0;
> +
>   		dev_err(adev->dev, "failed to suspend gangs from MES\n");
>   		dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
>   		kfd_hws_hang(dqm);
> @@ -1821,6 +1935,9 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   {
>   	struct device *dev = dqm->dev->adev->dev;
>   	int retval, num_hw_queue_slots;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> +	int hung_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
> +	int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
>   
>   	dqm_lock(dqm);
>   
> @@ -1870,6 +1987,11 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   		goto fail_detect_hang_buffer;
>   	}
>   
> +	dqm->hung_db_array = kzalloc(hung_array_size * sizeof(uint32_t), GFP_KERNEL);
> +	dqm->hqd_info = kzalloc(
> +		hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info),
> +		GFP_KERNEL);
> +
>   	dqm_unlock(dqm);
>   
>   	return 0;
> @@ -1910,6 +2032,9 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>   		pm_uninit(&dqm->packet_mgr);
>   	kfree(dqm->detect_hang_info);
>   	dqm->detect_hang_info = NULL;
> +	kfree(dqm->hung_db_array);
> +	kfree(dqm->hqd_info);
> +
>   	dqm_unlock(dqm);
>   
>   	return ret;
> @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
>   		q->properties.queue_id, pdd->process->lead_thread->pid);
>   
>   	pdd->has_reset_queue = true;
> +	q->properties.is_reset = true;
>   	if (q->properties.is_active) {
>   		q->properties.is_active = false;
>   		decrement_queue_count(dqm, qpd, q);
> @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
>   	return NULL;
>   }
>   
> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset)
> +{
> +	struct device_process_node *cur;
> +	struct qcm_process_device *qpd;
> +	struct queue *q;
> +
> +	list_for_each_entry(cur, &dqm->queues, list) {
> +		qpd = cur->qpd;
> +		list_for_each_entry(q, &qpd->queues_list, list) {
> +			if (doorbell_offset == q->properties.doorbell_off)
> +				return q;
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
>   static int reset_hung_queues(struct device_queue_manager *dqm)
>   {
>   	int r = 0, reset_count = 0, i;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 3272328da11f..e6eca38cae4e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -32,7 +32,6 @@
>   #include "kfd_priv.h"
>   #include "kfd_mqd_manager.h"
>   
> -
>   #define VMID_NUM 16
>   
>   #define KFD_MES_PROCESS_QUANTUM		100000
> @@ -285,6 +284,9 @@ struct device_queue_manager {
>   	struct dqm_detect_hang_info *detect_hang_info;
>   	size_t detect_hang_info_size;
>   	int detect_hang_count;
> +	/* for per-queue reset with mes */
> +	uint32_t *hung_db_array;
> +	struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
>   };
>   
>   void device_queue_manager_init_cik(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 27e4859e4ad7..6cb33f6d71e2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -523,6 +523,7 @@ struct queue_properties {
>   	uint32_t pm4_target_xcc;
>   	bool is_dbg_wa;
>   	bool is_user_cu_masked;
> +	bool is_reset;
>   	/* Not relevant for user mode queues in cp scheduling */
>   	unsigned int vmid;
>   	/* Relevant only for sdma queues*/


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 04/10] drm/amdgpu: Create hqd info structure
  2026-03-27 20:31 [PATCH v3 04/10] drm/amdgpu: Create hqd info structure Amber Lin
  2026-03-27 20:31 ` [PATCH v3 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
  2026-03-31 14:25 ` [PATCH v3 04/10] drm/amdgpu: Create hqd info structure Amber Lin
@ 2026-03-31 16:20 ` Alex Deucher
  2026-04-01 19:20   ` Amber Lin
  2 siblings, 1 reply; 7+ messages in thread
From: Alex Deucher @ 2026-03-31 16:20 UTC (permalink / raw)
  To: Amber Lin; +Cc: amd-gfx, Jonathan Kim

On Fri, Mar 27, 2026 at 4:33 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> Create hung_queue_hqd_info structure and fill in hung queses information
> passed by MES, including queue type, pipe id, and queue id.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 17 +++++++++--------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
>  2 files changed, 22 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index f1f8bbfc31e0..436a46ba1dfa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>  {
>         struct mes_detect_and_reset_queue_input input;
>         u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
> -       int r, i;
> +       int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
>
>         if (!hung_db_num || !hung_db_array)
>                 return -EINVAL;
> @@ -471,6 +471,12 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>                 return r;
>         }
>
> +       if (r && (queue_type != AMDGPU_RING_TYPE_COMPUTE)) {
> +               dev_err(adev->dev, "MES resetting queue type %d is not supported\n",
> +                               queue_type);
> +               return r;
> +       }

I think the message here is a bit confusing.  The MES can reset other
queue types, this is just the fall back case for when MES queue reset
has failed.  Also, does MES populate the doorbell array for all queue
types regardless of whether the reset was successful or not?  If so,
shouldn't we bail for non-compute queues after the doorbells are
populated?

Alex

> +
>         *hung_db_num = 0;
>         /* MES passes hung queues' doorbell to driver */
>         for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
> @@ -486,13 +492,8 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>                 return r;
>         }
>
> -       /*
> -        * TODO: return HQD info for MES scheduled user compute queue reset cases
> -        * stored in hung_db_array hqd info offset to full array size
> -        */
> -
> -       if (r)
> -               dev_err(adev->dev, "failed to reset\n");
> +       for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
> +               hung_db_array[i] = db_array[i];
>
>         return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index f80e3aca9c78..2e6ae9f84db0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -170,6 +170,19 @@ struct amdgpu_mes {
>         uint64_t            shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
>  };
>
> +struct amdgpu_mes_hung_queue_hqd_info {
> +       union {
> +               struct {
> +                       uint32_t queue_type: 3; // queue type
> +                       uint32_t pipe_index: 4; // pipe index
> +                       uint32_t queue_index: 8; // queue index
> +                       uint32_t reserved: 17;
> +               };
> +
> +               uint32_t bit0_31;
> +       };
> +};
> +
>  struct amdgpu_mes_gang {
>         int                             gang_id;
>         int                             priority;
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 09/10] drm/amdkfd: Reset queue/pipe in MES
  2026-03-27 20:31 ` [PATCH v3 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
  2026-03-31 14:27   ` Amber Lin
@ 2026-03-31 16:23   ` Alex Deucher
  1 sibling, 0 replies; 7+ messages in thread
From: Alex Deucher @ 2026-03-31 16:23 UTC (permalink / raw)
  To: Amber Lin; +Cc: amd-gfx, Jonathan Kim

On Fri, Mar 27, 2026 at 4:32 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> When removing queues fails, KFD calls amdgpu_mes to detect and reset
> hung queues, then cleans up those hung queues in KFD.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>

This will need greater coordination with KGD user queues on GPUs that
support it, but for now:

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       |   6 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |   1 +
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
>  .../drm/amd/amdkfd/kfd_device_queue_manager.h |   4 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   1 +
>  5 files changed, 156 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index fb7fdf5d0973..75720d247b4a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -793,6 +793,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
>                 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
>  }
>
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
> +{
> +       return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
> +               (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
> +}
> +
>  /* Fix me -- node_id is used to identify the correct MES instances in the future */
>  static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
>                                             uint32_t node_id, bool enable)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 643b4f8d757a..44fa4d73bce8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
>  }
>
>  bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
>
>  int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ec8d7f4be840..2670741f3e53 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>                                 struct queue *q, const uint32_t *restore_sdma_id);
>
>  static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
> +static int resume_all_queues_mes(struct device_queue_manager *dqm);
> +static int suspend_all_queues_mes(struct device_queue_manager *dqm);
> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
> +                                                  uint32_t doorbell_offset);
> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
> +                              struct qcm_process_device *qpd);
>
>  static inline
>  enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
> @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>         return r;
>  }
>
> -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> -                       struct qcm_process_device *qpd)
> +static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q,
> +                                           struct qcm_process_device *qpd,
> +                                           bool is_for_reset,
> +                                           bool flush_mes_queue)
>  {
>         struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
>         int r;
>         struct mes_remove_queue_input queue_input;
>
> +       /* queue was already removed during reset */
> +       if (q->properties.is_reset)
> +               return 0;
> +
>         if (!dqm->sched_running || dqm->sched_halt)
>                 return 0;
>         if (!down_read_trylock(&adev->reset_domain->sem))
> @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>         memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
>         queue_input.doorbell_offset = q->properties.doorbell_off;
>         queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
> +       queue_input.remove_queue_after_reset = flush_mes_queue;
>         queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
>
>         amdgpu_mes_lock(&adev->mes);
> @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>         amdgpu_mes_unlock(&adev->mes);
>         up_read(&adev->reset_domain->sem);
>
> +       if (is_for_reset)
> +               return r;
> +
>         if (r) {
> +               if (!suspend_all_queues_mes(dqm))
> +                       return resume_all_queues_mes(dqm);
> +
>                 dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
>                         q->properties.doorbell_off);
>                 dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
> @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>         return r;
>  }
>
> +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> +                           struct qcm_process_device *qpd)
> +{
> +       return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
> +}
> +
>  static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
>  {
>         struct device_process_node *cur;
> @@ -359,6 +384,92 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
>         return retval;
>  }
>
> +static int reset_queues_mes(struct device_queue_manager *dqm)
> +{
> +       struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> +       int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
> +       int num_hung = 0, r = 0, i, pipe, queue, queue_type;
> +       uint32_t *hung_array = dqm->hung_db_array;
> +       struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info;
> +       struct kfd_process_device *pdd;
> +       struct queue *q;
> +
> +       if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
> +               r = -ENOTRECOVERABLE;
> +               goto fail;
> +       }
> +
> +       /* reset should be used only in dqm locked queue reset */
> +       if (WARN_ON(dqm->detect_hang_count > 0))
> +               return 0;
> +
> +       if (!amdgpu_gpu_recovery) {
> +               r = -ENOTRECOVERABLE;
> +               goto fail;
> +       }
> +
> +       if (!hung_array || !hqd_info) {
> +               r = -ENOMEM;
> +               goto fail;
> +       }
> +
> +       memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info));
> +
> +       /*
> +        * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
> +        * post suspend_all as reset & detect will return all hung queue types.
> +        *
> +        * Passed parameter is for targeting queues not scheduled by MES add_queue.
> +        */
> +       r =  amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
> +               false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
> +
> +       if (!num_hung || r) {
> +               r = -ENOTRECOVERABLE;
> +               goto fail;
> +       }
> +
> +       /* MES resets queue/pipe and cleans up internally */
> +       for (i = 0; i < num_hung; i++) {
> +               hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
> +               pipe = hqd_info[i].pipe_index;
> +               queue = hqd_info[i].queue_index;
> +               queue_type = hqd_info[i].queue_type;
> +
> +               if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
> +                   queue_type != MES_QUEUE_TYPE_SDMA) {
> +                       pr_warn("Unsupported hung queue reset type: %d\n", queue_type);
> +                       hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
> +                       continue;
> +               }
> +
> +               q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
> +               if (!q) {
> +                       r = -ENOTRECOVERABLE;
> +                       goto fail;
> +               }
> +
> +               pdd = kfd_get_process_device_data(q->device, q->process);
> +               if (!pdd) {
> +                       r = -ENODEV;
> +                       goto fail;
> +               }
> +
> +               pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
> +                               hung_array[i], pipe, queue, queue_type);
> +               /* Proceed remove_queue with reset=true */
> +               remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false);
> +               set_queue_as_reset(dqm, q, &pdd->qpd);
> +       }
> +
> +       dqm->detect_hang_count = num_hung;
> +       kfd_signal_reset_event(dqm->dev);
> +
> +fail:
> +       dqm->detect_hang_count = 0;
> +       return r;
> +}
> +
>  static int suspend_all_queues_mes(struct device_queue_manager *dqm)
>  {
>         struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> @@ -371,6 +482,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
>         up_read(&adev->reset_domain->sem);
>
>         if (r) {
> +               if (!reset_queues_mes(dqm))
> +                       return 0;
> +
>                 dev_err(adev->dev, "failed to suspend gangs from MES\n");
>                 dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
>                 kfd_hws_hang(dqm);
> @@ -1821,6 +1935,9 @@ static int start_cpsch(struct device_queue_manager *dqm)
>  {
>         struct device *dev = dqm->dev->adev->dev;
>         int retval, num_hw_queue_slots;
> +       struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> +       int hung_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
> +       int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
>
>         dqm_lock(dqm);
>
> @@ -1870,6 +1987,11 @@ static int start_cpsch(struct device_queue_manager *dqm)
>                 goto fail_detect_hang_buffer;
>         }
>
> +       dqm->hung_db_array = kzalloc(hung_array_size * sizeof(uint32_t), GFP_KERNEL);
> +       dqm->hqd_info = kzalloc(
> +               hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info),
> +               GFP_KERNEL);
> +
>         dqm_unlock(dqm);
>
>         return 0;
> @@ -1910,6 +2032,9 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>                 pm_uninit(&dqm->packet_mgr);
>         kfree(dqm->detect_hang_info);
>         dqm->detect_hang_info = NULL;
> +       kfree(dqm->hung_db_array);
> +       kfree(dqm->hqd_info);
> +
>         dqm_unlock(dqm);
>
>         return ret;
> @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
>                 q->properties.queue_id, pdd->process->lead_thread->pid);
>
>         pdd->has_reset_queue = true;
> +       q->properties.is_reset = true;
>         if (q->properties.is_active) {
>                 q->properties.is_active = false;
>                 decrement_queue_count(dqm, qpd, q);
> @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
>         return NULL;
>  }
>
> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset)
> +{
> +       struct device_process_node *cur;
> +       struct qcm_process_device *qpd;
> +       struct queue *q;
> +
> +       list_for_each_entry(cur, &dqm->queues, list) {
> +               qpd = cur->qpd;
> +               list_for_each_entry(q, &qpd->queues_list, list) {
> +                       if (doorbell_offset == q->properties.doorbell_off)
> +                               return q;
> +               }
> +       }
> +
> +       return NULL;
> +}
> +
>  static int reset_hung_queues(struct device_queue_manager *dqm)
>  {
>         int r = 0, reset_count = 0, i;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 3272328da11f..e6eca38cae4e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -32,7 +32,6 @@
>  #include "kfd_priv.h"
>  #include "kfd_mqd_manager.h"
>
> -
>  #define VMID_NUM 16
>
>  #define KFD_MES_PROCESS_QUANTUM                100000
> @@ -285,6 +284,9 @@ struct device_queue_manager {
>         struct dqm_detect_hang_info *detect_hang_info;
>         size_t detect_hang_info_size;
>         int detect_hang_count;
> +       /* for per-queue reset with mes */
> +       uint32_t *hung_db_array;
> +       struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
>  };
>
>  void device_queue_manager_init_cik(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 27e4859e4ad7..6cb33f6d71e2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -523,6 +523,7 @@ struct queue_properties {
>         uint32_t pm4_target_xcc;
>         bool is_dbg_wa;
>         bool is_user_cu_masked;
> +       bool is_reset;
>         /* Not relevant for user mode queues in cp scheduling */
>         unsigned int vmid;
>         /* Relevant only for sdma queues*/
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 04/10] drm/amdgpu: Create hqd info structure
  2026-03-31 16:20 ` Alex Deucher
@ 2026-04-01 19:20   ` Amber Lin
  0 siblings, 0 replies; 7+ messages in thread
From: Amber Lin @ 2026-04-01 19:20 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx, Jonathan Kim


On 3/31/26 12:20, Alex Deucher wrote:
> On Fri, Mar 27, 2026 at 4:33 PM Amber Lin <Amber.Lin@amd.com> wrote:
>> Create hung_queue_hqd_info structure and fill in hung queses information
>> passed by MES, including queue type, pipe id, and queue id.
>>
>> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 17 +++++++++--------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
>>   2 files changed, 22 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> index f1f8bbfc31e0..436a46ba1dfa 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>>   {
>>          struct mes_detect_and_reset_queue_input input;
>>          u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
>> -       int r, i;
>> +       int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
>>
>>          if (!hung_db_num || !hung_db_array)
>>                  return -EINVAL;
>> @@ -471,6 +471,12 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>>                  return r;
>>          }
>>
>> +       if (r && (queue_type != AMDGPU_RING_TYPE_COMPUTE)) {
>> +               dev_err(adev->dev, "MES resetting queue type %d is not supported\n",
>> +                               queue_type);
>> +               return r;
>> +       }
> I think the message here is a bit confusing.  The MES can reset other
> queue types, this is just the fall back case for when MES queue reset
> has failed.  Also, does MES populate the doorbell array for all queue
> types regardless of whether the reset was successful or not?  If so,
> shouldn't we bail for non-compute queues after the doorbells are
> populated?
>
> Alex
>
You're right. I'm too obsessed by user compute queues and didn't pay 
attention to this is in amdgpu_mes for all queue types. I just sent v4 
to correct it. Thank you for the review

Amber
>> +
>>          *hung_db_num = 0;
>>          /* MES passes hung queues' doorbell to driver */
>>          for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
>> @@ -486,13 +492,8 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>>                  return r;
>>          }
>>
>> -       /*
>> -        * TODO: return HQD info for MES scheduled user compute queue reset cases
>> -        * stored in hung_db_array hqd info offset to full array size
>> -        */
>> -
>> -       if (r)
>> -               dev_err(adev->dev, "failed to reset\n");
>> +       for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
>> +               hung_db_array[i] = db_array[i];
>>
>>          return r;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> index f80e3aca9c78..2e6ae9f84db0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> @@ -170,6 +170,19 @@ struct amdgpu_mes {
>>          uint64_t            shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
>>   };
>>
>> +struct amdgpu_mes_hung_queue_hqd_info {
>> +       union {
>> +               struct {
>> +                       uint32_t queue_type: 3; // queue type
>> +                       uint32_t pipe_index: 4; // pipe index
>> +                       uint32_t queue_index: 8; // queue index
>> +                       uint32_t reserved: 17;
>> +               };
>> +
>> +               uint32_t bit0_31;
>> +       };
>> +};
>> +
>>   struct amdgpu_mes_gang {
>>          int                             gang_id;
>>          int                             priority;
>> --
>> 2.43.0
>>


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2026-04-01 19:20 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-27 20:31 [PATCH v3 04/10] drm/amdgpu: Create hqd info structure Amber Lin
2026-03-27 20:31 ` [PATCH v3 09/10] drm/amdkfd: Reset queue/pipe in MES Amber Lin
2026-03-31 14:27   ` Amber Lin
2026-03-31 16:23   ` Alex Deucher
2026-03-31 14:25 ` [PATCH v3 04/10] drm/amdgpu: Create hqd info structure Amber Lin
2026-03-31 16:20 ` Alex Deucher
2026-04-01 19:20   ` Amber Lin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox