* [PATCH 1/8] drm/amdgpu: Fix gfx_hqd_mask in mes 12.1
2026-03-20 20:02 [PATCH 0/8] Support compute queue/pipe reset on gfx 12.1 Amber Lin
@ 2026-03-20 20:02 ` Amber Lin
2026-03-23 19:03 ` Alex Deucher
2026-03-20 20:02 ` [PATCH 2/8] drm/amdgpu: Fixup boost mes detect hang array size Amber Lin
` (6 subsequent siblings)
7 siblings, 1 reply; 22+ messages in thread
From: Amber Lin @ 2026-03-20 20:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Amber Lin
Same as compute and sdma, gfx_hqd_mask has been initialized properly in
amdgpu_mes_init. set_hw_res should use those initialization accordingly.
Hardcoding it causes the mismatch between driver and MES.
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 15 +++------------
1 file changed, 3 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
index 5dcc2c32644a..70d80c2aed52 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
@@ -633,17 +633,6 @@ static int mes_v12_1_set_hw_resources_1(struct amdgpu_mes *mes,
offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status));
}
-static void mes_v12_1_set_gfx_hqd_mask(union MESAPI_SET_HW_RESOURCES *pkt)
-{
- /*
- * GFX V12 has only one GFX pipe, but 8 queues in it.
- * GFX pipe 0 queue 0 is being used by Kernel queue.
- * Set GFX pipe 0 queue 1-7 for MES scheduling
- * mask = 1111 1110b
- */
- pkt->gfx_hqd_mask[0] = 0xFE;
-}
-
static int mes_v12_1_set_hw_resources(struct amdgpu_mes *mes,
int pipe, int xcc_id)
{
@@ -667,7 +656,9 @@ static int mes_v12_1_set_hw_resources(struct amdgpu_mes *mes,
mes_set_hw_res_pkt.compute_hqd_mask[i] =
mes->compute_hqd_mask[i];
- mes_v12_1_set_gfx_hqd_mask(&mes_set_hw_res_pkt);
+ for (i = 0; i < MAX_GFX_PIPES; i++)
+ mes_set_hw_res_pkt.gfx_hqd_mask[i] =
+ mes->gfx_hqd_mask[i];
for (i = 0; i < MAX_SDMA_PIPES; i++)
mes_set_hw_res_pkt.sdma_hqd_mask[i] =
--
2.43.0
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 1/8] drm/amdgpu: Fix gfx_hqd_mask in mes 12.1
2026-03-20 20:02 ` [PATCH 1/8] drm/amdgpu: Fix gfx_hqd_mask in mes 12.1 Amber Lin
@ 2026-03-23 19:03 ` Alex Deucher
0 siblings, 0 replies; 22+ messages in thread
From: Alex Deucher @ 2026-03-23 19:03 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang
On Fri, Mar 20, 2026 at 4:02 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> Same as compute and sdma, gfx_hqd_mask has been initialized properly in
> amdgpu_mes_init. set_hw_res should use those initialization accordingly.
> Hardcoding it causes the mismatch between driver and MES.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 15 +++------------
> 1 file changed, 3 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> index 5dcc2c32644a..70d80c2aed52 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> @@ -633,17 +633,6 @@ static int mes_v12_1_set_hw_resources_1(struct amdgpu_mes *mes,
> offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status));
> }
>
> -static void mes_v12_1_set_gfx_hqd_mask(union MESAPI_SET_HW_RESOURCES *pkt)
> -{
> - /*
> - * GFX V12 has only one GFX pipe, but 8 queues in it.
> - * GFX pipe 0 queue 0 is being used by Kernel queue.
> - * Set GFX pipe 0 queue 1-7 for MES scheduling
> - * mask = 1111 1110b
> - */
> - pkt->gfx_hqd_mask[0] = 0xFE;
> -}
> -
> static int mes_v12_1_set_hw_resources(struct amdgpu_mes *mes,
> int pipe, int xcc_id)
> {
> @@ -667,7 +656,9 @@ static int mes_v12_1_set_hw_resources(struct amdgpu_mes *mes,
> mes_set_hw_res_pkt.compute_hqd_mask[i] =
> mes->compute_hqd_mask[i];
>
> - mes_v12_1_set_gfx_hqd_mask(&mes_set_hw_res_pkt);
> + for (i = 0; i < MAX_GFX_PIPES; i++)
> + mes_set_hw_res_pkt.gfx_hqd_mask[i] =
> + mes->gfx_hqd_mask[i];
>
> for (i = 0; i < MAX_SDMA_PIPES; i++)
> mes_set_hw_res_pkt.sdma_hqd_mask[i] =
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 2/8] drm/amdgpu: Fixup boost mes detect hang array size
2026-03-20 20:02 [PATCH 0/8] Support compute queue/pipe reset on gfx 12.1 Amber Lin
2026-03-20 20:02 ` [PATCH 1/8] drm/amdgpu: Fix gfx_hqd_mask in mes 12.1 Amber Lin
@ 2026-03-20 20:02 ` Amber Lin
2026-03-23 19:04 ` Alex Deucher
2026-03-20 20:02 ` [PATCH 3/8] drm/amdgpu: Fixup detect and reset Amber Lin
` (5 subsequent siblings)
7 siblings, 1 reply; 22+ messages in thread
From: Amber Lin @ 2026-03-20 20:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Amber Lin, Jonathan Kim
When allocate the hung queues memory, we need to take the number of
queues into account for the worst hang case.
Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 33 +++++++++++++++++++------
1 file changed, 26 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 0d4c77c1b4b5..b68bf4a9cb40 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -103,7 +103,7 @@ static inline u32 amdgpu_mes_get_hqd_mask(u32 num_pipe,
int amdgpu_mes_init(struct amdgpu_device *adev)
{
- int i, r, num_pipes;
+ int i, r, num_pipes, num_queues = 0;
u32 total_vmid_mask, reserved_vmid_mask;
int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
u32 gfx_hqd_mask = amdgpu_mes_get_hqd_mask(adev->gfx.me.num_pipe_per_me,
@@ -159,7 +159,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
adev->mes.compute_hqd_mask[i] = compute_hqd_mask;
}
- num_pipes = adev->sdma.num_instances;
+ num_pipes = adev->sdma.num_inst_per_xcc;
if (num_pipes > AMDGPU_MES_MAX_SDMA_PIPES)
dev_warn(adev->dev, "more SDMA pipes than supported by MES! (%d vs %d)\n",
num_pipes, AMDGPU_MES_MAX_SDMA_PIPES);
@@ -216,8 +216,27 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
if (r)
goto error_doorbell;
+ if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
+ /* When queue/pipe reset is done in MES instead of in the
+ * driver, MES passes hung queues information to the driver in
+ * hung_queue_hqd_info. Calculate required space to store this
+ * information.
+ */
+ for (i = 0; i < AMDGPU_MES_MAX_GFX_PIPES; i++)
+ num_queues += hweight32(adev->mes.gfx_hqd_mask[i]);
+
+ for (i = 0; i < AMDGPU_MES_MAX_COMPUTE_PIPES; i++)
+ num_queues += hweight32(adev->mes.compute_hqd_mask[i]);
+
+ for (i = 0; i < AMDGPU_MES_MAX_SDMA_PIPES; i++)
+ num_queues += hweight32(adev->mes.sdma_hqd_mask[i]) * num_xcc;
+
+ adev->mes.hung_queue_hqd_info_offset = num_queues;
+ adev->mes.hung_queue_db_array_size = num_queues * 2;
+ }
+
if (adev->mes.hung_queue_db_array_size) {
- for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
+ for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) {
r = amdgpu_bo_create_kernel(adev,
adev->mes.hung_queue_db_array_size * sizeof(u32),
PAGE_SIZE,
@@ -264,10 +283,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
&adev->mes.event_log_cpu_addr);
for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
- amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
- &adev->mes.hung_queue_db_array_gpu_addr[i],
- &adev->mes.hung_queue_db_array_cpu_addr[i]);
-
+ if (adev->mes.hung_queue_db_array_gpu_obj[i])
+ amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
+ &adev->mes.hung_queue_db_array_gpu_addr[i],
+ &adev->mes.hung_queue_db_array_cpu_addr[i]);
if (adev->mes.sch_ctx_ptr[i])
amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs[i]);
if (adev->mes.query_status_fence_ptr[i])
--
2.43.0
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 2/8] drm/amdgpu: Fixup boost mes detect hang array size
2026-03-20 20:02 ` [PATCH 2/8] drm/amdgpu: Fixup boost mes detect hang array size Amber Lin
@ 2026-03-23 19:04 ` Alex Deucher
2026-03-23 19:15 ` Amber Lin
0 siblings, 1 reply; 22+ messages in thread
From: Alex Deucher @ 2026-03-23 19:04 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Jonathan Kim
On Fri, Mar 20, 2026 at 4:09 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> When allocate the hung queues memory, we need to take the number of
> queues into account for the worst hang case.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 33 +++++++++++++++++++------
> 1 file changed, 26 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 0d4c77c1b4b5..b68bf4a9cb40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -103,7 +103,7 @@ static inline u32 amdgpu_mes_get_hqd_mask(u32 num_pipe,
>
> int amdgpu_mes_init(struct amdgpu_device *adev)
> {
> - int i, r, num_pipes;
> + int i, r, num_pipes, num_queues = 0;
> u32 total_vmid_mask, reserved_vmid_mask;
> int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
> u32 gfx_hqd_mask = amdgpu_mes_get_hqd_mask(adev->gfx.me.num_pipe_per_me,
> @@ -159,7 +159,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
> adev->mes.compute_hqd_mask[i] = compute_hqd_mask;
> }
>
> - num_pipes = adev->sdma.num_instances;
> + num_pipes = adev->sdma.num_inst_per_xcc;
> if (num_pipes > AMDGPU_MES_MAX_SDMA_PIPES)
> dev_warn(adev->dev, "more SDMA pipes than supported by MES! (%d vs %d)\n",
> num_pipes, AMDGPU_MES_MAX_SDMA_PIPES);
> @@ -216,8 +216,27 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
> if (r)
> goto error_doorbell;
>
> + if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
Is this 12.0 and higher or 12.1 and higher?
Alex
> + /* When queue/pipe reset is done in MES instead of in the
> + * driver, MES passes hung queues information to the driver in
> + * hung_queue_hqd_info. Calculate required space to store this
> + * information.
> + */
> + for (i = 0; i < AMDGPU_MES_MAX_GFX_PIPES; i++)
> + num_queues += hweight32(adev->mes.gfx_hqd_mask[i]);
> +
> + for (i = 0; i < AMDGPU_MES_MAX_COMPUTE_PIPES; i++)
> + num_queues += hweight32(adev->mes.compute_hqd_mask[i]);
> +
> + for (i = 0; i < AMDGPU_MES_MAX_SDMA_PIPES; i++)
> + num_queues += hweight32(adev->mes.sdma_hqd_mask[i]) * num_xcc;
> +
> + adev->mes.hung_queue_hqd_info_offset = num_queues;
> + adev->mes.hung_queue_db_array_size = num_queues * 2;
> + }
> +
> if (adev->mes.hung_queue_db_array_size) {
> - for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
> + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) {
> r = amdgpu_bo_create_kernel(adev,
> adev->mes.hung_queue_db_array_size * sizeof(u32),
> PAGE_SIZE,
> @@ -264,10 +283,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
> &adev->mes.event_log_cpu_addr);
>
> for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
> - amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
> - &adev->mes.hung_queue_db_array_gpu_addr[i],
> - &adev->mes.hung_queue_db_array_cpu_addr[i]);
> -
> + if (adev->mes.hung_queue_db_array_gpu_obj[i])
> + amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
> + &adev->mes.hung_queue_db_array_gpu_addr[i],
> + &adev->mes.hung_queue_db_array_cpu_addr[i]);
> if (adev->mes.sch_ctx_ptr[i])
> amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs[i]);
> if (adev->mes.query_status_fence_ptr[i])
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH 2/8] drm/amdgpu: Fixup boost mes detect hang array size
2026-03-23 19:04 ` Alex Deucher
@ 2026-03-23 19:15 ` Amber Lin
0 siblings, 0 replies; 22+ messages in thread
From: Amber Lin @ 2026-03-23 19:15 UTC (permalink / raw)
To: Alex Deucher
Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Jonathan Kim
On 3/23/26 15:04, Alex Deucher wrote:
> On Fri, Mar 20, 2026 at 4:09 PM Amber Lin <Amber.Lin@amd.com> wrote:
>> When allocate the hung queues memory, we need to take the number of
>> queues into account for the worst hang case.
>>
>> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 33 +++++++++++++++++++------
>> 1 file changed, 26 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> index 0d4c77c1b4b5..b68bf4a9cb40 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> @@ -103,7 +103,7 @@ static inline u32 amdgpu_mes_get_hqd_mask(u32 num_pipe,
>>
>> int amdgpu_mes_init(struct amdgpu_device *adev)
>> {
>> - int i, r, num_pipes;
>> + int i, r, num_pipes, num_queues = 0;
>> u32 total_vmid_mask, reserved_vmid_mask;
>> int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
>> u32 gfx_hqd_mask = amdgpu_mes_get_hqd_mask(adev->gfx.me.num_pipe_per_me,
>> @@ -159,7 +159,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>> adev->mes.compute_hqd_mask[i] = compute_hqd_mask;
>> }
>>
>> - num_pipes = adev->sdma.num_instances;
>> + num_pipes = adev->sdma.num_inst_per_xcc;
>> if (num_pipes > AMDGPU_MES_MAX_SDMA_PIPES)
>> dev_warn(adev->dev, "more SDMA pipes than supported by MES! (%d vs %d)\n",
>> num_pipes, AMDGPU_MES_MAX_SDMA_PIPES);
>> @@ -216,8 +216,27 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>> if (r)
>> goto error_doorbell;
>>
>> + if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
> Is this 12.0 and higher or 12.1 and higher?
>
> Alex
Thank you for the catch. Yes it should be 12.1 for now until the MES 12
support is available too. I'll fix it in v2
Amber
>
>> + /* When queue/pipe reset is done in MES instead of in the
>> + * driver, MES passes hung queues information to the driver in
>> + * hung_queue_hqd_info. Calculate required space to store this
>> + * information.
>> + */
>> + for (i = 0; i < AMDGPU_MES_MAX_GFX_PIPES; i++)
>> + num_queues += hweight32(adev->mes.gfx_hqd_mask[i]);
>> +
>> + for (i = 0; i < AMDGPU_MES_MAX_COMPUTE_PIPES; i++)
>> + num_queues += hweight32(adev->mes.compute_hqd_mask[i]);
>> +
>> + for (i = 0; i < AMDGPU_MES_MAX_SDMA_PIPES; i++)
>> + num_queues += hweight32(adev->mes.sdma_hqd_mask[i]) * num_xcc;
>> +
>> + adev->mes.hung_queue_hqd_info_offset = num_queues;
>> + adev->mes.hung_queue_db_array_size = num_queues * 2;
>> + }
>> +
>> if (adev->mes.hung_queue_db_array_size) {
>> - for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
>> + for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) {
>> r = amdgpu_bo_create_kernel(adev,
>> adev->mes.hung_queue_db_array_size * sizeof(u32),
>> PAGE_SIZE,
>> @@ -264,10 +283,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>> &adev->mes.event_log_cpu_addr);
>>
>> for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
>> - amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
>> - &adev->mes.hung_queue_db_array_gpu_addr[i],
>> - &adev->mes.hung_queue_db_array_cpu_addr[i]);
>> -
>> + if (adev->mes.hung_queue_db_array_gpu_obj[i])
>> + amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
>> + &adev->mes.hung_queue_db_array_gpu_addr[i],
>> + &adev->mes.hung_queue_db_array_cpu_addr[i]);
>> if (adev->mes.sch_ctx_ptr[i])
>> amdgpu_device_wb_free(adev, adev->mes.sch_ctx_offs[i]);
>> if (adev->mes.query_status_fence_ptr[i])
>> --
>> 2.43.0
>>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 3/8] drm/amdgpu: Fixup detect and reset
2026-03-20 20:02 [PATCH 0/8] Support compute queue/pipe reset on gfx 12.1 Amber Lin
2026-03-20 20:02 ` [PATCH 1/8] drm/amdgpu: Fix gfx_hqd_mask in mes 12.1 Amber Lin
2026-03-20 20:02 ` [PATCH 2/8] drm/amdgpu: Fixup boost mes detect hang array size Amber Lin
@ 2026-03-20 20:02 ` Amber Lin
2026-03-23 19:07 ` Alex Deucher
2026-03-20 20:02 ` [PATCH 4/8] drm/amdgpu: Create hqd info structure Amber Lin
` (4 subsequent siblings)
7 siblings, 1 reply; 22+ messages in thread
From: Amber Lin @ 2026-03-20 20:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Amber Lin, Jonathan Kim
Identify hung queues by comparing doorbells shown in hqd_info from MES
with doorbells stored in the driver to find matching queues.
Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 38 ++++++++++++++++---------
1 file changed, 25 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index b68bf4a9cb40..bea509f6b3ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -465,23 +465,35 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
&input);
- if (r) {
- dev_err(adev->dev, "failed to detect and reset\n");
- } else {
- *hung_db_num = 0;
- for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
- if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
- hung_db_array[i] = db_array[i];
- *hung_db_num += 1;
- }
+
+ if (r && detect_only) {
+ dev_err(adev->dev, "Failed to detect hung queues\n");
+ return r;
+ }
+
+ *hung_db_num = 0;
+ /* MES passes hung queues' doorbell to driver */
+ for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
+ /* Finding hung queues where db_array[i] is a valid doorbell */
+ if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
+ hung_db_array[i] = db_array[i];
+ *hung_db_num += 1;
}
+ }
- /*
- * TODO: return HQD info for MES scheduled user compute queue reset cases
- * stored in hung_db_array hqd info offset to full array size
- */
+ if (r && !hung_db_num) {
+ dev_err(adev->dev, "Failed to detect and reset hung queues\n");
+ return r;
}
+ /*
+ * TODO: return HQD info for MES scheduled user compute queue reset cases
+ * stored in hung_db_array hqd info offset to full array size
+ */
+
+ if (r)
+ dev_err(adev->dev, "failed to reset\n");
+
return r;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 3/8] drm/amdgpu: Fixup detect and reset
2026-03-20 20:02 ` [PATCH 3/8] drm/amdgpu: Fixup detect and reset Amber Lin
@ 2026-03-23 19:07 ` Alex Deucher
0 siblings, 0 replies; 22+ messages in thread
From: Alex Deucher @ 2026-03-23 19:07 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Jonathan Kim
On Fri, Mar 20, 2026 at 4:02 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> Identify hung queues by comparing doorbells shown in hqd_info from MES
> with doorbells stored in the driver to find matching queues.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 38 ++++++++++++++++---------
> 1 file changed, 25 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index b68bf4a9cb40..bea509f6b3ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -465,23 +465,35 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>
> r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
> &input);
> - if (r) {
> - dev_err(adev->dev, "failed to detect and reset\n");
> - } else {
> - *hung_db_num = 0;
> - for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
> - if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
> - hung_db_array[i] = db_array[i];
> - *hung_db_num += 1;
> - }
> +
> + if (r && detect_only) {
> + dev_err(adev->dev, "Failed to detect hung queues\n");
> + return r;
> + }
> +
> + *hung_db_num = 0;
> + /* MES passes hung queues' doorbell to driver */
> + for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
> + /* Finding hung queues where db_array[i] is a valid doorbell */
> + if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
> + hung_db_array[i] = db_array[i];
> + *hung_db_num += 1;
> }
> + }
>
> - /*
> - * TODO: return HQD info for MES scheduled user compute queue reset cases
> - * stored in hung_db_array hqd info offset to full array size
> - */
> + if (r && !hung_db_num) {
> + dev_err(adev->dev, "Failed to detect and reset hung queues\n");
> + return r;
> }
>
> + /*
> + * TODO: return HQD info for MES scheduled user compute queue reset cases
> + * stored in hung_db_array hqd info offset to full array size
> + */
> +
> + if (r)
> + dev_err(adev->dev, "failed to reset\n");
> +
> return r;
> }
>
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 4/8] drm/amdgpu: Create hqd info structure
2026-03-20 20:02 [PATCH 0/8] Support compute queue/pipe reset on gfx 12.1 Amber Lin
` (2 preceding siblings ...)
2026-03-20 20:02 ` [PATCH 3/8] drm/amdgpu: Fixup detect and reset Amber Lin
@ 2026-03-20 20:02 ` Amber Lin
2026-03-23 19:01 ` Alex Deucher
2026-03-20 20:02 ` [PATCH 5/8] drm/amdgpu: Missing multi-XCC support in MES Amber Lin
` (3 subsequent siblings)
7 siblings, 1 reply; 22+ messages in thread
From: Amber Lin @ 2026-03-20 20:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Amber Lin, Jonathan Kim
Create hung_queue_hqd_info structure and fill in hung queses information
passed by MES, including queue type, pipe id, and queue id.
Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 13 ++++++++-----
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
2 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index bea509f6b3ff..710bca87c32b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
{
struct mes_detect_and_reset_queue_input input;
u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
- int r, i;
+ int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
if (!hung_db_num || !hung_db_array)
return -EINVAL;
@@ -486,10 +486,13 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
return r;
}
- /*
- * TODO: return HQD info for MES scheduled user compute queue reset cases
- * stored in hung_db_array hqd info offset to full array size
- */
+ if (queue_type != AMDGPU_RING_TYPE_COMPUTE) {
+ dev_warn(adev->dev, "Unsupported queue type: %d\n", queue_type);
+ return r;
+ }
+
+ for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
+ hung_db_array[i] = db_array[i];
if (r)
dev_err(adev->dev, "failed to reset\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index f80e3aca9c78..2e6ae9f84db0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -170,6 +170,19 @@ struct amdgpu_mes {
uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
};
+struct amdgpu_mes_hung_queue_hqd_info {
+ union {
+ struct {
+ uint32_t queue_type: 3; // queue type
+ uint32_t pipe_index: 4; // pipe index
+ uint32_t queue_index: 8; // queue index
+ uint32_t reserved: 17;
+ };
+
+ uint32_t bit0_31;
+ };
+};
+
struct amdgpu_mes_gang {
int gang_id;
int priority;
--
2.43.0
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 4/8] drm/amdgpu: Create hqd info structure
2026-03-20 20:02 ` [PATCH 4/8] drm/amdgpu: Create hqd info structure Amber Lin
@ 2026-03-23 19:01 ` Alex Deucher
2026-03-23 19:11 ` Amber Lin
0 siblings, 1 reply; 22+ messages in thread
From: Alex Deucher @ 2026-03-23 19:01 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Jonathan Kim
On Fri, Mar 20, 2026 at 4:09 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> Create hung_queue_hqd_info structure and fill in hung queses information
> passed by MES, including queue type, pipe id, and queue id.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 13 ++++++++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
> 2 files changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index bea509f6b3ff..710bca87c32b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
> {
> struct mes_detect_and_reset_queue_input input;
> u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
> - int r, i;
> + int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
>
> if (!hung_db_num || !hung_db_array)
> return -EINVAL;
> @@ -486,10 +486,13 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
> return r;
> }
>
> - /*
> - * TODO: return HQD info for MES scheduled user compute queue reset cases
> - * stored in hung_db_array hqd info offset to full array size
> - */
> + if (queue_type != AMDGPU_RING_TYPE_COMPUTE) {
> + dev_warn(adev->dev, "Unsupported queue type: %d\n", queue_type);
This function will get called for non-compute queues. We shouldn't warn here.
Alex
> + return r;
> + }
> +
> + for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
> + hung_db_array[i] = db_array[i];
>
> if (r)
> dev_err(adev->dev, "failed to reset\n");
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index f80e3aca9c78..2e6ae9f84db0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -170,6 +170,19 @@ struct amdgpu_mes {
> uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
> };
>
> +struct amdgpu_mes_hung_queue_hqd_info {
> + union {
> + struct {
> + uint32_t queue_type: 3; // queue type
> + uint32_t pipe_index: 4; // pipe index
> + uint32_t queue_index: 8; // queue index
> + uint32_t reserved: 17;
> + };
> +
> + uint32_t bit0_31;
> + };
> +};
> +
> struct amdgpu_mes_gang {
> int gang_id;
> int priority;
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH 4/8] drm/amdgpu: Create hqd info structure
2026-03-23 19:01 ` Alex Deucher
@ 2026-03-23 19:11 ` Amber Lin
0 siblings, 0 replies; 22+ messages in thread
From: Amber Lin @ 2026-03-23 19:11 UTC (permalink / raw)
To: Alex Deucher
Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Jonathan Kim
On 3/23/26 15:01, Alex Deucher wrote:
> On Fri, Mar 20, 2026 at 4:09 PM Amber Lin <Amber.Lin@amd.com> wrote:
>> Create hung_queue_hqd_info structure and fill in hung queses information
>> passed by MES, including queue type, pipe id, and queue id.
>>
>> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 13 ++++++++-----
>> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
>> 2 files changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> index bea509f6b3ff..710bca87c32b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>> {
>> struct mes_detect_and_reset_queue_input input;
>> u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
>> - int r, i;
>> + int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
>>
>> if (!hung_db_num || !hung_db_array)
>> return -EINVAL;
>> @@ -486,10 +486,13 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>> return r;
>> }
>>
>> - /*
>> - * TODO: return HQD info for MES scheduled user compute queue reset cases
>> - * stored in hung_db_array hqd info offset to full array size
>> - */
>> + if (queue_type != AMDGPU_RING_TYPE_COMPUTE) {
>> + dev_warn(adev->dev, "Unsupported queue type: %d\n", queue_type);
> This function will get called for non-compute queues. We shouldn't warn here.
>
> Alex
Right, I only consider the caller being KFD and ignored it can be
potentially gfx too. I'll remove this warning.
Amber
>> + return r;
>> + }
>> +
>> + for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
>> + hung_db_array[i] = db_array[i];
>>
>> if (r)
>> dev_err(adev->dev, "failed to reset\n");
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> index f80e3aca9c78..2e6ae9f84db0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> @@ -170,6 +170,19 @@ struct amdgpu_mes {
>> uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
>> };
>>
>> +struct amdgpu_mes_hung_queue_hqd_info {
>> + union {
>> + struct {
>> + uint32_t queue_type: 3; // queue type
>> + uint32_t pipe_index: 4; // pipe index
>> + uint32_t queue_index: 8; // queue index
>> + uint32_t reserved: 17;
>> + };
>> +
>> + uint32_t bit0_31;
>> + };
>> +};
>> +
>> struct amdgpu_mes_gang {
>> int gang_id;
>> int priority;
>> --
>> 2.43.0
>>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 5/8] drm/amdgpu: Missing multi-XCC support in MES
2026-03-20 20:02 [PATCH 0/8] Support compute queue/pipe reset on gfx 12.1 Amber Lin
` (3 preceding siblings ...)
2026-03-20 20:02 ` [PATCH 4/8] drm/amdgpu: Create hqd info structure Amber Lin
@ 2026-03-20 20:02 ` Amber Lin
2026-03-23 19:10 ` Alex Deucher
2026-03-20 20:02 ` [PATCH 6/8] drm/amdgpu: Enable suspend/resume gang in mes 12.1 Amber Lin
` (2 subsequent siblings)
7 siblings, 1 reply; 22+ messages in thread
From: Amber Lin @ 2026-03-20 20:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Amber Lin
In a multi-XCC GPU, pass the master XCC's ID to amdgpu_mes_suspend,
amdgpu_mes_resume, and detect_and_reset_hung_queues so the command will be
sent to the matching master MES when the compute partition mode is not
SPX.
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 7 +++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 9 +++++----
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +-
drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 2 +-
drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 14 +-------------
.../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++--
6 files changed, 15 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 710bca87c32b..4f44b933e373 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -300,7 +300,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
mutex_destroy(&adev->mes.mutex_hidden);
}
-int amdgpu_mes_suspend(struct amdgpu_device *adev)
+int amdgpu_mes_suspend(struct amdgpu_device *adev, uint32_t xcc_id)
{
struct mes_suspend_gang_input input;
int r;
@@ -310,6 +310,7 @@ int amdgpu_mes_suspend(struct amdgpu_device *adev)
memset(&input, 0x0, sizeof(struct mes_suspend_gang_input));
input.suspend_all_gangs = 1;
+ input.xcc_id = xcc_id;
/*
* Avoid taking any other locks under MES lock to avoid circular
@@ -324,7 +325,7 @@ int amdgpu_mes_suspend(struct amdgpu_device *adev)
return r;
}
-int amdgpu_mes_resume(struct amdgpu_device *adev)
+int amdgpu_mes_resume(struct amdgpu_device *adev, uint32_t xcc_id)
{
struct mes_resume_gang_input input;
int r;
@@ -334,6 +335,7 @@ int amdgpu_mes_resume(struct amdgpu_device *adev)
memset(&input, 0x0, sizeof(struct mes_resume_gang_input));
input.resume_all_gangs = 1;
+ input.xcc_id = xcc_id;
/*
* Avoid taking any other locks under MES lock to avoid circular
@@ -462,6 +464,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
adev->mes.hung_queue_db_array_size * sizeof(u32));
input.queue_type = queue_type;
input.detect_only = detect_only;
+ input.xcc_id = xcc_id;
r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
&input);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 2e6ae9f84db0..643b4f8d757a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -325,8 +325,9 @@ struct mes_reset_queue_input {
};
struct mes_detect_and_reset_queue_input {
- uint32_t queue_type;
- bool detect_only;
+ uint32_t queue_type;
+ bool detect_only;
+ uint32_t xcc_id;
};
struct mes_inv_tlbs_pasid_input {
@@ -442,8 +443,8 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe);
int amdgpu_mes_init(struct amdgpu_device *adev);
void amdgpu_mes_fini(struct amdgpu_device *adev);
-int amdgpu_mes_suspend(struct amdgpu_device *adev);
-int amdgpu_mes_resume(struct amdgpu_device *adev);
+int amdgpu_mes_suspend(struct amdgpu_device *adev, uint32_t xcc_id);
+int amdgpu_mes_resume(struct amdgpu_device *adev, uint32_t xcc_id);
int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev,
struct amdgpu_ring *ring, uint32_t xcc_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 78d1f3eb522e..35734d34763a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -5200,7 +5200,7 @@ static int gfx_v11_0_post_soft_reset(struct amdgpu_ip_block *ip_block)
/**
* GFX soft reset will impact MES, need resume MES when do GFX soft reset
*/
- return amdgpu_mes_resume(adev);
+ return amdgpu_mes_resume(adev, 0);
}
static uint64_t gfx_v11_0_get_gpu_clock_counter(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 9508709abd49..d02a84711394 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -266,7 +266,7 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
if (found_hung_queue) {
/* Resume scheduling after hang recovery */
- r = amdgpu_mes_resume(adev);
+ r = amdgpu_mes_resume(adev, input.xcc_id);
}
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
index 70d80c2aed52..4b279259b9d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
@@ -1888,24 +1888,12 @@ static int mes_v12_1_hw_fini(struct amdgpu_ip_block *ip_block)
static int mes_v12_1_suspend(struct amdgpu_ip_block *ip_block)
{
- int r;
-
- r = amdgpu_mes_suspend(ip_block->adev);
- if (r)
- return r;
-
return mes_v12_1_hw_fini(ip_block);
}
static int mes_v12_1_resume(struct amdgpu_ip_block *ip_block)
{
- int r;
-
- r = mes_v12_1_hw_init(ip_block);
- if (r)
- return r;
-
- return amdgpu_mes_resume(ip_block->adev);
+ return mes_v12_1_hw_init(ip_block);
}
static int mes_v12_1_early_init(struct amdgpu_ip_block *ip_block)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 18bc5ba25f8f..ec8d7f4be840 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -367,7 +367,7 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
if (!down_read_trylock(&adev->reset_domain->sem))
return -EIO;
- r = amdgpu_mes_suspend(adev);
+ r = amdgpu_mes_suspend(adev, ffs(dqm->dev->xcc_mask) - 1);
up_read(&adev->reset_domain->sem);
if (r) {
@@ -387,7 +387,7 @@ static int resume_all_queues_mes(struct device_queue_manager *dqm)
if (!down_read_trylock(&adev->reset_domain->sem))
return -EIO;
- r = amdgpu_mes_resume(adev);
+ r = amdgpu_mes_resume(adev, ffs(dqm->dev->xcc_mask) - 1);
up_read(&adev->reset_domain->sem);
if (r) {
--
2.43.0
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 5/8] drm/amdgpu: Missing multi-XCC support in MES
2026-03-20 20:02 ` [PATCH 5/8] drm/amdgpu: Missing multi-XCC support in MES Amber Lin
@ 2026-03-23 19:10 ` Alex Deucher
2026-03-23 19:19 ` Amber Lin
0 siblings, 1 reply; 22+ messages in thread
From: Alex Deucher @ 2026-03-23 19:10 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang
On Fri, Mar 20, 2026 at 5:19 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> In a multi-XCC GPU, pass the master XCC's ID to amdgpu_mes_suspend,
> amdgpu_mes_resume, and detect_and_reset_hung_queues so the command will be
> sent to the matching master MES when the compute partition mode is not
> SPX.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 7 +++++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 9 +++++----
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 14 +-------------
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++--
> 6 files changed, 15 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 710bca87c32b..4f44b933e373 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -300,7 +300,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
> mutex_destroy(&adev->mes.mutex_hidden);
> }
>
> -int amdgpu_mes_suspend(struct amdgpu_device *adev)
> +int amdgpu_mes_suspend(struct amdgpu_device *adev, uint32_t xcc_id)
> {
> struct mes_suspend_gang_input input;
> int r;
> @@ -310,6 +310,7 @@ int amdgpu_mes_suspend(struct amdgpu_device *adev)
>
> memset(&input, 0x0, sizeof(struct mes_suspend_gang_input));
> input.suspend_all_gangs = 1;
> + input.xcc_id = xcc_id;
>
> /*
> * Avoid taking any other locks under MES lock to avoid circular
> @@ -324,7 +325,7 @@ int amdgpu_mes_suspend(struct amdgpu_device *adev)
> return r;
> }
>
> -int amdgpu_mes_resume(struct amdgpu_device *adev)
> +int amdgpu_mes_resume(struct amdgpu_device *adev, uint32_t xcc_id)
> {
> struct mes_resume_gang_input input;
> int r;
> @@ -334,6 +335,7 @@ int amdgpu_mes_resume(struct amdgpu_device *adev)
>
> memset(&input, 0x0, sizeof(struct mes_resume_gang_input));
> input.resume_all_gangs = 1;
> + input.xcc_id = xcc_id;
>
> /*
> * Avoid taking any other locks under MES lock to avoid circular
> @@ -462,6 +464,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
> adev->mes.hung_queue_db_array_size * sizeof(u32));
> input.queue_type = queue_type;
> input.detect_only = detect_only;
> + input.xcc_id = xcc_id;
>
> r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
> &input);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 2e6ae9f84db0..643b4f8d757a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -325,8 +325,9 @@ struct mes_reset_queue_input {
> };
>
> struct mes_detect_and_reset_queue_input {
> - uint32_t queue_type;
> - bool detect_only;
> + uint32_t queue_type;
> + bool detect_only;
> + uint32_t xcc_id;
> };
>
> struct mes_inv_tlbs_pasid_input {
> @@ -442,8 +443,8 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe);
> int amdgpu_mes_init(struct amdgpu_device *adev);
> void amdgpu_mes_fini(struct amdgpu_device *adev);
>
> -int amdgpu_mes_suspend(struct amdgpu_device *adev);
> -int amdgpu_mes_resume(struct amdgpu_device *adev);
> +int amdgpu_mes_suspend(struct amdgpu_device *adev, uint32_t xcc_id);
> +int amdgpu_mes_resume(struct amdgpu_device *adev, uint32_t xcc_id);
>
> int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev,
> struct amdgpu_ring *ring, uint32_t xcc_id);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 78d1f3eb522e..35734d34763a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -5200,7 +5200,7 @@ static int gfx_v11_0_post_soft_reset(struct amdgpu_ip_block *ip_block)
> /**
> * GFX soft reset will impact MES, need resume MES when do GFX soft reset
> */
> - return amdgpu_mes_resume(adev);
> + return amdgpu_mes_resume(adev, 0);
> }
>
> static uint64_t gfx_v11_0_get_gpu_clock_counter(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index 9508709abd49..d02a84711394 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -266,7 +266,7 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
>
> if (found_hung_queue) {
> /* Resume scheduling after hang recovery */
> - r = amdgpu_mes_resume(adev);
> + r = amdgpu_mes_resume(adev, input.xcc_id);
> }
>
> return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> index 70d80c2aed52..4b279259b9d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> @@ -1888,24 +1888,12 @@ static int mes_v12_1_hw_fini(struct amdgpu_ip_block *ip_block)
>
> static int mes_v12_1_suspend(struct amdgpu_ip_block *ip_block)
> {
> - int r;
> -
> - r = amdgpu_mes_suspend(ip_block->adev);
> - if (r)
> - return r;
> -
> return mes_v12_1_hw_fini(ip_block);
> }
>
> static int mes_v12_1_resume(struct amdgpu_ip_block *ip_block)
> {
> - int r;
> -
> - r = mes_v12_1_hw_init(ip_block);
> - if (r)
> - return r;
> -
> - return amdgpu_mes_resume(ip_block->adev);
> + return mes_v12_1_hw_init(ip_block);
These changes seem unrelated. Should these hunks be a separate patch?
Alex
> }
>
> static int mes_v12_1_early_init(struct amdgpu_ip_block *ip_block)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 18bc5ba25f8f..ec8d7f4be840 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -367,7 +367,7 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
> if (!down_read_trylock(&adev->reset_domain->sem))
> return -EIO;
>
> - r = amdgpu_mes_suspend(adev);
> + r = amdgpu_mes_suspend(adev, ffs(dqm->dev->xcc_mask) - 1);
> up_read(&adev->reset_domain->sem);
>
> if (r) {
> @@ -387,7 +387,7 @@ static int resume_all_queues_mes(struct device_queue_manager *dqm)
> if (!down_read_trylock(&adev->reset_domain->sem))
> return -EIO;
>
> - r = amdgpu_mes_resume(adev);
> + r = amdgpu_mes_resume(adev, ffs(dqm->dev->xcc_mask) - 1);
> up_read(&adev->reset_domain->sem);
>
> if (r) {
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH 5/8] drm/amdgpu: Missing multi-XCC support in MES
2026-03-23 19:10 ` Alex Deucher
@ 2026-03-23 19:19 ` Amber Lin
0 siblings, 0 replies; 22+ messages in thread
From: Amber Lin @ 2026-03-23 19:19 UTC (permalink / raw)
To: Alex Deucher; +Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang
On 3/23/26 15:10, Alex Deucher wrote:
> On Fri, Mar 20, 2026 at 5:19 PM Amber Lin <Amber.Lin@amd.com> wrote:
>> In a multi-XCC GPU, pass the master XCC's ID to amdgpu_mes_suspend,
>> amdgpu_mes_resume, and detect_and_reset_hung_queues so the command will be
>> sent to the matching master MES when the compute partition mode is not
>> SPX.
>>
>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 7 +++++--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 9 +++++----
>> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +-
>> drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 2 +-
>> drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 14 +-------------
>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++--
>> 6 files changed, 15 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> index 710bca87c32b..4f44b933e373 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> @@ -300,7 +300,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>> mutex_destroy(&adev->mes.mutex_hidden);
>> }
>>
>> -int amdgpu_mes_suspend(struct amdgpu_device *adev)
>> +int amdgpu_mes_suspend(struct amdgpu_device *adev, uint32_t xcc_id)
>> {
>> struct mes_suspend_gang_input input;
>> int r;
>> @@ -310,6 +310,7 @@ int amdgpu_mes_suspend(struct amdgpu_device *adev)
>>
>> memset(&input, 0x0, sizeof(struct mes_suspend_gang_input));
>> input.suspend_all_gangs = 1;
>> + input.xcc_id = xcc_id;
>>
>> /*
>> * Avoid taking any other locks under MES lock to avoid circular
>> @@ -324,7 +325,7 @@ int amdgpu_mes_suspend(struct amdgpu_device *adev)
>> return r;
>> }
>>
>> -int amdgpu_mes_resume(struct amdgpu_device *adev)
>> +int amdgpu_mes_resume(struct amdgpu_device *adev, uint32_t xcc_id)
>> {
>> struct mes_resume_gang_input input;
>> int r;
>> @@ -334,6 +335,7 @@ int amdgpu_mes_resume(struct amdgpu_device *adev)
>>
>> memset(&input, 0x0, sizeof(struct mes_resume_gang_input));
>> input.resume_all_gangs = 1;
>> + input.xcc_id = xcc_id;
>>
>> /*
>> * Avoid taking any other locks under MES lock to avoid circular
>> @@ -462,6 +464,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
>> adev->mes.hung_queue_db_array_size * sizeof(u32));
>> input.queue_type = queue_type;
>> input.detect_only = detect_only;
>> + input.xcc_id = xcc_id;
>>
>> r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
>> &input);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> index 2e6ae9f84db0..643b4f8d757a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> @@ -325,8 +325,9 @@ struct mes_reset_queue_input {
>> };
>>
>> struct mes_detect_and_reset_queue_input {
>> - uint32_t queue_type;
>> - bool detect_only;
>> + uint32_t queue_type;
>> + bool detect_only;
>> + uint32_t xcc_id;
>> };
>>
>> struct mes_inv_tlbs_pasid_input {
>> @@ -442,8 +443,8 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe);
>> int amdgpu_mes_init(struct amdgpu_device *adev);
>> void amdgpu_mes_fini(struct amdgpu_device *adev);
>>
>> -int amdgpu_mes_suspend(struct amdgpu_device *adev);
>> -int amdgpu_mes_resume(struct amdgpu_device *adev);
>> +int amdgpu_mes_suspend(struct amdgpu_device *adev, uint32_t xcc_id);
>> +int amdgpu_mes_resume(struct amdgpu_device *adev, uint32_t xcc_id);
>>
>> int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev,
>> struct amdgpu_ring *ring, uint32_t xcc_id);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> index 78d1f3eb522e..35734d34763a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> @@ -5200,7 +5200,7 @@ static int gfx_v11_0_post_soft_reset(struct amdgpu_ip_block *ip_block)
>> /**
>> * GFX soft reset will impact MES, need resume MES when do GFX soft reset
>> */
>> - return amdgpu_mes_resume(adev);
>> + return amdgpu_mes_resume(adev, 0);
>> }
>>
>> static uint64_t gfx_v11_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> index 9508709abd49..d02a84711394 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> @@ -266,7 +266,7 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
>>
>> if (found_hung_queue) {
>> /* Resume scheduling after hang recovery */
>> - r = amdgpu_mes_resume(adev);
>> + r = amdgpu_mes_resume(adev, input.xcc_id);
>> }
>>
>> return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
>> index 70d80c2aed52..4b279259b9d3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
>> @@ -1888,24 +1888,12 @@ static int mes_v12_1_hw_fini(struct amdgpu_ip_block *ip_block)
>>
>> static int mes_v12_1_suspend(struct amdgpu_ip_block *ip_block)
>> {
>> - int r;
>> -
>> - r = amdgpu_mes_suspend(ip_block->adev);
>> - if (r)
>> - return r;
>> -
>> return mes_v12_1_hw_fini(ip_block);
>> }
>>
>> static int mes_v12_1_resume(struct amdgpu_ip_block *ip_block)
>> {
>> - int r;
>> -
>> - r = mes_v12_1_hw_init(ip_block);
>> - if (r)
>> - return r;
>> -
>> - return amdgpu_mes_resume(ip_block->adev);
>> + return mes_v12_1_hw_init(ip_block);
> These changes seem unrelated. Should these hunks be a separate patch?
>
> Alex
Yes I'll separate them into another patch
>> }
>>
>> static int mes_v12_1_early_init(struct amdgpu_ip_block *ip_block)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index 18bc5ba25f8f..ec8d7f4be840 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -367,7 +367,7 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
>> if (!down_read_trylock(&adev->reset_domain->sem))
>> return -EIO;
>>
>> - r = amdgpu_mes_suspend(adev);
>> + r = amdgpu_mes_suspend(adev, ffs(dqm->dev->xcc_mask) - 1);
>> up_read(&adev->reset_domain->sem);
>>
>> if (r) {
>> @@ -387,7 +387,7 @@ static int resume_all_queues_mes(struct device_queue_manager *dqm)
>> if (!down_read_trylock(&adev->reset_domain->sem))
>> return -EIO;
>>
>> - r = amdgpu_mes_resume(adev);
>> + r = amdgpu_mes_resume(adev, ffs(dqm->dev->xcc_mask) - 1);
>> up_read(&adev->reset_domain->sem);
>>
>> if (r) {
>> --
>> 2.43.0
>>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 6/8] drm/amdgpu: Enable suspend/resume gang in mes 12.1
2026-03-20 20:02 [PATCH 0/8] Support compute queue/pipe reset on gfx 12.1 Amber Lin
` (4 preceding siblings ...)
2026-03-20 20:02 ` [PATCH 5/8] drm/amdgpu: Missing multi-XCC support in MES Amber Lin
@ 2026-03-20 20:02 ` Amber Lin
2026-03-23 19:11 ` Alex Deucher
2026-03-20 20:02 ` [PATCH 7/8] drm/amdkfd: Add detect+reset hangs to GC 12.1 Amber Lin
2026-03-20 20:02 ` [PATCH 8/8] drm/amdkfd: Reset queue/pipe in MES Amber Lin
7 siblings, 1 reply; 22+ messages in thread
From: Amber Lin @ 2026-03-20 20:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Amber Lin
This patch adds code to mes_v12_1_suspend_gang and mes_v12_1_resume_gang.
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 34 ++++++++++++++++++++++++--
1 file changed, 32 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
index 4b279259b9d3..7aea3a50e712 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
@@ -468,13 +468,43 @@ static int mes_v12_1_unmap_legacy_queue(struct amdgpu_mes *mes,
static int mes_v12_1_suspend_gang(struct amdgpu_mes *mes,
struct mes_suspend_gang_input *input)
{
- return 0;
+ union MESAPI__SUSPEND mes_suspend_gang_pkt;
+
+ memset(&mes_suspend_gang_pkt, 0, sizeof(mes_suspend_gang_pkt));
+
+ mes_suspend_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
+ mes_suspend_gang_pkt.header.opcode = MES_SCH_API_SUSPEND;
+ mes_suspend_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
+
+ mes_suspend_gang_pkt.suspend_all_gangs = input->suspend_all_gangs;
+ mes_suspend_gang_pkt.gang_context_addr = input->gang_context_addr;
+ mes_suspend_gang_pkt.suspend_fence_addr = input->suspend_fence_addr;
+ mes_suspend_gang_pkt.suspend_fence_value = input->suspend_fence_value;
+
+ /* Suspend gang is handled by master MES */
+ return mes_v12_1_submit_pkt_and_poll_completion(mes, input->xcc_id, AMDGPU_MES_SCHED_PIPE,
+ &mes_suspend_gang_pkt, sizeof(mes_suspend_gang_pkt),
+ offsetof(union MESAPI__SUSPEND, api_status));
}
static int mes_v12_1_resume_gang(struct amdgpu_mes *mes,
struct mes_resume_gang_input *input)
{
- return 0;
+ union MESAPI__RESUME mes_resume_gang_pkt;
+
+ memset(&mes_resume_gang_pkt, 0, sizeof(mes_resume_gang_pkt));
+
+ mes_resume_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
+ mes_resume_gang_pkt.header.opcode = MES_SCH_API_RESUME;
+ mes_resume_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
+
+ mes_resume_gang_pkt.resume_all_gangs = input->resume_all_gangs;
+ mes_resume_gang_pkt.gang_context_addr = input->gang_context_addr;
+
+ /* Resume gang is handled by master MES */
+ return mes_v12_1_submit_pkt_and_poll_completion(mes, input->xcc_id, AMDGPU_MES_SCHED_PIPE,
+ &mes_resume_gang_pkt, sizeof(mes_resume_gang_pkt),
+ offsetof(union MESAPI__RESUME, api_status));
}
static int mes_v12_1_query_sched_status(struct amdgpu_mes *mes,
--
2.43.0
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 6/8] drm/amdgpu: Enable suspend/resume gang in mes 12.1
2026-03-20 20:02 ` [PATCH 6/8] drm/amdgpu: Enable suspend/resume gang in mes 12.1 Amber Lin
@ 2026-03-23 19:11 ` Alex Deucher
0 siblings, 0 replies; 22+ messages in thread
From: Alex Deucher @ 2026-03-23 19:11 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang
On Fri, Mar 20, 2026 at 4:02 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> This patch adds code to mes_v12_1_suspend_gang and mes_v12_1_resume_gang.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 34 ++++++++++++++++++++++++--
> 1 file changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> index 4b279259b9d3..7aea3a50e712 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> @@ -468,13 +468,43 @@ static int mes_v12_1_unmap_legacy_queue(struct amdgpu_mes *mes,
> static int mes_v12_1_suspend_gang(struct amdgpu_mes *mes,
> struct mes_suspend_gang_input *input)
> {
> - return 0;
> + union MESAPI__SUSPEND mes_suspend_gang_pkt;
> +
> + memset(&mes_suspend_gang_pkt, 0, sizeof(mes_suspend_gang_pkt));
> +
> + mes_suspend_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
> + mes_suspend_gang_pkt.header.opcode = MES_SCH_API_SUSPEND;
> + mes_suspend_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> + mes_suspend_gang_pkt.suspend_all_gangs = input->suspend_all_gangs;
> + mes_suspend_gang_pkt.gang_context_addr = input->gang_context_addr;
> + mes_suspend_gang_pkt.suspend_fence_addr = input->suspend_fence_addr;
> + mes_suspend_gang_pkt.suspend_fence_value = input->suspend_fence_value;
> +
> + /* Suspend gang is handled by master MES */
> + return mes_v12_1_submit_pkt_and_poll_completion(mes, input->xcc_id, AMDGPU_MES_SCHED_PIPE,
> + &mes_suspend_gang_pkt, sizeof(mes_suspend_gang_pkt),
> + offsetof(union MESAPI__SUSPEND, api_status));
> }
>
> static int mes_v12_1_resume_gang(struct amdgpu_mes *mes,
> struct mes_resume_gang_input *input)
> {
> - return 0;
> + union MESAPI__RESUME mes_resume_gang_pkt;
> +
> + memset(&mes_resume_gang_pkt, 0, sizeof(mes_resume_gang_pkt));
> +
> + mes_resume_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
> + mes_resume_gang_pkt.header.opcode = MES_SCH_API_RESUME;
> + mes_resume_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> + mes_resume_gang_pkt.resume_all_gangs = input->resume_all_gangs;
> + mes_resume_gang_pkt.gang_context_addr = input->gang_context_addr;
> +
> + /* Resume gang is handled by master MES */
> + return mes_v12_1_submit_pkt_and_poll_completion(mes, input->xcc_id, AMDGPU_MES_SCHED_PIPE,
> + &mes_resume_gang_pkt, sizeof(mes_resume_gang_pkt),
> + offsetof(union MESAPI__RESUME, api_status));
> }
>
> static int mes_v12_1_query_sched_status(struct amdgpu_mes *mes,
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 7/8] drm/amdkfd: Add detect+reset hangs to GC 12.1
2026-03-20 20:02 [PATCH 0/8] Support compute queue/pipe reset on gfx 12.1 Amber Lin
` (5 preceding siblings ...)
2026-03-20 20:02 ` [PATCH 6/8] drm/amdgpu: Enable suspend/resume gang in mes 12.1 Amber Lin
@ 2026-03-20 20:02 ` Amber Lin
2026-03-23 19:12 ` Alex Deucher
2026-03-20 20:02 ` [PATCH 8/8] drm/amdkfd: Reset queue/pipe in MES Amber Lin
7 siblings, 1 reply; 22+ messages in thread
From: Amber Lin @ 2026-03-20 20:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Amber Lin
Add detect_and_reset_hung_queues to user mode compute queues on GC 12.1.
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 35 +++++++++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
index 7aea3a50e712..ac9e26b8bb52 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
@@ -46,6 +46,8 @@ static int mes_v12_1_kiq_hw_fini(struct amdgpu_device *adev, uint32_t xcc_id);
static int mes_v12_1_self_test(struct amdgpu_device *adev, int xcc_id);
#define MES_EOP_SIZE 2048
+#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd info */
+#define MES12_HUNG_HQD_INFO_OFFSET 4
#define regCP_HQD_IB_CONTROL_MES_12_1_DEFAULT 0x100000
#define XCC_MID_MASK 0x41000000
@@ -229,7 +231,7 @@ static int mes_v12_1_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
xcc_id, pipe, x_pkt->header.opcode);
r = amdgpu_fence_wait_polling(ring, seq, timeout);
- if (r < 1 || !*status_ptr) {
+ if (r < 1 || !lower_32_bits(*status_ptr)) {
if (misc_op_str)
dev_err(adev->dev,
"MES(%d, %d) failed to respond to msg=%s (%s)\n",
@@ -858,6 +860,33 @@ static int mes_v12_1_reset_legacy_queue(struct amdgpu_mes *mes,
}
#endif
+static int mes_v12_1_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
+ struct mes_detect_and_reset_queue_input *input)
+{
+ union MESAPI__RESET mes_reset_queue_pkt;
+
+ memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
+
+ mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
+ mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET;
+ mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
+
+ mes_reset_queue_pkt.queue_type =
+ convert_to_mes_queue_type(input->queue_type);
+ mes_reset_queue_pkt.doorbell_offset_addr =
+ mes->hung_queue_db_array_gpu_addr[0];
+
+ if (input->detect_only)
+ mes_reset_queue_pkt.hang_detect_only = 1;
+ else
+ mes_reset_queue_pkt.hang_detect_then_reset = 1;
+
+ return mes_v12_1_submit_pkt_and_poll_completion(mes,
+ input->xcc_id, AMDGPU_MES_SCHED_PIPE,
+ &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
+ offsetof(union MESAPI__RESET, api_status));
+}
+
static int mes_v12_inv_tlb_convert_hub_id(uint8_t id)
{
/*
@@ -915,6 +944,7 @@ static const struct amdgpu_mes_funcs mes_v12_1_funcs = {
.resume_gang = mes_v12_1_resume_gang,
.misc_op = mes_v12_1_misc_op,
.reset_hw_queue = mes_v12_1_reset_hw_queue,
+ .detect_and_reset_hung_queues = mes_v12_1_detect_and_reset_hung_queues,
.invalidate_tlbs_pasid = mes_v12_1_inv_tlbs_pasid,
};
@@ -1931,6 +1961,9 @@ static int mes_v12_1_early_init(struct amdgpu_ip_block *ip_block)
struct amdgpu_device *adev = ip_block->adev;
int pipe, r;
+ adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
+ adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET;
+
for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
r = amdgpu_mes_init_microcode(adev, pipe);
if (r)
--
2.43.0
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 7/8] drm/amdkfd: Add detect+reset hangs to GC 12.1
2026-03-20 20:02 ` [PATCH 7/8] drm/amdkfd: Add detect+reset hangs to GC 12.1 Amber Lin
@ 2026-03-23 19:12 ` Alex Deucher
0 siblings, 0 replies; 22+ messages in thread
From: Alex Deucher @ 2026-03-23 19:12 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang
On Fri, Mar 20, 2026 at 4:09 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> Add detect_and_reset_hung_queues to user mode compute queues on GC 12.1.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 35 +++++++++++++++++++++++++-
> 1 file changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> index 7aea3a50e712..ac9e26b8bb52 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
> @@ -46,6 +46,8 @@ static int mes_v12_1_kiq_hw_fini(struct amdgpu_device *adev, uint32_t xcc_id);
> static int mes_v12_1_self_test(struct amdgpu_device *adev, int xcc_id);
>
> #define MES_EOP_SIZE 2048
> +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd info */
> +#define MES12_HUNG_HQD_INFO_OFFSET 4
>
> #define regCP_HQD_IB_CONTROL_MES_12_1_DEFAULT 0x100000
> #define XCC_MID_MASK 0x41000000
> @@ -229,7 +231,7 @@ static int mes_v12_1_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> xcc_id, pipe, x_pkt->header.opcode);
>
> r = amdgpu_fence_wait_polling(ring, seq, timeout);
> - if (r < 1 || !*status_ptr) {
> + if (r < 1 || !lower_32_bits(*status_ptr)) {
> if (misc_op_str)
> dev_err(adev->dev,
> "MES(%d, %d) failed to respond to msg=%s (%s)\n",
> @@ -858,6 +860,33 @@ static int mes_v12_1_reset_legacy_queue(struct amdgpu_mes *mes,
> }
> #endif
>
> +static int mes_v12_1_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> + struct mes_detect_and_reset_queue_input *input)
> +{
> + union MESAPI__RESET mes_reset_queue_pkt;
> +
> + memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
> +
> + mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
> + mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET;
> + mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> + mes_reset_queue_pkt.queue_type =
> + convert_to_mes_queue_type(input->queue_type);
> + mes_reset_queue_pkt.doorbell_offset_addr =
> + mes->hung_queue_db_array_gpu_addr[0];
> +
> + if (input->detect_only)
> + mes_reset_queue_pkt.hang_detect_only = 1;
> + else
> + mes_reset_queue_pkt.hang_detect_then_reset = 1;
> +
> + return mes_v12_1_submit_pkt_and_poll_completion(mes,
> + input->xcc_id, AMDGPU_MES_SCHED_PIPE,
> + &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
> + offsetof(union MESAPI__RESET, api_status));
> +}
> +
> static int mes_v12_inv_tlb_convert_hub_id(uint8_t id)
> {
> /*
> @@ -915,6 +944,7 @@ static const struct amdgpu_mes_funcs mes_v12_1_funcs = {
> .resume_gang = mes_v12_1_resume_gang,
> .misc_op = mes_v12_1_misc_op,
> .reset_hw_queue = mes_v12_1_reset_hw_queue,
> + .detect_and_reset_hung_queues = mes_v12_1_detect_and_reset_hung_queues,
> .invalidate_tlbs_pasid = mes_v12_1_inv_tlbs_pasid,
> };
>
> @@ -1931,6 +1961,9 @@ static int mes_v12_1_early_init(struct amdgpu_ip_block *ip_block)
> struct amdgpu_device *adev = ip_block->adev;
> int pipe, r;
>
> + adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
> + adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET;
> +
> for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
> r = amdgpu_mes_init_microcode(adev, pipe);
> if (r)
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 8/8] drm/amdkfd: Reset queue/pipe in MES
2026-03-20 20:02 [PATCH 0/8] Support compute queue/pipe reset on gfx 12.1 Amber Lin
` (6 preceding siblings ...)
2026-03-20 20:02 ` [PATCH 7/8] drm/amdkfd: Add detect+reset hangs to GC 12.1 Amber Lin
@ 2026-03-20 20:02 ` Amber Lin
2026-03-23 19:21 ` Alex Deucher
7 siblings, 1 reply; 22+ messages in thread
From: Amber Lin @ 2026-03-20 20:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Amber Lin, Jonathan Kim
When removing queues fails, KFD calls amdgpu_mes to detect and reset
hung queues, then cleans up those hung queues in KFD.
Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1 +
5 files changed, 154 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 4f44b933e373..fd6b40d9da58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -795,6 +795,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
}
+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
+{
+ return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
+ (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
+}
+
/* Fix me -- node_id is used to identify the correct MES instances in the future */
static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
uint32_t node_id, bool enable)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 643b4f8d757a..44fa4d73bce8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
}
bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ec8d7f4be840..1c9c350bfffe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q, const uint32_t *restore_sdma_id);
static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
+static int resume_all_queues_mes(struct device_queue_manager *dqm);
+static int suspend_all_queues_mes(struct device_queue_manager *dqm);
+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
+ uint32_t doorbell_offset);
+static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd);
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
@@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
return r;
}
-static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
- struct qcm_process_device *qpd)
+static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd,
+ bool is_for_reset,
+ bool flush_mes_queue)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
int r;
struct mes_remove_queue_input queue_input;
+ /* queue was already removed during reset */
+ if (q->properties.is_reset)
+ return 0;
+
if (!dqm->sched_running || dqm->sched_halt)
return 0;
if (!down_read_trylock(&adev->reset_domain->sem))
@@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
queue_input.doorbell_offset = q->properties.doorbell_off;
queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
+ queue_input.remove_queue_after_reset = flush_mes_queue;
queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
amdgpu_mes_lock(&adev->mes);
@@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
amdgpu_mes_unlock(&adev->mes);
up_read(&adev->reset_domain->sem);
+ if (is_for_reset)
+ return r;
+
if (r) {
+ if (!suspend_all_queues_mes(dqm))
+ return resume_all_queues_mes(dqm);
+
dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
q->properties.doorbell_off);
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
@@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
return r;
}
+static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd)
+{
+ return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
+}
+
static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
{
struct device_process_node *cur;
@@ -359,6 +384,103 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
return retval;
}
+static int reset_queues_mes(struct device_queue_manager *dqm)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+ struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
+ int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
+ int num_hung = 0, r = 0, i, pipe, queue, queue_type;
+ uint32_t *hung_array;
+ struct kfd_process_device *pdd;
+ struct queue *q;
+
+ if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ /* reset should be used only in dqm locked queue reset */
+ if (WARN_ON(dqm->detect_hang_count > 0))
+ return 0;
+
+ if (!amdgpu_gpu_recovery) {
+ r = -ENOTRECOVERABLE;
+ goto fail;
+ }
+
+ hung_array = kzalloc(adev->mes.hung_queue_db_array_size * sizeof(uint32_t), GFP_KERNEL);
+ if (!hung_array) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ hqd_info = kzalloc(hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info), GFP_KERNEL);
+ if (!hqd_info) {
+ r = -ENOMEM;
+ goto free_hung_array;
+ }
+
+ memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info));
+
+ /*
+ * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
+ * post suspend_all as reset & detect will return all hung queue types.
+ *
+ * Passed parameter is for targeting queues not scheduled by MES add_queue.
+ */
+ r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
+ false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
+
+ if (!num_hung || r) {
+ r = -ENOTRECOVERABLE;
+ goto free_hqd_info;
+ }
+
+ /* MES reset resets queue/pipe and cleans up internally */
+ for (i = 0; i < num_hung; i++) {
+ hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
+ pipe = hqd_info[i].pipe_index;
+ queue = hqd_info[i].queue_index;
+ queue_type = hqd_info[i].queue_type;
+
+ if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
+ queue_type != MES_QUEUE_TYPE_SDMA) {
+ pr_warn("Unsupported hung queue reset type: %d\n", queue_type);
+ hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
+ continue;
+ }
+
+ q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
+ if (!q) {
+ r = -ENOTRECOVERABLE;
+ goto free_hqd_info;
+ }
+
+ pdd = kfd_get_process_device_data(q->device, q->process);
+ if (!pdd) {
+ r = -ENODEV;
+ goto free_hqd_info;
+ }
+
+ pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
+ hung_array[i], pipe, queue, queue_type);
+ /* Proceed remove_queue with reset=true */
+ remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false);
+ set_queue_as_reset(dqm, q, &pdd->qpd);
+ }
+
+ dqm->detect_hang_count = num_hung;
+ kfd_signal_reset_event(dqm->dev);
+
+free_hqd_info:
+ kfree(hqd_info);
+free_hung_array:
+ kfree(hung_array);
+fail:
+ dqm->detect_hang_count = 0;
+ return r;
+}
+
static int suspend_all_queues_mes(struct device_queue_manager *dqm)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
@@ -371,6 +493,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
up_read(&adev->reset_domain->sem);
if (r) {
+ if (!reset_queues_mes(dqm))
+ return 0;
+
dev_err(adev->dev, "failed to suspend gangs from MES\n");
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
@@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
q->properties.queue_id, pdd->process->lead_thread->pid);
pdd->has_reset_queue = true;
+ q->properties.is_reset = true;
if (q->properties.is_active) {
q->properties.is_active = false;
decrement_queue_count(dqm, qpd, q);
@@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
return NULL;
}
+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset)
+{
+ struct device_process_node *cur;
+ struct qcm_process_device *qpd;
+ struct queue *q;
+
+ list_for_each_entry(cur, &dqm->queues, list) {
+ qpd = cur->qpd;
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ if (doorbell_offset == q->properties.doorbell_off)
+ return q;
+ }
+ }
+
+ return NULL;
+}
+
static int reset_hung_queues(struct device_queue_manager *dqm)
{
int r = 0, reset_count = 0, i;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 27e4859e4ad7..6cb33f6d71e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -523,6 +523,7 @@ struct queue_properties {
uint32_t pm4_target_xcc;
bool is_dbg_wa;
bool is_user_cu_masked;
+ bool is_reset;
/* Not relevant for user mode queues in cp scheduling */
unsigned int vmid;
/* Relevant only for sdma queues*/
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 1ccd4514d3ee..4c52819aef9e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -2027,6 +2027,7 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0)) {
dev->node_props.capability |=
HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
+ dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED;
dev->node_props.capability2 |=
HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [PATCH 8/8] drm/amdkfd: Reset queue/pipe in MES
2026-03-20 20:02 ` [PATCH 8/8] drm/amdkfd: Reset queue/pipe in MES Amber Lin
@ 2026-03-23 19:21 ` Alex Deucher
2026-03-23 19:42 ` Amber Lin
0 siblings, 1 reply; 22+ messages in thread
From: Alex Deucher @ 2026-03-23 19:21 UTC (permalink / raw)
To: Amber Lin; +Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Jonathan Kim
On Fri, Mar 20, 2026 at 4:19 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> When removing queues fails, KFD calls amdgpu_mes to detect and reset
> hung queues, then cleans up those hung queues in KFD.
>
> Suggested-by: Jonathan Kim <jonathan.kim@amd.com>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
> drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1 +
> 5 files changed, 154 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 4f44b933e373..fd6b40d9da58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -795,6 +795,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
> amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
> }
>
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
> +{
> + return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
> + (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
> +}
> +
> /* Fix me -- node_id is used to identify the correct MES instances in the future */
> static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
> uint32_t node_id, bool enable)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 643b4f8d757a..44fa4d73bce8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
> }
>
> bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
>
> int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ec8d7f4be840..1c9c350bfffe 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q, const uint32_t *restore_sdma_id);
>
> static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
> +static int resume_all_queues_mes(struct device_queue_manager *dqm);
> +static int suspend_all_queues_mes(struct device_queue_manager *dqm);
> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
> + uint32_t doorbell_offset);
> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
> + struct qcm_process_device *qpd);
>
> static inline
> enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
> @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> return r;
> }
>
> -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> - struct qcm_process_device *qpd)
> +static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q,
> + struct qcm_process_device *qpd,
> + bool is_for_reset,
> + bool flush_mes_queue)
> {
> struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> int r;
> struct mes_remove_queue_input queue_input;
>
> + /* queue was already removed during reset */
> + if (q->properties.is_reset)
> + return 0;
> +
> if (!dqm->sched_running || dqm->sched_halt)
> return 0;
> if (!down_read_trylock(&adev->reset_domain->sem))
> @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
> queue_input.doorbell_offset = q->properties.doorbell_off;
> queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
> + queue_input.remove_queue_after_reset = flush_mes_queue;
> queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
>
> amdgpu_mes_lock(&adev->mes);
> @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> amdgpu_mes_unlock(&adev->mes);
> up_read(&adev->reset_domain->sem);
>
> + if (is_for_reset)
> + return r;
> +
> if (r) {
> + if (!suspend_all_queues_mes(dqm))
> + return resume_all_queues_mes(dqm);
> +
> dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
> q->properties.doorbell_off);
> dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
> @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> return r;
> }
>
> +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> + struct qcm_process_device *qpd)
> +{
> + return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
> +}
> +
> static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
> {
> struct device_process_node *cur;
> @@ -359,6 +384,103 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
> return retval;
> }
>
> +static int reset_queues_mes(struct device_queue_manager *dqm)
> +{
> + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> + struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
> + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
> + int num_hung = 0, r = 0, i, pipe, queue, queue_type;
> + uint32_t *hung_array;
> + struct kfd_process_device *pdd;
> + struct queue *q;
> +
> + if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
> + r = -ENOTRECOVERABLE;
> + goto fail;
> + }
> +
> + /* reset should be used only in dqm locked queue reset */
> + if (WARN_ON(dqm->detect_hang_count > 0))
> + return 0;
> +
> + if (!amdgpu_gpu_recovery) {
> + r = -ENOTRECOVERABLE;
> + goto fail;
> + }
> +
> + hung_array = kzalloc(adev->mes.hung_queue_db_array_size * sizeof(uint32_t), GFP_KERNEL);
> + if (!hung_array) {
> + r = -ENOMEM;
> + goto fail;
> + }
> +
> + hqd_info = kzalloc(hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info), GFP_KERNEL);
> + if (!hqd_info) {
> + r = -ENOMEM;
> + goto free_hung_array;
> + }
> +
> + memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info));
> +
> + /*
> + * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
> + * post suspend_all as reset & detect will return all hung queue types.
> + *
> + * Passed parameter is for targeting queues not scheduled by MES add_queue.
> + */
> + r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
> + false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
> +
> + if (!num_hung || r) {
> + r = -ENOTRECOVERABLE;
> + goto free_hqd_info;
> + }
> +
> + /* MES reset resets queue/pipe and cleans up internally */
> + for (i = 0; i < num_hung; i++) {
> + hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
> + pipe = hqd_info[i].pipe_index;
> + queue = hqd_info[i].queue_index;
> + queue_type = hqd_info[i].queue_type;
> +
> + if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
> + queue_type != MES_QUEUE_TYPE_SDMA) {
> + pr_warn("Unsupported hung queue reset type: %d\n", queue_type);
> + hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
> + continue;
> + }
> +
> + q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
> + if (!q) {
> + r = -ENOTRECOVERABLE;
> + goto free_hqd_info;
> + }
> +
> + pdd = kfd_get_process_device_data(q->device, q->process);
> + if (!pdd) {
> + r = -ENODEV;
> + goto free_hqd_info;
> + }
> +
> + pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
> + hung_array[i], pipe, queue, queue_type);
> + /* Proceed remove_queue with reset=true */
> + remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false);
> + set_queue_as_reset(dqm, q, &pdd->qpd);
> + }
> +
> + dqm->detect_hang_count = num_hung;
> + kfd_signal_reset_event(dqm->dev);
> +
> +free_hqd_info:
> + kfree(hqd_info);
> +free_hung_array:
> + kfree(hung_array);
> +fail:
> + dqm->detect_hang_count = 0;
> + return r;
> +}
> +
> static int suspend_all_queues_mes(struct device_queue_manager *dqm)
> {
> struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> @@ -371,6 +493,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
> up_read(&adev->reset_domain->sem);
>
> if (r) {
> + if (!reset_queues_mes(dqm))
> + return 0;
> +
> dev_err(adev->dev, "failed to suspend gangs from MES\n");
> dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
> kfd_hws_hang(dqm);
> @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
> q->properties.queue_id, pdd->process->lead_thread->pid);
>
> pdd->has_reset_queue = true;
> + q->properties.is_reset = true;
> if (q->properties.is_active) {
> q->properties.is_active = false;
> decrement_queue_count(dqm, qpd, q);
> @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
> return NULL;
> }
>
> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset)
> +{
> + struct device_process_node *cur;
> + struct qcm_process_device *qpd;
> + struct queue *q;
> +
> + list_for_each_entry(cur, &dqm->queues, list) {
> + qpd = cur->qpd;
> + list_for_each_entry(q, &qpd->queues_list, list) {
> + if (doorbell_offset == q->properties.doorbell_off)
> + return q;
> + }
> + }
> +
> + return NULL;
> +}
> +
> static int reset_hung_queues(struct device_queue_manager *dqm)
> {
> int r = 0, reset_count = 0, i;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 27e4859e4ad7..6cb33f6d71e2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -523,6 +523,7 @@ struct queue_properties {
> uint32_t pm4_target_xcc;
> bool is_dbg_wa;
> bool is_user_cu_masked;
> + bool is_reset;
> /* Not relevant for user mode queues in cp scheduling */
> unsigned int vmid;
> /* Relevant only for sdma queues*/
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 1ccd4514d3ee..4c52819aef9e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -2027,6 +2027,7 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
> if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0)) {
> dev->node_props.capability |=
> HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
> + dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED;
Should this hunk be a separate patch? Isn't this already supported on
existing parts using MES?
Alex
> dev->node_props.capability2 |=
> HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED;
> }
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [PATCH 8/8] drm/amdkfd: Reset queue/pipe in MES
2026-03-23 19:21 ` Alex Deucher
@ 2026-03-23 19:42 ` Amber Lin
0 siblings, 0 replies; 22+ messages in thread
From: Amber Lin @ 2026-03-23 19:42 UTC (permalink / raw)
To: Alex Deucher
Cc: amd-gfx, Shaoyun.Liu, Michael.Chen, Jesse.Zhang, Jonathan Kim
[-- Attachment #1: Type: text/plain, Size: 14057 bytes --]
On 3/23/26 15:21, Alex Deucher wrote:
> On Fri, Mar 20, 2026 at 4:19 PM Amber Lin<Amber.Lin@amd.com> wrote:
>> When removing queues fails, KFD calls amdgpu_mes to detect and reset
>> hung queues, then cleans up those hung queues in KFD.
>>
>> Suggested-by: Jonathan Kim<jonathan.kim@amd.com>
>> Signed-off-by: Amber Lin<Amber.Lin@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
>> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
>> drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1 +
>> 5 files changed, 154 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> index 4f44b933e373..fd6b40d9da58 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>> @@ -795,6 +795,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
>> amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
>> }
>>
>> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
>> +{
>> + return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
>> + (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
>> +}
>> +
>> /* Fix me -- node_id is used to identify the correct MES instances in the future */
>> static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
>> uint32_t node_id, bool enable)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> index 643b4f8d757a..44fa4d73bce8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>> @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
>> }
>>
>> bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
>> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
>>
>> int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index ec8d7f4be840..1c9c350bfffe 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> struct queue *q, const uint32_t *restore_sdma_id);
>>
>> static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
>> +static int resume_all_queues_mes(struct device_queue_manager *dqm);
>> +static int suspend_all_queues_mes(struct device_queue_manager *dqm);
>> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm,
>> + uint32_t doorbell_offset);
>> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
>> + struct qcm_process_device *qpd);
>>
>> static inline
>> enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
>> @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>> return r;
>> }
>>
>> -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>> - struct qcm_process_device *qpd)
>> +static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, struct queue *q,
>> + struct qcm_process_device *qpd,
>> + bool is_for_reset,
>> + bool flush_mes_queue)
>> {
>> struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
>> int r;
>> struct mes_remove_queue_input queue_input;
>>
>> + /* queue was already removed during reset */
>> + if (q->properties.is_reset)
>> + return 0;
>> +
>> if (!dqm->sched_running || dqm->sched_halt)
>> return 0;
>> if (!down_read_trylock(&adev->reset_domain->sem))
>> @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>> memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
>> queue_input.doorbell_offset = q->properties.doorbell_off;
>> queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
>> + queue_input.remove_queue_after_reset = flush_mes_queue;
>> queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
>>
>> amdgpu_mes_lock(&adev->mes);
>> @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>> amdgpu_mes_unlock(&adev->mes);
>> up_read(&adev->reset_domain->sem);
>>
>> + if (is_for_reset)
>> + return r;
>> +
>> if (r) {
>> + if (!suspend_all_queues_mes(dqm))
>> + return resume_all_queues_mes(dqm);
>> +
>> dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
>> q->properties.doorbell_off);
>> dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
>> @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>> return r;
>> }
>>
>> +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>> + struct qcm_process_device *qpd)
>> +{
>> + return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
>> +}
>> +
>> static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
>> {
>> struct device_process_node *cur;
>> @@ -359,6 +384,103 @@ static int add_all_kfd_queues_mes(struct device_queue_manager *dqm)
>> return retval;
>> }
>>
>> +static int reset_queues_mes(struct device_queue_manager *dqm)
>> +{
>> + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
>> + struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
>> + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
>> + int num_hung = 0, r = 0, i, pipe, queue, queue_type;
>> + uint32_t *hung_array;
>> + struct kfd_process_device *pdd;
>> + struct queue *q;
>> +
>> + if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
>> + r = -ENOTRECOVERABLE;
>> + goto fail;
>> + }
>> +
>> + /* reset should be used only in dqm locked queue reset */
>> + if (WARN_ON(dqm->detect_hang_count > 0))
>> + return 0;
>> +
>> + if (!amdgpu_gpu_recovery) {
>> + r = -ENOTRECOVERABLE;
>> + goto fail;
>> + }
>> +
>> + hung_array = kzalloc(adev->mes.hung_queue_db_array_size * sizeof(uint32_t), GFP_KERNEL);
>> + if (!hung_array) {
>> + r = -ENOMEM;
>> + goto fail;
>> + }
>> +
>> + hqd_info = kzalloc(hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info), GFP_KERNEL);
>> + if (!hqd_info) {
>> + r = -ENOMEM;
>> + goto free_hung_array;
>> + }
>> +
>> + memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info));
>> +
>> + /*
>> + * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
>> + * post suspend_all as reset & detect will return all hung queue types.
>> + *
>> + * Passed parameter is for targeting queues not scheduled by MES add_queue.
>> + */
>> + r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
>> + false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
>> +
>> + if (!num_hung || r) {
>> + r = -ENOTRECOVERABLE;
>> + goto free_hqd_info;
>> + }
>> +
>> + /* MES reset resets queue/pipe and cleans up internally */
>> + for (i = 0; i < num_hung; i++) {
>> + hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
>> + pipe = hqd_info[i].pipe_index;
>> + queue = hqd_info[i].queue_index;
>> + queue_type = hqd_info[i].queue_type;
>> +
>> + if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
>> + queue_type != MES_QUEUE_TYPE_SDMA) {
>> + pr_warn("Unsupported hung queue reset type: %d\n", queue_type);
>> + hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
>> + continue;
>> + }
>> +
>> + q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
>> + if (!q) {
>> + r = -ENOTRECOVERABLE;
>> + goto free_hqd_info;
>> + }
>> +
>> + pdd = kfd_get_process_device_data(q->device, q->process);
>> + if (!pdd) {
>> + r = -ENODEV;
>> + goto free_hqd_info;
>> + }
>> +
>> + pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
>> + hung_array[i], pipe, queue, queue_type);
>> + /* Proceed remove_queue with reset=true */
>> + remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false);
>> + set_queue_as_reset(dqm, q, &pdd->qpd);
>> + }
>> +
>> + dqm->detect_hang_count = num_hung;
>> + kfd_signal_reset_event(dqm->dev);
>> +
>> +free_hqd_info:
>> + kfree(hqd_info);
>> +free_hung_array:
>> + kfree(hung_array);
>> +fail:
>> + dqm->detect_hang_count = 0;
>> + return r;
>> +}
>> +
>> static int suspend_all_queues_mes(struct device_queue_manager *dqm)
>> {
>> struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
>> @@ -371,6 +493,9 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm)
>> up_read(&adev->reset_domain->sem);
>>
>> if (r) {
>> + if (!reset_queues_mes(dqm))
>> + return 0;
>> +
>> dev_err(adev->dev, "failed to suspend gangs from MES\n");
>> dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
>> kfd_hws_hang(dqm);
>> @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
>> q->properties.queue_id, pdd->process->lead_thread->pid);
>>
>> pdd->has_reset_queue = true;
>> + q->properties.is_reset = true;
>> if (q->properties.is_active) {
>> q->properties.is_active = false;
>> decrement_queue_count(dqm, qpd, q);
>> @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
>> return NULL;
>> }
>>
>> +static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager *dqm, uint32_t doorbell_offset)
>> +{
>> + struct device_process_node *cur;
>> + struct qcm_process_device *qpd;
>> + struct queue *q;
>> +
>> + list_for_each_entry(cur, &dqm->queues, list) {
>> + qpd = cur->qpd;
>> + list_for_each_entry(q, &qpd->queues_list, list) {
>> + if (doorbell_offset == q->properties.doorbell_off)
>> + return q;
>> + }
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> static int reset_hung_queues(struct device_queue_manager *dqm)
>> {
>> int r = 0, reset_count = 0, i;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 27e4859e4ad7..6cb33f6d71e2 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -523,6 +523,7 @@ struct queue_properties {
>> uint32_t pm4_target_xcc;
>> bool is_dbg_wa;
>> bool is_user_cu_masked;
>> + bool is_reset;
>> /* Not relevant for user mode queues in cp scheduling */
>> unsigned int vmid;
>> /* Relevant only for sdma queues*/
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>> index 1ccd4514d3ee..4c52819aef9e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>> @@ -2027,6 +2027,7 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
>> if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0)) {
>> dev->node_props.capability |=
>> HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
>> + dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED;
> Should this hunk be a separate patch? Isn't this already supported on
> existing parts using MES?
>
> Alex
For compute queue/pipe reset, it's only supported in gfx 9, which is in
if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) {
.....
if (!amdgpu_sriov_vf(dev->gpu->adev))
dev->node_props.capability |=
HSA_CAP_PER_QUEUE_RESET_SUPPORTED;
} else {
.....
==> and this is where I added the enablement for
if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0)) {
.....
}
But yes, I should take this part of KFD enablement into a separate
patch. I'll do that in v2
Amber
>> dev->node_props.capability2 |=
>> HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED;
>> }
>> --
>> 2.43.0
>>
[-- Attachment #2: Type: text/html, Size: 15047 bytes --]
^ permalink raw reply [flat|nested] 22+ messages in thread