* [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue
@ 2025-08-04 8:38 Jesse.Zhang
2025-08-04 8:38 ` [v6 02/13] drm/amdgpu/mes11: implement detect and reset callback Jesse.Zhang
` (11 more replies)
0 siblings, 12 replies; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Alex Deucher
From: Alex Deucher <alexander.deucher@amd.com>
Helper function to detect and reset hung queues. MES will
return an array of doorbell indices of which queues are hung
and were optionally reset.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 62 +++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 19 ++++++++
2 files changed, 81 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 135598502c8d..64c5cac9ad5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -191,6 +191,20 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
if (r)
goto error_doorbell;
+ if (adev->mes.hung_queue_db_array_size) {
+ r = amdgpu_bo_create_kernel(adev,
+ adev->mes.hung_queue_db_array_size * sizeof(u32),
+ PAGE_SIZE,
+ AMDGPU_GEM_DOMAIN_GTT,
+ &adev->mes.hung_queue_db_array_gpu_obj,
+ &adev->mes.hung_queue_db_array_gpu_addr,
+ &adev->mes.hung_queue_db_array_cpu_addr);
+ if (r) {
+ dev_warn(adev->dev, "failed to create MES hung db array buffer (%d)", r);
+ goto error_doorbell;
+ }
+ }
+
return 0;
error_doorbell:
@@ -216,6 +230,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
{
int i;
+ amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj,
+ &adev->mes.hung_queue_db_array_gpu_addr,
+ &adev->mes.hung_queue_db_array_cpu_addr);
+
amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
&adev->mes.event_log_gpu_addr,
&adev->mes.event_log_cpu_addr);
@@ -366,6 +384,50 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
return r;
}
+int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev)
+{
+ return adev->mes.hung_queue_db_array_size;
+}
+
+int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
+ int queue_type,
+ bool detect_only,
+ unsigned int *hung_db_num,
+ u32 *hung_db_array)
+
+{
+ struct mes_detect_and_reset_queue_input input;
+ u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr;
+ int r, i;
+
+ if (!hung_db_num || !hung_db_array)
+ return -EINVAL;
+
+ if ((queue_type != AMDGPU_RING_TYPE_GFX) &&
+ (queue_type != AMDGPU_RING_TYPE_COMPUTE) &&
+ (queue_type != AMDGPU_RING_TYPE_SDMA))
+ return -EINVAL;
+
+ input.queue_type = queue_type;
+ input.detect_only = detect_only;
+
+ r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
+ &input);
+ if (r) {
+ dev_err(adev->dev, "failed to detect and reset\n");
+ } else {
+ *hung_db_num = 0;
+ for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
+ if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
+ hung_db_array[i] = db_array[i];
+ *hung_db_num += 1;
+ }
+ }
+ }
+
+ return r;
+}
+
uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
{
struct mes_misc_op_input op_input;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index c0d2c195fe2e..2c4568951edb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -41,6 +41,7 @@
#define AMDGPU_MES_API_VERSION_MASK 0x00fff000
#define AMDGPU_MES_FEAT_VERSION_MASK 0xff000000
#define AMDGPU_MES_MSCRATCH_SIZE 0x40000
+#define AMDGPU_MES_INVALID_DB_OFFSET 0xffffffff
enum amdgpu_mes_priority_level {
AMDGPU_MES_PRIORITY_LEVEL_LOW = 0,
@@ -147,6 +148,10 @@ struct amdgpu_mes {
uint64_t resource_1_gpu_addr[AMDGPU_MAX_MES_PIPES];
void *resource_1_addr[AMDGPU_MAX_MES_PIPES];
+ int hung_queue_db_array_size;
+ struct amdgpu_bo *hung_queue_db_array_gpu_obj;
+ uint64_t hung_queue_db_array_gpu_addr;
+ void *hung_queue_db_array_cpu_addr;
};
struct amdgpu_mes_gang {
@@ -280,6 +285,11 @@ struct mes_reset_queue_input {
bool is_kq;
};
+struct mes_detect_and_reset_queue_input {
+ uint32_t queue_type;
+ bool detect_only;
+};
+
enum mes_misc_opcode {
MES_MISC_OP_WRITE_REG,
MES_MISC_OP_READ_REG,
@@ -367,6 +377,8 @@ struct amdgpu_mes_funcs {
int (*reset_hw_queue)(struct amdgpu_mes *mes,
struct mes_reset_queue_input *input);
+ int (*detect_and_reset_hung_queues)(struct amdgpu_mes *mes,
+ struct mes_detect_and_reset_queue_input *input);
};
#define amdgpu_mes_kiq_hw_init(adev) (adev)->mes.kiq_hw_init((adev))
@@ -390,6 +402,13 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
unsigned int vmid,
bool use_mmio);
+int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev);
+int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
+ int queue_type,
+ bool detect_only,
+ unsigned int *hung_db_num,
+ u32 *hung_db_array);
+
uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg);
int amdgpu_mes_wreg(struct amdgpu_device *adev,
uint32_t reg, uint32_t val);
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 02/13] drm/amdgpu/mes11: implement detect and reset callback
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 8:38 ` [v6 03/13] drm/amdgpu/mes12: " Jesse.Zhang
` (10 subsequent siblings)
11 siblings, 0 replies; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Alex Deucher
From: Alex Deucher <alexander.deucher@amd.com>
Implement support for the hung queue detect and reset
functionality.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 31 ++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 28eb846280dd..ed6a7f8af544 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -66,6 +66,8 @@ static int mes_v11_0_kiq_hw_fini(struct amdgpu_device *adev);
#define GFX_MES_DRAM_SIZE 0x80000
#define MES11_HW_RESOURCE_1_SIZE (128 * AMDGPU_GPU_PAGE_SIZE)
+#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 4
+
static void mes_v11_0_ring_set_wptr(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
@@ -783,6 +785,32 @@ static int mes_v11_0_reset_hw_queue(struct amdgpu_mes *mes,
offsetof(union MESAPI__RESET, api_status));
}
+static int mes_v11_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
+ struct mes_detect_and_reset_queue_input *input)
+{
+ union MESAPI__RESET mes_reset_queue_pkt;
+
+ memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
+
+ mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
+ mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET;
+ mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
+
+ mes_reset_queue_pkt.queue_type =
+ convert_to_mes_queue_type(input->queue_type);
+ mes_reset_queue_pkt.doorbell_offset_addr =
+ mes->hung_queue_db_array_gpu_addr;
+
+ if (input->detect_only)
+ mes_reset_queue_pkt.hang_detect_only = 1;
+ else
+ mes_reset_queue_pkt.hang_detect_then_reset = 1;
+
+ return mes_v11_0_submit_pkt_and_poll_completion(mes,
+ &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
+ offsetof(union MESAPI__RESET, api_status));
+}
+
static const struct amdgpu_mes_funcs mes_v11_0_funcs = {
.add_hw_queue = mes_v11_0_add_hw_queue,
.remove_hw_queue = mes_v11_0_remove_hw_queue,
@@ -792,6 +820,7 @@ static const struct amdgpu_mes_funcs mes_v11_0_funcs = {
.resume_gang = mes_v11_0_resume_gang,
.misc_op = mes_v11_0_misc_op,
.reset_hw_queue = mes_v11_0_reset_hw_queue,
+ .detect_and_reset_hung_queues = mes_v11_0_detect_and_reset_hung_queues,
};
static int mes_v11_0_allocate_ucode_buffer(struct amdgpu_device *adev,
@@ -1684,6 +1713,8 @@ static int mes_v11_0_early_init(struct amdgpu_ip_block *ip_block)
struct amdgpu_device *adev = ip_block->adev;
int pipe, r;
+ adev->mes.hung_queue_db_array_size =
+ MES11_HUNG_DB_OFFSET_ARRAY_SIZE;
for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE)
continue;
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 03/13] drm/amdgpu/mes12: implement detect and reset callback
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
2025-08-04 8:38 ` [v6 02/13] drm/amdgpu/mes11: implement detect and reset callback Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 17:00 ` Alex Deucher
2025-08-04 8:38 ` [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX Jesse.Zhang
` (9 subsequent siblings)
11 siblings, 1 reply; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Alex Deucher
From: Alex Deucher <alexander.deucher@amd.com>
Implement support for the hung queue detect and reset
functionality.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 37 ++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 6b222630f3fa..29d38aa1897e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -47,6 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct amdgpu_device *adev);
#define MES_EOP_SIZE 2048
+#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4
+
static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
@@ -879,6 +881,38 @@ static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes,
offsetof(union MESAPI__RESET, api_status));
}
+static int mes_v12_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
+ struct mes_detect_and_reset_queue_input *input)
+{
+ union MESAPI__RESET mes_reset_queue_pkt;
+ int pipe;
+
+ memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
+
+ mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
+ mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET;
+ mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
+
+ mes_reset_queue_pkt.queue_type =
+ convert_to_mes_queue_type(input->queue_type);
+ mes_reset_queue_pkt.doorbell_offset_addr =
+ mes->hung_queue_db_array_gpu_addr;
+
+ if (input->detect_only)
+ mes_reset_queue_pkt.hang_detect_only = 1;
+ else
+ mes_reset_queue_pkt.hang_detect_then_reset = 1;
+
+ if (mes->adev->enable_uni_mes)
+ pipe = AMDGPU_MES_KIQ_PIPE;
+ else
+ pipe = AMDGPU_MES_SCHED_PIPE;
+
+ return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
+ &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
+ offsetof(union MESAPI__RESET, api_status));
+}
+
static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
.add_hw_queue = mes_v12_0_add_hw_queue,
.remove_hw_queue = mes_v12_0_remove_hw_queue,
@@ -888,6 +922,7 @@ static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
.resume_gang = mes_v12_0_resume_gang,
.misc_op = mes_v12_0_misc_op,
.reset_hw_queue = mes_v12_0_reset_hw_queue,
+ .detect_and_reset_hung_queues = mes_v12_0_detect_and_reset_hung_queues,
};
static int mes_v12_0_allocate_ucode_buffer(struct amdgpu_device *adev,
@@ -1793,6 +1828,8 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block)
struct amdgpu_device *adev = ip_block->adev;
int pipe, r;
+ adev->mes.hung_queue_db_array_size =
+ MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
r = amdgpu_mes_init_microcode(adev, pipe);
if (r)
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
2025-08-04 8:38 ` [v6 02/13] drm/amdgpu/mes11: implement detect and reset callback Jesse.Zhang
2025-08-04 8:38 ` [v6 03/13] drm/amdgpu/mes12: " Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 17:03 ` Alex Deucher
2025-08-04 17:04 ` Alex Deucher
2025-08-04 8:38 ` [v6 05/13] drm/amdgpu: Implement active VMID detection in MES12 " Jesse.Zhang
` (8 subsequent siblings)
11 siblings, 2 replies; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang, kyle-hai.chau
MES queue reset functionality for GFX queues. The changes include:
1. Added detection of active VMIDs by reading CP_CNTX_STAT and CP_VMID
registers to properly identify contexts that need resetting
2. Implemented fallback to HPD status method when no active VMIDs are
found, checking both pipe 0 and pipe 1 queues
3. Extended the MES reset packet with:
- active_vmids bitmap
- connected_queue_index for pipe 0
- connected_queue_index_p1 for pipe 1
Suggested-by: kyle-hai.chau <kyle-hai.chau@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 51 +++++++++++++++++++
drivers/gpu/drm/amd/include/mes_v11_api_def.h | 13 ++++-
2 files changed, 63 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index ed6a7f8af544..1422bc59cd40 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -789,6 +789,12 @@ static int mes_v11_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
struct mes_detect_and_reset_queue_input *input)
{
union MESAPI__RESET mes_reset_queue_pkt;
+ struct amdgpu_device *adev = mes->adev;
+ uint32_t active_vmids = 0;
+ uint32_t connected_queue_index = 0;
+ uint32_t queue_status = 0;
+ uint32_t connected_queue_index_p1 = 0;
+ uint32_t queue_status_p1 = 0;
memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
@@ -801,6 +807,51 @@ static int mes_v11_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
mes_reset_queue_pkt.doorbell_offset_addr =
mes->hung_queue_db_array_gpu_addr;
+ /* Add VMID detection for GFX queues */
+ if (input->queue_type == AMDGPU_RING_TYPE_GFX) {
+ uint32_t cp_cntx_stat = RREG32_SOC15(GC, 0, regCP_CNTX_STAT);
+ uint32_t cp_vmid, grbm_gfx_cntl;
+
+ /* Check active contexts in CP_CNTX_STAT */
+ for (uint32_t i = 0; i < 8; i++) {
+ if ((cp_cntx_stat >> (0x14 + i)) & 0x1) {
+ grbm_gfx_cntl = (i << 11);
+ WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, grbm_gfx_cntl);
+ cp_vmid = RREG32_SOC15(GC, 0, regCP_VMID);
+ active_vmids |= (1 << cp_vmid);
+ }
+ }
+
+ /* Fallback to HPD status if no active VMIDs found */
+ if (active_vmids == 0) {
+ uint32_t hpd_status;
+
+ /* Pipe 0 */
+ WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, 0);
+ hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
+ queue_status = hpd_status & 0x1F;
+ connected_queue_index = (hpd_status & 0xE0) >> 5;
+
+ /* Pipe 1 */
+ WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, (1 << 6));
+ hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
+ queue_status_p1 = hpd_status & 0x1F;
+ connected_queue_index_p1 = (hpd_status & 0xE0) >> 5;
+ }
+
+ mes_reset_queue_pkt.active_vmids = active_vmids;
+ if (active_vmids == 0) {
+ if (queue_status != 0) {
+ mes_reset_queue_pkt.use_connected_queue_index = 1;
+ mes_reset_queue_pkt.connected_queue_index = connected_queue_index;
+ }
+ if (queue_status_p1 != 0) {
+ mes_reset_queue_pkt.use_connected_queue_index_p1 = 1;
+ mes_reset_queue_pkt.connected_queue_index_p1 = connected_queue_index_p1;
+ }
+ }
+ }
+
if (input->detect_only)
mes_reset_queue_pkt.hang_detect_only = 1;
else
diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
index 15680c3f4970..62ad4f0337eb 100644
--- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
+++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
@@ -460,7 +460,11 @@ union MESAPI__RESET {
uint32_t hang_detect_only : 1;
/* Rest HP and LP kernel queues not managed by MES */
uint32_t reset_legacy_gfx : 1;
- uint32_t reserved : 28;
+ /* Fallback to use conneceted queue index when CP_CNTX_STAT method fails (gfx pipe 0) */
+ uint32_t use_connected_queue_index : 1;
+ /* For gfx pipe 1 */
+ uint32_t use_connected_queue_index_p1 : 1;
+ uint32_t reserved : 26;
};
uint64_t gang_context_addr;
@@ -488,6 +492,13 @@ union MESAPI__RESET {
uint64_t wptr_addr_hp;
struct MES_API_STATUS api_status;
+ uint32_t active_vmids;
+ uint64_t timestamp;
+
+ uint32_t gang_context_array_index;
+
+ uint32_t connected_queue_index;
+ uint32_t connected_queue_index_p1;
};
uint32_t max_dwords_in_api[API_FRAME_SIZE_IN_DWORDS];
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 05/13] drm/amdgpu: Implement active VMID detection in MES12 queue reset for GFX
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
` (2 preceding siblings ...)
2025-08-04 8:38 ` [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 17:05 ` Alex Deucher
2025-08-04 8:38 ` [v6 06/13] drm/amdgpu/userq: add a detect and reset callback Jesse.Zhang
` (7 subsequent siblings)
11 siblings, 1 reply; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang, kyle-hai.chau
MES queue reset functionality for GFX queues. The changes include:
1. Added detection of active VMIDs by reading CP_CNTX_STAT and CP_VMID
registers to properly identify contexts that need resetting
2. Implemented fallback to HPD status method when no active VMIDs are
found, checking both pipe 0 and pipe 1 queues
3. Extended the MES reset packet with:
- active_vmids bitmap
- connected_queue_index for pipe 0
- connected_queue_index_p1 for pipe 1
Suggested-by: kyle-hai.chau <kyle-hai.chau@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 51 ++++++++++++++++++++++++++
1 file changed, 51 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 29d38aa1897e..579720695e9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -885,6 +885,12 @@ static int mes_v12_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
struct mes_detect_and_reset_queue_input *input)
{
union MESAPI__RESET mes_reset_queue_pkt;
+ struct amdgpu_device *adev = mes->adev;
+ uint32_t active_vmids = 0;
+ uint32_t connected_queue_index = 0;
+ uint32_t queue_status = 0;
+ uint32_t connected_queue_index_p1 = 0;
+ uint32_t queue_status_p1 = 0;
int pipe;
memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
@@ -898,6 +904,51 @@ static int mes_v12_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
mes_reset_queue_pkt.doorbell_offset_addr =
mes->hung_queue_db_array_gpu_addr;
+ /* Add VMID detection for GFX queues */
+ if (input->queue_type == AMDGPU_RING_TYPE_GFX) {
+ uint32_t cp_cntx_stat = RREG32_SOC15(GC, 0, regCP_CNTX_STAT);
+ uint32_t cp_vmid, grbm_gfx_cntl;
+
+ /* Check active contexts in CP_CNTX_STAT */
+ for (uint32_t i = 0; i < 8; i++) {
+ if ((cp_cntx_stat >> (0x14 + i)) & 0x1) {
+ grbm_gfx_cntl = (i << 11);
+ WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, grbm_gfx_cntl);
+ cp_vmid = RREG32_SOC15(GC, 0, regCP_CP_VMID);
+ active_vmids |= (1 << cp_vmid);
+ }
+ }
+
+ /* Fallback to HPD status if no active VMIDs found */
+ if (active_vmids == 0) {
+ uint32_t hpd_status;
+
+ /* Pipe 0 */
+ WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, 0);
+ hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
+ queue_status = hpd_status & 0x1F;
+ connected_queue_index = (hpd_status & 0xE0) >> 5;
+
+ /* Pipe 1 */
+ WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, (1 << 6));
+ hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
+ queue_status_p1 = hpd_status & 0x1F;
+ connected_queue_index_p1 = (hpd_status & 0xE0) >> 5;
+ }
+
+ mes_reset_queue_pkt.active_vmids = active_vmids;
+ if (active_vmids == 0) {
+ if (queue_status != 0) {
+ mes_reset_queue_pkt.use_connected_queue_index = 1;
+ mes_reset_queue_pkt.connected_queue_index = connected_queue_index;
+ }
+ if (queue_status_p1 != 0) {
+ mes_reset_queue_pkt.use_connected_queue_index_p1 = 1;
+ mes_reset_queue_pkt.connected_queue_index_p1 = connected_queue_index_p1;
+ }
+ }
+ }
+
if (input->detect_only)
mes_reset_queue_pkt.hang_detect_only = 1;
else
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 06/13] drm/amdgpu/userq: add a detect and reset callback
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
` (3 preceding siblings ...)
2025-08-04 8:38 ` [v6 05/13] drm/amdgpu: Implement active VMID detection in MES12 " Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 8:38 ` [v6 07/13] drm/amd: Add preempt and restore callbacks to userq funcs Jesse.Zhang
` (6 subsequent siblings)
11 siblings, 0 replies; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang, Alex Deucher
Add a detect and reset callback and add the implementation
for mes. The callback will detect all hung queues of a
particular ip type (e.g., GFX or compute or SDMA) and
reset them.
v2: increase reset counter and set fence force completion
v3: Removed userq_mutex in mes_userq_detect_and_reset since the driver holds it when calling
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 3 ++
drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 49 ++++++++++++++++++++++
2 files changed, 52 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index ec040c2fd6c9..0335ff03f65f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -77,6 +77,9 @@ struct amdgpu_userq_funcs {
struct amdgpu_usermode_queue *queue);
int (*map)(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue);
+ int (*detect_and_reset)(struct amdgpu_device *adev,
+ int queue_type);
+
};
/* Usermode queues for gfx */
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index d6f50b13e2ba..a871bac71e1e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -21,6 +21,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
+#include <drm/drm_drv.h>
#include "amdgpu.h"
#include "amdgpu_gfx.h"
#include "mes_userqueue.h"
@@ -198,6 +199,53 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr,
return 0;
}
+static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
+ int queue_type)
+{
+ int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
+ struct mes_detect_and_reset_queue_input input;
+ struct amdgpu_usermode_queue *queue;
+ struct amdgpu_userq_mgr *uqm, *tmp;
+ unsigned int hung_db_num = 0;
+ int queue_id, r, i;
+ u32 db_array[4];
+
+ if (db_array_size > 4) {
+ dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
+ db_array_size);
+ return -EINVAL;
+ }
+
+ memset(&input, 0x0, sizeof(struct mes_detect_and_reset_queue_input));
+
+ input.queue_type = queue_type;
+
+ amdgpu_mes_lock(&adev->mes);
+ r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false,
+ &hung_db_num, db_array);
+ amdgpu_mes_unlock(&adev->mes);
+ if (r) {
+ dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r);
+ } else if (hung_db_num) {
+ list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+ idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+ if (queue->queue_type == queue_type) {
+ for (i = 0; i < hung_db_num; i++) {
+ if (queue->doorbell_index == db_array[i]) {
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ atomic_inc(&adev->gpu_reset_counter);
+ amdgpu_userq_fence_driver_force_completion(queue);
+ drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return r;
+}
+
static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
struct drm_amdgpu_userq_in *args_in,
struct amdgpu_usermode_queue *queue)
@@ -352,4 +400,5 @@ const struct amdgpu_userq_funcs userq_mes_funcs = {
.mqd_destroy = mes_userq_mqd_destroy,
.unmap = mes_userq_unmap,
.map = mes_userq_map,
+ .detect_and_reset = mes_userq_detect_and_reset,
};
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 07/13] drm/amd: Add preempt and restore callbacks to userq funcs
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
` (4 preceding siblings ...)
2025-08-04 8:38 ` [v6 06/13] drm/amdgpu/userq: add a detect and reset callback Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 8:38 ` [v6 08/13] drm/amdgpu: adjust MES API used for suspend and resume Jesse.Zhang
` (5 subsequent siblings)
11 siblings, 0 replies; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Alex Deucher
From: Alex Deucher <alexander.deucher@amd.com>
Add two new function pointers to struct amdgpu_userq_funcs:
- preempt: To handle preemption of user mode queues
- restore: To restore preempted user mode queues
These callbacks will allow the driver to properly manage queue
preemption and restoration when needed, such as during context
switching or priority changes.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 0335ff03f65f..68e46d01bed2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -77,6 +77,10 @@ struct amdgpu_userq_funcs {
struct amdgpu_usermode_queue *queue);
int (*map)(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue);
+ int (*preempt)(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue);
+ int (*restore)(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue);
int (*detect_and_reset)(struct amdgpu_device *adev,
int queue_type);
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 08/13] drm/amdgpu: adjust MES API used for suspend and resume
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
` (5 preceding siblings ...)
2025-08-04 8:38 ` [v6 07/13] drm/amd: Add preempt and restore callbacks to userq funcs Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 17:16 ` Alex Deucher
2025-08-04 8:38 ` [v6 09/13] drm/amd/amdgpu: Implement MES suspend/resume gang functionality for v12 Jesse.Zhang
` (4 subsequent siblings)
11 siblings, 1 reply; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang, Alex Deucher
Use the suspend and resume API rather than remove queue
and add queue API. The former just preempts the queue
while the latter remove it from the scheduler completely.
There is no need to do that, we only need preemption
in this case.
V2: replace queue_active with queue state
v3: set the suspend_fence_addr
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 51 ++++++++++++++++++++++
1 file changed, 51 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index a871bac71e1e..8934d7113d58 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -395,10 +395,61 @@ mes_userq_mqd_destroy(struct amdgpu_userq_mgr *uq_mgr,
amdgpu_userq_destroy_object(uq_mgr, &queue->mqd);
}
+static int mes_userq_preempt(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ struct mes_suspend_gang_input queue_input;
+ struct amdgpu_userq_obj *ctx = &queue->fw_obj;
+ int r;
+
+ if (queue->state != AMDGPU_USERQ_STATE_MAPPED)
+ return 0;
+ if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)
+ return 0;
+
+ memset(&queue_input, 0x0, sizeof(struct mes_suspend_gang_input));
+ queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ;
+ queue_input.suspend_fence_addr = queue->fence_drv->gpu_addr;
+
+ amdgpu_mes_lock(&adev->mes);
+ r = adev->mes.funcs->suspend_gang(&adev->mes, &queue_input);
+ amdgpu_mes_unlock(&adev->mes);
+ if (r)
+ dev_err(adev->dev, "Failed to suspend queue, err (%d)\n", r);
+ return r;
+}
+
+static int mes_userq_restore(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ struct mes_resume_gang_input queue_input;
+ struct amdgpu_userq_obj *ctx = &queue->fw_obj;
+ int r;
+
+ if (queue->state == AMDGPU_USERQ_STATE_HUNG)
+ return -EINVAL;
+ if (queue->state != AMDGPU_USERQ_STATE_PREEMPTED)
+ return 0;
+
+ memset(&queue_input, 0x0, sizeof(struct mes_resume_gang_input));
+ queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ;
+
+ amdgpu_mes_lock(&adev->mes);
+ r = adev->mes.funcs->resume_gang(&adev->mes, &queue_input);
+ amdgpu_mes_unlock(&adev->mes);
+ if (r)
+ dev_err(adev->dev, "Failed to resume queue, err (%d)\n", r);
+ return r;
+ }
+
const struct amdgpu_userq_funcs userq_mes_funcs = {
.mqd_create = mes_userq_mqd_create,
.mqd_destroy = mes_userq_mqd_destroy,
.unmap = mes_userq_unmap,
.map = mes_userq_map,
+ .preempt = mes_userq_preempt,
+ .restore = mes_userq_restore,
.detect_and_reset = mes_userq_detect_and_reset,
};
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 09/13] drm/amd/amdgpu: Implement MES suspend/resume gang functionality for v12
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
` (6 preceding siblings ...)
2025-08-04 8:38 ` [v6 08/13] drm/amdgpu: adjust MES API used for suspend and resume Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 17:19 ` Alex Deucher
2025-08-04 8:38 ` [v6 10/13] drm/amdgpu: add user queue reset source Jesse.Zhang
` (3 subsequent siblings)
11 siblings, 1 reply; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang
This commit implements the actual MES (Micro Engine Scheduler) suspend
and resume gang operations for version 12 hardware. Previously these
functions were just stubs returning success.
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 44 ++++++++++++++++++++++++--
1 file changed, 42 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 579720695e9e..9c86dfdef1bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -569,13 +569,53 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes,
static int mes_v12_0_suspend_gang(struct amdgpu_mes *mes,
struct mes_suspend_gang_input *input)
{
- return 0;
+ union MESAPI__SUSPEND mes_suspend_gang_pkt;
+ int pipe;
+
+ memset(&mes_suspend_gang_pkt, 0, sizeof(mes_suspend_gang_pkt));
+
+ mes_suspend_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
+ mes_suspend_gang_pkt.header.opcode = MES_SCH_API_SUSPEND;
+ mes_suspend_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
+
+ mes_suspend_gang_pkt.suspend_all_gangs = input->suspend_all_gangs;
+ mes_suspend_gang_pkt.gang_context_addr = input->gang_context_addr;
+ mes_suspend_gang_pkt.suspend_fence_addr = input->suspend_fence_addr;
+ mes_suspend_gang_pkt.suspend_fence_value = input->suspend_fence_value;
+
+ if (mes->adev->enable_uni_mes)
+ pipe = AMDGPU_MES_KIQ_PIPE;
+ else
+ pipe = AMDGPU_MES_SCHED_PIPE;
+
+ return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
+ &mes_suspend_gang_pkt, sizeof(mes_suspend_gang_pkt),
+ offsetof(union MESAPI__SUSPEND, api_status));
}
static int mes_v12_0_resume_gang(struct amdgpu_mes *mes,
struct mes_resume_gang_input *input)
{
- return 0;
+ union MESAPI__RESUME mes_resume_gang_pkt;
+ int pipe;
+
+ memset(&mes_resume_gang_pkt, 0, sizeof(mes_resume_gang_pkt));
+
+ mes_resume_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
+ mes_resume_gang_pkt.header.opcode = MES_SCH_API_RESUME;
+ mes_resume_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
+
+ mes_resume_gang_pkt.resume_all_gangs = input->resume_all_gangs;
+ mes_resume_gang_pkt.gang_context_addr = input->gang_context_addr;
+
+ if (mes->adev->enable_uni_mes)
+ pipe = AMDGPU_MES_KIQ_PIPE;
+ else
+ pipe = AMDGPU_MES_SCHED_PIPE;
+
+ return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
+ &mes_resume_gang_pkt, sizeof(mes_resume_gang_pkt),
+ offsetof(union MESAPI__RESUME, api_status));
}
static int mes_v12_0_query_sched_status(struct amdgpu_mes *mes, int pipe)
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 10/13] drm/amdgpu: add user queue reset source
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
` (7 preceding siblings ...)
2025-08-04 8:38 ` [v6 09/13] drm/amd/amdgpu: Implement MES suspend/resume gang functionality for v12 Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 8:38 ` [v6 11/13] drm/amdgpu/userq: add force completion helpers Jesse.Zhang
` (2 subsequent siblings)
11 siblings, 0 replies; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Alex Deucher
From: Alex Deucher <alexander.deucher@amd.com>
Track resets from user queues.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 +
2 files changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index dabfbdf6f1ce..28c4ad62f50e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -340,6 +340,9 @@ void amdgpu_reset_get_desc(struct amdgpu_reset_context *rst_ctxt, char *buf,
case AMDGPU_RESET_SRC_USER:
strscpy(buf, "user trigger", len);
break;
+ case AMDGPU_RESET_SRC_USERQ:
+ strscpy(buf, "user queue trigger", len);
+ break;
default:
strscpy(buf, "unknown", len);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 4d9b9701139b..ebcea44dd743 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -43,6 +43,7 @@ enum AMDGPU_RESET_SRCS {
AMDGPU_RESET_SRC_MES,
AMDGPU_RESET_SRC_HWS,
AMDGPU_RESET_SRC_USER,
+ AMDGPU_RESET_SRC_USERQ,
};
struct amdgpu_reset_context {
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 11/13] drm/amdgpu/userq: add force completion helpers
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
` (8 preceding siblings ...)
2025-08-04 8:38 ` [v6 10/13] drm/amdgpu: add user queue reset source Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 8:38 ` [v6 12/13] drm/amdgpu: Add GPU reset handling for user mode queues Jesse.Zhang
2025-08-04 8:38 ` [v6 13/13] drm/amdgpu: Implement queue preemption using suspend/resume API Jesse.Zhang
11 siblings, 0 replies; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Alex Deucher
From: Alex Deucher <alexander.deucher@amd.com>
Add support for forcing completion of userq fences.
This is needed for userq resets and asic resets so that we
can set the error on the fence and force completion.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
.../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 42 +++++++++++++++++++
.../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 1 +
2 files changed, 43 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index a86616c6deef..01688bbf3f56 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -67,6 +67,14 @@ static u64 amdgpu_userq_fence_read(struct amdgpu_userq_fence_driver *fence_drv)
return le64_to_cpu(*fence_drv->cpu_addr);
}
+static void
+amdgpu_userq_fence_write(struct amdgpu_userq_fence_driver *fence_drv,
+ u64 seq)
+{
+ if (fence_drv->cpu_addr)
+ *fence_drv->cpu_addr = cpu_to_le64(seq);
+}
+
int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev,
struct amdgpu_usermode_queue *userq)
{
@@ -409,6 +417,40 @@ static void amdgpu_userq_fence_cleanup(struct dma_fence *fence)
dma_fence_put(fence);
}
+static void
+amdgpu_userq_fence_driver_set_error(struct amdgpu_userq_fence *fence,
+ int error)
+{
+ struct amdgpu_userq_fence_driver *fence_drv = fence->fence_drv;
+ unsigned long flags;
+ struct dma_fence *f;
+
+ spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
+
+ f = rcu_dereference_protected(&fence->base,
+ lockdep_is_held(&fence_drv->fence_list_lock));
+ if (f && !dma_fence_is_signaled_locked(f))
+ dma_fence_set_error(f, error);
+ spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
+}
+
+void
+amdgpu_userq_fence_driver_force_completion(struct amdgpu_usermode_queue *userq)
+{
+ struct dma_fence *f = userq->last_fence;
+
+ if (f) {
+ struct amdgpu_userq_fence *fence = to_amdgpu_userq_fence(f);
+ struct amdgpu_userq_fence_driver *fence_drv = fence->fence_drv;
+ u64 wptr = fence->base.seqno;
+
+ amdgpu_userq_fence_driver_set_error(fence, -ECANCELED);
+ amdgpu_userq_fence_write(fence_drv, wptr);
+ amdgpu_userq_fence_driver_process(fence_drv);
+
+ }
+}
+
int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
struct drm_file *filp)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h
index 97a125ab8a78..d76add2afc77 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h
@@ -67,6 +67,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev,
struct amdgpu_usermode_queue *userq);
void amdgpu_userq_fence_driver_free(struct amdgpu_usermode_queue *userq);
void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv);
+void amdgpu_userq_fence_driver_force_completion(struct amdgpu_usermode_queue *userq);
void amdgpu_userq_fence_driver_destroy(struct kref *ref);
int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
struct drm_file *filp);
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 12/13] drm/amdgpu: Add GPU reset handling for user mode queues
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
` (9 preceding siblings ...)
2025-08-04 8:38 ` [v6 11/13] drm/amdgpu/userq: add force completion helpers Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 8:38 ` [v6 13/13] drm/amdgpu: Implement queue preemption using suspend/resume API Jesse.Zhang
11 siblings, 0 replies; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang, Alex Deucher
This patch introduces GPU reset support for userq.
It adds detection and recovery logic for gfx, compute, and SDMA queues when errors occur during queue operations (e.g., map/unmap failures).
Key changes include:
- New function `amdgpu_userq_detect_and_reset_queues()` to scan active queues and attempt recovery via queue-specific handlers.
- Integration of userq reset logic into suspend/resume, eviction, and reset paths.
- New pre-reset and post-reset hooks (`amdgpu_userq_pre_reset()` and `amdgpu_userq_post_reset()`) for cleaning up and reinitializing user queues during full GPU reset.
- A new work_struct (`userq_reset_work`) is introduced and used in `amdgpu_userq_gpu_reset()` to trigger GPU recovery using the standard GPU reset framework.
- Improved robustness and logging during user queue unmap/map operations.
This enhances user queue resiliency and ensures graceful recovery in case of individual queue failure without immediately requiring full GPU reset.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 +
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 200 +++++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 +
4 files changed, 203 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index ef3af170dda4..9db05cdc7304 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1302,6 +1302,7 @@ struct amdgpu_device {
struct list_head userq_mgr_list;
struct mutex userq_mutex;
bool userq_halt_for_enforce_isolation;
+ struct work_struct userq_reset_work;
};
static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3757634613c3..1dc88b0055dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4475,6 +4475,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+ INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
adev->gfx.gfx_off_req_count = 1;
adev->gfx.gfx_off_residency = 0;
@@ -5880,6 +5881,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
if (r)
goto out;
+ r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
+ if (r)
+ goto out;
+
drm_client_dev_resume(adev_to_drm(tmp_adev), false);
/*
@@ -6102,6 +6107,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
if (!amdgpu_sriov_vf(adev))
cancel_work(&adev->reset_work);
#endif
+ cancel_work(&adev->userq_reset_work);
if (adev->kfd.dev)
cancel_work(&adev->kfd.reset_work);
@@ -6232,6 +6238,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev);
+ amdgpu_userq_pre_reset(tmp_adev);
+
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index aac0de86f3e8..0c91302162fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -26,7 +26,10 @@
#include <drm/drm_exec.h>
#include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
+
#include "amdgpu.h"
+#include "amdgpu_reset.h"
#include "amdgpu_vm.h"
#include "amdgpu_userq.h"
#include "amdgpu_userq_fence.h"
@@ -44,6 +47,92 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
return userq_ip_mask;
}
+static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
+{
+
+ if (amdgpu_device_should_recover_gpu(adev)) {
+ amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->userq_reset_work);
+ /* Wait for the reset job to complete */
+ flush_work(&adev->userq_reset_work);
+ }
+}
+
+static int
+amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const struct amdgpu_userq_funcs *userq_gfx_funcs =
+ adev->userq_funcs[AMDGPU_RING_TYPE_GFX];
+ const struct amdgpu_userq_funcs *userq_compute_funcs =
+ adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE];
+ const struct amdgpu_userq_funcs *userq_sdma_funcs =
+ adev->userq_funcs[AMDGPU_RING_TYPE_SDMA];
+
+ bool has_gfx = false, has_compute = false, has_sdma = false;
+ struct amdgpu_usermode_queue *userq;
+ bool gpu_reset = false;
+ int gpu_suspend, id, r = 0;
+
+ if (idr_is_empty(&uq_mgr->userq_idr))
+ return false;
+
+ /* Detect which types of queues are present */
+ idr_for_each_entry(&uq_mgr->userq_idr, userq, id) {
+ switch (userq->queue_type) {
+ case AMDGPU_RING_TYPE_GFX:
+ has_gfx = true;
+ break;
+ case AMDGPU_RING_TYPE_COMPUTE:
+ has_compute = true;
+ break;
+ case AMDGPU_RING_TYPE_SDMA:
+ has_sdma = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+ dev_err(adev->dev, "userq reset disabled by debug mask\n");
+ } else if (amdgpu_gpu_recovery) {
+ if (has_compute && userq_compute_funcs->detect_and_reset) {
+ gpu_suspend = amdgpu_mes_suspend(adev);
+ r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE);
+ if (r) {
+ gpu_reset = true;
+ goto gpu_reset;
+ }
+ }
+
+ if (has_gfx && userq_gfx_funcs->detect_and_reset) {
+ r = userq_gfx_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_GFX);
+ if (r) {
+ gpu_reset = true;
+ goto gpu_reset;
+ }
+ }
+
+ if (has_sdma && userq_sdma_funcs && userq_sdma_funcs->detect_and_reset) {
+ r = userq_sdma_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_SDMA);
+ if (r) {
+ gpu_reset = true;
+ goto gpu_reset;
+ }
+ }
+ }
+
+gpu_reset:
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
+ if ((!gpu_suspend) && has_compute)
+ amdgpu_mes_resume(adev);
+
+ return r;
+}
+
static int
amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue)
@@ -51,15 +140,22 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
r = userq_funcs->unmap(uq_mgr, queue);
- if (r)
+ if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
- else
+ gpu_reset = true;
+ } else {
queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
+ }
}
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
return r;
}
@@ -70,16 +166,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
r = userq_funcs->map(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
+ gpu_reset = true;
} else {
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
return r;
}
@@ -318,6 +420,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
amdgpu_bo_unreserve(queue->db_obj.obj);
}
amdgpu_bo_unref(&queue->db_obj.obj);
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
r = amdgpu_userq_unmap_helper(uq_mgr, queue);
amdgpu_userq_cleanup(uq_mgr, queue, queue_id);
mutex_unlock(&uq_mgr->userq_mutex);
@@ -691,6 +794,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
int queue_id;
int ret = 0, r;
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
/* Try to unmap all the queues in this process ctx */
idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
r = amdgpu_userq_unmap_helper(uq_mgr, queue);
@@ -703,6 +807,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
return ret;
}
+void amdgpu_userq_reset_work(struct work_struct *work)
+{
+ struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
+ userq_reset_work);
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ reset_context.src = AMDGPU_RESET_SRC_USERQ;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+}
+
static int
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
{
@@ -729,22 +850,19 @@ void
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence)
{
- int ret;
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
+ struct amdgpu_device *adev = uq_mgr->adev;
+ int ret;
/* Wait for any pending userqueue fence work to finish */
ret = amdgpu_userq_wait_for_signal(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
ret = amdgpu_userq_evict_all(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Failed to evict userqueue\n");
/* Signal current eviction fence */
amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
@@ -785,6 +903,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
mutex_lock(&adev->userq_mutex);
mutex_lock(&userq_mgr->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(userq_mgr);
idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
amdgpu_userq_unmap_helper(userq_mgr, queue);
@@ -818,6 +937,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
cancel_delayed_work_sync(&uqm->resume_work);
mutex_lock(&uqm->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(uqm);
idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
r = amdgpu_userq_unmap_helper(uqm, queue);
if (r)
@@ -874,6 +994,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
cancel_delayed_work_sync(&uqm->resume_work);
mutex_lock(&uqm->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(uqm);
idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
(queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
@@ -922,3 +1043,60 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
mutex_unlock(&adev->userq_mutex);
return ret;
}
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
+{
+ const struct amdgpu_userq_funcs *userq_funcs;
+ struct amdgpu_usermode_queue *queue;
+ struct amdgpu_userq_mgr *uqm, *tmp;
+ int queue_id;
+
+ list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+ cancel_delayed_work_sync(&uqm->resume_work);
+ idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+ if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ amdgpu_userq_wait_for_last_fence(uqm, queue);
+ userq_funcs = adev->userq_funcs[queue->queue_type];
+ userq_funcs->unmap(uqm, queue);
+ /* just mark all queues as hung at this point.
+ * if unmap succeeds, we could map again
+ * in amdgpu_userq_post_reset() if vram is not lost
+ */
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ amdgpu_userq_fence_driver_force_completion(queue);
+ }
+ }
+ }
+}
+
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
+{
+ /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
+ * at this point, we should be able to map it again
+ * and continue if vram is not lost.
+ */
+ struct amdgpu_userq_mgr *uqm;
+ struct amdgpu_usermode_queue *queue;
+ const struct amdgpu_userq_funcs *userq_funcs;
+ int queue_id, r = 0;
+
+ list_for_each_entry(uqm, &adev->userq_mgr_list, list) {
+ idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+ if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
+ userq_funcs = adev->userq_funcs[queue->queue_type];
+
+ r = userq_funcs->map(uqm, queue); // Re-map queue
+ if (r) {
+ dev_err(adev->dev, "Failed to remap queue %d\n", queue_id);
+ continue;
+ }
+ queue->state = AMDGPU_USERQ_STATE_MAPPED;
+ }
+ }
+
+ /* Restart resume work after reset */
+ //queue_delayed_work(system_wq, &uqm->resume_work, msecs_to_jiffies(100));
+ }
+
+ return r;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 68e46d01bed2..c136d7e7a763 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -139,4 +139,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
u32 idx);
+void amdgpu_userq_reset_work(struct work_struct *work);
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
+
#endif
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* [v6 13/13] drm/amdgpu: Implement queue preemption using suspend/resume API
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
` (10 preceding siblings ...)
2025-08-04 8:38 ` [v6 12/13] drm/amdgpu: Add GPU reset handling for user mode queues Jesse.Zhang
@ 2025-08-04 8:38 ` Jesse.Zhang
2025-08-04 17:22 ` Alex Deucher
11 siblings, 1 reply; 22+ messages in thread
From: Jesse.Zhang @ 2025-08-04 8:38 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.Zhang, Alex Deucher
Replace the queue remove/add approach with suspend/resume semantics
for user queue preemption. This change:
1. Maintains queue scheduling registration while only preempting execution
- Previously used remove_queue/add_queue would fully deregister queues
- New suspend/resume approach keeps scheduler state while preempting
2. Introduces proper preemption helpers:
- amdgpu_userqueue_preempt_helper(): Suspends queue execution
- Transitions MAPPED→UNMAPPED state on success
- Marks as HUNG and triggers reset on failure
- amdgpu_userqueue_restore_helper(): Resumes queue execution
- Transitions UNMAPPED→MAPPED state on success
- Triggers GPU reset on failure
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 64 +++++++++++++++++++----
1 file changed, 53 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 0c91302162fa..3a8da1f47159 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -72,7 +72,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
bool has_gfx = false, has_compute = false, has_sdma = false;
struct amdgpu_usermode_queue *userq;
bool gpu_reset = false;
- int gpu_suspend, id, r = 0;
+ int id, r = 0;
if (idr_is_empty(&uq_mgr->userq_idr))
return false;
@@ -98,7 +98,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
dev_err(adev->dev, "userq reset disabled by debug mask\n");
} else if (amdgpu_gpu_recovery) {
if (has_compute && userq_compute_funcs->detect_and_reset) {
- gpu_suspend = amdgpu_mes_suspend(adev);
r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE);
if (r) {
gpu_reset = true;
@@ -127,9 +126,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
if (gpu_reset)
amdgpu_userq_gpu_reset(adev);
- if ((!gpu_suspend) && has_compute)
- amdgpu_mes_resume(adev);
-
return r;
}
@@ -143,7 +139,8 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
bool gpu_reset = false;
int r = 0;
- if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
+ (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
r = userq_funcs->unmap(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
@@ -185,6 +182,54 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
return r;
}
+static int
+amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const struct amdgpu_userq_funcs *userq_funcs =
+ adev->userq_funcs[queue->queue_type];
+ int r = 0;
+
+ if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ r = userq_funcs->preempt(uq_mgr, queue);
+ if (r) {
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ } else {
+ queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
+ }
+ }
+
+ return r;
+}
+
+static int
+amdgpu_userqueue_restore_helper(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const struct amdgpu_userq_funcs *userq_funcs =
+ adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
+ int r = 0;
+
+ if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) {
+ r = userq_funcs->restore(uq_mgr, queue);
+ if (r) {
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ gpu_reset = true;
+ } else {
+ queue->state = AMDGPU_USERQ_STATE_MAPPED;
+ }
+ }
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
+ return r;
+}
+
static void
amdgpu_userq_wait_for_last_fence(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue)
@@ -639,7 +684,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
/* Resume all the queues for this process */
idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
- r = amdgpu_userq_map_helper(uq_mgr, queue);
+ r = amdgpu_userqueue_restore_helper(uq_mgr, queue);
if (r)
ret = r;
}
@@ -794,10 +839,9 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
int queue_id;
int ret = 0, r;
- amdgpu_userq_detect_and_reset_queues(uq_mgr);
/* Try to unmap all the queues in this process ctx */
idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
- r = amdgpu_userq_unmap_helper(uq_mgr, queue);
+ r = amdgpu_userqueue_preempt_helper(uq_mgr, queue);
if (r)
ret = r;
}
@@ -900,7 +944,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
uint32_t queue_id;
cancel_delayed_work_sync(&userq_mgr->resume_work);
-
mutex_lock(&adev->userq_mutex);
mutex_lock(&userq_mgr->userq_mutex);
amdgpu_userq_detect_and_reset_queues(userq_mgr);
@@ -909,7 +952,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
amdgpu_userq_unmap_helper(userq_mgr, queue);
amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
}
-
list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
if (uqm == userq_mgr) {
list_del(&uqm->list);
--
2.49.0
^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [v6 03/13] drm/amdgpu/mes12: implement detect and reset callback
2025-08-04 8:38 ` [v6 03/13] drm/amdgpu/mes12: " Jesse.Zhang
@ 2025-08-04 17:00 ` Alex Deucher
2025-08-05 6:13 ` Zhang, Jesse(Jie)
0 siblings, 1 reply; 22+ messages in thread
From: Alex Deucher @ 2025-08-04 17:00 UTC (permalink / raw)
To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig
On Mon, Aug 4, 2025 at 4:53 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> From: Alex Deucher <alexander.deucher@amd.com>
>
> Implement support for the hung queue detect and reset
> functionality.
>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 37 ++++++++++++++++++++++++++
> 1 file changed, 37 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 6b222630f3fa..29d38aa1897e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -47,6 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct amdgpu_device *adev);
>
> #define MES_EOP_SIZE 2048
>
> +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4
> +
> static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring)
> {
> struct amdgpu_device *adev = ring->adev;
> @@ -879,6 +881,38 @@ static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes,
> offsetof(union MESAPI__RESET, api_status));
> }
>
> +static int mes_v12_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> + struct mes_detect_and_reset_queue_input *input)
> +{
> + union MESAPI__RESET mes_reset_queue_pkt;
> + int pipe;
> +
> + memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
> +
> + mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
> + mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET;
> + mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> + mes_reset_queue_pkt.queue_type =
> + convert_to_mes_queue_type(input->queue_type);
> + mes_reset_queue_pkt.doorbell_offset_addr =
> + mes->hung_queue_db_array_gpu_addr;
> +
> + if (input->detect_only)
> + mes_reset_queue_pkt.hang_detect_only = 1;
> + else
> + mes_reset_queue_pkt.hang_detect_then_reset = 1;
> +
> + if (mes->adev->enable_uni_mes)
> + pipe = AMDGPU_MES_KIQ_PIPE;
> + else
> + pipe = AMDGPU_MES_SCHED_PIPE;
I think this should probably always be AMDGPU_MES_SCHED_PIPE. Setting
this may fix the issues you were seeing on gfx12.
Alex
> +
> + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> + &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
> + offsetof(union MESAPI__RESET, api_status));
> +}
> +
> static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
> .add_hw_queue = mes_v12_0_add_hw_queue,
> .remove_hw_queue = mes_v12_0_remove_hw_queue,
> @@ -888,6 +922,7 @@ static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
> .resume_gang = mes_v12_0_resume_gang,
> .misc_op = mes_v12_0_misc_op,
> .reset_hw_queue = mes_v12_0_reset_hw_queue,
> + .detect_and_reset_hung_queues = mes_v12_0_detect_and_reset_hung_queues,
> };
>
> static int mes_v12_0_allocate_ucode_buffer(struct amdgpu_device *adev,
> @@ -1793,6 +1828,8 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block)
> struct amdgpu_device *adev = ip_block->adev;
> int pipe, r;
>
> + adev->mes.hung_queue_db_array_size =
> + MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
> for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
> r = amdgpu_mes_init_microcode(adev, pipe);
> if (r)
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX
2025-08-04 8:38 ` [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX Jesse.Zhang
@ 2025-08-04 17:03 ` Alex Deucher
2025-08-05 6:52 ` Zhang, Jesse(Jie)
2025-08-04 17:04 ` Alex Deucher
1 sibling, 1 reply; 22+ messages in thread
From: Alex Deucher @ 2025-08-04 17:03 UTC (permalink / raw)
To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig, kyle-hai.chau
On Mon, Aug 4, 2025 at 4:48 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> MES queue reset functionality for GFX queues. The changes include:
>
> 1. Added detection of active VMIDs by reading CP_CNTX_STAT and CP_VMID
> registers to properly identify contexts that need resetting
>
> 2. Implemented fallback to HPD status method when no active VMIDs are
> found, checking both pipe 0 and pipe 1 queues
>
> 3. Extended the MES reset packet with:
> - active_vmids bitmap
> - connected_queue_index for pipe 0
> - connected_queue_index_p1 for pipe 1
>
Do we still need this if we switch over to suspending queues before we
run detect and reset?
Alex
> Suggested-by: kyle-hai.chau <kyle-hai.chau@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 51 +++++++++++++++++++
> drivers/gpu/drm/amd/include/mes_v11_api_def.h | 13 ++++-
> 2 files changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index ed6a7f8af544..1422bc59cd40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -789,6 +789,12 @@ static int mes_v11_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> struct mes_detect_and_reset_queue_input *input)
> {
> union MESAPI__RESET mes_reset_queue_pkt;
> + struct amdgpu_device *adev = mes->adev;
> + uint32_t active_vmids = 0;
> + uint32_t connected_queue_index = 0;
> + uint32_t queue_status = 0;
> + uint32_t connected_queue_index_p1 = 0;
> + uint32_t queue_status_p1 = 0;
>
> memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
>
> @@ -801,6 +807,51 @@ static int mes_v11_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> mes_reset_queue_pkt.doorbell_offset_addr =
> mes->hung_queue_db_array_gpu_addr;
>
> + /* Add VMID detection for GFX queues */
> + if (input->queue_type == AMDGPU_RING_TYPE_GFX) {
> + uint32_t cp_cntx_stat = RREG32_SOC15(GC, 0, regCP_CNTX_STAT);
> + uint32_t cp_vmid, grbm_gfx_cntl;
> +
> + /* Check active contexts in CP_CNTX_STAT */
> + for (uint32_t i = 0; i < 8; i++) {
> + if ((cp_cntx_stat >> (0x14 + i)) & 0x1) {
> + grbm_gfx_cntl = (i << 11);
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, grbm_gfx_cntl);
> + cp_vmid = RREG32_SOC15(GC, 0, regCP_VMID);
> + active_vmids |= (1 << cp_vmid);
> + }
> + }
> +
> + /* Fallback to HPD status if no active VMIDs found */
> + if (active_vmids == 0) {
> + uint32_t hpd_status;
> +
> + /* Pipe 0 */
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, 0);
> + hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
> + queue_status = hpd_status & 0x1F;
> + connected_queue_index = (hpd_status & 0xE0) >> 5;
> +
> + /* Pipe 1 */
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, (1 << 6));
> + hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
> + queue_status_p1 = hpd_status & 0x1F;
> + connected_queue_index_p1 = (hpd_status & 0xE0) >> 5;
> + }
> +
> + mes_reset_queue_pkt.active_vmids = active_vmids;
> + if (active_vmids == 0) {
> + if (queue_status != 0) {
> + mes_reset_queue_pkt.use_connected_queue_index = 1;
> + mes_reset_queue_pkt.connected_queue_index = connected_queue_index;
> + }
> + if (queue_status_p1 != 0) {
> + mes_reset_queue_pkt.use_connected_queue_index_p1 = 1;
> + mes_reset_queue_pkt.connected_queue_index_p1 = connected_queue_index_p1;
> + }
> + }
> + }
> +
> if (input->detect_only)
> mes_reset_queue_pkt.hang_detect_only = 1;
> else
> diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> index 15680c3f4970..62ad4f0337eb 100644
> --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> @@ -460,7 +460,11 @@ union MESAPI__RESET {
> uint32_t hang_detect_only : 1;
> /* Rest HP and LP kernel queues not managed by MES */
> uint32_t reset_legacy_gfx : 1;
> - uint32_t reserved : 28;
> + /* Fallback to use conneceted queue index when CP_CNTX_STAT method fails (gfx pipe 0) */
> + uint32_t use_connected_queue_index : 1;
> + /* For gfx pipe 1 */
> + uint32_t use_connected_queue_index_p1 : 1;
> + uint32_t reserved : 26;
> };
>
> uint64_t gang_context_addr;
> @@ -488,6 +492,13 @@ union MESAPI__RESET {
> uint64_t wptr_addr_hp;
>
> struct MES_API_STATUS api_status;
> + uint32_t active_vmids;
> + uint64_t timestamp;
> +
> + uint32_t gang_context_array_index;
> +
> + uint32_t connected_queue_index;
> + uint32_t connected_queue_index_p1;
> };
>
> uint32_t max_dwords_in_api[API_FRAME_SIZE_IN_DWORDS];
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX
2025-08-04 8:38 ` [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX Jesse.Zhang
2025-08-04 17:03 ` Alex Deucher
@ 2025-08-04 17:04 ` Alex Deucher
1 sibling, 0 replies; 22+ messages in thread
From: Alex Deucher @ 2025-08-04 17:04 UTC (permalink / raw)
To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig, kyle-hai.chau
On Mon, Aug 4, 2025 at 4:48 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> MES queue reset functionality for GFX queues. The changes include:
>
> 1. Added detection of active VMIDs by reading CP_CNTX_STAT and CP_VMID
> registers to properly identify contexts that need resetting
>
> 2. Implemented fallback to HPD status method when no active VMIDs are
> found, checking both pipe 0 and pipe 1 queues
>
> 3. Extended the MES reset packet with:
> - active_vmids bitmap
> - connected_queue_index for pipe 0
> - connected_queue_index_p1 for pipe 1
>
> Suggested-by: kyle-hai.chau <kyle-hai.chau@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 51 +++++++++++++++++++
> drivers/gpu/drm/amd/include/mes_v11_api_def.h | 13 ++++-
> 2 files changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index ed6a7f8af544..1422bc59cd40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -789,6 +789,12 @@ static int mes_v11_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> struct mes_detect_and_reset_queue_input *input)
> {
> union MESAPI__RESET mes_reset_queue_pkt;
> + struct amdgpu_device *adev = mes->adev;
> + uint32_t active_vmids = 0;
> + uint32_t connected_queue_index = 0;
> + uint32_t queue_status = 0;
> + uint32_t connected_queue_index_p1 = 0;
> + uint32_t queue_status_p1 = 0;
>
> memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
>
> @@ -801,6 +807,51 @@ static int mes_v11_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> mes_reset_queue_pkt.doorbell_offset_addr =
> mes->hung_queue_db_array_gpu_addr;
>
> + /* Add VMID detection for GFX queues */
> + if (input->queue_type == AMDGPU_RING_TYPE_GFX) {
> + uint32_t cp_cntx_stat = RREG32_SOC15(GC, 0, regCP_CNTX_STAT);
> + uint32_t cp_vmid, grbm_gfx_cntl;
> +
> + /* Check active contexts in CP_CNTX_STAT */
> + for (uint32_t i = 0; i < 8; i++) {
> + if ((cp_cntx_stat >> (0x14 + i)) & 0x1) {
> + grbm_gfx_cntl = (i << 11);
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, grbm_gfx_cntl);
> + cp_vmid = RREG32_SOC15(GC, 0, regCP_VMID);
> + active_vmids |= (1 << cp_vmid);
> + }
> + }
> +
> + /* Fallback to HPD status if no active VMIDs found */
> + if (active_vmids == 0) {
> + uint32_t hpd_status;
> +
> + /* Pipe 0 */
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, 0);
> + hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
> + queue_status = hpd_status & 0x1F;
> + connected_queue_index = (hpd_status & 0xE0) >> 5;
> +
> + /* Pipe 1 */
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, (1 << 6));
> + hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
> + queue_status_p1 = hpd_status & 0x1F;
> + connected_queue_index_p1 = (hpd_status & 0xE0) >> 5;
> + }
> +
> + mes_reset_queue_pkt.active_vmids = active_vmids;
> + if (active_vmids == 0) {
> + if (queue_status != 0) {
> + mes_reset_queue_pkt.use_connected_queue_index = 1;
> + mes_reset_queue_pkt.connected_queue_index = connected_queue_index;
> + }
> + if (queue_status_p1 != 0) {
> + mes_reset_queue_pkt.use_connected_queue_index_p1 = 1;
> + mes_reset_queue_pkt.connected_queue_index_p1 = connected_queue_index_p1;
> + }
> + }
> + }
> +
> if (input->detect_only)
> mes_reset_queue_pkt.hang_detect_only = 1;
> else
> diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> index 15680c3f4970..62ad4f0337eb 100644
> --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> @@ -460,7 +460,11 @@ union MESAPI__RESET {
> uint32_t hang_detect_only : 1;
> /* Rest HP and LP kernel queues not managed by MES */
> uint32_t reset_legacy_gfx : 1;
> - uint32_t reserved : 28;
> + /* Fallback to use conneceted queue index when CP_CNTX_STAT method fails (gfx pipe 0) */
> + uint32_t use_connected_queue_index : 1;
> + /* For gfx pipe 1 */
> + uint32_t use_connected_queue_index_p1 : 1;
> + uint32_t reserved : 26;
> };
>
> uint64_t gang_context_addr;
> @@ -488,6 +492,13 @@ union MESAPI__RESET {
> uint64_t wptr_addr_hp;
>
> struct MES_API_STATUS api_status;
> + uint32_t active_vmids;
> + uint64_t timestamp;
> +
> + uint32_t gang_context_array_index;
> +
> + uint32_t connected_queue_index;
> + uint32_t connected_queue_index_p1;
We need a FW check so we know which version added this new API.
Alex
> };
>
> uint32_t max_dwords_in_api[API_FRAME_SIZE_IN_DWORDS];
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [v6 05/13] drm/amdgpu: Implement active VMID detection in MES12 queue reset for GFX
2025-08-04 8:38 ` [v6 05/13] drm/amdgpu: Implement active VMID detection in MES12 " Jesse.Zhang
@ 2025-08-04 17:05 ` Alex Deucher
0 siblings, 0 replies; 22+ messages in thread
From: Alex Deucher @ 2025-08-04 17:05 UTC (permalink / raw)
To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig, kyle-hai.chau
On Mon, Aug 4, 2025 at 4:48 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> MES queue reset functionality for GFX queues. The changes include:
>
> 1. Added detection of active VMIDs by reading CP_CNTX_STAT and CP_VMID
> registers to properly identify contexts that need resetting
>
> 2. Implemented fallback to HPD status method when no active VMIDs are
> found, checking both pipe 0 and pipe 1 queues
>
> 3. Extended the MES reset packet with:
> - active_vmids bitmap
> - connected_queue_index for pipe 0
> - connected_queue_index_p1 for pipe 1
>
Same comment as the gfx11 versions of these patches.
Alex
> Suggested-by: kyle-hai.chau <kyle-hai.chau@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 51 ++++++++++++++++++++++++++
> 1 file changed, 51 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 29d38aa1897e..579720695e9e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -885,6 +885,12 @@ static int mes_v12_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> struct mes_detect_and_reset_queue_input *input)
> {
> union MESAPI__RESET mes_reset_queue_pkt;
> + struct amdgpu_device *adev = mes->adev;
> + uint32_t active_vmids = 0;
> + uint32_t connected_queue_index = 0;
> + uint32_t queue_status = 0;
> + uint32_t connected_queue_index_p1 = 0;
> + uint32_t queue_status_p1 = 0;
> int pipe;
>
> memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
> @@ -898,6 +904,51 @@ static int mes_v12_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> mes_reset_queue_pkt.doorbell_offset_addr =
> mes->hung_queue_db_array_gpu_addr;
>
> + /* Add VMID detection for GFX queues */
> + if (input->queue_type == AMDGPU_RING_TYPE_GFX) {
> + uint32_t cp_cntx_stat = RREG32_SOC15(GC, 0, regCP_CNTX_STAT);
> + uint32_t cp_vmid, grbm_gfx_cntl;
> +
> + /* Check active contexts in CP_CNTX_STAT */
> + for (uint32_t i = 0; i < 8; i++) {
> + if ((cp_cntx_stat >> (0x14 + i)) & 0x1) {
> + grbm_gfx_cntl = (i << 11);
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, grbm_gfx_cntl);
> + cp_vmid = RREG32_SOC15(GC, 0, regCP_CP_VMID);
> + active_vmids |= (1 << cp_vmid);
> + }
> + }
> +
> + /* Fallback to HPD status if no active VMIDs found */
> + if (active_vmids == 0) {
> + uint32_t hpd_status;
> +
> + /* Pipe 0 */
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, 0);
> + hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
> + queue_status = hpd_status & 0x1F;
> + connected_queue_index = (hpd_status & 0xE0) >> 5;
> +
> + /* Pipe 1 */
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, (1 << 6));
> + hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
> + queue_status_p1 = hpd_status & 0x1F;
> + connected_queue_index_p1 = (hpd_status & 0xE0) >> 5;
> + }
> +
> + mes_reset_queue_pkt.active_vmids = active_vmids;
> + if (active_vmids == 0) {
> + if (queue_status != 0) {
> + mes_reset_queue_pkt.use_connected_queue_index = 1;
> + mes_reset_queue_pkt.connected_queue_index = connected_queue_index;
> + }
> + if (queue_status_p1 != 0) {
> + mes_reset_queue_pkt.use_connected_queue_index_p1 = 1;
> + mes_reset_queue_pkt.connected_queue_index_p1 = connected_queue_index_p1;
> + }
> + }
> + }
> +
> if (input->detect_only)
> mes_reset_queue_pkt.hang_detect_only = 1;
> else
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [v6 08/13] drm/amdgpu: adjust MES API used for suspend and resume
2025-08-04 8:38 ` [v6 08/13] drm/amdgpu: adjust MES API used for suspend and resume Jesse.Zhang
@ 2025-08-04 17:16 ` Alex Deucher
0 siblings, 0 replies; 22+ messages in thread
From: Alex Deucher @ 2025-08-04 17:16 UTC (permalink / raw)
To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig
On Mon, Aug 4, 2025 at 4:41 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> Use the suspend and resume API rather than remove queue
> and add queue API. The former just preempts the queue
> while the latter remove it from the scheduler completely.
> There is no need to do that, we only need preemption
> in this case.
>
> V2: replace queue_active with queue state
> v3: set the suspend_fence_addr
>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 51 ++++++++++++++++++++++
> 1 file changed, 51 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index a871bac71e1e..8934d7113d58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -395,10 +395,61 @@ mes_userq_mqd_destroy(struct amdgpu_userq_mgr *uq_mgr,
> amdgpu_userq_destroy_object(uq_mgr, &queue->mqd);
> }
>
> +static int mes_userq_preempt(struct amdgpu_userq_mgr *uq_mgr,
> + struct amdgpu_usermode_queue *queue)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + struct mes_suspend_gang_input queue_input;
> + struct amdgpu_userq_obj *ctx = &queue->fw_obj;
> + int r;
> +
> + if (queue->state != AMDGPU_USERQ_STATE_MAPPED)
> + return 0;
> + if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)
> + return 0;
> +
> + memset(&queue_input, 0x0, sizeof(struct mes_suspend_gang_input));
> + queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ;
> + queue_input.suspend_fence_addr = queue->fence_drv->gpu_addr;
You need to allocate a separate buffer for this. You should also set
the fence value we are waiting for. The driver then needs to wait on
this fence location to make sure the suspend is complete. You can use
a wb allocation similar to the status stuff in
mes_v11_0_submit_pkt_and_poll_completion().
I think you also need to specify the doorbell offset for the queue you
want to target since this function is per queue.
Alex
> +
> + amdgpu_mes_lock(&adev->mes);
> + r = adev->mes.funcs->suspend_gang(&adev->mes, &queue_input);
> + amdgpu_mes_unlock(&adev->mes);
> + if (r)
> + dev_err(adev->dev, "Failed to suspend queue, err (%d)\n", r);
> + return r;
> +}
> +
> +static int mes_userq_restore(struct amdgpu_userq_mgr *uq_mgr,
> + struct amdgpu_usermode_queue *queue)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + struct mes_resume_gang_input queue_input;
> + struct amdgpu_userq_obj *ctx = &queue->fw_obj;
> + int r;
> +
> + if (queue->state == AMDGPU_USERQ_STATE_HUNG)
> + return -EINVAL;
> + if (queue->state != AMDGPU_USERQ_STATE_PREEMPTED)
> + return 0;
> +
> + memset(&queue_input, 0x0, sizeof(struct mes_resume_gang_input));
> + queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ;
> +
> + amdgpu_mes_lock(&adev->mes);
> + r = adev->mes.funcs->resume_gang(&adev->mes, &queue_input);
> + amdgpu_mes_unlock(&adev->mes);
> + if (r)
> + dev_err(adev->dev, "Failed to resume queue, err (%d)\n", r);
> + return r;
> + }
> +
> const struct amdgpu_userq_funcs userq_mes_funcs = {
> .mqd_create = mes_userq_mqd_create,
> .mqd_destroy = mes_userq_mqd_destroy,
> .unmap = mes_userq_unmap,
> .map = mes_userq_map,
> + .preempt = mes_userq_preempt,
> + .restore = mes_userq_restore,
> .detect_and_reset = mes_userq_detect_and_reset,
> };
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [v6 09/13] drm/amd/amdgpu: Implement MES suspend/resume gang functionality for v12
2025-08-04 8:38 ` [v6 09/13] drm/amd/amdgpu: Implement MES suspend/resume gang functionality for v12 Jesse.Zhang
@ 2025-08-04 17:19 ` Alex Deucher
0 siblings, 0 replies; 22+ messages in thread
From: Alex Deucher @ 2025-08-04 17:19 UTC (permalink / raw)
To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig
On Mon, Aug 4, 2025 at 4:48 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> This commit implements the actual MES (Micro Engine Scheduler) suspend
> and resume gang operations for version 12 hardware. Previously these
> functions were just stubs returning success.
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 44 ++++++++++++++++++++++++--
> 1 file changed, 42 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 579720695e9e..9c86dfdef1bb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -569,13 +569,53 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes,
> static int mes_v12_0_suspend_gang(struct amdgpu_mes *mes,
> struct mes_suspend_gang_input *input)
> {
> - return 0;
> + union MESAPI__SUSPEND mes_suspend_gang_pkt;
> + int pipe;
> +
> + memset(&mes_suspend_gang_pkt, 0, sizeof(mes_suspend_gang_pkt));
> +
> + mes_suspend_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
> + mes_suspend_gang_pkt.header.opcode = MES_SCH_API_SUSPEND;
> + mes_suspend_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> + mes_suspend_gang_pkt.suspend_all_gangs = input->suspend_all_gangs;
> + mes_suspend_gang_pkt.gang_context_addr = input->gang_context_addr;
> + mes_suspend_gang_pkt.suspend_fence_addr = input->suspend_fence_addr;
> + mes_suspend_gang_pkt.suspend_fence_value = input->suspend_fence_value;
> +
> + if (mes->adev->enable_uni_mes)
> + pipe = AMDGPU_MES_KIQ_PIPE;
> + else
> + pipe = AMDGPU_MES_SCHED_PIPE;
I think this should probably be AMDGPU_MES_SCHED_PIPE always.
> +
> + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> + &mes_suspend_gang_pkt, sizeof(mes_suspend_gang_pkt),
> + offsetof(union MESAPI__SUSPEND, api_status));
> }
>
> static int mes_v12_0_resume_gang(struct amdgpu_mes *mes,
> struct mes_resume_gang_input *input)
> {
> - return 0;
> + union MESAPI__RESUME mes_resume_gang_pkt;
> + int pipe;
> +
> + memset(&mes_resume_gang_pkt, 0, sizeof(mes_resume_gang_pkt));
> +
> + mes_resume_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
> + mes_resume_gang_pkt.header.opcode = MES_SCH_API_RESUME;
> + mes_resume_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> + mes_resume_gang_pkt.resume_all_gangs = input->resume_all_gangs;
> + mes_resume_gang_pkt.gang_context_addr = input->gang_context_addr;
> +
> + if (mes->adev->enable_uni_mes)
> + pipe = AMDGPU_MES_KIQ_PIPE;
> + else
> + pipe = AMDGPU_MES_SCHED_PIPE;
I think this should probably be AMDGPU_MES_SCHED_PIPE always.
> +
> + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> + &mes_resume_gang_pkt, sizeof(mes_resume_gang_pkt),
> + offsetof(union MESAPI__RESUME, api_status));
> }
>
> static int mes_v12_0_query_sched_status(struct amdgpu_mes *mes, int pipe)
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [v6 13/13] drm/amdgpu: Implement queue preemption using suspend/resume API
2025-08-04 8:38 ` [v6 13/13] drm/amdgpu: Implement queue preemption using suspend/resume API Jesse.Zhang
@ 2025-08-04 17:22 ` Alex Deucher
0 siblings, 0 replies; 22+ messages in thread
From: Alex Deucher @ 2025-08-04 17:22 UTC (permalink / raw)
To: Jesse.Zhang; +Cc: amd-gfx, Alexander.Deucher, Christian Koenig
On Mon, Aug 4, 2025 at 4:41 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> Replace the queue remove/add approach with suspend/resume semantics
> for user queue preemption. This change:
>
> 1. Maintains queue scheduling registration while only preempting execution
> - Previously used remove_queue/add_queue would fully deregister queues
> - New suspend/resume approach keeps scheduler state while preempting
>
> 2. Introduces proper preemption helpers:
> - amdgpu_userqueue_preempt_helper(): Suspends queue execution
> - Transitions MAPPED→UNMAPPED state on success
> - Marks as HUNG and triggers reset on failure
> - amdgpu_userqueue_restore_helper(): Resumes queue execution
> - Transitions UNMAPPED→MAPPED state on success
> - Triggers GPU reset on failure
I would move the preempt/restore patches to the start of the series.
Use preempt/restore for all of the cases where we need to preempt the
queues and only use map/unmap for device init/fini and system
suspend/resume.
Alex
>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 64 +++++++++++++++++++----
> 1 file changed, 53 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 0c91302162fa..3a8da1f47159 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -72,7 +72,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> bool has_gfx = false, has_compute = false, has_sdma = false;
> struct amdgpu_usermode_queue *userq;
> bool gpu_reset = false;
> - int gpu_suspend, id, r = 0;
> + int id, r = 0;
>
> if (idr_is_empty(&uq_mgr->userq_idr))
> return false;
> @@ -98,7 +98,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> dev_err(adev->dev, "userq reset disabled by debug mask\n");
> } else if (amdgpu_gpu_recovery) {
> if (has_compute && userq_compute_funcs->detect_and_reset) {
> - gpu_suspend = amdgpu_mes_suspend(adev);
> r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE);
> if (r) {
> gpu_reset = true;
> @@ -127,9 +126,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
> if (gpu_reset)
> amdgpu_userq_gpu_reset(adev);
>
> - if ((!gpu_suspend) && has_compute)
> - amdgpu_mes_resume(adev);
> -
> return r;
> }
>
> @@ -143,7 +139,8 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
> bool gpu_reset = false;
> int r = 0;
>
> - if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> + if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
> + (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
> r = userq_funcs->unmap(uq_mgr, queue);
> if (r) {
> queue->state = AMDGPU_USERQ_STATE_HUNG;
> @@ -185,6 +182,54 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
> return r;
> }
>
> +static int
> +amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr,
> + struct amdgpu_usermode_queue *queue)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + const struct amdgpu_userq_funcs *userq_funcs =
> + adev->userq_funcs[queue->queue_type];
> + int r = 0;
> +
> + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
> + r = userq_funcs->preempt(uq_mgr, queue);
> + if (r) {
> + amdgpu_userq_detect_and_reset_queues(uq_mgr);
> + queue->state = AMDGPU_USERQ_STATE_HUNG;
> + } else {
> + queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
> + }
> + }
> +
> + return r;
> +}
> +
> +static int
> +amdgpu_userqueue_restore_helper(struct amdgpu_userq_mgr *uq_mgr,
> + struct amdgpu_usermode_queue *queue)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + const struct amdgpu_userq_funcs *userq_funcs =
> + adev->userq_funcs[queue->queue_type];
> + bool gpu_reset = false;
> + int r = 0;
> +
> + if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) {
> + r = userq_funcs->restore(uq_mgr, queue);
> + if (r) {
> + queue->state = AMDGPU_USERQ_STATE_HUNG;
> + gpu_reset = true;
> + } else {
> + queue->state = AMDGPU_USERQ_STATE_MAPPED;
> + }
> + }
> +
> + if (gpu_reset)
> + amdgpu_userq_gpu_reset(adev);
> +
> + return r;
> +}
> +
> static void
> amdgpu_userq_wait_for_last_fence(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_usermode_queue *queue)
> @@ -639,7 +684,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
>
> /* Resume all the queues for this process */
> idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> - r = amdgpu_userq_map_helper(uq_mgr, queue);
> + r = amdgpu_userqueue_restore_helper(uq_mgr, queue);
> if (r)
> ret = r;
> }
> @@ -794,10 +839,9 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
> int queue_id;
> int ret = 0, r;
>
> - amdgpu_userq_detect_and_reset_queues(uq_mgr);
> /* Try to unmap all the queues in this process ctx */
> idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
> - r = amdgpu_userq_unmap_helper(uq_mgr, queue);
> + r = amdgpu_userqueue_preempt_helper(uq_mgr, queue);
> if (r)
> ret = r;
> }
> @@ -900,7 +944,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
> uint32_t queue_id;
>
> cancel_delayed_work_sync(&userq_mgr->resume_work);
> -
> mutex_lock(&adev->userq_mutex);
> mutex_lock(&userq_mgr->userq_mutex);
> amdgpu_userq_detect_and_reset_queues(userq_mgr);
> @@ -909,7 +952,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
> amdgpu_userq_unmap_helper(userq_mgr, queue);
> amdgpu_userq_cleanup(userq_mgr, queue, queue_id);
> }
> -
> list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> if (uqm == userq_mgr) {
> list_del(&uqm->list);
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* RE: [v6 03/13] drm/amdgpu/mes12: implement detect and reset callback
2025-08-04 17:00 ` Alex Deucher
@ 2025-08-05 6:13 ` Zhang, Jesse(Jie)
0 siblings, 0 replies; 22+ messages in thread
From: Zhang, Jesse(Jie) @ 2025-08-05 6:13 UTC (permalink / raw)
To: Alex Deucher
Cc: amd-gfx@lists.freedesktop.org, Deucher, Alexander,
Koenig, Christian
[AMD Official Use Only - AMD Internal Distribution Only]
-----Original Message-----
From: Alex Deucher <alexdeucher@gmail.com>
Sent: Tuesday, August 5, 2025 1:01 AM
To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [v6 03/13] drm/amdgpu/mes12: implement detect and reset callback
On Mon, Aug 4, 2025 at 4:53 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> From: Alex Deucher <alexander.deucher@amd.com>
>
> Implement support for the hung queue detect and reset functionality.
>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 37
> ++++++++++++++++++++++++++
> 1 file changed, 37 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 6b222630f3fa..29d38aa1897e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -47,6 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct
> amdgpu_device *adev);
>
> #define MES_EOP_SIZE 2048
>
> +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4
> +
> static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring) {
> struct amdgpu_device *adev = ring->adev; @@ -879,6 +881,38 @@
> static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes,
> offsetof(union MESAPI__RESET, api_status)); }
>
> +static int mes_v12_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> + struct
> +mes_detect_and_reset_queue_input *input) {
> + union MESAPI__RESET mes_reset_queue_pkt;
> + int pipe;
> +
> + memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
> +
> + mes_reset_queue_pkt.header.type = MES_API_TYPE_SCHEDULER;
> + mes_reset_queue_pkt.header.opcode = MES_SCH_API_RESET;
> + mes_reset_queue_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
> +
> + mes_reset_queue_pkt.queue_type =
> + convert_to_mes_queue_type(input->queue_type);
> + mes_reset_queue_pkt.doorbell_offset_addr =
> + mes->hung_queue_db_array_gpu_addr;
> +
> + if (input->detect_only)
> + mes_reset_queue_pkt.hang_detect_only = 1;
> + else
> + mes_reset_queue_pkt.hang_detect_then_reset = 1;
> +
> + if (mes->adev->enable_uni_mes)
> + pipe = AMDGPU_MES_KIQ_PIPE;
> + else
> + pipe = AMDGPU_MES_SCHED_PIPE;
I think this should probably always be AMDGPU_MES_SCHED_PIPE. Setting this may fix the issues you were seeing on gfx12.
Thanks Alex. With this change, it fixed the issues with the GFX12 compute queue.
Thanks
Jesse
Alex
> +
> + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> + &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
> + offsetof(union MESAPI__RESET, api_status)); }
> +
> static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
> .add_hw_queue = mes_v12_0_add_hw_queue,
> .remove_hw_queue = mes_v12_0_remove_hw_queue, @@ -888,6 +922,7
> @@ static const struct amdgpu_mes_funcs mes_v12_0_funcs = {
> .resume_gang = mes_v12_0_resume_gang,
> .misc_op = mes_v12_0_misc_op,
> .reset_hw_queue = mes_v12_0_reset_hw_queue,
> + .detect_and_reset_hung_queues =
> + mes_v12_0_detect_and_reset_hung_queues,
> };
>
> static int mes_v12_0_allocate_ucode_buffer(struct amdgpu_device
> *adev, @@ -1793,6 +1828,8 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block)
> struct amdgpu_device *adev = ip_block->adev;
> int pipe, r;
>
> + adev->mes.hung_queue_db_array_size =
> + MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
> for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
> r = amdgpu_mes_init_microcode(adev, pipe);
> if (r)
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* RE: [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX
2025-08-04 17:03 ` Alex Deucher
@ 2025-08-05 6:52 ` Zhang, Jesse(Jie)
0 siblings, 0 replies; 22+ messages in thread
From: Zhang, Jesse(Jie) @ 2025-08-05 6:52 UTC (permalink / raw)
To: Alex Deucher
Cc: amd-gfx@lists.freedesktop.org, Deucher, Alexander,
Koenig, Christian, Chau, Kyle-hai
[AMD Official Use Only - AMD Internal Distribution Only]
-----Original Message-----
From: Alex Deucher <alexdeucher@gmail.com>
Sent: Tuesday, August 5, 2025 1:03 AM
To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Chau, Kyle-hai <Kyle-hai.Chau@amd.com>
Subject: Re: [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX
On Mon, Aug 4, 2025 at 4:48 AM Jesse.Zhang <Jesse.Zhang@amd.com> wrote:
>
> MES queue reset functionality for GFX queues. The changes include:
>
> 1. Added detection of active VMIDs by reading CP_CNTX_STAT and CP_VMID
> registers to properly identify contexts that need resetting
>
> 2. Implemented fallback to HPD status method when no active VMIDs are
> found, checking both pipe 0 and pipe 1 queues
>
> 3. Extended the MES reset packet with:
> - active_vmids bitmap
> - connected_queue_index for pipe 0
> - connected_queue_index_p1 for pipe 1
>
Do we still need this if we switch over to suspending queues before we run detect and reset?
With that, we don’t' need it. Will drop it
Thanks
Jesse
Alex
> Suggested-by: kyle-hai.chau <kyle-hai.chau@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 51 +++++++++++++++++++
> drivers/gpu/drm/amd/include/mes_v11_api_def.h | 13 ++++-
> 2 files changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index ed6a7f8af544..1422bc59cd40 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -789,6 +789,12 @@ static int mes_v11_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> struct
> mes_detect_and_reset_queue_input *input) {
> union MESAPI__RESET mes_reset_queue_pkt;
> + struct amdgpu_device *adev = mes->adev;
> + uint32_t active_vmids = 0;
> + uint32_t connected_queue_index = 0;
> + uint32_t queue_status = 0;
> + uint32_t connected_queue_index_p1 = 0;
> + uint32_t queue_status_p1 = 0;
>
> memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
>
> @@ -801,6 +807,51 @@ static int mes_v11_0_detect_and_reset_hung_queues(struct amdgpu_mes *mes,
> mes_reset_queue_pkt.doorbell_offset_addr =
> mes->hung_queue_db_array_gpu_addr;
>
> + /* Add VMID detection for GFX queues */
> + if (input->queue_type == AMDGPU_RING_TYPE_GFX) {
> + uint32_t cp_cntx_stat = RREG32_SOC15(GC, 0, regCP_CNTX_STAT);
> + uint32_t cp_vmid, grbm_gfx_cntl;
> +
> + /* Check active contexts in CP_CNTX_STAT */
> + for (uint32_t i = 0; i < 8; i++) {
> + if ((cp_cntx_stat >> (0x14 + i)) & 0x1) {
> + grbm_gfx_cntl = (i << 11);
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, grbm_gfx_cntl);
> + cp_vmid = RREG32_SOC15(GC, 0, regCP_VMID);
> + active_vmids |= (1 << cp_vmid);
> + }
> + }
> +
> + /* Fallback to HPD status if no active VMIDs found */
> + if (active_vmids == 0) {
> + uint32_t hpd_status;
> +
> + /* Pipe 0 */
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, 0);
> + hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
> + queue_status = hpd_status & 0x1F;
> + connected_queue_index = (hpd_status & 0xE0) >>
> + 5;
> +
> + /* Pipe 1 */
> + WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, (1 << 6));
> + hpd_status = RREG32_SOC15(GC, 0, regCP_GFX_HPD_STATUS0);
> + queue_status_p1 = hpd_status & 0x1F;
> + connected_queue_index_p1 = (hpd_status & 0xE0) >> 5;
> + }
> +
> + mes_reset_queue_pkt.active_vmids = active_vmids;
> + if (active_vmids == 0) {
> + if (queue_status != 0) {
> + mes_reset_queue_pkt.use_connected_queue_index = 1;
> + mes_reset_queue_pkt.connected_queue_index = connected_queue_index;
> + }
> + if (queue_status_p1 != 0) {
> + mes_reset_queue_pkt.use_connected_queue_index_p1 = 1;
> + mes_reset_queue_pkt.connected_queue_index_p1 = connected_queue_index_p1;
> + }
> + }
> + }
> +
> if (input->detect_only)
> mes_reset_queue_pkt.hang_detect_only = 1;
> else
> diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> index 15680c3f4970..62ad4f0337eb 100644
> --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> @@ -460,7 +460,11 @@ union MESAPI__RESET {
> uint32_t hang_detect_only : 1;
> /* Rest HP and LP kernel queues not managed by MES */
> uint32_t reset_legacy_gfx : 1;
> - uint32_t reserved : 28;
> + /* Fallback to use conneceted queue index when CP_CNTX_STAT method fails (gfx pipe 0) */
> + uint32_t use_connected_queue_index : 1;
> + /* For gfx pipe 1 */
> + uint32_t use_connected_queue_index_p1 : 1;
> + uint32_t reserved : 26;
> };
>
> uint64_t gang_context_addr;
> @@ -488,6 +492,13 @@ union MESAPI__RESET {
> uint64_t wptr_addr_hp;
>
> struct MES_API_STATUS api_status;
> + uint32_t active_vmids;
> + uint64_t timestamp;
> +
> + uint32_t gang_context_array_index;
> +
> + uint32_t connected_queue_index;
> + uint32_t connected_queue_index_p1;
> };
>
> uint32_t max_dwords_in_api[API_FRAME_SIZE_IN_DWORDS];
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 22+ messages in thread
end of thread, other threads:[~2025-08-05 6:52 UTC | newest]
Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-04 8:38 [v6 01/13] drm/amdgpu/mes: add front end for detect and reset hung queue Jesse.Zhang
2025-08-04 8:38 ` [v6 02/13] drm/amdgpu/mes11: implement detect and reset callback Jesse.Zhang
2025-08-04 8:38 ` [v6 03/13] drm/amdgpu/mes12: " Jesse.Zhang
2025-08-04 17:00 ` Alex Deucher
2025-08-05 6:13 ` Zhang, Jesse(Jie)
2025-08-04 8:38 ` [v6 04/13] drm/amdgpu: Implement active VMID detection in MES11 queue reset for GFX Jesse.Zhang
2025-08-04 17:03 ` Alex Deucher
2025-08-05 6:52 ` Zhang, Jesse(Jie)
2025-08-04 17:04 ` Alex Deucher
2025-08-04 8:38 ` [v6 05/13] drm/amdgpu: Implement active VMID detection in MES12 " Jesse.Zhang
2025-08-04 17:05 ` Alex Deucher
2025-08-04 8:38 ` [v6 06/13] drm/amdgpu/userq: add a detect and reset callback Jesse.Zhang
2025-08-04 8:38 ` [v6 07/13] drm/amd: Add preempt and restore callbacks to userq funcs Jesse.Zhang
2025-08-04 8:38 ` [v6 08/13] drm/amdgpu: adjust MES API used for suspend and resume Jesse.Zhang
2025-08-04 17:16 ` Alex Deucher
2025-08-04 8:38 ` [v6 09/13] drm/amd/amdgpu: Implement MES suspend/resume gang functionality for v12 Jesse.Zhang
2025-08-04 17:19 ` Alex Deucher
2025-08-04 8:38 ` [v6 10/13] drm/amdgpu: add user queue reset source Jesse.Zhang
2025-08-04 8:38 ` [v6 11/13] drm/amdgpu/userq: add force completion helpers Jesse.Zhang
2025-08-04 8:38 ` [v6 12/13] drm/amdgpu: Add GPU reset handling for user mode queues Jesse.Zhang
2025-08-04 8:38 ` [v6 13/13] drm/amdgpu: Implement queue preemption using suspend/resume API Jesse.Zhang
2025-08-04 17:22 ` Alex Deucher
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).