From: Jesse Zhang <Jesse.Zhang@amd.com>
To: <amd-gfx@lists.freedesktop.org>
Cc: <Alexander.Deucher@amd.com>,
Christian Koenig <christian.koenig@amd.com>,
Jesse Zhang <Jesse.Zhang@amd.com>,
Manu Rastogi <manu.rastogi@amd.com>,
"Alex Deucher" <alexander.deucher@amd.com>,
Jesse Zhang <jesse.zhang@amd.com>
Subject: [PATCH v3 2/8] drm/amdgpu/gfx11: Refactor compute pipe reset and add HQD cleanup
Date: Tue, 14 Apr 2026 16:58:49 +0800 [thread overview]
Message-ID: <20260414085926.3171086-2-Jesse.Zhang@amd.com> (raw)
In-Reply-To: <20260414085926.3171086-1-Jesse.Zhang@amd.com>
Refactor gfx_v11_0_reset_compute_pipe() to accept explicit me, pipe, and
queue parameters instead of deriving them from the ring structure. This
enables the function to be used in generic pipe reset flows.
Introduce gfx_v11_0_clear_hqds_on_mec_pipe() to properly clear
CP_HQD_ACTIVE and CP_HQD_DEQUEUE_REQUEST for all queues on a given MEC
pipe while the pipe reset is asserted, ensuring the HQDs are torn down
correctly before deasserting reset.
Switch the KCQ reset path to use the common MEC pipe reset helper
amdgpu_gfx_mec_pipe_reset_run(), which coordinates the reset sequence
including KFD suspend/resume to avoid conflicts with user mode queues.
Suggested-by: Manu Rastogi <manu.rastogi@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 177 +++++++++++++++----------
1 file changed, 109 insertions(+), 68 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index ae39b9e1f7d6..e29e8e620699 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -6906,11 +6906,39 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring,
return amdgpu_ring_reset_helper_end(ring, timedout_fence);
}
-static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring)
+/*
+ * With MEC pipe reset asserted, clear CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST for
+ * every queue on (me, pipe). HQDs must be torn down while pipe reset stays
+ * asserted; only then clear the pipe reset bit.
+ * Caller must hold adev->srbm_mutex.
+ */
+static void gfx_v11_0_clear_hqds_on_mec_pipe(struct amdgpu_device *adev, u32 me,
+ u32 pipe)
{
+ unsigned int q;
+ int j;
- struct amdgpu_device *adev = ring->adev;
- uint32_t reset_pipe = 0, clean_pipe = 0;
+ for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) {
+ soc21_grbm_select(adev, me, pipe, q, 0);
+ /* Start from a clean HQD dequeue state before forcing HQD inactive. */
+ WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, 0);
+ if (RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1) {
+ WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 1);
+ for (j = 0; j < adev->usec_timeout; j++) {
+ if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
+ break;
+ udelay(1);
+ }
+ }
+
+ WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0);
+ }
+}
+
+static int gfx_v11_0_reset_compute_pipe(struct amdgpu_device *adev,
+ u32 me, u32 pipe, u32 queue)
+{
+ uint32_t reset_val, clean_val;
int r;
if (!gfx_v11_pipe_reset_support(adev))
@@ -6918,109 +6946,115 @@ static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring)
gfx_v11_0_set_safe_mode(adev, 0);
mutex_lock(&adev->srbm_mutex);
- soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-
- reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
- clean_pipe = reset_pipe;
+ soc21_grbm_select(adev, me, pipe, queue, 0);
if (adev->gfx.rs64_enable) {
+ reset_val = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
+ clean_val = reset_val;
- switch (ring->pipe) {
+ switch (pipe) {
case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE0_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE0_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE0_RESET, 0);
break;
case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE1_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE1_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE1_RESET, 0);
break;
case 2:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE2_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE2_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE2_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE2_RESET, 0);
break;
case 3:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE3_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE3_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE3_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE3_RESET, 0);
break;
default:
break;
}
- WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_pipe);
- WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_pipe);
+ WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_val);
+ gfx_v11_0_clear_hqds_on_mec_pipe(adev, me, pipe);
+ soc21_grbm_select(adev, me, pipe, queue, 0);
+ WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_val);
r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) -
RS64_FW_UC_START_ADDR_LO;
} else {
- if (ring->me == 1) {
- switch (ring->pipe) {
+ reset_val = RREG32_SOC15(GC, 0, regCP_MEC_CNTL);
+ clean_val = reset_val;
+
+ if (me == 1) {
+ switch (pipe) {
case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE0_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE0_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE0_RESET, 0);
break;
case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE1_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE1_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE1_RESET, 0);
break;
case 2:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE2_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE2_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE2_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE2_RESET, 0);
break;
case 3:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE3_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE3_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE3_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE3_RESET, 0);
break;
default:
break;
}
/* mec1 fw pc: CP_MEC1_INSTR_PNTR */
} else {
- switch (ring->pipe) {
+ switch (pipe) {
case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE0_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE0_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE0_RESET, 0);
break;
case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE1_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE1_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE1_RESET, 0);
break;
case 2:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE2_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE2_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE2_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE2_RESET, 0);
break;
case 3:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE3_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE3_RESET, 0);
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE3_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE3_RESET, 0);
break;
default:
break;
}
/* mec2 fw pc: CP:CP_MEC2_INSTR_PNTR */
}
- WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_pipe);
- WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_pipe);
+ WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_val);
+ gfx_v11_0_clear_hqds_on_mec_pipe(adev, me, pipe);
+ soc21_grbm_select(adev, me, pipe, queue, 0);
+ WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_val);
r = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_MEC1_INSTR_PNTR));
}
@@ -7028,8 +7062,8 @@ static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring)
mutex_unlock(&adev->srbm_mutex);
gfx_v11_0_unset_safe_mode(adev, 0);
- dev_info(adev->dev, "The ring %s pipe resets to MEC FW start PC: %s\n", ring->name,
- r == 0 ? "successfully" : "failed");
+ dev_dbg(adev->dev, "MEC pipe me%u pipe%u queue%u resets to MEC FW start PC: %s\n",
+ me, pipe, queue, r == 0 ? "successfully" : "failed");
/*FIXME:Sometimes driver can't cache the MEC firmware start PC correctly, so the pipe
* reset status relies on the compute ring test result.
*/
@@ -7048,9 +7082,16 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring,
r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true, 0);
if (r) {
dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe reset\n", r);
- r = gfx_v11_0_reset_compute_pipe(ring);
- if (r)
- return r;
+
+ amdgpu_amdkfd_suspend(adev, true);
+ r = amdgpu_gfx_mec_pipe_reset_run(adev,
+ ring->xcc_id, ring->me, ring->pipe,
+ ring->queue, timedout_fence,
+ gfx_v11_0_reset_compute_pipe,
+ NULL,
+ gfx_v11_0_kcq_init_queue);
+ amdgpu_amdkfd_resume(adev, true);
+ return r;
}
r = gfx_v11_0_kcq_init_queue(ring, true);
--
2.49.0
next prev parent reply other threads:[~2026-04-14 9:00 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-14 8:58 [PATCH v3 1/8] drm/amdgpu: add coordinated MEC pipe reset for GFX compute queues Jesse Zhang
2026-04-14 8:58 ` Jesse Zhang [this message]
2026-04-14 8:58 ` [PATCH v3 3/8] drm/amdgpu/gfx11: Fall back to pipe reset if per-queue reset ring test fails Jesse Zhang
2026-04-14 8:58 ` [PATCH v3 4/8] drm/amdgpu/gfx11: enable per-pipe reset support for compute queues Jesse Zhang
2026-04-14 8:58 ` [PATCH v3 5/8] drm/amdgpu/gfx12: Refactor compute pipe reset and add HQD cleanup Jesse Zhang
2026-04-14 8:58 ` [PATCH v3 6/8] drm/amdgpu/gfx12: Fall back to pipe reset if per-queue reset ring test fails Jesse Zhang
2026-04-14 8:58 ` [PATCH v3 7/8] drm/amdgpu/gfx12: enable per-pipe reset support for compute queues Jesse Zhang
2026-04-14 8:58 ` [PATCH v3 8/8] drm/amdgpu/gfx_v12_0: set gfx.rs64_enable from PFP header on GFX12 Jesse Zhang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260414085926.3171086-2-Jesse.Zhang@amd.com \
--to=jesse.zhang@amd.com \
--cc=Alexander.Deucher@amd.com \
--cc=amd-gfx@lists.freedesktop.org \
--cc=christian.koenig@amd.com \
--cc=manu.rastogi@amd.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox