* [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
@ 2025-07-30 7:43 Xiang Liu
0 siblings, 0 replies; 4+ messages in thread
From: Xiang Liu @ 2025-07-30 7:43 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, Xiang Liu
Avoid GFX poison consumption errors logged when fatal error occurs.
Signed-off-by: Xiang Liu <xiang.liu@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 3835f2592914..e9ba546c36ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -163,6 +163,10 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
bank.smu_err_type = type;
+ if (type == ACA_SMU_TYPE_UE &&
+ ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]))
+ continue;
+
aca_smu_bank_dump(adev, i, count, &bank, qctx);
ret = aca_banks_add_bank(banks, &bank);
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
@ 2025-07-30 9:24 Xiang Liu
2025-07-30 10:46 ` Zhou1, Tao
0 siblings, 1 reply; 4+ messages in thread
From: Xiang Liu @ 2025-07-30 9:24 UTC (permalink / raw)
To: amd-gfx; +Cc: Hawking.Zhang, Xiang Liu
Avoid GFX poison consumption errors logged when fatal error occurs.
Signed-off-by: Xiang Liu <xiang.liu@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 47 ++++++++++++++-----------
1 file changed, 26 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 3835f2592914..59dbb9257096 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -125,6 +125,27 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st
RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged by the scrubber\n");
}
+static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type)
+{
+
+ struct aca_hwip *hwip;
+ int hwid, mcatype;
+ u64 ipid;
+
+ if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
+ return false;
+
+ hwip = &aca_hwid_mcatypes[type];
+ if (!hwip->hwid)
+ return false;
+
+ ipid = bank->regs[ACA_REG_IDX_IPID];
+ hwid = ACA_REG__IPID__HARDWAREID(ipid);
+ mcatype = ACA_REG__IPID__MCATYPE(ipid);
+
+ return hwip->hwid == hwid && hwip->mcatype == mcatype;
+}
+
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
int start, int count,
struct aca_banks *banks, struct ras_query_context *qctx)
@@ -163,6 +184,11 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
bank.smu_err_type = type;
+ if (type == ACA_SMU_TYPE_UE &&
+ ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) &&
+ !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC))
+ continue;
+
aca_smu_bank_dump(adev, i, count, &bank, qctx);
ret = aca_banks_add_bank(banks, &bank);
@@ -173,27 +199,6 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
return 0;
}
-static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type)
-{
-
- struct aca_hwip *hwip;
- int hwid, mcatype;
- u64 ipid;
-
- if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
- return false;
-
- hwip = &aca_hwid_mcatypes[type];
- if (!hwip->hwid)
- return false;
-
- ipid = bank->regs[ACA_REG_IDX_IPID];
- hwid = ACA_REG__IPID__HARDWAREID(ipid);
- mcatype = ACA_REG__IPID__MCATYPE(ipid);
-
- return hwip->hwid == hwid && hwip->mcatype == mcatype;
-}
-
static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
{
const struct aca_bank_ops *bank_ops = handle->bank_ops;
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* RE: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
2025-07-30 9:24 Xiang Liu
@ 2025-07-30 10:46 ` Zhou1, Tao
2025-07-30 14:36 ` Liu, Xiang(Dean)
0 siblings, 1 reply; 4+ messages in thread
From: Zhou1, Tao @ 2025-07-30 10:46 UTC (permalink / raw)
To: Liu, Xiang(Dean), amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking, Liu, Xiang(Dean)
[AMD Official Use Only - AMD Internal Distribution Only]
Better to add comment for the added condition check, with this resolved, the patch is:
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Xiang Liu
> Sent: Wednesday, July 30, 2025 5:25 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Xiang(Dean)
> <Xiang.Liu@amd.com>
> Subject: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
>
> Avoid GFX poison consumption errors logged when fatal error occurs.
>
> Signed-off-by: Xiang Liu <xiang.liu@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 47 ++++++++++++++-----------
> 1 file changed, 26 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> index 3835f2592914..59dbb9257096 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> @@ -125,6 +125,27 @@ static void aca_smu_bank_dump(struct amdgpu_device
> *adev, int idx, int total, st
> RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged
> by the scrubber\n"); }
>
> +static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> +aca_hwip_type type) {
> +
> + struct aca_hwip *hwip;
> + int hwid, mcatype;
> + u64 ipid;
> +
> + if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> + return false;
> +
> + hwip = &aca_hwid_mcatypes[type];
> + if (!hwip->hwid)
> + return false;
> +
> + ipid = bank->regs[ACA_REG_IDX_IPID];
> + hwid = ACA_REG__IPID__HARDWAREID(ipid);
> + mcatype = ACA_REG__IPID__MCATYPE(ipid);
> +
> + return hwip->hwid == hwid && hwip->mcatype == mcatype; }
> +
> static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_type type,
> int start, int count,
> struct aca_banks *banks, struct
> ras_query_context *qctx) @@ -163,6 +184,11 @@ static int
> aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
>
> bank.smu_err_type = type;
>
> + if (type == ACA_SMU_TYPE_UE &&
> +
> ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) &&
> + !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC))
> + continue;
> +
> aca_smu_bank_dump(adev, i, count, &bank, qctx);
>
> ret = aca_banks_add_bank(banks, &bank); @@ -173,27 +199,6 @@
> static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_
> return 0;
> }
>
> -static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> aca_hwip_type type) -{
> -
> - struct aca_hwip *hwip;
> - int hwid, mcatype;
> - u64 ipid;
> -
> - if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> - return false;
> -
> - hwip = &aca_hwid_mcatypes[type];
> - if (!hwip->hwid)
> - return false;
> -
> - ipid = bank->regs[ACA_REG_IDX_IPID];
> - hwid = ACA_REG__IPID__HARDWAREID(ipid);
> - mcatype = ACA_REG__IPID__MCATYPE(ipid);
> -
> - return hwip->hwid == hwid && hwip->mcatype == mcatype;
> -}
> -
> static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
> enum aca_smu_type type) {
> const struct aca_bank_ops *bank_ops = handle->bank_ops;
> --
> 2.34.1
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
2025-07-30 10:46 ` Zhou1, Tao
@ 2025-07-30 14:36 ` Liu, Xiang(Dean)
0 siblings, 0 replies; 4+ messages in thread
From: Liu, Xiang(Dean) @ 2025-07-30 14:36 UTC (permalink / raw)
To: Zhou1, Tao, amd-gfx@lists.freedesktop.org; +Cc: Zhang, Hawking
[-- Attachment #1: Type: text/plain, Size: 4040 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
Thanks, will do.
Best Regards,
Liu, Xiang
________________________________
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, July 30, 2025 6:46 PM
To: Liu, Xiang(Dean) <Xiang.Liu@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Xiang(Dean) <Xiang.Liu@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
[AMD Official Use Only - AMD Internal Distribution Only]
Better to add comment for the added condition check, with this resolved, the patch is:
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Xiang Liu
> Sent: Wednesday, July 30, 2025 5:25 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Xiang(Dean)
> <Xiang.Liu@amd.com>
> Subject: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
>
> Avoid GFX poison consumption errors logged when fatal error occurs.
>
> Signed-off-by: Xiang Liu <xiang.liu@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 47 ++++++++++++++-----------
> 1 file changed, 26 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> index 3835f2592914..59dbb9257096 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> @@ -125,6 +125,27 @@ static void aca_smu_bank_dump(struct amdgpu_device
> *adev, int idx, int total, st
> RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged
> by the scrubber\n"); }
>
> +static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> +aca_hwip_type type) {
> +
> + struct aca_hwip *hwip;
> + int hwid, mcatype;
> + u64 ipid;
> +
> + if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> + return false;
> +
> + hwip = &aca_hwid_mcatypes[type];
> + if (!hwip->hwid)
> + return false;
> +
> + ipid = bank->regs[ACA_REG_IDX_IPID];
> + hwid = ACA_REG__IPID__HARDWAREID(ipid);
> + mcatype = ACA_REG__IPID__MCATYPE(ipid);
> +
> + return hwip->hwid == hwid && hwip->mcatype == mcatype; }
> +
> static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_type type,
> int start, int count,
> struct aca_banks *banks, struct
> ras_query_context *qctx) @@ -163,6 +184,11 @@ static int
> aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
>
> bank.smu_err_type = type;
>
> + if (type == ACA_SMU_TYPE_UE &&
> +
> ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) &&
> + !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC))
> + continue;
> +
> aca_smu_bank_dump(adev, i, count, &bank, qctx);
>
> ret = aca_banks_add_bank(banks, &bank); @@ -173,27 +199,6 @@
> static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_
> return 0;
> }
>
> -static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> aca_hwip_type type) -{
> -
> - struct aca_hwip *hwip;
> - int hwid, mcatype;
> - u64 ipid;
> -
> - if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> - return false;
> -
> - hwip = &aca_hwid_mcatypes[type];
> - if (!hwip->hwid)
> - return false;
> -
> - ipid = bank->regs[ACA_REG_IDX_IPID];
> - hwid = ACA_REG__IPID__HARDWAREID(ipid);
> - mcatype = ACA_REG__IPID__MCATYPE(ipid);
> -
> - return hwip->hwid == hwid && hwip->mcatype == mcatype;
> -}
> -
> static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
> enum aca_smu_type type) {
> const struct aca_bank_ops *bank_ops = handle->bank_ops;
> --
> 2.34.1
[-- Attachment #2: Type: text/html, Size: 8069 bytes --]
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-07-30 14:36 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-30 7:43 [PATCH] drm/amdgpu: Skip poison aca bank from UE channel Xiang Liu
-- strict thread matches above, loose matches on Subject: below --
2025-07-30 9:24 Xiang Liu
2025-07-30 10:46 ` Zhou1, Tao
2025-07-30 14:36 ` Liu, Xiang(Dean)
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).