amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
@ 2025-07-30  7:43 Xiang Liu
  0 siblings, 0 replies; 4+ messages in thread
From: Xiang Liu @ 2025-07-30  7:43 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Xiang Liu

Avoid GFX poison consumption errors logged when fatal error occurs.

Signed-off-by: Xiang Liu <xiang.liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 3835f2592914..e9ba546c36ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -163,6 +163,10 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
 
 		bank.smu_err_type = type;
 
+		if (type == ACA_SMU_TYPE_UE &&
+		    ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]))
+			continue;
+
 		aca_smu_bank_dump(adev, i, count, &bank, qctx);
 
 		ret = aca_banks_add_bank(banks, &bank);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
@ 2025-07-30  9:24 Xiang Liu
  2025-07-30 10:46 ` Zhou1, Tao
  0 siblings, 1 reply; 4+ messages in thread
From: Xiang Liu @ 2025-07-30  9:24 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Xiang Liu

Avoid GFX poison consumption errors logged when fatal error occurs.

Signed-off-by: Xiang Liu <xiang.liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 47 ++++++++++++++-----------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 3835f2592914..59dbb9257096 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -125,6 +125,27 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st
 		RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged by the scrubber\n");
 }
 
+static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type)
+{
+
+	struct aca_hwip *hwip;
+	int hwid, mcatype;
+	u64 ipid;
+
+	if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
+		return false;
+
+	hwip = &aca_hwid_mcatypes[type];
+	if (!hwip->hwid)
+		return false;
+
+	ipid = bank->regs[ACA_REG_IDX_IPID];
+	hwid = ACA_REG__IPID__HARDWAREID(ipid);
+	mcatype = ACA_REG__IPID__MCATYPE(ipid);
+
+	return hwip->hwid == hwid && hwip->mcatype == mcatype;
+}
+
 static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
 				       int start, int count,
 				       struct aca_banks *banks, struct ras_query_context *qctx)
@@ -163,6 +184,11 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
 
 		bank.smu_err_type = type;
 
+		if (type == ACA_SMU_TYPE_UE &&
+		    ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) &&
+		    !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC))
+			continue;
+
 		aca_smu_bank_dump(adev, i, count, &bank, qctx);
 
 		ret = aca_banks_add_bank(banks, &bank);
@@ -173,27 +199,6 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
 	return 0;
 }
 
-static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type)
-{
-
-	struct aca_hwip *hwip;
-	int hwid, mcatype;
-	u64 ipid;
-
-	if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
-		return false;
-
-	hwip = &aca_hwid_mcatypes[type];
-	if (!hwip->hwid)
-		return false;
-
-	ipid = bank->regs[ACA_REG_IDX_IPID];
-	hwid = ACA_REG__IPID__HARDWAREID(ipid);
-	mcatype = ACA_REG__IPID__MCATYPE(ipid);
-
-	return hwip->hwid == hwid && hwip->mcatype == mcatype;
-}
-
 static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
 {
 	const struct aca_bank_ops *bank_ops = handle->bank_ops;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* RE: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
  2025-07-30  9:24 Xiang Liu
@ 2025-07-30 10:46 ` Zhou1, Tao
  2025-07-30 14:36   ` Liu, Xiang(Dean)
  0 siblings, 1 reply; 4+ messages in thread
From: Zhou1, Tao @ 2025-07-30 10:46 UTC (permalink / raw)
  To: Liu, Xiang(Dean), amd-gfx@lists.freedesktop.org
  Cc: Zhang, Hawking, Liu, Xiang(Dean)

[AMD Official Use Only - AMD Internal Distribution Only]

Better to add comment for the added condition check, with this resolved, the patch is:

Reviewed-by: Tao Zhou <tao.zhou1@amd.com>

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Xiang Liu
> Sent: Wednesday, July 30, 2025 5:25 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Xiang(Dean)
> <Xiang.Liu@amd.com>
> Subject: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
>
> Avoid GFX poison consumption errors logged when fatal error occurs.
>
> Signed-off-by: Xiang Liu <xiang.liu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 47 ++++++++++++++-----------
>  1 file changed, 26 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> index 3835f2592914..59dbb9257096 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> @@ -125,6 +125,27 @@ static void aca_smu_bank_dump(struct amdgpu_device
> *adev, int idx, int total, st
>               RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged
> by the scrubber\n");  }
>
> +static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> +aca_hwip_type type) {
> +
> +     struct aca_hwip *hwip;
> +     int hwid, mcatype;
> +     u64 ipid;
> +
> +     if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> +             return false;
> +
> +     hwip = &aca_hwid_mcatypes[type];
> +     if (!hwip->hwid)
> +             return false;
> +
> +     ipid = bank->regs[ACA_REG_IDX_IPID];
> +     hwid = ACA_REG__IPID__HARDWAREID(ipid);
> +     mcatype = ACA_REG__IPID__MCATYPE(ipid);
> +
> +     return hwip->hwid == hwid && hwip->mcatype == mcatype; }
> +
>  static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_type type,
>                                      int start, int count,
>                                      struct aca_banks *banks, struct
> ras_query_context *qctx) @@ -163,6 +184,11 @@ static int
> aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
>
>               bank.smu_err_type = type;
>
> +             if (type == ACA_SMU_TYPE_UE &&
> +
> ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) &&
> +                 !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC))
> +                     continue;
> +
>               aca_smu_bank_dump(adev, i, count, &bank, qctx);
>
>               ret = aca_banks_add_bank(banks, &bank); @@ -173,27 +199,6 @@
> static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_
>       return 0;
>  }
>
> -static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> aca_hwip_type type) -{
> -
> -     struct aca_hwip *hwip;
> -     int hwid, mcatype;
> -     u64 ipid;
> -
> -     if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> -             return false;
> -
> -     hwip = &aca_hwid_mcatypes[type];
> -     if (!hwip->hwid)
> -             return false;
> -
> -     ipid = bank->regs[ACA_REG_IDX_IPID];
> -     hwid = ACA_REG__IPID__HARDWAREID(ipid);
> -     mcatype = ACA_REG__IPID__MCATYPE(ipid);
> -
> -     return hwip->hwid == hwid && hwip->mcatype == mcatype;
> -}
> -
>  static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
> enum aca_smu_type type)  {
>       const struct aca_bank_ops *bank_ops = handle->bank_ops;
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
  2025-07-30 10:46 ` Zhou1, Tao
@ 2025-07-30 14:36   ` Liu, Xiang(Dean)
  0 siblings, 0 replies; 4+ messages in thread
From: Liu, Xiang(Dean) @ 2025-07-30 14:36 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx@lists.freedesktop.org; +Cc: Zhang, Hawking

[-- Attachment #1: Type: text/plain, Size: 4040 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]

Thanks, will do.

Best Regards,

Liu, Xiang

________________________________
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, July 30, 2025 6:46 PM
To: Liu, Xiang(Dean) <Xiang.Liu@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Xiang(Dean) <Xiang.Liu@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel

[AMD Official Use Only - AMD Internal Distribution Only]

Better to add comment for the added condition check, with this resolved, the patch is:

Reviewed-by: Tao Zhou <tao.zhou1@amd.com>

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Xiang Liu
> Sent: Wednesday, July 30, 2025 5:25 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Xiang(Dean)
> <Xiang.Liu@amd.com>
> Subject: [PATCH] drm/amdgpu: Skip poison aca bank from UE channel
>
> Avoid GFX poison consumption errors logged when fatal error occurs.
>
> Signed-off-by: Xiang Liu <xiang.liu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 47 ++++++++++++++-----------
>  1 file changed, 26 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> index 3835f2592914..59dbb9257096 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
> @@ -125,6 +125,27 @@ static void aca_smu_bank_dump(struct amdgpu_device
> *adev, int idx, int total, st
>               RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged
> by the scrubber\n");  }
>
> +static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> +aca_hwip_type type) {
> +
> +     struct aca_hwip *hwip;
> +     int hwid, mcatype;
> +     u64 ipid;
> +
> +     if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> +             return false;
> +
> +     hwip = &aca_hwid_mcatypes[type];
> +     if (!hwip->hwid)
> +             return false;
> +
> +     ipid = bank->regs[ACA_REG_IDX_IPID];
> +     hwid = ACA_REG__IPID__HARDWAREID(ipid);
> +     mcatype = ACA_REG__IPID__MCATYPE(ipid);
> +
> +     return hwip->hwid == hwid && hwip->mcatype == mcatype; }
> +
>  static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_type type,
>                                      int start, int count,
>                                      struct aca_banks *banks, struct
> ras_query_context *qctx) @@ -163,6 +184,11 @@ static int
> aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
>
>               bank.smu_err_type = type;
>
> +             if (type == ACA_SMU_TYPE_UE &&
> +
> ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) &&
> +                 !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC))
> +                     continue;
> +
>               aca_smu_bank_dump(adev, i, count, &bank, qctx);
>
>               ret = aca_banks_add_bank(banks, &bank); @@ -173,27 +199,6 @@
> static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum
> aca_smu_
>       return 0;
>  }
>
> -static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum
> aca_hwip_type type) -{
> -
> -     struct aca_hwip *hwip;
> -     int hwid, mcatype;
> -     u64 ipid;
> -
> -     if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
> -             return false;
> -
> -     hwip = &aca_hwid_mcatypes[type];
> -     if (!hwip->hwid)
> -             return false;
> -
> -     ipid = bank->regs[ACA_REG_IDX_IPID];
> -     hwid = ACA_REG__IPID__HARDWAREID(ipid);
> -     mcatype = ACA_REG__IPID__MCATYPE(ipid);
> -
> -     return hwip->hwid == hwid && hwip->mcatype == mcatype;
> -}
> -
>  static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
> enum aca_smu_type type)  {
>       const struct aca_bank_ops *bank_ops = handle->bank_ops;
> --
> 2.34.1


[-- Attachment #2: Type: text/html, Size: 8069 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-07-30 14:36 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-30  7:43 [PATCH] drm/amdgpu: Skip poison aca bank from UE channel Xiang Liu
  -- strict thread matches above, loose matches on Subject: below --
2025-07-30  9:24 Xiang Liu
2025-07-30 10:46 ` Zhou1, Tao
2025-07-30 14:36   ` Liu, Xiang(Dean)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).