From: Ce Sun <cesun102@amd.com>
To: <amd-gfx@lists.freedesktop.org>
Cc: <tao.zhou1@amd.com>, <Stanley.Yang@amd.com>,
<Hawking.Zhang@amd.com>, <kevinyang.wang@amd.com>,
<YiPeng.Chai@amd.com>, Ce Sun <cesun102@amd.com>
Subject: [PATCH 4/4 v3] drm/amdgpu: Correct the loss of aca bank reg info
Date: Fri, 15 Aug 2025 12:11:58 +0800 [thread overview]
Message-ID: <20250815041158.301031-1-cesun102@amd.com> (raw)
By polling, poll ACA bank count to ensure that valid
ACA bank reg info can be obtained
v2: add corresponding delay before send msg to SMU to query mca bank info.
(Stanley)
v3: the loop cannot exit. (Thomas)
Signed-off-by: Ce Sun <cesun102@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 65 +++++++++++++------------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +-
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 12 +++--
3 files changed, 44 insertions(+), 37 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 31850a47a41f..9ccc1fbca14f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
-#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC 50
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
@@ -131,6 +131,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
#define BYPASS_ALLOCATED_ADDRESS 0x0
#define BYPASS_INITIALIZATION_ADDRESS 0x1
+#define MAX_BANK_COUNT 12
+
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -3306,8 +3308,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
mutex_init(&ecc_log->lock);
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
- ecc_log->de_queried_count = 0;
- ecc_log->prev_de_queried_count = 0;
+ ecc_log->consumption_de_count = 0;
+ ecc_log->creation_de_count = 0;
}
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
@@ -3326,8 +3328,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
mutex_unlock(&ecc_log->lock);
mutex_destroy(&ecc_log->lock);
- ecc_log->de_queried_count = 0;
- ecc_log->prev_de_queried_count = 0;
+ ecc_log->consumption_de_count = 0;
+ ecc_log->creation_de_count = 0;
}
#endif
@@ -3381,49 +3383,48 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
uint32_t poison_creation_count)
{
int ret = 0;
- struct ras_ecc_log_info *ecc_log;
struct ras_query_if info;
- uint32_t timeout = 0;
+ struct ras_ecc_log_info *ecc_log;
+ uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- uint64_t de_queried_count;
- uint32_t new_detect_count, total_detect_count;
- uint32_t need_query_count = poison_creation_count;
+ uint64_t creation_de_count = 0;
+ uint64_t consumption_de_count = 0;
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+ uint64_t bank_count = 0;
+ uint64_t total_bank_count = 0;
memset(&info, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC;
-
ecc_log = &ras->umc_ecc_log;
- total_detect_count = 0;
+
do {
ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
if (ret)
return ret;
-
- de_queried_count = ecc_log->de_queried_count;
- if (de_queried_count > ecc_log->prev_de_queried_count) {
- new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
- ecc_log->prev_de_queried_count = de_queried_count;
- timeout = 0;
+ creation_de_count = ecc_log->creation_de_count;
+ consumption_de_count = ecc_log->consumption_de_count;
+
+ bank_count = amdgpu_aca_get_bank_count(adev);
+ if (bank_count) {
+ total_bank_count += bank_count;
+ amdgpu_aca_clear_bank_count(adev);
+ timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
} else {
- new_detect_count = 0;
+ --timeout;
+ msleep(20);
}
- if (new_detect_count) {
- total_detect_count += new_detect_count;
- } else {
- if (!timeout && need_query_count)
- timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
+ if (creation_de_count && consumption_de_count)
+ break;
- if (timeout) {
- if (!--timeout)
- break;
- msleep(1);
- }
- }
- } while (total_detect_count < need_query_count);
+ if (total_bank_count >= MAX_BANK_COUNT)
+ break;
+ } while (timeout);
+
+ ecc_log->creation_de_count = 0;
+ ecc_log->consumption_de_count = 0;
- if (total_detect_count)
+ if (consumption_de_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6265dac0e1c0..b4eb427409ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -492,8 +492,8 @@ struct ras_ecc_err {
struct ras_ecc_log_info {
struct mutex lock;
struct radix_tree_root de_page_tree;
- uint64_t de_queried_count;
- uint64_t prev_de_queried_count;
+ uint64_t consumption_de_count;
+ uint64_t creation_de_count;
};
struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index e590cbdd8de9..11b99095efd3 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -536,8 +536,14 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
- if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0))
+ /* only creation/consumption defer error can access here.
+ * MCA_UMC_HWID_V12_0/MCA_UMC_MCATYPE_V12_0.
+ * It is the hwid/mactype of the consumption defer error
+ * */
+ if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) {
+ con->umc_ecc_log.creation_de_count++;
return 0;
+ }
if (!status)
return 0;
@@ -582,7 +588,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
if (ret) {
if (ret == -EEXIST)
- con->umc_ecc_log.de_queried_count++;
+ con->umc_ecc_log.consumption_de_count++;
else
dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
@@ -590,7 +596,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
return ret;
}
- con->umc_ecc_log.de_queried_count++;
+ con->umc_ecc_log.consumption_de_count++;
memset(page_pfn, 0, sizeof(page_pfn));
count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
--
2.34.1
next reply other threads:[~2025-08-15 4:12 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-15 4:11 Ce Sun [this message]
2025-08-15 8:57 ` [PATCH 4/4 v3] drm/amdgpu: Correct the loss of aca bank reg info Wang, Yang(Kevin)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250815041158.301031-1-cesun102@amd.com \
--to=cesun102@amd.com \
--cc=Hawking.Zhang@amd.com \
--cc=Stanley.Yang@amd.com \
--cc=YiPeng.Chai@amd.com \
--cc=amd-gfx@lists.freedesktop.org \
--cc=kevinyang.wang@amd.com \
--cc=tao.zhou1@amd.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).