amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 3/3 v2] drm/amdgpu: Correct the loss of aca bank reg info
@ 2025-08-13 10:35 Ce Sun
  2025-08-14  3:05 ` Chai, Thomas
  0 siblings, 1 reply; 2+ messages in thread
From: Ce Sun @ 2025-08-13 10:35 UTC (permalink / raw)
  To: amd-gfx
  Cc: tao.zhou1, Stanley.Yang, Hawking.Zhang, kevinyang.wang,
	YiPeng.Chai, Ce Sun

By polling, poll ACA bank count to ensure that valid
ACA bank reg info can be obtained

v2: add corresponding delay before send msg to SMU to query mca bank info.
(Stanley)

Signed-off-by: Ce Sun <cesun102@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 --
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  |  7 +---
 4 files changed, 14 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 92c2370831b3..2beaf30ccb96 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -877,7 +877,7 @@ size_t amdgpu_aca_get_bank_count(struct amdgpu_device *adev)
 
 void amdgpu_aca_clear_bank_count(struct amdgpu_device *adev)
 {
-	atomic64_set(&aca->bank_count, 0);
+	atomic64_set(&adev->aca.bank_count, 0);
 }
 #if defined(CONFIG_DEBUG_FS)
 static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 185b9e538f98..23f583492bfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3306,8 +3306,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
 	mutex_init(&ecc_log->lock);
 
 	INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
-	ecc_log->de_queried_count = 0;
-	ecc_log->prev_de_queried_count = 0;
 }
 
 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
@@ -3326,8 +3324,6 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
 	mutex_unlock(&ecc_log->lock);
 
 	mutex_destroy(&ecc_log->lock);
-	ecc_log->de_queried_count = 0;
-	ecc_log->prev_de_queried_count = 0;
 }
 #endif
 
@@ -3381,49 +3377,33 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
 				uint32_t poison_creation_count)
 {
 	int ret = 0;
-	struct ras_ecc_log_info *ecc_log;
 	struct ras_query_if info;
-	uint32_t timeout = 0;
+	uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-	uint64_t de_queried_count;
-	uint32_t new_detect_count, total_detect_count;
-	uint32_t need_query_count = poison_creation_count;
 	enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+	uint64_t prev_de_queried_count = 0;
+	uint64_t bank_count = 0;
 
 	memset(&info, 0, sizeof(info));
 	info.head.block = AMDGPU_RAS_BLOCK__UMC;
 
-	ecc_log = &ras->umc_ecc_log;
-	total_detect_count = 0;
 	do {
 		ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
 		if (ret)
 			return ret;
 
-		de_queried_count = ecc_log->de_queried_count;
-		if (de_queried_count > ecc_log->prev_de_queried_count) {
-			new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
-			ecc_log->prev_de_queried_count = de_queried_count;
-			timeout = 0;
+		bank_count = amdgpu_aca_get_bank_count(adev);
+		if (bank_count) {
+			prev_de_queried_count = bank_count;
+			amdgpu_aca_clear_bank_count(adev);
+			timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
 		} else {
-			new_detect_count = 0;
-		}
-
-		if (new_detect_count) {
-			total_detect_count += new_detect_count;
-		} else {
-			if (!timeout && need_query_count)
-				timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
-
-			if (timeout) {
-				if (!--timeout)
-					break;
-				msleep(1);
-			}
+			--timeout;
+			msleep(1);
 		}
-	} while (total_detect_count < need_query_count);
+	} while (timeout);
 
-	if (total_detect_count)
+	if (prev_de_queried_count)
 		schedule_delayed_work(&ras->page_retirement_dwork, 0);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7f10a7402160..df93791eb645 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -492,8 +492,6 @@ struct ras_ecc_err {
 struct ras_ecc_log_info {
 	struct mutex lock;
 	struct radix_tree_root de_page_tree;
-	uint64_t	de_queried_count;
-	uint64_t	prev_de_queried_count;
 };
 
 struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index e590cbdd8de9..b3bdcf70df2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -581,17 +581,12 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
 
 	ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
 	if (ret) {
-		if (ret == -EEXIST)
-			con->umc_ecc_log.de_queried_count++;
-		else
+		if (ret != -EEXIST)
 			dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
-
 		kfree(ecc_err);
 		return ret;
 	}
 
-	con->umc_ecc_log.de_queried_count++;
-
 	memset(page_pfn, 0, sizeof(page_pfn));
 	count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
 				pa_addr,
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* RE: [PATCH 3/3 v2] drm/amdgpu: Correct the loss of aca bank reg info
  2025-08-13 10:35 [PATCH 3/3 v2] drm/amdgpu: Correct the loss of aca bank reg info Ce Sun
@ 2025-08-14  3:05 ` Chai, Thomas
  0 siblings, 0 replies; 2+ messages in thread
From: Chai, Thomas @ 2025-08-14  3:05 UTC (permalink / raw)
  To: Sun, Ce(Overlord), amd-gfx@lists.freedesktop.org
  Cc: Zhou1, Tao, Yang, Stanley, Zhang, Hawking, Wang, Yang(Kevin)

[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Sun, Ce(Overlord) <Ce.Sun@amd.com>
Sent: Wednesday, August 13, 2025 6:36 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Sun, Ce(Overlord) <Ce.Sun@amd.com>
Subject: [PATCH 3/3 v2] drm/amdgpu: Correct the loss of aca bank reg info

By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained

v2: add corresponding delay before send msg to SMU to query mca bank info.
(Stanley)

Signed-off-by: Ce Sun <cesun102@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c |  2 +-  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++------------------  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 --  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  |  7 +---
 4 files changed, 14 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 92c2370831b3..2beaf30ccb96 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -877,7 +877,7 @@ size_t amdgpu_aca_get_bank_count(struct amdgpu_device *adev)

 void amdgpu_aca_clear_bank_count(struct amdgpu_device *adev)  {
-       atomic64_set(&aca->bank_count, 0);
+       atomic64_set(&adev->aca.bank_count, 0);
 }
 #if defined(CONFIG_DEBUG_FS)
 static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 185b9e538f98..23f583492bfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3306,8 +3306,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
        mutex_init(&ecc_log->lock);

        INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
 }

 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3326,8 +3324,6 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
        mutex_unlock(&ecc_log->lock);

        mutex_destroy(&ecc_log->lock);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
 }
 #endif

@@ -3381,49 +3377,33 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
                                uint32_t poison_creation_count)
 {
        int ret = 0;
-       struct ras_ecc_log_info *ecc_log;
        struct ras_query_if info;
-       uint32_t timeout = 0;
+       uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-       uint64_t de_queried_count;
-       uint32_t new_detect_count, total_detect_count;
-       uint32_t need_query_count = poison_creation_count;
        enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+       uint64_t prev_de_queried_count = 0;
+       uint64_t bank_count = 0;

        memset(&info, 0, sizeof(info));
        info.head.block = AMDGPU_RAS_BLOCK__UMC;

-       ecc_log = &ras->umc_ecc_log;
-       total_detect_count = 0;
        do {
                ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
                if (ret)
                        return ret;

-               de_queried_count = ecc_log->de_queried_count;
-               if (de_queried_count > ecc_log->prev_de_queried_count) {
-                       new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
-                       ecc_log->prev_de_queried_count = de_queried_count;
-                       timeout = 0;
+               bank_count = amdgpu_aca_get_bank_count(adev);
+               if (bank_count) {
+                       prev_de_queried_count = bank_count;
+                       amdgpu_aca_clear_bank_count(adev);
+                       timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
                } else {
-                       new_detect_count = 0;
-               }
-
-               if (new_detect_count) {
-                       total_detect_count += new_detect_count;
-               } else {
-                       if (!timeout && need_query_count)
-                               timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
-
-                       if (timeout) {
-                               if (!--timeout)
-                                       break;
-                               msleep(1);
-                       }
+                       --timeout;
+                       msleep(1);
                }
-       } while (total_detect_count < need_query_count);
+       } while (timeout);

[Thomas] As discussed offline, this code can cause system hang under stress tested.

-       if (total_detect_count)
+       if (prev_de_queried_count)
                schedule_delayed_work(&ras->page_retirement_dwork, 0);

        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7f10a7402160..df93791eb645 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -492,8 +492,6 @@ struct ras_ecc_err {  struct ras_ecc_log_info {
        struct mutex lock;
        struct radix_tree_root de_page_tree;
-       uint64_t        de_queried_count;
-       uint64_t        prev_de_queried_count;
 };

 struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index e590cbdd8de9..b3bdcf70df2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -581,17 +581,12 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,

        ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
        if (ret) {
-               if (ret == -EEXIST)
-                       con->umc_ecc_log.de_queried_count++;
-               else
+               if (ret != -EEXIST)
                        dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
-
                kfree(ecc_err);
                return ret;
        }

-       con->umc_ecc_log.de_queried_count++;
-
        memset(page_pfn, 0, sizeof(page_pfn));
        count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
                                pa_addr,
--
2.34.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-08-14  3:05 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-13 10:35 [PATCH 3/3 v2] drm/amdgpu: Correct the loss of aca bank reg info Ce Sun
2025-08-14  3:05 ` Chai, Thomas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).