* [PATCH 1/5] drm/amdgpu: add condition check for waking up thread
@ 2024-06-13 2:25 YiPeng Chai
2024-06-13 2:25 ` [PATCH 2/5] drm/amdgpu: add threshold to interrupt waiting for DE data to be ready YiPeng Chai
` (4 more replies)
0 siblings, 5 replies; 8+ messages in thread
From: YiPeng Chai @ 2024-06-13 2:25 UTC (permalink / raw)
To: amd-gfx
Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
Stanley.Yang, YiPeng Chai
1. Cannot add messages to fifo in gpu reset mode.
2. Only when the message is successfully saved to the
fifo, the thread can be awakened.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++++------
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 18 +++++++++++-------
2 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d0dcd3d37e6d..ed260966363f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2093,12 +2093,16 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
- amdgpu_ras_put_poison_req(obj->adev,
- AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
-
- atomic_inc(&con->page_retirement_req_cnt);
-
- wake_up(&con->page_retirement_wq);
+ if (!amdgpu_in_reset(obj->adev) && !atomic_read(&con->in_recovery)) {
+ int ret;
+
+ ret = amdgpu_ras_put_poison_req(obj->adev,
+ AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
+ if (!ret) {
+ atomic_inc(&con->page_retirement_req_cnt);
+ wake_up(&con->page_retirement_wq);
+ }
+ }
}
#endif
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1dbe69eabb9a..94181ae85886 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -293,16 +293,20 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
amdgpu_ras_error_data_fini(&err_data);
} else {
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
#ifdef HAVE_KFIFO_PUT_NON_POINTER
- amdgpu_ras_put_poison_req(adev,
- block, pasid, pasid_fn, data, reset);
-#endif
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- atomic_inc(&con->page_retirement_req_cnt);
+ if (!amdgpu_in_reset(adev) && !atomic_read(&con->in_recovery)) {
+ int ret;
- wake_up(&con->page_retirement_wq);
+ ret = amdgpu_ras_put_poison_req(adev,
+ block, pasid, pasid_fn, data, reset);
+ if (!ret) {
+ atomic_inc(&con->page_retirement_req_cnt);
+ wake_up(&con->page_retirement_wq);
+ }
+ }
+#endif
}
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/5] drm/amdgpu: add threshold to interrupt waiting for DE data to be ready
2024-06-13 2:25 [PATCH 1/5] drm/amdgpu: add condition check for waking up thread YiPeng Chai
@ 2024-06-13 2:25 ` YiPeng Chai
2024-06-13 2:25 ` [PATCH 3/5] drm/amdgpu: clear all messages reset flags in fifo before gpu reset YiPeng Chai
` (3 subsequent siblings)
4 siblings, 0 replies; 8+ messages in thread
From: YiPeng Chai @ 2024-06-13 2:25 UTC (permalink / raw)
To: amd-gfx
Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
Stanley.Yang, YiPeng Chai
If the number of messages to be processed in the fifo exceeds
the threshold, it will not continue to wait for the DE data
to be ready.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 +++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +++-
2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ed260966363f..1e6e06009577 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
+#define RAS_POISON_FIFO_MSG_PENDING_THRESHOLD (AMDGPU_RAS_POISON_FIFO_SIZE/4)
+
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2832,6 +2834,7 @@ static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
struct ras_query_if info;
uint32_t timeout = timeout_ms;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ bool trigger_threshold = false;
memset(&info, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC;
@@ -2845,6 +2848,12 @@ static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
return;
}
+ if (atomic_read(&ras->page_retirement_req_cnt) >
+ RAS_POISON_FIFO_MSG_PENDING_THRESHOLD) {
+ trigger_threshold = true;
+ break;
+ }
+
if (timeout && !ecc_log->de_updated) {
msleep(1);
timeout--;
@@ -2856,6 +2865,10 @@ static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
return;
}
+ if (trigger_threshold)
+ dev_dbg(adev->dev, "Waiting for deferred data %d ms, pending msg:%d\n",
+ timeout_ms - timeout, atomic_read(&ras->page_retirement_req_cnt));
+
if (!ret)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index e70c45712ddb..103436bb650e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -36,6 +36,8 @@
struct amdgpu_iv_entry;
+#define AMDGPU_RAS_POISON_FIFO_SIZE 128
+
#define AMDGPU_RAS_GPU_ERR_MEM_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 0, 0)
#define AMDGPU_RAS_GPU_ERR_FW_LOAD(x) AMDGPU_GET_REG_FIELD(x, 1, 1)
#define AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 2, 2)
@@ -532,7 +534,7 @@ struct amdgpu_ras {
struct mutex page_retirement_lock;
atomic_t page_retirement_req_cnt;
struct mutex page_rsv_lock;
- DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
+ DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, AMDGPU_RAS_POISON_FIFO_SIZE);
struct ras_ecc_log_info umc_ecc_log;
struct delayed_work page_retirement_dwork;
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 3/5] drm/amdgpu: clear all messages reset flags in fifo before gpu reset
2024-06-13 2:25 [PATCH 1/5] drm/amdgpu: add condition check for waking up thread YiPeng Chai
2024-06-13 2:25 ` [PATCH 2/5] drm/amdgpu: add threshold to interrupt waiting for DE data to be ready YiPeng Chai
@ 2024-06-13 2:25 ` YiPeng Chai
2024-06-13 2:25 ` [PATCH 4/5] drm/amdgpu: wait for gpu to complete reset YiPeng Chai
` (2 subsequent siblings)
4 siblings, 0 replies; 8+ messages in thread
From: YiPeng Chai @ 2024-06-13 2:25 UTC (permalink / raw)
To: amd-gfx
Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
Stanley.Yang, YiPeng Chai
To avoid resetting the gpu repeatedly, clear all
message reset flags in the fifo before the first
gpu reset.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++++++++++-
1 file changed, 58 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1e6e06009577..7dfb2e548d70 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2768,6 +2768,49 @@ static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
return kfifo_get(&con->poison_fifo, poison_msg);
}
+
+static void amdgpu_ras_clear_poison_fifo_msg_reset_flag(struct amdgpu_device *adev,
+ uint32_t *cached_reset)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_poison_msg msg;
+ uint32_t cached_msg_count;
+ uint32_t reset = 0;
+ int i, ret;
+
+ cached_msg_count = kfifo_len(&con->poison_fifo);
+
+ for (i = 0; i < cached_msg_count; i++) {
+ ret = kfifo_get(&con->poison_fifo, &msg);
+ if (!ret)
+ continue;
+
+ if (msg.block != AMDGPU_RAS_BLOCK__UMC) {
+ reset |= msg.reset;
+
+ /* Clear reset flag */
+ msg.reset = 0;
+ }
+
+ /* add message back to fifo */
+ ret = kfifo_put(&con->poison_fifo, msg);
+ if (!ret)
+ dev_info(adev->dev, "Poison fifo drop message!\n");
+ }
+ *cached_reset = reset;
+}
+
+static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_poison_msg msg;
+ int ret;
+
+ do {
+ ret = kfifo_get(&con->poison_fifo, &msg);
+ } while (ret);
+
+}
#endif
#ifdef HAVE_RADIX_TREE_ITER_DELETE
@@ -2886,9 +2929,23 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
poison_msg->pasid_fn(adev, pasid, poison_msg->data);
if (reset) {
+ uint32_t fifo_cached_reset = 0;
+
flush_delayed_work(&con->page_retirement_dwork);
- con->gpu_reset_flags |= reset;
+ amdgpu_ras_clear_poison_fifo_msg_reset_flag(adev, &fifo_cached_reset);
+
+ reset |= fifo_cached_reset;
+
+ if (reset & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
+ con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+ amdgpu_ras_clear_poison_fifo(adev);
+ } else if (reset & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+ con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ } else {
+ con->gpu_reset_flags |= reset;
+ }
+
amdgpu_ras_reset_gpu(adev);
}
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 4/5] drm/amdgpu: wait for gpu to complete reset
2024-06-13 2:25 [PATCH 1/5] drm/amdgpu: add condition check for waking up thread YiPeng Chai
2024-06-13 2:25 ` [PATCH 2/5] drm/amdgpu: add threshold to interrupt waiting for DE data to be ready YiPeng Chai
2024-06-13 2:25 ` [PATCH 3/5] drm/amdgpu: clear all messages reset flags in fifo before gpu reset YiPeng Chai
@ 2024-06-13 2:25 ` YiPeng Chai
2024-06-13 10:19 ` Christian König
2024-06-13 2:25 ` [PATCH 5/5] drm/amdgpu: add gpu reset check before page retirement thread runs YiPeng Chai
2024-06-13 8:11 ` [PATCH 1/5] drm/amdgpu: add condition check for waking up thread Lazar, Lijo
4 siblings, 1 reply; 8+ messages in thread
From: YiPeng Chai @ 2024-06-13 2:25 UTC (permalink / raw)
To: amd-gfx
Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
Stanley.Yang, YiPeng Chai
Add completion to wait for gpu to complete reset.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 ++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
2 files changed, 13 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7dfb2e548d70..341c9bd0d1a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
+#define MAX_GPU_RESET_COMPLETION_TIME 120000 //ms
+
#define RAS_POISON_FIFO_MSG_PENDING_THRESHOLD (AMDGPU_RAS_POISON_FIFO_SIZE/4)
enum amdgpu_ras_retire_page_reservation {
@@ -2526,6 +2528,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
atomic_set(&hive->ras_recovery, 0);
amdgpu_put_xgmi_hive(hive);
}
+
+ complete(&ras->gpu_reset_completion);
}
/* alloc/realloc bps array */
@@ -2946,7 +2950,14 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
con->gpu_reset_flags |= reset;
}
+ reinit_completion(&con->gpu_reset_completion);
+
amdgpu_ras_reset_gpu(adev);
+
+ if (!wait_for_completion_timeout(&con->gpu_reset_completion,
+ msecs_to_jiffies(MAX_GPU_RESET_COMPLETION_TIME)))
+ dev_err(adev->dev, "Waiting for GPU to complete reset timeout! reset:0x%x\n",
+ reset);
}
return 0;
@@ -3072,6 +3083,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
}
}
+ init_completion(&con->gpu_reset_completion);
mutex_init(&con->page_rsv_lock);
INIT_KFIFO(con->poison_fifo);
mutex_init(&con->page_retirement_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 103436bb650e..d5ddd0ca5de1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -537,6 +537,7 @@ struct amdgpu_ras {
DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, AMDGPU_RAS_POISON_FIFO_SIZE);
struct ras_ecc_log_info umc_ecc_log;
struct delayed_work page_retirement_dwork;
+ struct completion gpu_reset_completion;
/* Fatal error detected flag */
atomic_t fed;
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 5/5] drm/amdgpu: add gpu reset check before page retirement thread runs
2024-06-13 2:25 [PATCH 1/5] drm/amdgpu: add condition check for waking up thread YiPeng Chai
` (2 preceding siblings ...)
2024-06-13 2:25 ` [PATCH 4/5] drm/amdgpu: wait for gpu to complete reset YiPeng Chai
@ 2024-06-13 2:25 ` YiPeng Chai
2024-06-13 10:21 ` Christian König
2024-06-13 8:11 ` [PATCH 1/5] drm/amdgpu: add condition check for waking up thread Lazar, Lijo
4 siblings, 1 reply; 8+ messages in thread
From: YiPeng Chai @ 2024-06-13 2:25 UTC (permalink / raw)
To: amd-gfx
Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
Stanley.Yang, YiPeng Chai
If gpu is recovering, clear all message reset flags
in fifo and wait for gpu to complete recovery.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 341c9bd0d1a4..bf4f8d439ebe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2982,6 +2982,18 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_dec(&con->page_retirement_req_cnt);
+ reinit_completion(&con->gpu_reset_completion);
+
+ if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) {
+ uint32_t reset;
+
+ amdgpu_ras_clear_poison_fifo_msg_reset_flag(adev, &reset);
+
+ if (!wait_for_completion_timeout(&con->gpu_reset_completion,
+ msecs_to_jiffies(MAX_GPU_RESET_COMPLETION_TIME)))
+ dev_err(adev->dev, "Waiting for GPU to complete reset timeout!\n");
+ }
+
#ifdef HAVE_KFIFO_PUT_NON_POINTER
if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
continue;
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 1/5] drm/amdgpu: add condition check for waking up thread
2024-06-13 2:25 [PATCH 1/5] drm/amdgpu: add condition check for waking up thread YiPeng Chai
` (3 preceding siblings ...)
2024-06-13 2:25 ` [PATCH 5/5] drm/amdgpu: add gpu reset check before page retirement thread runs YiPeng Chai
@ 2024-06-13 8:11 ` Lazar, Lijo
4 siblings, 0 replies; 8+ messages in thread
From: Lazar, Lijo @ 2024-06-13 8:11 UTC (permalink / raw)
To: YiPeng Chai, amd-gfx
Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
Stanley.Yang
On 6/13/2024 7:55 AM, YiPeng Chai wrote:
> 1. Cannot add messages to fifo in gpu reset mode.
> 2. Only when the message is successfully saved to the
> fifo, the thread can be awakened.
>
I think fifo should still cache the poison requests while in reset. Page
retirement thread may try to acquire the read side of reset lock and
wait if any reset is in progress.
Thanks
Lijo
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++++------
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 18 +++++++++++-------
> 2 files changed, 21 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d0dcd3d37e6d..ed260966363f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2093,12 +2093,16 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
> if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
> struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
>
> - amdgpu_ras_put_poison_req(obj->adev,
> - AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
> -
> - atomic_inc(&con->page_retirement_req_cnt);
> -
> - wake_up(&con->page_retirement_wq);
> + if (!amdgpu_in_reset(obj->adev) && !atomic_read(&con->in_recovery)) {
> + int ret;
> +
> + ret = amdgpu_ras_put_poison_req(obj->adev,
> + AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
> + if (!ret) {
> + atomic_inc(&con->page_retirement_req_cnt);
> + wake_up(&con->page_retirement_wq);
> + }
> + }
> }
> #endif
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 1dbe69eabb9a..94181ae85886 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -293,16 +293,20 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
>
> amdgpu_ras_error_data_fini(&err_data);
> } else {
> - struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> -
> #ifdef HAVE_KFIFO_PUT_NON_POINTER
> - amdgpu_ras_put_poison_req(adev,
> - block, pasid, pasid_fn, data, reset);
> -#endif
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>
> - atomic_inc(&con->page_retirement_req_cnt);
> + if (!amdgpu_in_reset(adev) && !atomic_read(&con->in_recovery)) {
> + int ret;
>
> - wake_up(&con->page_retirement_wq);
> + ret = amdgpu_ras_put_poison_req(adev,
> + block, pasid, pasid_fn, data, reset);
> + if (!ret) {
> + atomic_inc(&con->page_retirement_req_cnt);
> + wake_up(&con->page_retirement_wq);
> + }
> + }
> +#endif
> }
> } else {
> if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 4/5] drm/amdgpu: wait for gpu to complete reset
2024-06-13 2:25 ` [PATCH 4/5] drm/amdgpu: wait for gpu to complete reset YiPeng Chai
@ 2024-06-13 10:19 ` Christian König
0 siblings, 0 replies; 8+ messages in thread
From: Christian König @ 2024-06-13 10:19 UTC (permalink / raw)
To: YiPeng Chai, amd-gfx
Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
Stanley.Yang
Am 13.06.24 um 04:25 schrieb YiPeng Chai:
> Add completion to wait for gpu to complete reset.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 ++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
> 2 files changed, 13 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 7dfb2e548d70..341c9bd0d1a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
>
> #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
>
> +#define MAX_GPU_RESET_COMPLETION_TIME 120000 //ms
> +
> #define RAS_POISON_FIFO_MSG_PENDING_THRESHOLD (AMDGPU_RAS_POISON_FIFO_SIZE/4)
>
> enum amdgpu_ras_retire_page_reservation {
> @@ -2526,6 +2528,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
> atomic_set(&hive->ras_recovery, 0);
> amdgpu_put_xgmi_hive(hive);
> }
> +
> + complete(&ras->gpu_reset_completion);
> }
>
> /* alloc/realloc bps array */
> @@ -2946,7 +2950,14 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
> con->gpu_reset_flags |= reset;
> }
>
> + reinit_completion(&con->gpu_reset_completion);
> +
> amdgpu_ras_reset_gpu(adev);
> +
> + if (!wait_for_completion_timeout(&con->gpu_reset_completion,
> + msecs_to_jiffies(MAX_GPU_RESET_COMPLETION_TIME)))
> + dev_err(adev->dev, "Waiting for GPU to complete reset timeout! reset:0x%x\n",
> + reset);
Are there any looks taken here which the GPU reset might need as well?
Regards,
Christian.
> }
>
> return 0;
> @@ -3072,6 +3083,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> }
> }
>
> + init_completion(&con->gpu_reset_completion);
> mutex_init(&con->page_rsv_lock);
> INIT_KFIFO(con->poison_fifo);
> mutex_init(&con->page_retirement_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 103436bb650e..d5ddd0ca5de1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -537,6 +537,7 @@ struct amdgpu_ras {
> DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, AMDGPU_RAS_POISON_FIFO_SIZE);
> struct ras_ecc_log_info umc_ecc_log;
> struct delayed_work page_retirement_dwork;
> + struct completion gpu_reset_completion;
>
> /* Fatal error detected flag */
> atomic_t fed;
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 5/5] drm/amdgpu: add gpu reset check before page retirement thread runs
2024-06-13 2:25 ` [PATCH 5/5] drm/amdgpu: add gpu reset check before page retirement thread runs YiPeng Chai
@ 2024-06-13 10:21 ` Christian König
0 siblings, 0 replies; 8+ messages in thread
From: Christian König @ 2024-06-13 10:21 UTC (permalink / raw)
To: YiPeng Chai, amd-gfx
Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
Stanley.Yang
Am 13.06.24 um 04:25 schrieb YiPeng Chai:
> If gpu is recovering, clear all message reset flags
> in fifo and wait for gpu to complete recovery.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 ++++++++++++
> 1 file changed, 12 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 341c9bd0d1a4..bf4f8d439ebe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2982,6 +2982,18 @@ static int amdgpu_ras_page_retirement_thread(void *param)
>
> atomic_dec(&con->page_retirement_req_cnt);
>
> + reinit_completion(&con->gpu_reset_completion);
> +
> + if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) {
It's illegal to call amdgpu_in_reset() from outside of the hw specific
backends.
When you want to make the code mutual exclusive with GPU resets you need
to grab the reset lock.
Regards,
Christian.
> + uint32_t reset;
> +
> + amdgpu_ras_clear_poison_fifo_msg_reset_flag(adev, &reset);
> +
> + if (!wait_for_completion_timeout(&con->gpu_reset_completion,
> + msecs_to_jiffies(MAX_GPU_RESET_COMPLETION_TIME)))
> + dev_err(adev->dev, "Waiting for GPU to complete reset timeout!\n");
> + }
> +
> #ifdef HAVE_KFIFO_PUT_NON_POINTER
> if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
> continue;
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2024-06-13 10:21 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-13 2:25 [PATCH 1/5] drm/amdgpu: add condition check for waking up thread YiPeng Chai
2024-06-13 2:25 ` [PATCH 2/5] drm/amdgpu: add threshold to interrupt waiting for DE data to be ready YiPeng Chai
2024-06-13 2:25 ` [PATCH 3/5] drm/amdgpu: clear all messages reset flags in fifo before gpu reset YiPeng Chai
2024-06-13 2:25 ` [PATCH 4/5] drm/amdgpu: wait for gpu to complete reset YiPeng Chai
2024-06-13 10:19 ` Christian König
2024-06-13 2:25 ` [PATCH 5/5] drm/amdgpu: add gpu reset check before page retirement thread runs YiPeng Chai
2024-06-13 10:21 ` Christian König
2024-06-13 8:11 ` [PATCH 1/5] drm/amdgpu: add condition check for waking up thread Lazar, Lijo
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox