* [PATCH 1/2] drm/amdgpu: add reset sources in gpu reset context
@ 2024-06-03 18:12 Eric Huang
2024-06-03 18:12 ` [PATCH 2/2] drm/amdkfd: add reset cause in gpu pre-reset smi event Eric Huang
0 siblings, 1 reply; 4+ messages in thread
From: Eric Huang @ 2024-06-03 18:12 UTC (permalink / raw)
To: amd-gfx; +Cc: Lijo.Lazar, Harish.Kasiviswanathan, Eric Huang
reset source or reset cause is very useful info
for reset context, it will be used by events API.
Suggested-by: Lijo Lazar <Lijo.Lazar@amd.com>
Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 34 +++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 13 +++++++++
2 files changed, 47 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index bfdde772b7ee..f07f0fb9f827 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -160,3 +160,37 @@ void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
atomic_set(&reset_domain->in_gpu_reset, 0);
up_write(&reset_domain->sem);
}
+
+void amdgpu_reset_get_desc(struct amdgpu_reset_context *rst_ctxt, char *buf,
+ size_t len)
+{
+ struct amdgpu_ring *ring;
+
+ if (!buf || !len)
+ return;
+
+ switch (rst_ctxt->src) {
+ case AMDGPU_RESET_SRC_JOB:
+ if (rst_ctxt->job) {
+ ring = amdgpu_job_ring(rst_ctxt->job);
+ snprintf(buf, len, "job hang on ring:%s", ring->name);
+ } else {
+ strscpy(buf, "job hang", len);
+ }
+ break;
+ case AMDGPU_RESET_SRC_RAS:
+ strscpy(buf, "RAS error", len);
+ break;
+ case AMDGPU_RESET_SRC_MES:
+ strscpy(buf, "MES hang", len);
+ break;
+ case AMDGPU_RESET_SRC_HWS:
+ strscpy(buf, "HWS hang", len);
+ break;
+ case AMDGPU_RESET_SRC_USER:
+ strscpy(buf, "user trigger", len);
+ break;
+ default:
+ strscpy(buf, "unknown", len);
+ }
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 5a9cc043b858..9de8e4157a4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -36,6 +36,15 @@ enum AMDGPU_RESET_FLAGS {
AMDGPU_HOST_FLR = 3,
};
+enum AMDGPU_RESET_SRCS {
+ AMDGPU_RESET_SRC_UNKNOWN,
+ AMDGPU_RESET_SRC_JOB,
+ AMDGPU_RESET_SRC_RAS,
+ AMDGPU_RESET_SRC_MES,
+ AMDGPU_RESET_SRC_HWS,
+ AMDGPU_RESET_SRC_USER,
+};
+
struct amdgpu_reset_context {
enum amd_reset_method method;
struct amdgpu_device *reset_req_dev;
@@ -43,6 +52,7 @@ struct amdgpu_reset_context {
struct amdgpu_hive_info *hive;
struct list_head *reset_device_list;
unsigned long flags;
+ enum AMDGPU_RESET_SRCS src;
};
struct amdgpu_reset_handler {
@@ -130,6 +140,9 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+void amdgpu_reset_get_desc(struct amdgpu_reset_context *rst_ctxt, char *buf,
+ size_t len);
+
#define for_each_handler(i, handler, reset_ctl) \
for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) && \
(handler = (*reset_ctl->reset_handlers)[i]); \
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/2] drm/amdkfd: add reset cause in gpu pre-reset smi event
2024-06-03 18:12 [PATCH 1/2] drm/amdgpu: add reset sources in gpu reset context Eric Huang
@ 2024-06-03 18:12 ` Eric Huang
2024-06-04 7:26 ` Lazar, Lijo
0 siblings, 1 reply; 4+ messages in thread
From: Eric Huang @ 2024-06-03 18:12 UTC (permalink / raw)
To: amd-gfx; +Cc: Lijo.Lazar, Harish.Kasiviswanathan, Eric Huang
reset cause is requested by customer as additional
info for gpu reset smi event.
v2: integerate reset sources suggested by Lijo Lazar
Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 ++++---
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 16 ++++++++++++++--
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 5 ++++-
6 files changed, 33 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index e3738d417245..eb601b41d9d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
+ reset_context.src = adev->enable_mes ?
+ AMDGPU_RESET_SRC_MES :
+ AMDGPU_RESET_SRC_HWS;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1de021ebdd46..7e945a4790bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
};
struct amdgpu_device;
+struct amdgpu_reset_context;
enum kfd_mem_attachment_type {
KFD_MEM_ATT_SHARED, /* Share kgd_mem->bo or another attachment's */
@@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
-int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
+int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
+ struct amdgpu_reset_context *reset_context);
int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
@@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
void kgd2kfd_device_exit(struct kfd_dev *kfd);
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
-int kgd2kfd_pre_reset(struct kfd_dev *kfd);
+int kgd2kfd_pre_reset(struct kfd_dev *kfd,
+ struct amdgpu_reset_context *reset_context);
int kgd2kfd_post_reset(struct kfd_dev *kfd);
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
@@ -459,7 +462,8 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
return 0;
}
-static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd,
+ struct amdgpu_reset_context *reset_context)
{
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6711836054f9..4096cb3e937e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5775,7 +5775,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
- amdgpu_amdkfd_pre_reset(tmp_adev);
+ amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
/*
* Mark these ASICs to be reseted as untracked first
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index fba9b9a258a5..52be4e340fb1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -924,7 +924,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
kfree(kfd);
}
-int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+int kgd2kfd_pre_reset(struct kfd_dev *kfd,
+ struct amdgpu_reset_context *reset_context)
{
struct kfd_node *node;
int i;
@@ -934,7 +935,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
for (i = 0; i < kfd->num_nodes; i++) {
node = kfd->nodes[i];
- kfd_smi_event_update_gpu_reset(node, false);
+ kfd_smi_event_update_gpu_reset(node, false, reset_context);
node->dqm->ops.pre_reset(node->dqm);
}
@@ -974,7 +975,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
for (i = 0; i < kfd->num_nodes; i++) {
node = kfd->nodes[i];
atomic_set(&node->sram_ecc_flag, 0);
- kfd_smi_event_update_gpu_reset(node, true);
+ kfd_smi_event_update_gpu_reset(node, true, NULL);
}
return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 06ac835190f9..ea6a8e43bd5b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -29,6 +29,7 @@
#include "amdgpu_vm.h"
#include "kfd_priv.h"
#include "kfd_smi_events.h"
+#include "amdgpu_reset.h"
struct kfd_smi_client {
struct list_head list;
@@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev,
add_event_to_kfifo(pid, dev, event, fifo_in, len);
}
-void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
+void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
+ struct amdgpu_reset_context *reset_context)
{
unsigned int event;
+ char reset_cause[64];
if (post_reset) {
event = KFD_SMI_EVENT_GPU_POST_RESET;
@@ -225,7 +228,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
event = KFD_SMI_EVENT_GPU_PRE_RESET;
++(dev->reset_seq_num);
}
- kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
+
+ memset(reset_cause, 0, sizeof(reset_cause));
+
+ if (reset_context)
+ amdgpu_reset_get_desc(reset_context, reset_cause,
+ sizeof(reset_cause));
+
+ kfd_smi_event_add(0, dev, event, "%x %s\n",
+ dev->reset_seq_num,
+ reset_cause);
}
void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index fa95c2dfd587..85010b8307f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -24,11 +24,14 @@
#ifndef KFD_SMI_EVENTS_H_INCLUDED
#define KFD_SMI_EVENTS_H_INCLUDED
+struct amdgpu_reset_context;
+
int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd);
void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid);
void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
uint64_t throttle_bitmask);
-void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset);
+void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
+ struct amdgpu_reset_context *reset_context);
void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
unsigned long address, bool write_fault,
ktime_t ts);
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH 2/2] drm/amdkfd: add reset cause in gpu pre-reset smi event
2024-06-03 18:12 ` [PATCH 2/2] drm/amdkfd: add reset cause in gpu pre-reset smi event Eric Huang
@ 2024-06-04 7:26 ` Lazar, Lijo
2024-06-04 13:55 ` Eric Huang
0 siblings, 1 reply; 4+ messages in thread
From: Lazar, Lijo @ 2024-06-04 7:26 UTC (permalink / raw)
To: Eric Huang, amd-gfx; +Cc: Harish.Kasiviswanathan
On 6/3/2024 11:42 PM, Eric Huang wrote:
> reset cause is requested by customer as additional
> info for gpu reset smi event.
>
> v2: integerate reset sources suggested by Lijo Lazar
>
> Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
This series is
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
I think SMI needs to get all reset cause descriptions. Are you planning
to fill reset source at other places also?
Thanks,
Lijo
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +++++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 ++++---
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 16 ++++++++++++++--
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 5 ++++-
> 6 files changed, 33 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index e3738d417245..eb601b41d9d5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
>
> reset_context.method = AMD_RESET_METHOD_NONE;
> reset_context.reset_req_dev = adev;
> + reset_context.src = adev->enable_mes ?
> + AMDGPU_RESET_SRC_MES :
> + AMDGPU_RESET_SRC_HWS;
> clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>
> amdgpu_device_gpu_recover(adev, NULL, &reset_context);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 1de021ebdd46..7e945a4790bb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
> };
>
> struct amdgpu_device;
> +struct amdgpu_reset_context;
>
> enum kfd_mem_attachment_type {
> KFD_MEM_ATT_SHARED, /* Share kgd_mem->bo or another attachment's */
> @@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
>
> bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
>
> -int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
> +int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
> + struct amdgpu_reset_context *reset_context);
>
> int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
>
> @@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
> void kgd2kfd_device_exit(struct kfd_dev *kfd);
> void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
> int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
> -int kgd2kfd_pre_reset(struct kfd_dev *kfd);
> +int kgd2kfd_pre_reset(struct kfd_dev *kfd,
> + struct amdgpu_reset_context *reset_context);
> int kgd2kfd_post_reset(struct kfd_dev *kfd);
> void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
> void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
> @@ -459,7 +462,8 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
> return 0;
> }
>
> -static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
> +static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd,
> + struct amdgpu_reset_context *reset_context)
> {
> return 0;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 6711836054f9..4096cb3e937e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5775,7 +5775,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>
> cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
>
> - amdgpu_amdkfd_pre_reset(tmp_adev);
> + amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
>
> /*
> * Mark these ASICs to be reseted as untracked first
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index fba9b9a258a5..52be4e340fb1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -924,7 +924,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
> kfree(kfd);
> }
>
> -int kgd2kfd_pre_reset(struct kfd_dev *kfd)
> +int kgd2kfd_pre_reset(struct kfd_dev *kfd,
> + struct amdgpu_reset_context *reset_context)
> {
> struct kfd_node *node;
> int i;
> @@ -934,7 +935,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>
> for (i = 0; i < kfd->num_nodes; i++) {
> node = kfd->nodes[i];
> - kfd_smi_event_update_gpu_reset(node, false);
> + kfd_smi_event_update_gpu_reset(node, false, reset_context);
> node->dqm->ops.pre_reset(node->dqm);
> }
>
> @@ -974,7 +975,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
> for (i = 0; i < kfd->num_nodes; i++) {
> node = kfd->nodes[i];
> atomic_set(&node->sram_ecc_flag, 0);
> - kfd_smi_event_update_gpu_reset(node, true);
> + kfd_smi_event_update_gpu_reset(node, true, NULL);
> }
>
> return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 06ac835190f9..ea6a8e43bd5b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -29,6 +29,7 @@
> #include "amdgpu_vm.h"
> #include "kfd_priv.h"
> #include "kfd_smi_events.h"
> +#include "amdgpu_reset.h"
>
> struct kfd_smi_client {
> struct list_head list;
> @@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev,
> add_event_to_kfifo(pid, dev, event, fifo_in, len);
> }
>
> -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
> +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
> + struct amdgpu_reset_context *reset_context)
> {
> unsigned int event;
> + char reset_cause[64];
>
> if (post_reset) {
> event = KFD_SMI_EVENT_GPU_POST_RESET;
> @@ -225,7 +228,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
> event = KFD_SMI_EVENT_GPU_PRE_RESET;
> ++(dev->reset_seq_num);
> }
> - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
> +
> + memset(reset_cause, 0, sizeof(reset_cause));
> +
> + if (reset_context)
> + amdgpu_reset_get_desc(reset_context, reset_cause,
> + sizeof(reset_cause));
> +
> + kfd_smi_event_add(0, dev, event, "%x %s\n",
> + dev->reset_seq_num,
> + reset_cause);
> }
>
> void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index fa95c2dfd587..85010b8307f8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -24,11 +24,14 @@
> #ifndef KFD_SMI_EVENTS_H_INCLUDED
> #define KFD_SMI_EVENTS_H_INCLUDED
>
> +struct amdgpu_reset_context;
> +
> int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd);
> void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid);
> void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
> uint64_t throttle_bitmask);
> -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset);
> +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
> + struct amdgpu_reset_context *reset_context);
> void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
> unsigned long address, bool write_fault,
> ktime_t ts);
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 2/2] drm/amdkfd: add reset cause in gpu pre-reset smi event
2024-06-04 7:26 ` Lazar, Lijo
@ 2024-06-04 13:55 ` Eric Huang
0 siblings, 0 replies; 4+ messages in thread
From: Eric Huang @ 2024-06-04 13:55 UTC (permalink / raw)
To: Lazar, Lijo, amd-gfx; +Cc: Harish.Kasiviswanathan
Thanks for your review Lijo, I will send a patch with reset source in
another places.
Regards,
Eric
On 2024-06-04 03:26, Lazar, Lijo wrote:
>
> On 6/3/2024 11:42 PM, Eric Huang wrote:
>> reset cause is requested by customer as additional
>> info for gpu reset smi event.
>>
>> v2: integerate reset sources suggested by Lijo Lazar
>>
>> Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
> This series is
> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
>
> I think SMI needs to get all reset cause descriptions. Are you planning
> to fill reset source at other places also?
>
> Thanks,
> Lijo
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 +++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +++++++---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
>> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 ++++---
>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 16 ++++++++++++++--
>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 5 ++++-
>> 6 files changed, 33 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index e3738d417245..eb601b41d9d5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
>>
>> reset_context.method = AMD_RESET_METHOD_NONE;
>> reset_context.reset_req_dev = adev;
>> + reset_context.src = adev->enable_mes ?
>> + AMDGPU_RESET_SRC_MES :
>> + AMDGPU_RESET_SRC_HWS;
>> clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>>
>> amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 1de021ebdd46..7e945a4790bb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
>> };
>>
>> struct amdgpu_device;
>> +struct amdgpu_reset_context;
>>
>> enum kfd_mem_attachment_type {
>> KFD_MEM_ATT_SHARED, /* Share kgd_mem->bo or another attachment's */
>> @@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
>>
>> bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
>>
>> -int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
>> +int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
>> + struct amdgpu_reset_context *reset_context);
>>
>> int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
>>
>> @@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>> void kgd2kfd_device_exit(struct kfd_dev *kfd);
>> void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
>> int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
>> -int kgd2kfd_pre_reset(struct kfd_dev *kfd);
>> +int kgd2kfd_pre_reset(struct kfd_dev *kfd,
>> + struct amdgpu_reset_context *reset_context);
>> int kgd2kfd_post_reset(struct kfd_dev *kfd);
>> void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
>> void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
>> @@ -459,7 +462,8 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>> return 0;
>> }
>>
>> -static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>> +static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd,
>> + struct amdgpu_reset_context *reset_context)
>> {
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 6711836054f9..4096cb3e937e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5775,7 +5775,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>
>> cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
>>
>> - amdgpu_amdkfd_pre_reset(tmp_adev);
>> + amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
>>
>> /*
>> * Mark these ASICs to be reseted as untracked first
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index fba9b9a258a5..52be4e340fb1 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -924,7 +924,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>> kfree(kfd);
>> }
>>
>> -int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>> +int kgd2kfd_pre_reset(struct kfd_dev *kfd,
>> + struct amdgpu_reset_context *reset_context)
>> {
>> struct kfd_node *node;
>> int i;
>> @@ -934,7 +935,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>
>> for (i = 0; i < kfd->num_nodes; i++) {
>> node = kfd->nodes[i];
>> - kfd_smi_event_update_gpu_reset(node, false);
>> + kfd_smi_event_update_gpu_reset(node, false, reset_context);
>> node->dqm->ops.pre_reset(node->dqm);
>> }
>>
>> @@ -974,7 +975,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>> for (i = 0; i < kfd->num_nodes; i++) {
>> node = kfd->nodes[i];
>> atomic_set(&node->sram_ecc_flag, 0);
>> - kfd_smi_event_update_gpu_reset(node, true);
>> + kfd_smi_event_update_gpu_reset(node, true, NULL);
>> }
>>
>> return 0;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> index 06ac835190f9..ea6a8e43bd5b 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> @@ -29,6 +29,7 @@
>> #include "amdgpu_vm.h"
>> #include "kfd_priv.h"
>> #include "kfd_smi_events.h"
>> +#include "amdgpu_reset.h"
>>
>> struct kfd_smi_client {
>> struct list_head list;
>> @@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev,
>> add_event_to_kfifo(pid, dev, event, fifo_in, len);
>> }
>>
>> -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
>> +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
>> + struct amdgpu_reset_context *reset_context)
>> {
>> unsigned int event;
>> + char reset_cause[64];
>>
>> if (post_reset) {
>> event = KFD_SMI_EVENT_GPU_POST_RESET;
>> @@ -225,7 +228,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
>> event = KFD_SMI_EVENT_GPU_PRE_RESET;
>> ++(dev->reset_seq_num);
>> }
>> - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
>> +
>> + memset(reset_cause, 0, sizeof(reset_cause));
>> +
>> + if (reset_context)
>> + amdgpu_reset_get_desc(reset_context, reset_cause,
>> + sizeof(reset_cause));
>> +
>> + kfd_smi_event_add(0, dev, event, "%x %s\n",
>> + dev->reset_seq_num,
>> + reset_cause);
>> }
>>
>> void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> index fa95c2dfd587..85010b8307f8 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> @@ -24,11 +24,14 @@
>> #ifndef KFD_SMI_EVENTS_H_INCLUDED
>> #define KFD_SMI_EVENTS_H_INCLUDED
>>
>> +struct amdgpu_reset_context;
>> +
>> int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd);
>> void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid);
>> void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
>> uint64_t throttle_bitmask);
>> -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset);
>> +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
>> + struct amdgpu_reset_context *reset_context);
>> void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
>> unsigned long address, bool write_fault,
>> ktime_t ts);
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2024-06-04 13:55 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-03 18:12 [PATCH 1/2] drm/amdgpu: add reset sources in gpu reset context Eric Huang
2024-06-03 18:12 ` [PATCH 2/2] drm/amdkfd: add reset cause in gpu pre-reset smi event Eric Huang
2024-06-04 7:26 ` Lazar, Lijo
2024-06-04 13:55 ` Eric Huang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox