[PATCH 0/2] Improve the dev coredump

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2]  Improve the dev coredump
@ 2024-08-19  9:53 Trigger.Huang
  2024-08-19  9:53 ` [PATCH 1/2] drm/amdgpu: skip printing vram_lost if needed Trigger.Huang
  2024-08-19  9:53 ` [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo Trigger.Huang
  0 siblings, 2 replies; 12+ messages in thread
From: Trigger.Huang @ 2024-08-19  9:53 UTC (permalink / raw)
  To: amd-gfx; +Cc: sunil.khatri, alexander.deucher, Trigger Huang

From: Trigger Huang <Trigger.Huang@amd.com>

The current dev coredump implementation sometimes cannot fully satisfy customer's requirements due to:
1, dev coredump is called in GPU reset function, so if GPU reset is disabled, the dev coredump is also disabled
2, When job timeout happened, the dump GPU status will be happened after a lot of operations, like soft_reset. The concern here is that the status is not so close to the real GPU's error status

The new solution will unconditionally call dev coredump immediately after a job timeout to get a closer representation of GPU's error status

Trigger Huang (2):
  drm/amdgpu: skip printing vram_lost if needed
  drm/amdgpu: Do core dump immediately when job tmo

 .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c  | 20 +++---
 .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h  |  7 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 62 +++++++++++++++++++
 4 files changed, 77 insertions(+), 14 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/2] drm/amdgpu: skip printing vram_lost if needed
  2024-08-19  9:53 [PATCH 0/2] Improve the dev coredump Trigger.Huang
@ 2024-08-19  9:53 ` Trigger.Huang
  2024-08-19  9:53 ` [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo Trigger.Huang
  1 sibling, 0 replies; 12+ messages in thread
From: Trigger.Huang @ 2024-08-19  9:53 UTC (permalink / raw)
  To: amd-gfx; +Cc: sunil.khatri, alexander.deucher, Trigger Huang

From: Trigger Huang <Trigger.Huang@amd.com>

The vm lost status can only be obtained after a GPU reset occurs, but
sometimes a dev core dump can be happened before GPU reset. So a new
argument is added to tell the dev core dump implementation whether to
skip printing the vram_lost status in the dump.
And this patch is also trying to decouple the core dump function from
the GPU reset function, by replacing the argument amdgpu_reset_context
with amdgpu_job to specify the context for core dump.

V2: Inform user if VRAM lost check is skipped so users don't assume
VRAM wasn't lost (Alex)

Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c  | 20 ++++++++++---------
 .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h  |  7 +++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  2 +-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
index cf2b4dd4d865..5ac59b62020c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
@@ -28,8 +28,8 @@
 #include "atom.h"
 
 #ifndef CONFIG_DEV_COREDUMP
-void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
-		     struct amdgpu_reset_context *reset_context)
+void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
+		     bool vram_lost, struct amdgpu_job *job)
 {
 }
 #else
@@ -315,7 +315,9 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
 		}
 	}
 
-	if (coredump->reset_vram_lost)
+	if (coredump->skip_vram_check)
+		drm_printf(&p, "VRAM lost check is skipped!\n");
+	else if (coredump->reset_vram_lost)
 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
 
 	return count - iter.remain;
@@ -326,12 +328,11 @@ static void amdgpu_devcoredump_free(void *data)
 	kfree(data);
 }
 
-void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
-		     struct amdgpu_reset_context *reset_context)
+void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
+		     bool vram_lost, struct amdgpu_job *job)
 {
-	struct amdgpu_coredump_info *coredump;
 	struct drm_device *dev = adev_to_drm(adev);
-	struct amdgpu_job *job = reset_context->job;
+	struct amdgpu_coredump_info *coredump;
 	struct drm_sched_job *s_job;
 
 	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
@@ -341,11 +342,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
 		return;
 	}
 
+	coredump->skip_vram_check = skip_vram_check;
 	coredump->reset_vram_lost = vram_lost;
 
-	if (reset_context->job && reset_context->job->vm) {
+	if (job && job->vm) {
+		struct amdgpu_vm *vm = job->vm;
 		struct amdgpu_task_info *ti;
-		struct amdgpu_vm *vm = reset_context->job->vm;
 
 		ti = amdgpu_vm_get_task_info_vm(vm);
 		if (ti) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
index 52459512cb2b..ef9772c6bcc9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
@@ -26,7 +26,6 @@
 #define __AMDGPU_DEV_COREDUMP_H__
 
 #include "amdgpu.h"
-#include "amdgpu_reset.h"
 
 #ifdef CONFIG_DEV_COREDUMP
 
@@ -36,12 +35,12 @@ struct amdgpu_coredump_info {
 	struct amdgpu_device            *adev;
 	struct amdgpu_task_info         reset_task_info;
 	struct timespec64               reset_time;
+	bool                            skip_vram_check;
 	bool                            reset_vram_lost;
 	struct amdgpu_ring              *ring;
 };
 #endif
 
-void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
-		     struct amdgpu_reset_context *reset_context);
-
+void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
+		     bool vram_lost, struct amdgpu_job *job);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ad97f03f1358..59a443abc11e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5468,7 +5468,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
 
 				if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
-					amdgpu_coredump(tmp_adev, vram_lost, reset_context);
+					amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
 
 				if (vram_lost) {
 					DRM_INFO("VRAM is lost due to GPU reset!\n");
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-19  9:53 [PATCH 0/2] Improve the dev coredump Trigger.Huang
  2024-08-19  9:53 ` [PATCH 1/2] drm/amdgpu: skip printing vram_lost if needed Trigger.Huang
@ 2024-08-19  9:53 ` Trigger.Huang
  2024-08-19 10:30   ` Khatri, Sunil
  1 sibling, 1 reply; 12+ messages in thread
From: Trigger.Huang @ 2024-08-19  9:53 UTC (permalink / raw)
  To: amd-gfx; +Cc: sunil.khatri, alexander.deucher, Trigger Huang

From: Trigger Huang <Trigger.Huang@amd.com>

Do the coredump immediately after a job timeout to get a closer
representation of GPU's error status.

V2: This will skip printing vram_lost as the GPU reset is not
happened yet (Alex)

V3: Unconditionally call the core dump as we care about all the reset
functions(soft-recovery and queue reset and full adapter reset, Alex)

Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62 +++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index c6a1783fc9ef..ebbb1434073e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -30,6 +30,61 @@
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_dev_coredump.h"
+#include "amdgpu_xgmi.h"
+
+static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
+				    struct amdgpu_job *job)
+{
+	int i;
+
+	dev_info(adev->dev, "Dumping IP State\n");
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (adev->ip_blocks[i].version->funcs->dump_ip_state)
+			adev->ip_blocks[i].version->funcs
+				->dump_ip_state((void *)adev);
+		dev_info(adev->dev, "Dumping IP State Completed\n");
+	}
+
+	amdgpu_coredump(adev, true, false, job);
+}
+
+static void amdgpu_job_core_dump(struct amdgpu_device *adev,
+				 struct amdgpu_job *job)
+{
+	struct list_head device_list, *device_list_handle =  NULL;
+	struct amdgpu_device *tmp_adev = NULL;
+	struct amdgpu_hive_info *hive = NULL;
+
+	if (!amdgpu_sriov_vf(adev))
+		hive = amdgpu_get_xgmi_hive(adev);
+	if (hive)
+		mutex_lock(&hive->hive_lock);
+	/*
+	 * Reuse the logic in amdgpu_device_gpu_recover() to build list of
+	 * devices for code dump
+	 */
+	INIT_LIST_HEAD(&device_list);
+	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
+		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+			list_add_tail(&tmp_adev->reset_list, &device_list);
+		if (!list_is_first(&adev->reset_list, &device_list))
+			list_rotate_to_front(&adev->reset_list, &device_list);
+		device_list_handle = &device_list;
+	} else {
+		list_add_tail(&adev->reset_list, &device_list);
+		device_list_handle = &device_list;
+	}
+
+	/* Do the coredump for each device */
+	list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+		amdgpu_job_do_core_dump(tmp_adev, job);
+
+	if (hive) {
+		mutex_unlock(&hive->hive_lock);
+		amdgpu_put_xgmi_hive(hive);
+	}
+}
 
 static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 {
@@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		return DRM_GPU_SCHED_STAT_ENODEV;
 	}
 
+	amdgpu_job_core_dump(adev, job);
 
 	adev->job_hang = true;
 
@@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		reset_context.src = AMDGPU_RESET_SRC_JOB;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
+		/*
+		 * To avoid an unnecessary extra coredump, as we have already
+		 * got the very close representation of GPU's error status
+		 */
+		set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
+
 		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
 		if (r)
 			dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-19  9:53 ` [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo Trigger.Huang
@ 2024-08-19 10:30   ` Khatri, Sunil
  2024-08-20  7:30     ` Huang, Trigger
  0 siblings, 1 reply; 12+ messages in thread
From: Khatri, Sunil @ 2024-08-19 10:30 UTC (permalink / raw)
  To: Trigger.Huang, amd-gfx; +Cc: alexander.deucher


On 8/19/2024 3:23 PM, Trigger.Huang@amd.com wrote:
> From: Trigger Huang <Trigger.Huang@amd.com>
>
> Do the coredump immediately after a job timeout to get a closer
> representation of GPU's error status.
>
> V2: This will skip printing vram_lost as the GPU reset is not
> happened yet (Alex)
>
> V3: Unconditionally call the core dump as we care about all the reset
> functions(soft-recovery and queue reset and full adapter reset, Alex)
>
> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62 +++++++++++++++++++++++++
>   1 file changed, 62 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index c6a1783fc9ef..ebbb1434073e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -30,6 +30,61 @@
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
>   #include "amdgpu_reset.h"
> +#include "amdgpu_dev_coredump.h"
> +#include "amdgpu_xgmi.h"
> +
> +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
> +				    struct amdgpu_job *job)
> +{
> +	int i;
> +
> +	dev_info(adev->dev, "Dumping IP State\n");
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (adev->ip_blocks[i].version->funcs->dump_ip_state)
> +			adev->ip_blocks[i].version->funcs
> +				->dump_ip_state((void *)adev);
> +		dev_info(adev->dev, "Dumping IP State Completed\n");
> +	}
> +
> +	amdgpu_coredump(adev, true, false, job);
> +}
> +
> +static void amdgpu_job_core_dump(struct amdgpu_device *adev,
> +				 struct amdgpu_job *job)
> +{
> +	struct list_head device_list, *device_list_handle =  NULL;
> +	struct amdgpu_device *tmp_adev = NULL;
> +	struct amdgpu_hive_info *hive = NULL;
> +
> +	if (!amdgpu_sriov_vf(adev))
> +		hive = amdgpu_get_xgmi_hive(adev);
> +	if (hive)
> +		mutex_lock(&hive->hive_lock);
> +	/*
> +	 * Reuse the logic in amdgpu_device_gpu_recover() to build list of
> +	 * devices for code dump
> +	 */
> +	INIT_LIST_HEAD(&device_list);
> +	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
> +		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
> +			list_add_tail(&tmp_adev->reset_list, &device_list);
> +		if (!list_is_first(&adev->reset_list, &device_list))
> +			list_rotate_to_front(&adev->reset_list, &device_list);
> +		device_list_handle = &device_list;
> +	} else {
> +		list_add_tail(&adev->reset_list, &device_list);
> +		device_list_handle = &device_list;
> +	}
> +
> +	/* Do the coredump for each device */
> +	list_for_each_entry(tmp_adev, device_list_handle, reset_list)
> +		amdgpu_job_do_core_dump(tmp_adev, job);
> +
> +	if (hive) {
> +		mutex_unlock(&hive->hive_lock);
> +		amdgpu_put_xgmi_hive(hive);
> +	}
> +}
>   
>   static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   {
> @@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		return DRM_GPU_SCHED_STAT_ENODEV;
>   	}
>   
> +	amdgpu_job_core_dump(adev, job);
The philosophy is hang and recovery is to let the HW and software try to 
recover. Here we try to do a soft recovery first and i think we should 
wait for seft recovery and if fails then we do dump and thats exactly we 
are doing here.

Also we need to make sure that the tasks which are already in queue are 
put on hold and the their sync points are signalled before we dump. 
check once what all steps are taken before we dump in the current 
implementation.

Regards

Sunil khatri

>   
>   	adev->job_hang = true;
>   
> @@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		reset_context.src = AMDGPU_RESET_SRC_JOB;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>   
> +		/*
> +		 * To avoid an unnecessary extra coredump, as we have already
> +		 * got the very close representation of GPU's error status
> +		 */
> +		set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
> +
>   		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
>   		if (r)
>   			dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-19 10:30   ` Khatri, Sunil
@ 2024-08-20  7:30     ` Huang, Trigger
  2024-08-20 14:06       ` Alex Deucher
  2024-08-20 15:31       ` Khatri, Sunil
  0 siblings, 2 replies; 12+ messages in thread
From: Huang, Trigger @ 2024-08-20  7:30 UTC (permalink / raw)
  To: Khatri, Sunil, amd-gfx@lists.freedesktop.org; +Cc: Deucher, Alexander

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Khatri, Sunil <Sunil.Khatri@amd.com>
> Sent: Monday, August 19, 2024 6:31 PM
> To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job
> tmo
>
>
> On 8/19/2024 3:23 PM, Trigger.Huang@amd.com wrote:
> > From: Trigger Huang <Trigger.Huang@amd.com>
> >
> > Do the coredump immediately after a job timeout to get a closer
> > representation of GPU's error status.
> >
> > V2: This will skip printing vram_lost as the GPU reset is not happened
> > yet (Alex)
> >
> > V3: Unconditionally call the core dump as we care about all the reset
> > functions(soft-recovery and queue reset and full adapter reset, Alex)
> >
> > Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62
> +++++++++++++++++++++++++
> >   1 file changed, 62 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > index c6a1783fc9ef..ebbb1434073e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > @@ -30,6 +30,61 @@
> >   #include "amdgpu.h"
> >   #include "amdgpu_trace.h"
> >   #include "amdgpu_reset.h"
> > +#include "amdgpu_dev_coredump.h"
> > +#include "amdgpu_xgmi.h"
> > +
> > +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
> > +                               struct amdgpu_job *job)
> > +{
> > +   int i;
> > +
> > +   dev_info(adev->dev, "Dumping IP State\n");
> > +   for (i = 0; i < adev->num_ip_blocks; i++) {
> > +           if (adev->ip_blocks[i].version->funcs->dump_ip_state)
> > +                   adev->ip_blocks[i].version->funcs
> > +                           ->dump_ip_state((void *)adev);
> > +           dev_info(adev->dev, "Dumping IP State Completed\n");
> > +   }
> > +
> > +   amdgpu_coredump(adev, true, false, job); }
> > +
> > +static void amdgpu_job_core_dump(struct amdgpu_device *adev,
> > +                            struct amdgpu_job *job)
> > +{
> > +   struct list_head device_list, *device_list_handle =  NULL;
> > +   struct amdgpu_device *tmp_adev = NULL;
> > +   struct amdgpu_hive_info *hive = NULL;
> > +
> > +   if (!amdgpu_sriov_vf(adev))
> > +           hive = amdgpu_get_xgmi_hive(adev);
> > +   if (hive)
> > +           mutex_lock(&hive->hive_lock);
> > +   /*
> > +    * Reuse the logic in amdgpu_device_gpu_recover() to build list of
> > +    * devices for code dump
> > +    */
> > +   INIT_LIST_HEAD(&device_list);
> > +   if (!amdgpu_sriov_vf(adev) && (adev-
> >gmc.xgmi.num_physical_nodes > 1) && hive) {
> > +           list_for_each_entry(tmp_adev, &hive->device_list,
> gmc.xgmi.head)
> > +                   list_add_tail(&tmp_adev->reset_list, &device_list);
> > +           if (!list_is_first(&adev->reset_list, &device_list))
> > +                   list_rotate_to_front(&adev->reset_list, &device_list);
> > +           device_list_handle = &device_list;
> > +   } else {
> > +           list_add_tail(&adev->reset_list, &device_list);
> > +           device_list_handle = &device_list;
> > +   }
> > +
> > +   /* Do the coredump for each device */
> > +   list_for_each_entry(tmp_adev, device_list_handle, reset_list)
> > +           amdgpu_job_do_core_dump(tmp_adev, job);
> > +
> > +   if (hive) {
> > +           mutex_unlock(&hive->hive_lock);
> > +           amdgpu_put_xgmi_hive(hive);
> > +   }
> > +}
> >
> >   static enum drm_gpu_sched_stat amdgpu_job_timedout(struct
> drm_sched_job *s_job)
> >   {
> > @@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat
> amdgpu_job_timedout(struct drm_sched_job *s_job)
> >             return DRM_GPU_SCHED_STAT_ENODEV;
> >     }
> >
> > +   amdgpu_job_core_dump(adev, job);
> The philosophy is hang and recovery is to let the HW and software try to
> recover. Here we try to do a soft recovery first and i think we should wait for
> seft recovery and if fails then we do dump and thats exactly we are doing here.

Hi Sunil ,
thanks for the suggestion, and that's reasonable. But my concern is that after soft recovery happened, the GPU's status may change(take gfx 9 for example, it will try to kill the current hang wave)
 Actually, in most cases, a real shader hang cannot be resolved through soft recovery, and at that moment, we need to get a very close dump/snapshot/representation of GPU's current error status.
Just like the scandump, when we trying to do a scandump for a shader hang, we will disable gpu_recovery, and no soft recovery/per-queue reset/HW reset will happen before the scandump, right?
On most products, there are no scandump interfaces, so core dump is even more important for debugging GPU hang issue.

Regards,
Trigger

>
> Also we need to make sure that the tasks which are already in queue are put
> on hold and the their sync points are signalled before we dump.
> check once what all steps are taken before we dump in the current
> implementation.

Do you mean sometimes like:
        drm_sched_wqueue_stop(&ring->sched);
        amdgpu_fence_driver_force_completion(ring); // Since there is no GPU reset happened, is it reasonable to call it here?
        amdgpu_job_core_dump(adev, job);


Regards,
Trigger

>
> Regards
>
> Sunil khatri
>
> >
> >     adev->job_hang = true;
> >
> > @@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat
> amdgpu_job_timedout(struct drm_sched_job *s_job)
> >             reset_context.src = AMDGPU_RESET_SRC_JOB;
> >             clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> >
> > +           /*
> > +            * To avoid an unnecessary extra coredump, as we have
> already
> > +            * got the very close representation of GPU's error status
> > +            */
> > +           set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
> > +
> >             r = amdgpu_device_gpu_recover(ring->adev, job,
> &reset_context);
> >             if (r)
> >                     dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-20  7:30     ` Huang, Trigger
@ 2024-08-20 14:06       ` Alex Deucher
  2024-08-20 15:07         ` Khatri, Sunil
  2024-08-20 15:31       ` Khatri, Sunil
  1 sibling, 1 reply; 12+ messages in thread
From: Alex Deucher @ 2024-08-20 14:06 UTC (permalink / raw)
  To: Huang, Trigger
  Cc: Khatri, Sunil, amd-gfx@lists.freedesktop.org, Deucher, Alexander

On Tue, Aug 20, 2024 at 3:30 AM Huang, Trigger <Trigger.Huang@amd.com> wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> > -----Original Message-----
> > From: Khatri, Sunil <Sunil.Khatri@amd.com>
> > Sent: Monday, August 19, 2024 6:31 PM
> > To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> > Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
> > Subject: Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job
> > tmo
> >
> >
> > On 8/19/2024 3:23 PM, Trigger.Huang@amd.com wrote:
> > > From: Trigger Huang <Trigger.Huang@amd.com>
> > >
> > > Do the coredump immediately after a job timeout to get a closer
> > > representation of GPU's error status.
> > >
> > > V2: This will skip printing vram_lost as the GPU reset is not happened
> > > yet (Alex)
> > >
> > > V3: Unconditionally call the core dump as we care about all the reset
> > > functions(soft-recovery and queue reset and full adapter reset, Alex)
> > >
> > > Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62
> > +++++++++++++++++++++++++
> > >   1 file changed, 62 insertions(+)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > index c6a1783fc9ef..ebbb1434073e 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > @@ -30,6 +30,61 @@
> > >   #include "amdgpu.h"
> > >   #include "amdgpu_trace.h"
> > >   #include "amdgpu_reset.h"
> > > +#include "amdgpu_dev_coredump.h"
> > > +#include "amdgpu_xgmi.h"
> > > +
> > > +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
> > > +                               struct amdgpu_job *job)
> > > +{
> > > +   int i;
> > > +
> > > +   dev_info(adev->dev, "Dumping IP State\n");
> > > +   for (i = 0; i < adev->num_ip_blocks; i++) {
> > > +           if (adev->ip_blocks[i].version->funcs->dump_ip_state)
> > > +                   adev->ip_blocks[i].version->funcs
> > > +                           ->dump_ip_state((void *)adev);
> > > +           dev_info(adev->dev, "Dumping IP State Completed\n");
> > > +   }
> > > +
> > > +   amdgpu_coredump(adev, true, false, job); }
> > > +
> > > +static void amdgpu_job_core_dump(struct amdgpu_device *adev,
> > > +                            struct amdgpu_job *job)
> > > +{
> > > +   struct list_head device_list, *device_list_handle =  NULL;
> > > +   struct amdgpu_device *tmp_adev = NULL;
> > > +   struct amdgpu_hive_info *hive = NULL;
> > > +
> > > +   if (!amdgpu_sriov_vf(adev))
> > > +           hive = amdgpu_get_xgmi_hive(adev);
> > > +   if (hive)
> > > +           mutex_lock(&hive->hive_lock);
> > > +   /*
> > > +    * Reuse the logic in amdgpu_device_gpu_recover() to build list of
> > > +    * devices for code dump
> > > +    */
> > > +   INIT_LIST_HEAD(&device_list);
> > > +   if (!amdgpu_sriov_vf(adev) && (adev-
> > >gmc.xgmi.num_physical_nodes > 1) && hive) {
> > > +           list_for_each_entry(tmp_adev, &hive->device_list,
> > gmc.xgmi.head)
> > > +                   list_add_tail(&tmp_adev->reset_list, &device_list);
> > > +           if (!list_is_first(&adev->reset_list, &device_list))
> > > +                   list_rotate_to_front(&adev->reset_list, &device_list);
> > > +           device_list_handle = &device_list;
> > > +   } else {
> > > +           list_add_tail(&adev->reset_list, &device_list);
> > > +           device_list_handle = &device_list;
> > > +   }
> > > +
> > > +   /* Do the coredump for each device */
> > > +   list_for_each_entry(tmp_adev, device_list_handle, reset_list)
> > > +           amdgpu_job_do_core_dump(tmp_adev, job);
> > > +
> > > +   if (hive) {
> > > +           mutex_unlock(&hive->hive_lock);
> > > +           amdgpu_put_xgmi_hive(hive);
> > > +   }
> > > +}
> > >
> > >   static enum drm_gpu_sched_stat amdgpu_job_timedout(struct
> > drm_sched_job *s_job)
> > >   {
> > > @@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat
> > amdgpu_job_timedout(struct drm_sched_job *s_job)
> > >             return DRM_GPU_SCHED_STAT_ENODEV;
> > >     }
> > >
> > > +   amdgpu_job_core_dump(adev, job);
> > The philosophy is hang and recovery is to let the HW and software try to
> > recover. Here we try to do a soft recovery first and i think we should wait for
> > seft recovery and if fails then we do dump and thats exactly we are doing here.
>
> Hi Sunil ,
> thanks for the suggestion, and that's reasonable. But my concern is that after soft recovery happened, the GPU's status may change(take gfx 9 for example, it will try to kill the current hang wave)
>  Actually, in most cases, a real shader hang cannot be resolved through soft recovery, and at that moment, we need to get a very close dump/snapshot/representation of GPU's current error status.
> Just like the scandump, when we trying to do a scandump for a shader hang, we will disable gpu_recovery, and no soft recovery/per-queue reset/HW reset will happen before the scandump, right?
> On most products, there are no scandump interfaces, so core dump is even more important for debugging GPU hang issue.

I agree that it makes sense to take the dump as early as possible.  It
makes sense to move it up now that we are starting to have finer
grained resets.  We may want to wire devcoredump into the KFD side as
well at some point.

Alex

>
> Regards,
> Trigger
>
> >
> > Also we need to make sure that the tasks which are already in queue are put
> > on hold and the their sync points are signalled before we dump.
> > check once what all steps are taken before we dump in the current
> > implementation.
>
> Do you mean sometimes like:
>         drm_sched_wqueue_stop(&ring->sched);
>         amdgpu_fence_driver_force_completion(ring); // Since there is no GPU reset happened, is it reasonable to call it here?
>         amdgpu_job_core_dump(adev, job);
>
>
> Regards,
> Trigger
>
> >
> > Regards
> >
> > Sunil khatri
> >
> > >
> > >     adev->job_hang = true;
> > >
> > > @@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat
> > amdgpu_job_timedout(struct drm_sched_job *s_job)
> > >             reset_context.src = AMDGPU_RESET_SRC_JOB;
> > >             clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> > >
> > > +           /*
> > > +            * To avoid an unnecessary extra coredump, as we have
> > already
> > > +            * got the very close representation of GPU's error status
> > > +            */
> > > +           set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
> > > +
> > >             r = amdgpu_device_gpu_recover(ring->adev, job,
> > &reset_context);
> > >             if (r)
> > >                     dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-20 14:06       ` Alex Deucher
@ 2024-08-20 15:07         ` Khatri, Sunil
  2024-08-20 15:29           ` Alex Deucher
  0 siblings, 1 reply; 12+ messages in thread
From: Khatri, Sunil @ 2024-08-20 15:07 UTC (permalink / raw)
  To: Alex Deucher, Huang, Trigger
  Cc: amd-gfx@lists.freedesktop.org, Deucher, Alexander


On 8/20/2024 7:36 PM, Alex Deucher wrote:
> On Tue, Aug 20, 2024 at 3:30 AM Huang, Trigger <Trigger.Huang@amd.com> wrote:
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>>> -----Original Message-----
>>> From: Khatri, Sunil <Sunil.Khatri@amd.com>
>>> Sent: Monday, August 19, 2024 6:31 PM
>>> To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
>>> Subject: Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job
>>> tmo
>>>
>>>
>>> On 8/19/2024 3:23 PM, Trigger.Huang@amd.com wrote:
>>>> From: Trigger Huang <Trigger.Huang@amd.com>
>>>>
>>>> Do the coredump immediately after a job timeout to get a closer
>>>> representation of GPU's error status.
>>>>
>>>> V2: This will skip printing vram_lost as the GPU reset is not happened
>>>> yet (Alex)
>>>>
>>>> V3: Unconditionally call the core dump as we care about all the reset
>>>> functions(soft-recovery and queue reset and full adapter reset, Alex)
>>>>
>>>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62
>>> +++++++++++++++++++++++++
>>>>    1 file changed, 62 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index c6a1783fc9ef..ebbb1434073e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -30,6 +30,61 @@
>>>>    #include "amdgpu.h"
>>>>    #include "amdgpu_trace.h"
>>>>    #include "amdgpu_reset.h"
>>>> +#include "amdgpu_dev_coredump.h"
>>>> +#include "amdgpu_xgmi.h"
>>>> +
>>>> +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
>>>> +                               struct amdgpu_job *job)
>>>> +{
>>>> +   int i;
>>>> +
>>>> +   dev_info(adev->dev, "Dumping IP State\n");
>>>> +   for (i = 0; i < adev->num_ip_blocks; i++) {
>>>> +           if (adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>> +                   adev->ip_blocks[i].version->funcs
>>>> +                           ->dump_ip_state((void *)adev);
>>>> +           dev_info(adev->dev, "Dumping IP State Completed\n");
>>>> +   }
>>>> +
>>>> +   amdgpu_coredump(adev, true, false, job); }
>>>> +
>>>> +static void amdgpu_job_core_dump(struct amdgpu_device *adev,
>>>> +                            struct amdgpu_job *job)
>>>> +{
>>>> +   struct list_head device_list, *device_list_handle =  NULL;
>>>> +   struct amdgpu_device *tmp_adev = NULL;
>>>> +   struct amdgpu_hive_info *hive = NULL;
>>>> +
>>>> +   if (!amdgpu_sriov_vf(adev))
>>>> +           hive = amdgpu_get_xgmi_hive(adev);
>>>> +   if (hive)
>>>> +           mutex_lock(&hive->hive_lock);
>>>> +   /*
>>>> +    * Reuse the logic in amdgpu_device_gpu_recover() to build list of
>>>> +    * devices for code dump
>>>> +    */
>>>> +   INIT_LIST_HEAD(&device_list);
>>>> +   if (!amdgpu_sriov_vf(adev) && (adev-
>>>> gmc.xgmi.num_physical_nodes > 1) && hive) {
>>>> +           list_for_each_entry(tmp_adev, &hive->device_list,
>>> gmc.xgmi.head)
>>>> +                   list_add_tail(&tmp_adev->reset_list, &device_list);
>>>> +           if (!list_is_first(&adev->reset_list, &device_list))
>>>> +                   list_rotate_to_front(&adev->reset_list, &device_list);
>>>> +           device_list_handle = &device_list;
>>>> +   } else {
>>>> +           list_add_tail(&adev->reset_list, &device_list);
>>>> +           device_list_handle = &device_list;
>>>> +   }
>>>> +
>>>> +   /* Do the coredump for each device */
>>>> +   list_for_each_entry(tmp_adev, device_list_handle, reset_list)
>>>> +           amdgpu_job_do_core_dump(tmp_adev, job);
>>>> +
>>>> +   if (hive) {
>>>> +           mutex_unlock(&hive->hive_lock);
>>>> +           amdgpu_put_xgmi_hive(hive);
>>>> +   }
>>>> +}
>>>>
>>>>    static enum drm_gpu_sched_stat amdgpu_job_timedout(struct
>>> drm_sched_job *s_job)
>>>>    {
>>>> @@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat
>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>>              return DRM_GPU_SCHED_STAT_ENODEV;
>>>>      }
>>>>
>>>> +   amdgpu_job_core_dump(adev, job);
>>> The philosophy is hang and recovery is to let the HW and software try to
>>> recover. Here we try to do a soft recovery first and i think we should wait for
>>> seft recovery and if fails then we do dump and thats exactly we are doing here.
>> Hi Sunil ,
>> thanks for the suggestion, and that's reasonable. But my concern is that after soft recovery happened, the GPU's status may change(take gfx 9 for example, it will try to kill the current hang wave)
>>   Actually, in most cases, a real shader hang cannot be resolved through soft recovery, and at that moment, we need to get a very close dump/snapshot/representation of GPU's current error status.
>> Just like the scandump, when we trying to do a scandump for a shader hang, we will disable gpu_recovery, and no soft recovery/per-queue reset/HW reset will happen before the scandump, right?
>> On most products, there are no scandump interfaces, so core dump is even more important for debugging GPU hang issue.
> I agree that it makes sense to take the dump as early as possible.  It
> makes sense to move it up now that we are starting to have finer
> grained resets.  We may want to wire devcoredump into the KFD side as
> well at some point.
In the current implementation we do stop the Scheduler and wait for 
existing fences to complete or signal them. But the new place to dump 
will not have anything like that and even though we have timeout some of 
the threads or waves might be still progressing the IP dump might be 
changing during that time.

But i am not sure if we want to stop the scheduling of new tasks and end 
the existing one.  How about we have multiple level of dumps i.e just 
capture and not dump as first dump is what is captured not last.
a. Capture after soft reset and let it be overwritten by next level
b. After soft reset fails capture again before hard reset is triggered.

So eventually we would have the ip dump file generated of what we are 
interested in.

Regards
Sunil K



>
> Alex
>
>> Regards,
>> Trigger
>>
>>> Also we need to make sure that the tasks which are already in queue are put
>>> on hold and the their sync points are signalled before we dump.
>>> check once what all steps are taken before we dump in the current
>>> implementation.
>> Do you mean sometimes like:
>>          drm_sched_wqueue_stop(&ring->sched);
>>          amdgpu_fence_driver_force_completion(ring); // Since there is no GPU reset happened, is it reasonable to call it here?
>>          amdgpu_job_core_dump(adev, job);
>>
>>
>> Regards,
>> Trigger
>>
>>> Regards
>>>
>>> Sunil khatri
>>>
>>>>      adev->job_hang = true;
>>>>
>>>> @@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat
>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>>              reset_context.src = AMDGPU_RESET_SRC_JOB;
>>>>              clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>>>>
>>>> +           /*
>>>> +            * To avoid an unnecessary extra coredump, as we have
>>> already
>>>> +            * got the very close representation of GPU's error status
>>>> +            */
>>>> +           set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
>>>> +
>>>>              r = amdgpu_device_gpu_recover(ring->adev, job,
>>> &reset_context);
>>>>              if (r)
>>>>                      dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-20 15:07         ` Khatri, Sunil
@ 2024-08-20 15:29           ` Alex Deucher
  0 siblings, 0 replies; 12+ messages in thread
From: Alex Deucher @ 2024-08-20 15:29 UTC (permalink / raw)
  To: Khatri, Sunil
  Cc: Huang, Trigger, amd-gfx@lists.freedesktop.org, Deucher, Alexander

On Tue, Aug 20, 2024 at 11:07 AM Khatri, Sunil <sunil.khatri@amd.com> wrote:
>
>
> On 8/20/2024 7:36 PM, Alex Deucher wrote:
> > On Tue, Aug 20, 2024 at 3:30 AM Huang, Trigger <Trigger.Huang@amd.com> wrote:
> >> [AMD Official Use Only - AMD Internal Distribution Only]
> >>
> >>> -----Original Message-----
> >>> From: Khatri, Sunil <Sunil.Khatri@amd.com>
> >>> Sent: Monday, August 19, 2024 6:31 PM
> >>> To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> >>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
> >>> Subject: Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job
> >>> tmo
> >>>
> >>>
> >>> On 8/19/2024 3:23 PM, Trigger.Huang@amd.com wrote:
> >>>> From: Trigger Huang <Trigger.Huang@amd.com>
> >>>>
> >>>> Do the coredump immediately after a job timeout to get a closer
> >>>> representation of GPU's error status.
> >>>>
> >>>> V2: This will skip printing vram_lost as the GPU reset is not happened
> >>>> yet (Alex)
> >>>>
> >>>> V3: Unconditionally call the core dump as we care about all the reset
> >>>> functions(soft-recovery and queue reset and full adapter reset, Alex)
> >>>>
> >>>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
> >>>> ---
> >>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62
> >>> +++++++++++++++++++++++++
> >>>>    1 file changed, 62 insertions(+)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>>> index c6a1783fc9ef..ebbb1434073e 100644
> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>>> @@ -30,6 +30,61 @@
> >>>>    #include "amdgpu.h"
> >>>>    #include "amdgpu_trace.h"
> >>>>    #include "amdgpu_reset.h"
> >>>> +#include "amdgpu_dev_coredump.h"
> >>>> +#include "amdgpu_xgmi.h"
> >>>> +
> >>>> +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
> >>>> +                               struct amdgpu_job *job)
> >>>> +{
> >>>> +   int i;
> >>>> +
> >>>> +   dev_info(adev->dev, "Dumping IP State\n");
> >>>> +   for (i = 0; i < adev->num_ip_blocks; i++) {
> >>>> +           if (adev->ip_blocks[i].version->funcs->dump_ip_state)
> >>>> +                   adev->ip_blocks[i].version->funcs
> >>>> +                           ->dump_ip_state((void *)adev);
> >>>> +           dev_info(adev->dev, "Dumping IP State Completed\n");
> >>>> +   }
> >>>> +
> >>>> +   amdgpu_coredump(adev, true, false, job); }
> >>>> +
> >>>> +static void amdgpu_job_core_dump(struct amdgpu_device *adev,
> >>>> +                            struct amdgpu_job *job)
> >>>> +{
> >>>> +   struct list_head device_list, *device_list_handle =  NULL;
> >>>> +   struct amdgpu_device *tmp_adev = NULL;
> >>>> +   struct amdgpu_hive_info *hive = NULL;
> >>>> +
> >>>> +   if (!amdgpu_sriov_vf(adev))
> >>>> +           hive = amdgpu_get_xgmi_hive(adev);
> >>>> +   if (hive)
> >>>> +           mutex_lock(&hive->hive_lock);
> >>>> +   /*
> >>>> +    * Reuse the logic in amdgpu_device_gpu_recover() to build list of
> >>>> +    * devices for code dump
> >>>> +    */
> >>>> +   INIT_LIST_HEAD(&device_list);
> >>>> +   if (!amdgpu_sriov_vf(adev) && (adev-
> >>>> gmc.xgmi.num_physical_nodes > 1) && hive) {
> >>>> +           list_for_each_entry(tmp_adev, &hive->device_list,
> >>> gmc.xgmi.head)
> >>>> +                   list_add_tail(&tmp_adev->reset_list, &device_list);
> >>>> +           if (!list_is_first(&adev->reset_list, &device_list))
> >>>> +                   list_rotate_to_front(&adev->reset_list, &device_list);
> >>>> +           device_list_handle = &device_list;
> >>>> +   } else {
> >>>> +           list_add_tail(&adev->reset_list, &device_list);
> >>>> +           device_list_handle = &device_list;
> >>>> +   }
> >>>> +
> >>>> +   /* Do the coredump for each device */
> >>>> +   list_for_each_entry(tmp_adev, device_list_handle, reset_list)
> >>>> +           amdgpu_job_do_core_dump(tmp_adev, job);
> >>>> +
> >>>> +   if (hive) {
> >>>> +           mutex_unlock(&hive->hive_lock);
> >>>> +           amdgpu_put_xgmi_hive(hive);
> >>>> +   }
> >>>> +}
> >>>>
> >>>>    static enum drm_gpu_sched_stat amdgpu_job_timedout(struct
> >>> drm_sched_job *s_job)
> >>>>    {
> >>>> @@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat
> >>> amdgpu_job_timedout(struct drm_sched_job *s_job)
> >>>>              return DRM_GPU_SCHED_STAT_ENODEV;
> >>>>      }
> >>>>
> >>>> +   amdgpu_job_core_dump(adev, job);
> >>> The philosophy is hang and recovery is to let the HW and software try to
> >>> recover. Here we try to do a soft recovery first and i think we should wait for
> >>> seft recovery and if fails then we do dump and thats exactly we are doing here.
> >> Hi Sunil ,
> >> thanks for the suggestion, and that's reasonable. But my concern is that after soft recovery happened, the GPU's status may change(take gfx 9 for example, it will try to kill the current hang wave)
> >>   Actually, in most cases, a real shader hang cannot be resolved through soft recovery, and at that moment, we need to get a very close dump/snapshot/representation of GPU's current error status.
> >> Just like the scandump, when we trying to do a scandump for a shader hang, we will disable gpu_recovery, and no soft recovery/per-queue reset/HW reset will happen before the scandump, right?
> >> On most products, there are no scandump interfaces, so core dump is even more important for debugging GPU hang issue.
> > I agree that it makes sense to take the dump as early as possible.  It
> > makes sense to move it up now that we are starting to have finer
> > grained resets.  We may want to wire devcoredump into the KFD side as
> > well at some point.
> In the current implementation we do stop the Scheduler and wait for
> existing fences to complete or signal them. But the new place to dump
> will not have anything like that and even though we have timeout some of
> the threads or waves might be still progressing the IP dump might be
> changing during that time.

Sure, but that is all sw state.  It shouldn't affect the hw state.  Or
if it does, it would be indirect.  E.g., signalling a fence might
unblock a dependent job, but the root issue would still be valid.

>
> But i am not sure if we want to stop the scheduling of new tasks and end
> the existing one.  How about we have multiple level of dumps i.e just
> capture and not dump as first dump is what is captured not last.
> a. Capture after soft reset and let it be overwritten by next level
> b. After soft reset fails capture again before hard reset is triggered.
>
> So eventually we would have the ip dump file generated of what we are
> interested in.

It might make more sense to slice and dice the dumps more finely, but
I think we'll need to see how things work out as is before we worry
about those details.

Alex

>
> Regards
> Sunil K
>
>
>
> >
> > Alex
> >
> >> Regards,
> >> Trigger
> >>
> >>> Also we need to make sure that the tasks which are already in queue are put
> >>> on hold and the their sync points are signalled before we dump.
> >>> check once what all steps are taken before we dump in the current
> >>> implementation.
> >> Do you mean sometimes like:
> >>          drm_sched_wqueue_stop(&ring->sched);
> >>          amdgpu_fence_driver_force_completion(ring); // Since there is no GPU reset happened, is it reasonable to call it here?
> >>          amdgpu_job_core_dump(adev, job);
> >>
> >>
> >> Regards,
> >> Trigger
> >>
> >>> Regards
> >>>
> >>> Sunil khatri
> >>>
> >>>>      adev->job_hang = true;
> >>>>
> >>>> @@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat
> >>> amdgpu_job_timedout(struct drm_sched_job *s_job)
> >>>>              reset_context.src = AMDGPU_RESET_SRC_JOB;
> >>>>              clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> >>>>
> >>>> +           /*
> >>>> +            * To avoid an unnecessary extra coredump, as we have
> >>> already
> >>>> +            * got the very close representation of GPU's error status
> >>>> +            */
> >>>> +           set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
> >>>> +
> >>>>              r = amdgpu_device_gpu_recover(ring->adev, job,
> >>> &reset_context);
> >>>>              if (r)
> >>>>                      dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-20  7:30     ` Huang, Trigger
  2024-08-20 14:06       ` Alex Deucher
@ 2024-08-20 15:31       ` Khatri, Sunil
  2024-08-20 16:01         ` Alex Deucher
  1 sibling, 1 reply; 12+ messages in thread
From: Khatri, Sunil @ 2024-08-20 15:31 UTC (permalink / raw)
  To: Huang, Trigger, amd-gfx@lists.freedesktop.org; +Cc: Deucher, Alexander


On 8/20/2024 1:00 PM, Huang, Trigger wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>> -----Original Message-----
>> From: Khatri, Sunil <Sunil.Khatri@amd.com>
>> Sent: Monday, August 19, 2024 6:31 PM
>> To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
>> Subject: Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job
>> tmo
>>
>>
>> On 8/19/2024 3:23 PM, Trigger.Huang@amd.com wrote:
>>> From: Trigger Huang <Trigger.Huang@amd.com>
>>>
>>> Do the coredump immediately after a job timeout to get a closer
>>> representation of GPU's error status.
>>>
>>> V2: This will skip printing vram_lost as the GPU reset is not happened
>>> yet (Alex)
>>>
>>> V3: Unconditionally call the core dump as we care about all the reset
>>> functions(soft-recovery and queue reset and full adapter reset, Alex)
>>>
>>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62
>> +++++++++++++++++++++++++
>>>    1 file changed, 62 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index c6a1783fc9ef..ebbb1434073e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -30,6 +30,61 @@
>>>    #include "amdgpu.h"
>>>    #include "amdgpu_trace.h"
>>>    #include "amdgpu_reset.h"
>>> +#include "amdgpu_dev_coredump.h"
>>> +#include "amdgpu_xgmi.h"
>>> +
>>> +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
>>> +                               struct amdgpu_job *job)
>>> +{
>>> +   int i;
>>> +
>>> +   dev_info(adev->dev, "Dumping IP State\n");
>>> +   for (i = 0; i < adev->num_ip_blocks; i++) {
>>> +           if (adev->ip_blocks[i].version->funcs->dump_ip_state)
>>> +                   adev->ip_blocks[i].version->funcs
>>> +                           ->dump_ip_state((void *)adev);
>>> +           dev_info(adev->dev, "Dumping IP State Completed\n");
>>> +   }
>>> +
>>> +   amdgpu_coredump(adev, true, false, job); }
>>> +
>>> +static void amdgpu_job_core_dump(struct amdgpu_device *adev,
>>> +                            struct amdgpu_job *job)
>>> +{
>>> +   struct list_head device_list, *device_list_handle =  NULL;
>>> +   struct amdgpu_device *tmp_adev = NULL;
>>> +   struct amdgpu_hive_info *hive = NULL;
>>> +
>>> +   if (!amdgpu_sriov_vf(adev))
>>> +           hive = amdgpu_get_xgmi_hive(adev);
>>> +   if (hive)
>>> +           mutex_lock(&hive->hive_lock);
>>> +   /*
>>> +    * Reuse the logic in amdgpu_device_gpu_recover() to build list of
>>> +    * devices for code dump
>>> +    */
>>> +   INIT_LIST_HEAD(&device_list);
>>> +   if (!amdgpu_sriov_vf(adev) && (adev-
>>> gmc.xgmi.num_physical_nodes > 1) && hive) {
>>> +           list_for_each_entry(tmp_adev, &hive->device_list,
>> gmc.xgmi.head)
>>> +                   list_add_tail(&tmp_adev->reset_list, &device_list);
>>> +           if (!list_is_first(&adev->reset_list, &device_list))
>>> +                   list_rotate_to_front(&adev->reset_list, &device_list);
>>> +           device_list_handle = &device_list;
>>> +   } else {
>>> +           list_add_tail(&adev->reset_list, &device_list);
>>> +           device_list_handle = &device_list;
>>> +   }
>>> +
>>> +   /* Do the coredump for each device */
>>> +   list_for_each_entry(tmp_adev, device_list_handle, reset_list)
>>> +           amdgpu_job_do_core_dump(tmp_adev, job);
>>> +
>>> +   if (hive) {
>>> +           mutex_unlock(&hive->hive_lock);
>>> +           amdgpu_put_xgmi_hive(hive);
>>> +   }
>>> +}
>>>
>>>    static enum drm_gpu_sched_stat amdgpu_job_timedout(struct
>> drm_sched_job *s_job)
>>>    {
>>> @@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>              return DRM_GPU_SCHED_STAT_ENODEV;
>>>      }
>>>
>>> +   amdgpu_job_core_dump(adev, job);
>> The philosophy is hang and recovery is to let the HW and software try to
>> recover. Here we try to do a soft recovery first and i think we should wait for
>> seft recovery and if fails then we do dump and thats exactly we are doing here.
> Hi Sunil ,
> thanks for the suggestion, and that's reasonable. But my concern is that after soft recovery happened, the GPU's status may change(take gfx 9 for example, it will try to kill the current hang wave)
>   Actually, in most cases, a real shader hang cannot be resolved through soft recovery, and at that moment, we need to get a very close dump/snapshot/representation of GPU's current error status.
> Just like the scandump, when we trying to do a scandump for a shader hang, we will disable gpu_recovery, and no soft recovery/per-queue reset/HW reset will happen before the scandump, right?
> On most products, there are no scandump interfaces, so core dump is even more important for debugging GPU hang issue.
>
> Regards,
> Trigger
>
>> Also we need to make sure that the tasks which are already in queue are put
>> on hold and the their sync points are signalled before we dump.
>> check once what all steps are taken before we dump in the current
>> implementation.
> Do you mean sometimes like:
>          drm_sched_wqueue_stop(&ring->sched);
>          amdgpu_fence_driver_force_completion(ring); // Since there is no GPU reset happened, is it reasonable to call it here?
>          amdgpu_job_core_dump(adev, job);
Before hard reset we do all those things. But i guess we may not need 
that in case we capturing before soft reset itself but you need to test 
it, to see the dump values are true or not.
Also apart from hardware state we dump a lot of other information like 
ring buffers and in case jobs are still submitting we might be not able 
to get the right data as the ring might be in use and being 
consumed/filled up that time and that's why scheduler stop helps. But in 
case soft reset is successful we do not want to do that.

So here is what i think but Alex please suggest if it make sense.
If recovery is disabled : Capture ip dump before soft reset. (Give close 
register state but ring buffer need to be seen as it is in use as 
scheduler is running)
if recovery is enabled : capture ip dump (Current implementation make 
sure to disable drm sched and fence time out)

function ptr print ip state could be called to capture dump when its 
needed in both above cases. Right now print is called when dump is 
actually dumped which is when data file which is generated in 
devcoredump is read.

Regards
Sunil Khatri


> Regards,
> Trigger
>
>> Regards
>>
>> Sunil khatri
>>
>>>      adev->job_hang = true;
>>>
>>> @@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>              reset_context.src = AMDGPU_RESET_SRC_JOB;
>>>              clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>>>
>>> +           /*
>>> +            * To avoid an unnecessary extra coredump, as we have
>> already
>>> +            * got the very close representation of GPU's error status
>>> +            */
>>> +           set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
>>> +
>>>              r = amdgpu_device_gpu_recover(ring->adev, job,
>> &reset_context);
>>>              if (r)
>>>                      dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-20 15:31       ` Khatri, Sunil
@ 2024-08-20 16:01         ` Alex Deucher
  2024-08-20 16:54           ` Khatri, Sunil
  0 siblings, 1 reply; 12+ messages in thread
From: Alex Deucher @ 2024-08-20 16:01 UTC (permalink / raw)
  To: Khatri, Sunil
  Cc: Huang, Trigger, amd-gfx@lists.freedesktop.org, Deucher, Alexander

On Tue, Aug 20, 2024 at 11:31 AM Khatri, Sunil <sunil.khatri@amd.com> wrote:
>
>
> On 8/20/2024 1:00 PM, Huang, Trigger wrote:
> > [AMD Official Use Only - AMD Internal Distribution Only]
> >
> >> -----Original Message-----
> >> From: Khatri, Sunil <Sunil.Khatri@amd.com>
> >> Sent: Monday, August 19, 2024 6:31 PM
> >> To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
> >> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
> >> Subject: Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job
> >> tmo
> >>
> >>
> >> On 8/19/2024 3:23 PM, Trigger.Huang@amd.com wrote:
> >>> From: Trigger Huang <Trigger.Huang@amd.com>
> >>>
> >>> Do the coredump immediately after a job timeout to get a closer
> >>> representation of GPU's error status.
> >>>
> >>> V2: This will skip printing vram_lost as the GPU reset is not happened
> >>> yet (Alex)
> >>>
> >>> V3: Unconditionally call the core dump as we care about all the reset
> >>> functions(soft-recovery and queue reset and full adapter reset, Alex)
> >>>
> >>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
> >>> ---
> >>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62
> >> +++++++++++++++++++++++++
> >>>    1 file changed, 62 insertions(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> index c6a1783fc9ef..ebbb1434073e 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >>> @@ -30,6 +30,61 @@
> >>>    #include "amdgpu.h"
> >>>    #include "amdgpu_trace.h"
> >>>    #include "amdgpu_reset.h"
> >>> +#include "amdgpu_dev_coredump.h"
> >>> +#include "amdgpu_xgmi.h"
> >>> +
> >>> +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
> >>> +                               struct amdgpu_job *job)
> >>> +{
> >>> +   int i;
> >>> +
> >>> +   dev_info(adev->dev, "Dumping IP State\n");
> >>> +   for (i = 0; i < adev->num_ip_blocks; i++) {
> >>> +           if (adev->ip_blocks[i].version->funcs->dump_ip_state)
> >>> +                   adev->ip_blocks[i].version->funcs
> >>> +                           ->dump_ip_state((void *)adev);
> >>> +           dev_info(adev->dev, "Dumping IP State Completed\n");
> >>> +   }
> >>> +
> >>> +   amdgpu_coredump(adev, true, false, job); }
> >>> +
> >>> +static void amdgpu_job_core_dump(struct amdgpu_device *adev,
> >>> +                            struct amdgpu_job *job)
> >>> +{
> >>> +   struct list_head device_list, *device_list_handle =  NULL;
> >>> +   struct amdgpu_device *tmp_adev = NULL;
> >>> +   struct amdgpu_hive_info *hive = NULL;
> >>> +
> >>> +   if (!amdgpu_sriov_vf(adev))
> >>> +           hive = amdgpu_get_xgmi_hive(adev);
> >>> +   if (hive)
> >>> +           mutex_lock(&hive->hive_lock);
> >>> +   /*
> >>> +    * Reuse the logic in amdgpu_device_gpu_recover() to build list of
> >>> +    * devices for code dump
> >>> +    */
> >>> +   INIT_LIST_HEAD(&device_list);
> >>> +   if (!amdgpu_sriov_vf(adev) && (adev-
> >>> gmc.xgmi.num_physical_nodes > 1) && hive) {
> >>> +           list_for_each_entry(tmp_adev, &hive->device_list,
> >> gmc.xgmi.head)
> >>> +                   list_add_tail(&tmp_adev->reset_list, &device_list);
> >>> +           if (!list_is_first(&adev->reset_list, &device_list))
> >>> +                   list_rotate_to_front(&adev->reset_list, &device_list);
> >>> +           device_list_handle = &device_list;
> >>> +   } else {
> >>> +           list_add_tail(&adev->reset_list, &device_list);
> >>> +           device_list_handle = &device_list;
> >>> +   }
> >>> +
> >>> +   /* Do the coredump for each device */
> >>> +   list_for_each_entry(tmp_adev, device_list_handle, reset_list)
> >>> +           amdgpu_job_do_core_dump(tmp_adev, job);
> >>> +
> >>> +   if (hive) {
> >>> +           mutex_unlock(&hive->hive_lock);
> >>> +           amdgpu_put_xgmi_hive(hive);
> >>> +   }
> >>> +}
> >>>
> >>>    static enum drm_gpu_sched_stat amdgpu_job_timedout(struct
> >> drm_sched_job *s_job)
> >>>    {
> >>> @@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat
> >> amdgpu_job_timedout(struct drm_sched_job *s_job)
> >>>              return DRM_GPU_SCHED_STAT_ENODEV;
> >>>      }
> >>>
> >>> +   amdgpu_job_core_dump(adev, job);
> >> The philosophy is hang and recovery is to let the HW and software try to
> >> recover. Here we try to do a soft recovery first and i think we should wait for
> >> seft recovery and if fails then we do dump and thats exactly we are doing here.
> > Hi Sunil ,
> > thanks for the suggestion, and that's reasonable. But my concern is that after soft recovery happened, the GPU's status may change(take gfx 9 for example, it will try to kill the current hang wave)
> >   Actually, in most cases, a real shader hang cannot be resolved through soft recovery, and at that moment, we need to get a very close dump/snapshot/representation of GPU's current error status.
> > Just like the scandump, when we trying to do a scandump for a shader hang, we will disable gpu_recovery, and no soft recovery/per-queue reset/HW reset will happen before the scandump, right?
> > On most products, there are no scandump interfaces, so core dump is even more important for debugging GPU hang issue.
> >
> > Regards,
> > Trigger
> >
> >> Also we need to make sure that the tasks which are already in queue are put
> >> on hold and the their sync points are signalled before we dump.
> >> check once what all steps are taken before we dump in the current
> >> implementation.
> > Do you mean sometimes like:
> >          drm_sched_wqueue_stop(&ring->sched);
> >          amdgpu_fence_driver_force_completion(ring); // Since there is no GPU reset happened, is it reasonable to call it here?
> >          amdgpu_job_core_dump(adev, job);
> Before hard reset we do all those things. But i guess we may not need
> that in case we capturing before soft reset itself but you need to test
> it, to see the dump values are true or not.
> Also apart from hardware state we dump a lot of other information like
> ring buffers and in case jobs are still submitting we might be not able
> to get the right data as the ring might be in use and being
> consumed/filled up that time and that's why scheduler stop helps. But in
> case soft reset is successful we do not want to do that.
>
> So here is what i think but Alex please suggest if it make sense.
> If recovery is disabled : Capture ip dump before soft reset. (Give close
> register state but ring buffer need to be seen as it is in use as
> scheduler is running)
> if recovery is enabled : capture ip dump (Current implementation make
> sure to disable drm sched and fence time out)

I think Trigger's proposed logic makes sense.  I don't see a reason
not to dump earlier if we can.  I don't really see what it buys us to
have different behavior depending on whether or not recovery is
enabled.  If per queue resets are successful, then we'd miss the dump
in that case.  If we only dump in job_timedout, then we'd miss the
dump when there is a reset due to something the kernel driver or KFD
has done.

I think what we want is for job_timedout or the similar logic on the
KFD side to call the coredump code when we detect a stuck queue.
We'll need to keep the codedump code in gpu_recover to cover the cases
where we need to reset due to something outside of the user submission
paths.

Alex

>
> function ptr print ip state could be called to capture dump when its
> needed in both above cases. Right now print is called when dump is
> actually dumped which is when data file which is generated in
> devcoredump is read.
>
> Regards
> Sunil Khatri
>
>
> > Regards,
> > Trigger
> >
> >> Regards
> >>
> >> Sunil khatri
> >>
> >>>      adev->job_hang = true;
> >>>
> >>> @@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat
> >> amdgpu_job_timedout(struct drm_sched_job *s_job)
> >>>              reset_context.src = AMDGPU_RESET_SRC_JOB;
> >>>              clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> >>>
> >>> +           /*
> >>> +            * To avoid an unnecessary extra coredump, as we have
> >> already
> >>> +            * got the very close representation of GPU's error status
> >>> +            */
> >>> +           set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
> >>> +
> >>>              r = amdgpu_device_gpu_recover(ring->adev, job,
> >> &reset_context);
> >>>              if (r)
> >>>                      dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-20 16:01         ` Alex Deucher
@ 2024-08-20 16:54           ` Khatri, Sunil
  2024-08-21  8:19             ` Huang, Trigger
  0 siblings, 1 reply; 12+ messages in thread
From: Khatri, Sunil @ 2024-08-20 16:54 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Huang, Trigger, amd-gfx@lists.freedesktop.org, Deucher, Alexander

[-- Attachment #1: Type: text/plain, Size: 9173 bytes --]


On 8/20/2024 9:31 PM, Alex Deucher wrote:
> On Tue, Aug 20, 2024 at 11:31 AM Khatri, Sunil <sunil.khatri@amd.com> wrote:
>>
>> On 8/20/2024 1:00 PM, Huang, Trigger wrote:
>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>
>>>> -----Original Message-----
>>>> From: Khatri, Sunil <Sunil.Khatri@amd.com>
>>>> Sent: Monday, August 19, 2024 6:31 PM
>>>> To: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
>>>> Subject: Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job
>>>> tmo
>>>>
>>>>
>>>> On 8/19/2024 3:23 PM, Trigger.Huang@amd.com wrote:
>>>>> From: Trigger Huang <Trigger.Huang@amd.com>
>>>>>
>>>>> Do the coredump immediately after a job timeout to get a closer
>>>>> representation of GPU's error status.
>>>>>
>>>>> V2: This will skip printing vram_lost as the GPU reset is not happened
>>>>> yet (Alex)
>>>>>
>>>>> V3: Unconditionally call the core dump as we care about all the reset
>>>>> functions(soft-recovery and queue reset and full adapter reset, Alex)
>>>>>
>>>>> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62
>>>> +++++++++++++++++++++++++
>>>>>     1 file changed, 62 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> index c6a1783fc9ef..ebbb1434073e 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> @@ -30,6 +30,61 @@
>>>>>     #include "amdgpu.h"
>>>>>     #include "amdgpu_trace.h"
>>>>>     #include "amdgpu_reset.h"
>>>>> +#include "amdgpu_dev_coredump.h"
>>>>> +#include "amdgpu_xgmi.h"
>>>>> +
>>>>> +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
>>>>> +                               struct amdgpu_job *job)
>>>>> +{
>>>>> +   int i;
>>>>> +
>>>>> +   dev_info(adev->dev, "Dumping IP State\n");
>>>>> +   for (i = 0; i < adev->num_ip_blocks; i++) {
>>>>> +           if (adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>> +                   adev->ip_blocks[i].version->funcs
>>>>> +                           ->dump_ip_state((void *)adev);
>>>>> +           dev_info(adev->dev, "Dumping IP State Completed\n");
>>>>> +   }
>>>>> +
>>>>> +   amdgpu_coredump(adev, true, false, job); }
>>>>> +
>>>>> +static void amdgpu_job_core_dump(struct amdgpu_device *adev,
>>>>> +                            struct amdgpu_job *job)
>>>>> +{
>>>>> +   struct list_head device_list, *device_list_handle =  NULL;
>>>>> +   struct amdgpu_device *tmp_adev = NULL;
>>>>> +   struct amdgpu_hive_info *hive = NULL;
>>>>> +
>>>>> +   if (!amdgpu_sriov_vf(adev))
>>>>> +           hive = amdgpu_get_xgmi_hive(adev);
>>>>> +   if (hive)
>>>>> +           mutex_lock(&hive->hive_lock);
>>>>> +   /*
>>>>> +    * Reuse the logic in amdgpu_device_gpu_recover() to build list of
>>>>> +    * devices for code dump
>>>>> +    */
>>>>> +   INIT_LIST_HEAD(&device_list);
>>>>> +   if (!amdgpu_sriov_vf(adev) && (adev-
>>>>> gmc.xgmi.num_physical_nodes > 1) && hive) {
>>>>> +           list_for_each_entry(tmp_adev, &hive->device_list,
>>>> gmc.xgmi.head)
>>>>> +                   list_add_tail(&tmp_adev->reset_list, &device_list);
>>>>> +           if (!list_is_first(&adev->reset_list, &device_list))
>>>>> +                   list_rotate_to_front(&adev->reset_list, &device_list);
>>>>> +           device_list_handle = &device_list;
>>>>> +   } else {
>>>>> +           list_add_tail(&adev->reset_list, &device_list);
>>>>> +           device_list_handle = &device_list;
>>>>> +   }
>>>>> +
>>>>> +   /* Do the coredump for each device */
>>>>> +   list_for_each_entry(tmp_adev, device_list_handle, reset_list)
>>>>> +           amdgpu_job_do_core_dump(tmp_adev, job);
>>>>> +
>>>>> +   if (hive) {
>>>>> +           mutex_unlock(&hive->hive_lock);
>>>>> +           amdgpu_put_xgmi_hive(hive);
>>>>> +   }
>>>>> +}
>>>>>
>>>>>     static enum drm_gpu_sched_stat amdgpu_job_timedout(struct
>>>> drm_sched_job *s_job)
>>>>>     {
>>>>> @@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat
>>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>>>               return DRM_GPU_SCHED_STAT_ENODEV;
>>>>>       }
>>>>>
>>>>> +   amdgpu_job_core_dump(adev, job);

Let's call this afteradev->job_hang=true;

regards
Sunil

>>>> The philosophy is hang and recovery is to let the HW and software try to
>>>> recover. Here we try to do a soft recovery first and i think we should wait for
>>>> seft recovery and if fails then we do dump and thats exactly we are doing here.
>>> Hi Sunil ,
>>> thanks for the suggestion, and that's reasonable. But my concern is that after soft recovery happened, the GPU's status may change(take gfx 9 for example, it will try to kill the current hang wave)
>>>    Actually, in most cases, a real shader hang cannot be resolved through soft recovery, and at that moment, we need to get a very close dump/snapshot/representation of GPU's current error status.
>>> Just like the scandump, when we trying to do a scandump for a shader hang, we will disable gpu_recovery, and no soft recovery/per-queue reset/HW reset will happen before the scandump, right?
>>> On most products, there are no scandump interfaces, so core dump is even more important for debugging GPU hang issue.
>>>
>>> Regards,
>>> Trigger
>>>
>>>> Also we need to make sure that the tasks which are already in queue are put
>>>> on hold and the their sync points are signalled before we dump.
>>>> check once what all steps are taken before we dump in the current
>>>> implementation.
>>> Do you mean sometimes like:
>>>           drm_sched_wqueue_stop(&ring->sched);
>>>           amdgpu_fence_driver_force_completion(ring); // Since there is no GPU reset happened, is it reasonable to call it here?
>>>           amdgpu_job_core_dump(adev, job);
>> Before hard reset we do all those things. But i guess we may not need
>> that in case we capturing before soft reset itself but you need to test
>> it, to see the dump values are true or not.
>> Also apart from hardware state we dump a lot of other information like
>> ring buffers and in case jobs are still submitting we might be not able
>> to get the right data as the ring might be in use and being
>> consumed/filled up that time and that's why scheduler stop helps. But in
>> case soft reset is successful we do not want to do that.
>>
>> So here is what i think but Alex please suggest if it make sense.
>> If recovery is disabled : Capture ip dump before soft reset. (Give close
>> register state but ring buffer need to be seen as it is in use as
>> scheduler is running)
>> if recovery is enabled : capture ip dump (Current implementation make
>> sure to disable drm sched and fence time out)
> I think Trigger's proposed logic makes sense.  I don't see a reason
> not to dump earlier if we can.  I don't really see what it buys us to
> have different behavior depending on whether or not recovery is
> enabled.  If per queue resets are successful, then we'd miss the dump
> in that case.  If we only dump in job_timedout, then we'd miss the
> dump when there is a reset due to something the kernel driver or KFD
> has done.
I am totally in for dumping the ip registers as early as possible and 
its a good idea to dump before soft reset.
I am little worried if we need to stop scheduling any further jobs and 
process the ones already scheduled. If that's not a problem it looks 
good to me.


With one minor change above patch looks fine to me.Just make your try it 
and see the  time it takes between  "Dumping IP State" and"Dumping IP 
State Completed" i.e reading registers.

Acked-by: Sunil Khatri <sunil.khatri@amd.com>

Regards
Sunil K


> I think what we want is for job_timedout or the similar logic on the
> KFD side to call the coredump code when we detect a stuck queue.
> We'll need to keep the codedump code in gpu_recover to cover the cases
> where we need to reset due to something outside of the user submission
> paths.
>
> Alex
>
>> function ptr print ip state could be called to capture dump when its
>> needed in both above cases. Right now print is called when dump is
>> actually dumped which is when data file which is generated in
>> devcoredump is read.
>>
>> Regards
>> Sunil Khatri
>>
>>
>>> Regards,
>>> Trigger
>>>
>>>> Regards
>>>>
>>>> Sunil khatri
>>>>
>>>>>       adev->job_hang = true;
>>>>>
>>>>> @@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat
>>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>>>               reset_context.src = AMDGPU_RESET_SRC_JOB;
>>>>>               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>>>>>
>>>>> +           /*
>>>>> +            * To avoid an unnecessary extra coredump, as we have
>>>> already
>>>>> +            * got the very close representation of GPU's error status
>>>>> +            */
>>>>> +           set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
>>>>> +
>>>>>               r = amdgpu_device_gpu_recover(ring->adev, job,
>>>> &reset_context);
>>>>>               if (r)
>>>>>                       dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

[-- Attachment #2: Type: text/html, Size: 49481 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-20 16:54           ` Khatri, Sunil
@ 2024-08-21  8:19             ` Huang, Trigger
  0 siblings, 0 replies; 12+ messages in thread
From: Huang, Trigger @ 2024-08-21  8:19 UTC (permalink / raw)
  To: Khatri, Sunil, Alex Deucher
  Cc: amd-gfx@lists.freedesktop.org, Deucher,  Alexander

[-- Attachment #1: Type: text/plain, Size: 11578 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]




Hi Sunil,



Move your two comments here since the email format has changed.







>+   amdgpu_job_core_dump(adev, job);

>Let's call this after adev->job_hang = true;



Sure, I will modify it in new patch



Thanks,

Trigger



>regards

>Sunil







>I am totally in for dumping the ip registers as early as possible and its a good idea to dump before soft reset.

>I am little worried if we need to stop scheduling any further jobs and process the ones already scheduled. If that's not a problem it looks good to me.

As mentioned by Alex, it shouldn't affect the hw state.

Since the default value of amdgpu_sched_hw_submission is set to 2, there can be 2 jobs in the ring buffer when GPU hang, so when dumping the ring buffer, we may get some PM4 packets that do not belong to this hang job, but CP should not issue new job to graphics pipeline if the current job is not finished, so the HW state is still there for the core dump.

Probably we can set amdgpu_sched_hw_submission=1 for a more precise dump when debugging some specific hang issue.



Thanks,

Trigger





>With one minor change above patch looks fine to me. Just make your try it and see the  time it takes between  "Dumping IP State" and "Dumping IP State Completed" i.e reading registers.



I tested several times on Renoir APU platform, it will cost less than 1 ms to dump the status of gfx_v9_0 + sdma_v4_0+vcn_v2_0.



[  189.186995] amdgpu 0000:02:00.0: amdgpu: Dumping all IP State took 946695 nanoseconds
[  227.226338] amdgpu 0000:02:00.0: amdgpu: Dumping all IP State took 927643 nanoseconds
[  274.512442] amdgpu 0000:02:00.0: amdgpu: Dumping all IP State took 957721 nanoseconds
[  291.186766] amdgpu 0000:02:00.0: amdgpu: Dumping all IP State took 999311 nanoseconds
[  313.559718] amdgpu 0000:02:00.0: amdgpu: Dumping all IP State took 922235 nanoseconds



Thanks,

Trigger



>Acked-by:  Sunil Khatri sunil.khatri@amd.com<mailto:sunil.khatri@amd.com>

>Regards

>Sunil K



From: Khatri, Sunil <Sunil.Khatri@amd.com>
Sent: Wednesday, August 21, 2024 12:54 AM
To: Alex Deucher <alexdeucher@gmail.com>
Cc: Huang, Trigger <Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo



On 8/20/2024 9:31 PM, Alex Deucher wrote:

On Tue, Aug 20, 2024 at 11:31 AM Khatri, Sunil <sunil.khatri@amd.com><mailto:sunil.khatri@amd.com> wrote:





On 8/20/2024 1:00 PM, Huang, Trigger wrote:

[AMD Official Use Only - AMD Internal Distribution Only]



-----Original Message-----

From: Khatri, Sunil <Sunil.Khatri@amd.com><mailto:Sunil.Khatri@amd.com>

Sent: Monday, August 19, 2024 6:31 PM

To: Huang, Trigger <Trigger.Huang@amd.com><mailto:Trigger.Huang@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>

Cc: Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>

Subject: Re: [PATCH 2/2] drm/amdgpu: Do core dump immediately when job

tmo





On 8/19/2024 3:23 PM, Trigger.Huang@amd.com<mailto:Trigger.Huang@amd.com> wrote:

From: Trigger Huang <Trigger.Huang@amd.com><mailto:Trigger.Huang@amd.com>



Do the coredump immediately after a job timeout to get a closer

representation of GPU's error status.



V2: This will skip printing vram_lost as the GPU reset is not happened

yet (Alex)



V3: Unconditionally call the core dump as we care about all the reset

functions(soft-recovery and queue reset and full adapter reset, Alex)



Signed-off-by: Trigger Huang <Trigger.Huang@amd.com><mailto:Trigger.Huang@amd.com>

---

   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 62

+++++++++++++++++++++++++

   1 file changed, 62 insertions(+)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

index c6a1783fc9ef..ebbb1434073e 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

@@ -30,6 +30,61 @@

   #include "amdgpu.h"

   #include "amdgpu_trace.h"

   #include "amdgpu_reset.h"

+#include "amdgpu_dev_coredump.h"

+#include "amdgpu_xgmi.h"

+

+static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,

+                               struct amdgpu_job *job)

+{

+   int i;

+

+   dev_info(adev->dev, "Dumping IP State\n");

+   for (i = 0; i < adev->num_ip_blocks; i++) {

+           if (adev->ip_blocks[i].version->funcs->dump_ip_state)

+                   adev->ip_blocks[i].version->funcs

+                           ->dump_ip_state((void *)adev);

+           dev_info(adev->dev, "Dumping IP State Completed\n");

+   }

+

+   amdgpu_coredump(adev, true, false, job); }

+

+static void amdgpu_job_core_dump(struct amdgpu_device *adev,

+                            struct amdgpu_job *job)

+{

+   struct list_head device_list, *device_list_handle =  NULL;

+   struct amdgpu_device *tmp_adev = NULL;

+   struct amdgpu_hive_info *hive = NULL;

+

+   if (!amdgpu_sriov_vf(adev))

+           hive = amdgpu_get_xgmi_hive(adev);

+   if (hive)

+           mutex_lock(&hive->hive_lock);

+   /*

+    * Reuse the logic in amdgpu_device_gpu_recover() to build list of

+    * devices for code dump

+    */

+   INIT_LIST_HEAD(&device_list);

+   if (!amdgpu_sriov_vf(adev) && (adev-

gmc.xgmi.num_physical_nodes > 1) && hive) {

+           list_for_each_entry(tmp_adev, &hive->device_list,

gmc.xgmi.head)

+                   list_add_tail(&tmp_adev->reset_list, &device_list);

+           if (!list_is_first(&adev->reset_list, &device_list))

+                   list_rotate_to_front(&adev->reset_list, &device_list);

+           device_list_handle = &device_list;

+   } else {

+           list_add_tail(&adev->reset_list, &device_list);

+           device_list_handle = &device_list;

+   }

+

+   /* Do the coredump for each device */

+   list_for_each_entry(tmp_adev, device_list_handle, reset_list)

+           amdgpu_job_do_core_dump(tmp_adev, job);

+

+   if (hive) {

+           mutex_unlock(&hive->hive_lock);

+           amdgpu_put_xgmi_hive(hive);

+   }

+}



   static enum drm_gpu_sched_stat amdgpu_job_timedout(struct

drm_sched_job *s_job)

   {

@@ -48,6 +103,7 @@ static enum drm_gpu_sched_stat

amdgpu_job_timedout(struct drm_sched_job *s_job)

             return DRM_GPU_SCHED_STAT_ENODEV;

     }



+   amdgpu_job_core_dump(adev, job);

Let's call this after adev->job_hang = true;

regards
Sunil



The philosophy is hang and recovery is to let the HW and software try to

recover. Here we try to do a soft recovery first and i think we should wait for

seft recovery and if fails then we do dump and thats exactly we are doing here.

Hi Sunil ,

thanks for the suggestion, and that's reasonable. But my concern is that after soft recovery happened, the GPU's status may change(take gfx 9 for example, it will try to kill the current hang wave)

  Actually, in most cases, a real shader hang cannot be resolved through soft recovery, and at that moment, we need to get a very close dump/snapshot/representation of GPU's current error status.

Just like the scandump, when we trying to do a scandump for a shader hang, we will disable gpu_recovery, and no soft recovery/per-queue reset/HW reset will happen before the scandump, right?

On most products, there are no scandump interfaces, so core dump is even more important for debugging GPU hang issue.



Regards,

Trigger



Also we need to make sure that the tasks which are already in queue are put

on hold and the their sync points are signalled before we dump.

check once what all steps are taken before we dump in the current

implementation.

Do you mean sometimes like:

         drm_sched_wqueue_stop(&ring->sched);

         amdgpu_fence_driver_force_completion(ring); // Since there is no GPU reset happened, is it reasonable to call it here?

         amdgpu_job_core_dump(adev, job);

Before hard reset we do all those things. But i guess we may not need

that in case we capturing before soft reset itself but you need to test

it, to see the dump values are true or not.

Also apart from hardware state we dump a lot of other information like

ring buffers and in case jobs are still submitting we might be not able

to get the right data as the ring might be in use and being

consumed/filled up that time and that's why scheduler stop helps. But in

case soft reset is successful we do not want to do that.



So here is what i think but Alex please suggest if it make sense.

If recovery is disabled : Capture ip dump before soft reset. (Give close

register state but ring buffer need to be seen as it is in use as

scheduler is running)

if recovery is enabled : capture ip dump (Current implementation make

sure to disable drm sched and fence time out)



I think Trigger's proposed logic makes sense.  I don't see a reason

not to dump earlier if we can.  I don't really see what it buys us to

have different behavior depending on whether or not recovery is

enabled.  If per queue resets are successful, then we'd miss the dump

in that case.  If we only dump in job_timedout, then we'd miss the

dump when there is a reset due to something the kernel driver or KFD

has done.
I am totally in for dumping the ip registers as early as possible and its a good idea to dump before soft reset.
I am little worried if we need to stop scheduling any further jobs and process the ones already scheduled. If that's not a problem it looks good to me.




With one minor change above patch looks fine to me. Just make your try it and see the  time it takes between  "Dumping IP State" and "Dumping IP State Completed" i.e reading registers.

Acked-by:  Sunil Khatri <sunil.khatri@amd.com><mailto:sunil.khatri@amd.com>

Regards
Sunil K



I think what we want is for job_timedout or the similar logic on the

KFD side to call the coredump code when we detect a stuck queue.

We'll need to keep the codedump code in gpu_recover to cover the cases

where we need to reset due to something outside of the user submission

paths.



Alex





function ptr print ip state could be called to capture dump when its

needed in both above cases. Right now print is called when dump is

actually dumped which is when data file which is generated in

devcoredump is read.



Regards

Sunil Khatri





Regards,

Trigger



Regards



Sunil khatri



     adev->job_hang = true;



@@ -101,6 +157,12 @@ static enum drm_gpu_sched_stat

amdgpu_job_timedout(struct drm_sched_job *s_job)

             reset_context.src = AMDGPU_RESET_SRC_JOB;

             clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);



+           /*

+            * To avoid an unnecessary extra coredump, as we have

already

+            * got the very close representation of GPU's error status

+            */

+           set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);

+

             r = amdgpu_device_gpu_recover(ring->adev, job,

&reset_context);

             if (r)

                     dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

[-- Attachment #2: Type: text/html, Size: 24866 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-08-21  8:19 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-19  9:53 [PATCH 0/2] Improve the dev coredump Trigger.Huang
2024-08-19  9:53 ` [PATCH 1/2] drm/amdgpu: skip printing vram_lost if needed Trigger.Huang
2024-08-19  9:53 ` [PATCH 2/2] drm/amdgpu: Do core dump immediately when job tmo Trigger.Huang
2024-08-19 10:30   ` Khatri, Sunil
2024-08-20  7:30     ` Huang, Trigger
2024-08-20 14:06       ` Alex Deucher
2024-08-20 15:07         ` Khatri, Sunil
2024-08-20 15:29           ` Alex Deucher
2024-08-20 15:31       ` Khatri, Sunil
2024-08-20 16:01         ` Alex Deucher
2024-08-20 16:54           ` Khatri, Sunil
2024-08-21  8:19             ` Huang, Trigger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox