[PATCH v4 0/2] Improve the dev coredump for gfx job timeout scenario

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v4 0/2] Improve the dev coredump for gfx job timeout scenario
@ 2024-08-21  8:38 Trigger.Huang
  2024-08-21  8:38 ` [PATCH v4 1/2] drm/amdgpu: skip printing vram_lost if needed Trigger.Huang
  2024-08-21  8:38 ` [PATCH v4 2/2] drm/amdgpu: Do core dump immediately when job tmo Trigger.Huang
  0 siblings, 2 replies; 5+ messages in thread
From: Trigger.Huang @ 2024-08-21  8:38 UTC (permalink / raw)
  To: amd-gfx; +Cc: sunil.khatri, alexander.deucher, Trigger Huang

From: Trigger Huang <Trigger.Huang@amd.com>

The current dev coredump implementation sometimes cannot fully satisfy customer's requirements due to:
1, dev coredump is called in GPU reset function, so if GPU reset is disabled, the dev coredump is also disabled
2, When job timeout happened, the dump GPU status will be happened after a lot of operations, like soft_reset. The concern here is that the status is not so close to the real GPU's error status

The new solution will unconditionally call dev coredump immediately after a job timeout to get a closer representation of GPU's error status

Trigger Huang (2):
  drm/amdgpu: skip printing vram_lost if needed
  drm/amdgpu: Do core dump immediately when job tmo

 .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c  | 20 +++---
 .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h  |  7 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 68 ++++++++++++++++++-
 4 files changed, 82 insertions(+), 15 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v4 1/2] drm/amdgpu: skip printing vram_lost if needed
  2024-08-21  8:38 [PATCH v4 0/2] Improve the dev coredump for gfx job timeout scenario Trigger.Huang
@ 2024-08-21  8:38 ` Trigger.Huang
  2024-08-21  8:38 ` [PATCH v4 2/2] drm/amdgpu: Do core dump immediately when job tmo Trigger.Huang
  1 sibling, 0 replies; 5+ messages in thread
From: Trigger.Huang @ 2024-08-21  8:38 UTC (permalink / raw)
  To: amd-gfx; +Cc: sunil.khatri, alexander.deucher, Trigger Huang

From: Trigger Huang <Trigger.Huang@amd.com>

The vm lost status can only be obtained after a GPU reset occurs, but
sometimes a dev core dump can be happened before GPU reset. So a new
argument is added to tell the dev core dump implementation whether to
skip printing the vram_lost status in the dump.
And this patch is also trying to decouple the core dump function from
the GPU reset function, by replacing the argument amdgpu_reset_context
with amdgpu_job to specify the context for core dump.

V2: Inform user if VRAM lost check is skipped so users don't assume
VRAM wasn't lost (Alex)

Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c  | 20 ++++++++++---------
 .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h  |  7 +++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  2 +-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
index cf2b4dd4d865..5ac59b62020c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
@@ -28,8 +28,8 @@
 #include "atom.h"
 
 #ifndef CONFIG_DEV_COREDUMP
-void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
-		     struct amdgpu_reset_context *reset_context)
+void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
+		     bool vram_lost, struct amdgpu_job *job)
 {
 }
 #else
@@ -315,7 +315,9 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
 		}
 	}
 
-	if (coredump->reset_vram_lost)
+	if (coredump->skip_vram_check)
+		drm_printf(&p, "VRAM lost check is skipped!\n");
+	else if (coredump->reset_vram_lost)
 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
 
 	return count - iter.remain;
@@ -326,12 +328,11 @@ static void amdgpu_devcoredump_free(void *data)
 	kfree(data);
 }
 
-void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
-		     struct amdgpu_reset_context *reset_context)
+void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
+		     bool vram_lost, struct amdgpu_job *job)
 {
-	struct amdgpu_coredump_info *coredump;
 	struct drm_device *dev = adev_to_drm(adev);
-	struct amdgpu_job *job = reset_context->job;
+	struct amdgpu_coredump_info *coredump;
 	struct drm_sched_job *s_job;
 
 	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
@@ -341,11 +342,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
 		return;
 	}
 
+	coredump->skip_vram_check = skip_vram_check;
 	coredump->reset_vram_lost = vram_lost;
 
-	if (reset_context->job && reset_context->job->vm) {
+	if (job && job->vm) {
+		struct amdgpu_vm *vm = job->vm;
 		struct amdgpu_task_info *ti;
-		struct amdgpu_vm *vm = reset_context->job->vm;
 
 		ti = amdgpu_vm_get_task_info_vm(vm);
 		if (ti) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
index 52459512cb2b..ef9772c6bcc9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
@@ -26,7 +26,6 @@
 #define __AMDGPU_DEV_COREDUMP_H__
 
 #include "amdgpu.h"
-#include "amdgpu_reset.h"
 
 #ifdef CONFIG_DEV_COREDUMP
 
@@ -36,12 +35,12 @@ struct amdgpu_coredump_info {
 	struct amdgpu_device            *adev;
 	struct amdgpu_task_info         reset_task_info;
 	struct timespec64               reset_time;
+	bool                            skip_vram_check;
 	bool                            reset_vram_lost;
 	struct amdgpu_ring              *ring;
 };
 #endif
 
-void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
-		     struct amdgpu_reset_context *reset_context);
-
+void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
+		     bool vram_lost, struct amdgpu_job *job);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ad97f03f1358..59a443abc11e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5468,7 +5468,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
 
 				if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
-					amdgpu_coredump(tmp_adev, vram_lost, reset_context);
+					amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
 
 				if (vram_lost) {
 					DRM_INFO("VRAM is lost due to GPU reset!\n");
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v4 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-21  8:38 [PATCH v4 0/2] Improve the dev coredump for gfx job timeout scenario Trigger.Huang
  2024-08-21  8:38 ` [PATCH v4 1/2] drm/amdgpu: skip printing vram_lost if needed Trigger.Huang
@ 2024-08-21  8:38 ` Trigger.Huang
  2024-08-21 10:02   ` Khatri, Sunil
  2024-08-21 17:01   ` Deucher, Alexander
  1 sibling, 2 replies; 5+ messages in thread
From: Trigger.Huang @ 2024-08-21  8:38 UTC (permalink / raw)
  To: amd-gfx; +Cc: sunil.khatri, alexander.deucher, Trigger Huang

From: Trigger Huang <Trigger.Huang@amd.com>

Do the coredump immediately after a job timeout to get a closer
representation of GPU's error status.

V2: This will skip printing vram_lost as the GPU reset is not
happened yet (Alex)

V3: Unconditionally call the core dump as we care about all the reset
functions(soft-recovery and queue reset and full adapter reset, Alex)

V4: Do the dump after adev->job_hang = true (Sunil)

Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 68 ++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index c6a1783fc9ef..3000a49b3e5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -30,6 +30,61 @@
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_dev_coredump.h"
+#include "amdgpu_xgmi.h"
+
+static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
+				    struct amdgpu_job *job)
+{
+	int i;
+
+	dev_info(adev->dev, "Dumping IP State\n");
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (adev->ip_blocks[i].version->funcs->dump_ip_state)
+			adev->ip_blocks[i].version->funcs
+				->dump_ip_state((void *)adev);
+		dev_info(adev->dev, "Dumping IP State Completed\n");
+	}
+
+	amdgpu_coredump(adev, true, false, job);
+}
+
+static void amdgpu_job_core_dump(struct amdgpu_device *adev,
+				 struct amdgpu_job *job)
+{
+	struct list_head device_list, *device_list_handle =  NULL;
+	struct amdgpu_device *tmp_adev = NULL;
+	struct amdgpu_hive_info *hive = NULL;
+
+	if (!amdgpu_sriov_vf(adev))
+		hive = amdgpu_get_xgmi_hive(adev);
+	if (hive)
+		mutex_lock(&hive->hive_lock);
+	/*
+	 * Reuse the logic in amdgpu_device_gpu_recover() to build list of
+	 * devices for code dump
+	 */
+	INIT_LIST_HEAD(&device_list);
+	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
+		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+			list_add_tail(&tmp_adev->reset_list, &device_list);
+		if (!list_is_first(&adev->reset_list, &device_list))
+			list_rotate_to_front(&adev->reset_list, &device_list);
+		device_list_handle = &device_list;
+	} else {
+		list_add_tail(&adev->reset_list, &device_list);
+		device_list_handle = &device_list;
+	}
+
+	/* Do the coredump for each device */
+	list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+		amdgpu_job_do_core_dump(tmp_adev, job);
+
+	if (hive) {
+		mutex_unlock(&hive->hive_lock);
+		amdgpu_put_xgmi_hive(hive);
+	}
+}
 
 static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 {
@@ -48,9 +103,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		return DRM_GPU_SCHED_STAT_ENODEV;
 	}
 
-
 	adev->job_hang = true;
 
+	/*
+	 * Do the coredump immediately after a job timeout to get a very
+	 * close dump/snapshot/representation of GPU's current error status
+	 */
+	amdgpu_job_core_dump(adev, job);
+
 	if (amdgpu_gpu_recovery &&
 	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
 		dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
@@ -101,6 +161,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		reset_context.src = AMDGPU_RESET_SRC_JOB;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
+		/*
+		 * To avoid an unnecessary extra coredump, as we have already
+		 * got the very close representation of GPU's error status
+		 */
+		set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
+
 		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
 		if (r)
 			dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-21  8:38 ` [PATCH v4 2/2] drm/amdgpu: Do core dump immediately when job tmo Trigger.Huang
@ 2024-08-21 10:02   ` Khatri, Sunil
  2024-08-21 17:01   ` Deucher, Alexander
  1 sibling, 0 replies; 5+ messages in thread
From: Khatri, Sunil @ 2024-08-21 10:02 UTC (permalink / raw)
  To: Trigger.Huang, amd-gfx; +Cc: alexander.deucher

[-- Attachment #1: Type: text/plain, Size: 4144 bytes --]

Acked-by: Sunil Khatri <sunil.khatri@amd.com> <mailto:sunil.khatri@amd.com>

On 8/21/2024 2:08 PM, Trigger.Huang@amd.com wrote:
> From: Trigger Huang <Trigger.Huang@amd.com>
>
> Do the coredump immediately after a job timeout to get a closer
> representation of GPU's error status.
>
> V2: This will skip printing vram_lost as the GPU reset is not
> happened yet (Alex)
>
> V3: Unconditionally call the core dump as we care about all the reset
> functions(soft-recovery and queue reset and full adapter reset, Alex)
>
> V4: Do the dump after adev->job_hang = true (Sunil)
>
> Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 68 ++++++++++++++++++++++++-
>   1 file changed, 67 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index c6a1783fc9ef..3000a49b3e5c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -30,6 +30,61 @@
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
>   #include "amdgpu_reset.h"
> +#include "amdgpu_dev_coredump.h"
> +#include "amdgpu_xgmi.h"
> +
> +static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
> +				    struct amdgpu_job *job)
> +{
> +	int i;
> +
> +	dev_info(adev->dev, "Dumping IP State\n");
> +	for (i = 0; i < adev->num_ip_blocks; i++) {
> +		if (adev->ip_blocks[i].version->funcs->dump_ip_state)
> +			adev->ip_blocks[i].version->funcs
> +				->dump_ip_state((void *)adev);
> +		dev_info(adev->dev, "Dumping IP State Completed\n");
> +	}
> +
> +	amdgpu_coredump(adev, true, false, job);
> +}
> +
> +static void amdgpu_job_core_dump(struct amdgpu_device *adev,
> +				 struct amdgpu_job *job)
> +{
> +	struct list_head device_list, *device_list_handle =  NULL;
> +	struct amdgpu_device *tmp_adev = NULL;
> +	struct amdgpu_hive_info *hive = NULL;
> +
> +	if (!amdgpu_sriov_vf(adev))
> +		hive = amdgpu_get_xgmi_hive(adev);
> +	if (hive)
> +		mutex_lock(&hive->hive_lock);
> +	/*
> +	 * Reuse the logic in amdgpu_device_gpu_recover() to build list of
> +	 * devices for code dump
> +	 */
> +	INIT_LIST_HEAD(&device_list);
> +	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
> +		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
> +			list_add_tail(&tmp_adev->reset_list, &device_list);
> +		if (!list_is_first(&adev->reset_list, &device_list))
> +			list_rotate_to_front(&adev->reset_list, &device_list);
> +		device_list_handle = &device_list;
> +	} else {
> +		list_add_tail(&adev->reset_list, &device_list);
> +		device_list_handle = &device_list;
> +	}
> +
> +	/* Do the coredump for each device */
> +	list_for_each_entry(tmp_adev, device_list_handle, reset_list)
> +		amdgpu_job_do_core_dump(tmp_adev, job);
> +
> +	if (hive) {
> +		mutex_unlock(&hive->hive_lock);
> +		amdgpu_put_xgmi_hive(hive);
> +	}
> +}
>   
>   static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   {
> @@ -48,9 +103,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		return DRM_GPU_SCHED_STAT_ENODEV;
>   	}
>   
> -
>   	adev->job_hang = true;
>   
> +	/*
> +	 * Do the coredump immediately after a job timeout to get a very
> +	 * close dump/snapshot/representation of GPU's current error status
> +	 */
> +	amdgpu_job_core_dump(adev, job);
> +
>   	if (amdgpu_gpu_recovery &&
>   	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
>   		dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
> @@ -101,6 +161,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		reset_context.src = AMDGPU_RESET_SRC_JOB;
>   		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>   
> +		/*
> +		 * To avoid an unnecessary extra coredump, as we have already
> +		 * got the very close representation of GPU's error status
> +		 */
> +		set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
> +
>   		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
>   		if (r)
>   			dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

[-- Attachment #2: Type: text/html, Size: 40380 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 2/2] drm/amdgpu: Do core dump immediately when job tmo
  2024-08-21  8:38 ` [PATCH v4 2/2] drm/amdgpu: Do core dump immediately when job tmo Trigger.Huang
  2024-08-21 10:02   ` Khatri, Sunil
@ 2024-08-21 17:01   ` Deucher, Alexander
  1 sibling, 0 replies; 5+ messages in thread
From: Deucher, Alexander @ 2024-08-21 17:01 UTC (permalink / raw)
  To: Huang, Trigger, amd-gfx@lists.freedesktop.org; +Cc: Khatri, Sunil

[-- Attachment #1: Type: text/plain, Size: 5077 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]

Series is:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
________________________________
From: Huang, Trigger <Trigger.Huang@amd.com>
Sent: Wednesday, August 21, 2024 4:38 AM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Khatri, Sunil <Sunil.Khatri@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Trigger <Trigger.Huang@amd.com>
Subject: [PATCH v4 2/2] drm/amdgpu: Do core dump immediately when job tmo

From: Trigger Huang <Trigger.Huang@amd.com>

Do the coredump immediately after a job timeout to get a closer
representation of GPU's error status.

V2: This will skip printing vram_lost as the GPU reset is not
happened yet (Alex)

V3: Unconditionally call the core dump as we care about all the reset
functions(soft-recovery and queue reset and full adapter reset, Alex)

V4: Do the dump after adev->job_hang = true (Sunil)

Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 68 ++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index c6a1783fc9ef..3000a49b3e5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -30,6 +30,61 @@
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_dev_coredump.h"
+#include "amdgpu_xgmi.h"
+
+static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
+                                   struct amdgpu_job *job)
+{
+       int i;
+
+       dev_info(adev->dev, "Dumping IP State\n");
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (adev->ip_blocks[i].version->funcs->dump_ip_state)
+                       adev->ip_blocks[i].version->funcs
+                               ->dump_ip_state((void *)adev);
+               dev_info(adev->dev, "Dumping IP State Completed\n");
+       }
+
+       amdgpu_coredump(adev, true, false, job);
+}
+
+static void amdgpu_job_core_dump(struct amdgpu_device *adev,
+                                struct amdgpu_job *job)
+{
+       struct list_head device_list, *device_list_handle =  NULL;
+       struct amdgpu_device *tmp_adev = NULL;
+       struct amdgpu_hive_info *hive = NULL;
+
+       if (!amdgpu_sriov_vf(adev))
+               hive = amdgpu_get_xgmi_hive(adev);
+       if (hive)
+               mutex_lock(&hive->hive_lock);
+       /*
+        * Reuse the logic in amdgpu_device_gpu_recover() to build list of
+        * devices for code dump
+        */
+       INIT_LIST_HEAD(&device_list);
+       if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+                       list_add_tail(&tmp_adev->reset_list, &device_list);
+               if (!list_is_first(&adev->reset_list, &device_list))
+                       list_rotate_to_front(&adev->reset_list, &device_list);
+               device_list_handle = &device_list;
+       } else {
+               list_add_tail(&adev->reset_list, &device_list);
+               device_list_handle = &device_list;
+       }
+
+       /* Do the coredump for each device */
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+               amdgpu_job_do_core_dump(tmp_adev, job);
+
+       if (hive) {
+               mutex_unlock(&hive->hive_lock);
+               amdgpu_put_xgmi_hive(hive);
+       }
+}

 static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 {
@@ -48,9 +103,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
                 return DRM_GPU_SCHED_STAT_ENODEV;
         }

-
         adev->job_hang = true;

+       /*
+        * Do the coredump immediately after a job timeout to get a very
+        * close dump/snapshot/representation of GPU's current error status
+        */
+       amdgpu_job_core_dump(adev, job);
+
         if (amdgpu_gpu_recovery &&
             amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
                 dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
@@ -101,6 +161,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
                 reset_context.src = AMDGPU_RESET_SRC_JOB;
                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

+               /*
+                * To avoid an unnecessary extra coredump, as we have already
+                * got the very close representation of GPU's error status
+                */
+               set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
+
                 r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
                 if (r)
                         dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);
--
2.34.1


[-- Attachment #2: Type: text/html, Size: 10665 bytes --]

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-08-21 17:01 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-21  8:38 [PATCH v4 0/2] Improve the dev coredump for gfx job timeout scenario Trigger.Huang
2024-08-21  8:38 ` [PATCH v4 1/2] drm/amdgpu: skip printing vram_lost if needed Trigger.Huang
2024-08-21  8:38 ` [PATCH v4 2/2] drm/amdgpu: Do core dump immediately when job tmo Trigger.Huang
2024-08-21 10:02   ` Khatri, Sunil
2024-08-21 17:01   ` Deucher, Alexander

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox