[PATCH 4/4] drm/amdgpu: Do core dump immediately when job tmo

All of lore.kernel.org
 help / color / mirror / Atom feed

From: <Trigger.Huang@amd.com>
To: <amd-gfx@lists.freedesktop.org>
Cc: <sunil.khatri@amd.com>, <alexander.deucher@amd.com>,
	Trigger Huang <Trigger.Huang@amd.com>
Subject: [PATCH 4/4] drm/amdgpu: Do core dump immediately when job tmo
Date: Fri, 16 Aug 2024 15:54:47 +0800	[thread overview]
Message-ID: <20240816075447.442983-5-Trigger.Huang@amd.com> (raw)
In-Reply-To: <20240816075447.442983-1-Trigger.Huang@amd.com>

From: Trigger Huang <Trigger.Huang@amd.com>

Do the coredump immediately after a job timeout to get a closer
representation of GPU's error status.

V2: This will skip printing vram_lost as the GPU reset is not
happened yet (Alex)

Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 64 +++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index c6a1783fc9ef..009551335d05 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -28,9 +28,63 @@
 #include <drm/drm_drv.h>
 
 #include "amdgpu.h"
+#include "amdgpu_dev_coredump.h"
+#include "amdgpu_xgmi.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_reset.h"
 
+static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
+			       struct amdgpu_job *job)
+{
+	int i = 0;
+
+	dev_info(adev->dev, "Dumping IP State\n");
+	for (i = 0; i < adev->num_ip_blocks; i++) {
+		if (adev->ip_blocks[i].version->funcs->dump_ip_state)
+			adev->ip_blocks[i].version->funcs
+				->dump_ip_state((void *)adev);
+		dev_info(adev->dev, "Dumping IP State Completed\n");
+	}
+
+	amdgpu_coredump(adev, true, false, job);
+}
+
+static void amdgpu_job_core_dump(struct amdgpu_device *adev, struct amdgpu_job *job)
+{
+	struct list_head device_list, *device_list_handle =  NULL;
+	struct amdgpu_device *tmp_adev = NULL;
+	struct amdgpu_hive_info *hive = NULL;
+
+	if (!amdgpu_sriov_vf(adev))
+		hive = amdgpu_get_xgmi_hive(adev);
+	if (hive)
+		mutex_lock(&hive->hive_lock);
+	/*
+	 * Reuse the logic in amdgpu_device_gpu_recover() to build list of
+	 * devices for code dump
+	 */
+	INIT_LIST_HEAD(&device_list);
+	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
+		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+			list_add_tail(&tmp_adev->reset_list, &device_list);
+		if (!list_is_first(&adev->reset_list, &device_list))
+			list_rotate_to_front(&adev->reset_list, &device_list);
+		device_list_handle = &device_list;
+	} else {
+		list_add_tail(&adev->reset_list, &device_list);
+		device_list_handle = &device_list;
+	}
+
+	/* Do the coredump for each device */
+	list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+		amdgpu_job_do_core_dump(tmp_adev, job);
+
+	if (hive) {
+		mutex_unlock(&hive->hive_lock);
+		amdgpu_put_xgmi_hive(hive);
+	}
+}
+
 static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 {
 	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
@@ -48,6 +102,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		return DRM_GPU_SCHED_STAT_ENODEV;
 	}
 
+	/*
+	 * Do the coredump immediately after a job timeout to get a closer
+	 * representation of GPU's error status.
+	 */
+	if (amdgpu_gpu_coredump)
+		amdgpu_job_core_dump(adev, job);
 
 	adev->job_hang = true;
 
@@ -101,6 +161,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		reset_context.src = AMDGPU_RESET_SRC_JOB;
 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
+		/* To avoid a double coredump, as we have already done it */
+		if (amdgpu_gpu_coredump)
+			set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
+
 		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
 		if (r)
 			dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);
-- 
2.34.1

next prev parent reply	other threads:[~2024-08-16  7:55 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-08-16  7:54 [PATCH 0/4] Improve the dev coredump Trigger.Huang
2024-08-16  7:54 ` [PATCH 1/4] drm/amdgpu: Add gpu_coredump parameter Trigger.Huang
2024-08-16 13:55   ` Alex Deucher
2024-08-16  7:54 ` [PATCH 2/4] drm/amdgpu: Use gpu_coredump to control core dump Trigger.Huang
2024-08-16  7:54 ` [PATCH 3/4] drm/amdgpu: skip printing vram_lost if needed Trigger.Huang
2024-08-16 13:52   ` Alex Deucher
2024-08-19  9:30     ` Huang, Trigger
2024-08-16  7:54 ` Trigger.Huang [this message]
2024-08-16 13:58   ` [PATCH 4/4] drm/amdgpu: Do core dump immediately when job tmo Alex Deucher
2024-08-19  9:37     ` Huang, Trigger

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c6a1783fc9e dfblob:009551335d0 )
 OR (
bs:"[PATCH 4/4] drm/amdgpu: Do core dump immediately when job tmo" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240816075447.442983-5-Trigger.Huang@amd.com \
    --to=trigger.huang@amd.com \
    --cc=alexander.deucher@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=sunil.khatri@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.