All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jordan Crouse <jcrouse-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
To: freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: linux-arm-msm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: [PATCH 09/13] drm/msm/gpu: Capture the GPU state on a GPU hang
Date: Fri, 29 Jun 2018 10:56:37 -0600	[thread overview]
Message-ID: <20180629165641.1348-10-jcrouse@codeaurora.org> (raw)
In-Reply-To: <20180629165641.1348-1-jcrouse-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>

Capture the GPU state on a GPU hang and store it for later playback
via the devcoredump facility. Only one crash state is stored at a
time on the assumption that the first hang is usually the most
interesting. The existing crash state can be cleared after capturing
it and then a new one will be captured on the next hang.

Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/Kconfig             |  1 +
 drivers/gpu/drm/msm/adreno/a3xx_gpu.c   |  2 +-
 drivers/gpu/drm/msm/adreno/a4xx_gpu.c   |  2 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |  4 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 36 +++++++----
 drivers/gpu/drm/msm/adreno/adreno_gpu.h |  6 +-
 drivers/gpu/drm/msm/msm_debugfs.c       |  5 +-
 drivers/gpu/drm/msm/msm_gpu.c           | 83 ++++++++++++++++++++++++-
 drivers/gpu/drm/msm/msm_gpu.h           | 38 ++++++++++-
 9 files changed, 154 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
index 38cbde971b48..843a9d40c05e 100644
--- a/drivers/gpu/drm/msm/Kconfig
+++ b/drivers/gpu/drm/msm/Kconfig
@@ -12,6 +12,7 @@ config DRM_MSM
 	select SHMEM
 	select TMPFS
 	select QCOM_SCM
+	select WANT_DEV_COREDUMP
 	select SND_SOC_HDMI_CODEC if SND_SOC
 	select SYNC_FILE
 	select PM_OPP
diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
index 4cffec2b6adc..fc502e412132 100644
--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -454,7 +454,7 @@ static const struct adreno_gpu_funcs funcs = {
 		.active_ring = adreno_active_ring,
 		.irq = a3xx_irq,
 		.destroy = a3xx_destroy,
-#ifdef CONFIG_DEBUG_FS
+#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
 		.show = adreno_show,
 #endif
 		.gpu_state_get = a3xx_gpu_state_get,
diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
index 95f08c22e8d7..8129cf037db1 100644
--- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
@@ -540,7 +540,7 @@ static const struct adreno_gpu_funcs funcs = {
 		.active_ring = adreno_active_ring,
 		.irq = a4xx_irq,
 		.destroy = a4xx_destroy,
-#ifdef CONFIG_DEBUG_FS
+#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
 		.show = adreno_show,
 #endif
 		.gpu_state_get = a4xx_gpu_state_get,
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 5f1aab3c1cb1..16074fa6bf1e 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1243,8 +1243,10 @@ static const struct adreno_gpu_funcs funcs = {
 		.active_ring = a5xx_active_ring,
 		.irq = a5xx_irq,
 		.destroy = a5xx_destroy,
-#ifdef CONFIG_DEBUG_FS
+#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
 		.show = adreno_show,
+#endif
+#if defined(CONFIG_DEBUG_FS)
 		.debugfs_init = a5xx_debugfs_init,
 #endif
 		.gpu_busy = a5xx_gpu_busy,
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 0e937eedcec5..163542487e2c 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -378,6 +378,8 @@ struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu)
 	if (!state)
 		return ERR_PTR(-ENOMEM);
 
+	kref_init(&state->ref);
+
 	do_gettimeofday(&state->time);
 
 	for (i = 0; i < gpu->nr_rings; i++) {
@@ -413,18 +415,28 @@ struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu)
 	return state;
 }
 
-void adreno_gpu_state_put(struct msm_gpu_state *state)
+static void adreno_gpu_state_destroy(struct kref *kref)
 {
-	if (IS_ERR_OR_NULL(state))
-		return;
+	struct msm_gpu_state *state = container_of(kref,
+		struct msm_gpu_state, ref);
 
+	kfree(state->comm);
+	kfree(state->cmd);
 	kfree(state->registers);
 	kfree(state);
 }
 
-#ifdef CONFIG_DEBUG_FS
+int adreno_gpu_state_put(struct msm_gpu_state *state)
+{
+	if (IS_ERR_OR_NULL(state))
+		return 1;
+
+	return kref_put(&state->ref, adreno_gpu_state_destroy);
+}
+
+#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
 void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
-		struct seq_file *m)
+		struct drm_printer *p)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	int i;
@@ -432,23 +444,23 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
 	if (IS_ERR_OR_NULL(state))
 		return;
 
-	seq_printf(m, "status:   %08x\n", state->rbbm_status);
-	seq_printf(m, "revision: %d (%d.%d.%d.%d)\n",
+	drm_printf(p, "status:   %08x\n", state->rbbm_status);
+	drm_printf(p, "revision: %d (%d.%d.%d.%d)\n",
 			adreno_gpu->info->revn, adreno_gpu->rev.core,
 			adreno_gpu->rev.major, adreno_gpu->rev.minor,
 			adreno_gpu->rev.patchid);
 
 	for (i = 0; i < gpu->nr_rings; i++) {
-		seq_printf(m, "rb %d: fence:    %d/%d\n", i,
+		drm_printf(p, "rb %d: fence:    %d/%d\n", i,
 			state->ring[i].fence, state->ring[i].seqno);
 
-		seq_printf(m, "      rptr:     %d\n", state->ring[i].rptr);
-		seq_printf(m, "rb wptr:  %d\n", state->ring[i].wptr);
+		drm_printf(p, "      rptr:     %d\n", state->ring[i].rptr);
+		drm_printf(p, "rb wptr:  %d\n", state->ring[i].wptr);
 	}
 
-	seq_printf(m, "IO:region %s 00000000 00020000\n", gpu->name);
+	drm_printf(p, "IO:region %s 00000000 00020000\n", gpu->name);
 	for (i = 0; i < state->nr_registers; i++) {
-		seq_printf(m, "IO:R %08x %08x\n",
+		drm_printf(p, "IO:R %08x %08x\n",
 			state->registers[i * 2] << 2,
 			state->registers[(i * 2) + 1]);
 	}
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 90b6b59252af..4a868aaf1a70 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -215,9 +215,9 @@ void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		struct msm_file_private *ctx);
 void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
-#ifdef CONFIG_DEBUG_FS
+#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
 void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
-		struct seq_file *m);
+		struct drm_printer *p);
 #endif
 void adreno_dump_info(struct msm_gpu *gpu);
 void adreno_dump(struct msm_gpu *gpu);
@@ -231,7 +231,7 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu);
 
 
 struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu);
-void adreno_gpu_state_put(struct msm_gpu_state *state);
+int adreno_gpu_state_put(struct msm_gpu_state *state);
 
 /* ringbuffer helpers (the parts that are adreno specific) */
 
diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
index c3da12179888..f0da0d3c8a80 100644
--- a/drivers/gpu/drm/msm/msm_debugfs.c
+++ b/drivers/gpu/drm/msm/msm_debugfs.c
@@ -29,6 +29,7 @@ struct msm_gpu_show_priv {
 
 static int msm_gpu_show(struct seq_file *m, void *arg)
 {
+	struct drm_printer p = drm_seq_file_printer(m);
 	struct msm_gpu_show_priv *show_priv = m->private;
 	struct msm_drm_private *priv = show_priv->dev->dev_private;
 	struct msm_gpu *gpu = priv->gpu;
@@ -38,8 +39,8 @@ static int msm_gpu_show(struct seq_file *m, void *arg)
 	if (ret)
 		return ret;
 
-	seq_printf(m, "%s Status:\n", gpu->name);
-	gpu->funcs->show(gpu, show_priv->state, m);
+	drm_printf(&p, "%s Status:\n", gpu->name);
+	gpu->funcs->show(gpu, show_priv->state, &p);
 
 	mutex_unlock(&show_priv->dev->struct_mutex);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 2ca354047250..1945736fc448 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -20,10 +20,11 @@
 #include "msm_mmu.h"
 #include "msm_fence.h"
 
+#include <generated/utsrelease.h>
 #include <linux/string_helpers.h>
 #include <linux/pm_opp.h>
 #include <linux/devfreq.h>
-
+#include <linux/devcoredump.h>
 
 /*
  * Power Management:
@@ -273,6 +274,81 @@ int msm_gpu_hw_init(struct msm_gpu *gpu)
 	return ret;
 }
 
+#ifdef CONFIG_DEV_COREDUMP
+static ssize_t msm_gpu_devcoredump_read(char *buffer, loff_t offset,
+		size_t count, void *data, size_t datalen)
+{
+	struct msm_gpu *gpu = data;
+	struct drm_print_iterator iter;
+	struct drm_printer p;
+	struct msm_gpu_state *state;
+
+	state = msm_gpu_crashstate_get(gpu);
+	if (!state)
+		return 0;
+
+	iter.data = buffer;
+	iter.offset = 0;
+	iter.start = offset;
+	iter.remain = count;
+
+	p = drm_coredump_printer(&iter);
+
+	drm_printf(&p, "---\n");
+	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
+	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
+	drm_printf(&p, "time: %ld.%ld\n",
+		state->time.tv_sec, state->time.tv_usec);
+	if (state->comm)
+		drm_printf(&p, "comm: %s\n", state->comm);
+	if (state->cmd)
+		drm_printf(&p, "cmdline: %s\n", state->cmd);
+
+	gpu->funcs->show(gpu, state, &p);
+
+	msm_gpu_crashstate_put(gpu);
+
+	return count - iter.remain;
+}
+
+static void msm_gpu_devcoredump_free(void *data)
+{
+	struct msm_gpu *gpu = data;
+
+	msm_gpu_crashstate_put(gpu);
+}
+
+static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, char *comm,
+		char *cmd)
+{
+	struct msm_gpu_state *state;
+
+	/* Only save one crash state at a time */
+	if (gpu->crashstate)
+		return;
+
+	state = gpu->funcs->gpu_state_get(gpu);
+	if (IS_ERR_OR_NULL(state))
+		return;
+
+	/* Fill in the additional crash state information */
+	state->comm = kstrdup(comm, GFP_KERNEL);
+	state->cmd = kstrdup(cmd, GFP_KERNEL);
+
+	/* Set the active crash state to be dumped on failure */
+	gpu->crashstate = state;
+
+	/* FIXME: Release the crashstate if this errors out? */
+	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
+		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
+}
+#else
+static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, char *comm,
+		char *cmd)
+{
+}
+#endif
+
 /*
  * Hangcheck detection for locked gpu:
  */
@@ -356,6 +432,11 @@ static void recover_worker(struct work_struct *work)
 			msm_rd_dump_submit(priv->hangrd, submit, NULL);
 	}
 
+	/* Record the crash state */
+	pm_runtime_get_sync(&gpu->pdev->dev);
+	msm_gpu_crashstate_capture(gpu, comm, cmd);
+	pm_runtime_put_sync(&gpu->pdev->dev);
+
 	kfree(cmd);
 	kfree(comm);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 470f3bb5f834..e65f507954c0 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -66,13 +66,13 @@ struct msm_gpu_funcs {
 #ifdef CONFIG_DEBUG_FS
 	/* show GPU status in debugfs: */
 	void (*show)(struct msm_gpu *gpu, struct msm_gpu_state *state,
-			struct seq_file *m);
+			struct drm_printer *p);
 	/* for generation specific debugfs: */
 	int (*debugfs_init)(struct msm_gpu *gpu, struct drm_minor *minor);
 #endif
 	int (*gpu_busy)(struct msm_gpu *gpu, uint64_t *value);
 	struct msm_gpu_state *(*gpu_state_get)(struct msm_gpu *gpu);
-	void (*gpu_state_put)(struct msm_gpu_state *state);
+	int (*gpu_state_put)(struct msm_gpu_state *state);
 };
 
 struct msm_gpu {
@@ -133,6 +133,8 @@ struct msm_gpu {
 		u64 busy_cycles;
 		ktime_t time;
 	} devfreq;
+
+	struct msm_gpu_state *crashstate;
 };
 
 /* It turns out that all targets use the same ringbuffer size */
@@ -180,6 +182,7 @@ struct msm_gpu_submitqueue {
 };
 
 struct msm_gpu_state {
+	struct kref ref;
 	struct timeval time;
 
 	struct {
@@ -193,6 +196,9 @@ struct msm_gpu_state {
 	u32 *registers;
 
 	u32 rbbm_status;
+
+	char *comm;
+	char *cmd;
 };
 
 static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
@@ -274,4 +280,32 @@ static inline void msm_submitqueue_put(struct msm_gpu_submitqueue *queue)
 		kref_put(&queue->ref, msm_submitqueue_destroy);
 }
 
+static inline struct msm_gpu_state *msm_gpu_crashstate_get(struct msm_gpu *gpu)
+{
+	struct msm_gpu_state *state = NULL;
+
+	mutex_lock(&gpu->dev->struct_mutex);
+
+	if (gpu->crashstate) {
+		kref_get(&gpu->crashstate->ref);
+		state = gpu->crashstate;
+	}
+
+	mutex_unlock(&gpu->dev->struct_mutex);
+
+	return state;
+}
+
+static inline void msm_gpu_crashstate_put(struct msm_gpu *gpu)
+{
+	mutex_lock(&gpu->dev->struct_mutex);
+
+	if (gpu->crashstate) {
+		if (gpu->funcs->gpu_state_put(gpu->crashstate))
+			gpu->crashstate = NULL;
+	}
+
+	mutex_unlock(&gpu->dev->struct_mutex);
+}
+
 #endif /* __MSM_GPU_H__ */
-- 
2.17.1

_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno

  parent reply	other threads:[~2018-06-29 16:56 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-29 16:56 [v6 00/13] drm/msm: Capture and dump the GPU crash state Jordan Crouse
2018-06-29 16:56 ` [PATCH 01/13] include: Move ascii85 functions from i915 to linux/ascii85.h Jordan Crouse
     [not found] ` <20180629165641.1348-1-jcrouse-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-06-29 16:56   ` [PATCH 02/13] drm: drm_printer: Add printer for devcoredump Jordan Crouse
2018-06-29 16:56   ` [PATCH 03/13] drm: Add drm_puts() to complement drm_printf() Jordan Crouse
2018-06-29 16:56   ` [PATCH 04/13] drm: Add puts function for the seq_file printer Jordan Crouse
2018-06-29 16:56   ` [PATCH 05/13] drm: Add put callback for the coredump printer Jordan Crouse
2018-06-29 16:56   ` [PATCH 06/13] drm/msm/gpu: Capture the state of the GPU Jordan Crouse
2018-06-29 16:56   ` [PATCH 07/13] drm/msm/gpu: Convert the GPU show function to use the GPU state Jordan Crouse
2018-06-29 16:56   ` [PATCH 08/13] drm/msm/gpu: Rearrange the code that collects the task during a hang Jordan Crouse
2018-06-29 16:56   ` Jordan Crouse [this message]
2018-06-29 20:51     ` [PATCH 09/13] drm/msm/gpu: Capture the GPU state on a GPU hang kbuild test robot
2018-06-29 16:56   ` [PATCH 10/13] drm/msm/adreno: Convert the show/crash file format Jordan Crouse
2018-06-29 16:56   ` [PATCH 11/13] drm/msm/adreno: Add ringbuffer data to the GPU state Jordan Crouse
2018-06-29 16:56   ` [PATCH 12/13] drm/msm/adreno: Add a5xx specific registers for " Jordan Crouse
2018-06-29 16:56   ` [PATCH 13/13] drm/msm/gpu: Add the buffer objects from the submit to the crash dump Jordan Crouse
     [not found]     ` <20180629165641.1348-14-jcrouse-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-06-29 19:31       ` kbuild test robot
  -- strict thread matches above, loose matches on Subject: below --
2018-07-12 18:59 [v7 00/13] drm/msm: Capture and dump the GPU crash state Jordan Crouse
     [not found] ` <20180712185930.2492-1-jcrouse-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-07-12 18:59   ` [PATCH 09/13] drm/msm/gpu: Capture the GPU state on a GPU hang Jordan Crouse
2018-07-24 16:33 [v8 PATCH 00/13] drm/msm: Capture and dump the GPU crash state Jordan Crouse
     [not found] ` <20180724163331.18250-1-jcrouse-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-07-24 16:33   ` [PATCH 09/13] drm/msm/gpu: Capture the GPU state on a GPU hang Jordan Crouse

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180629165641.1348-10-jcrouse@codeaurora.org \
    --to=jcrouse-sgv2jx0feol9jmxxk+q4oq@public.gmane.org \
    --cc=dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    --cc=freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    --cc=linux-arm-msm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.