linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/9] drm/panthor: add devcoredump support
@ 2025-07-20  0:01 Chia-I Wu
  2025-07-20  0:01 ` [PATCH 1/9] " Chia-I Wu
                   ` (9 more replies)
  0 siblings, 10 replies; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

This series adds devcoredump support to panthor.

This is written from scratch and is not based on the prior work[1]. The
main differences are

 - coredump triggers on all faulty/fatal/timeout events
 - state capture and state process are two separated steps, with
   GFP_NOWAIT being used for state capture
 - state capture captures both sw states and hw regs that are
   potentially interesting
 - coredump data is in text format, similar to what msm and xe do

A sample devcoredump can be found at
https://gitlab.freedesktop.org/panfrost/linux/-/issues/44

[1] https://lore.kernel.org/lkml/20240821143826.3720-1-daniel.almeida@collabora.com/

Chia-I Wu (9):
  drm/panthor: add devcoredump support
  drm/panthor: capture GPU state for devcoredump
  drm/panthor: capture GLB state for devcoredump
  drm/panthor: capture CSG state for devcoredump
  drm/panthor: capture CS state for devcoredump
  drm/panthor: capture AS state for devcoredump
  drm/panthor: capture VMA state for devcoredump
  drm/panthor: check bo offset alignment in vm bind
  drm/panthor: add DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE

 drivers/gpu/drm/panthor/Makefile           |   2 +
 drivers/gpu/drm/panthor/panthor_coredump.c | 617 +++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_coredump.h | 178 ++++++
 drivers/gpu/drm/panthor/panthor_device.h   |   6 +
 drivers/gpu/drm/panthor/panthor_drv.c      |   3 +-
 drivers/gpu/drm/panthor/panthor_mmu.c      |  54 +-
 drivers/gpu/drm/panthor/panthor_mmu.h      |   4 +
 drivers/gpu/drm/panthor/panthor_regs.h     |   6 +
 drivers/gpu/drm/panthor/panthor_sched.c    | 104 ++++
 drivers/gpu/drm/panthor/panthor_sched.h    |  14 +
 include/uapi/drm/panthor_drm.h             |   7 +
 11 files changed, 989 insertions(+), 6 deletions(-)
 create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.c
 create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.h

-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/9] drm/panthor: add devcoredump support
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
@ 2025-07-20  0:01 ` Chia-I Wu
  2025-07-20  3:17   ` kernel test robot
  2025-07-28 11:24   ` Steven Price
  2025-07-20  0:01 ` [PATCH 2/9] drm/panthor: capture GPU state for devcoredump Chia-I Wu
                   ` (8 subsequent siblings)
  9 siblings, 2 replies; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

Create a devcoredump on any faulty or fatal event. The coredump data is
in YAML format for readability and flexibility.

Only panthor_group state is captured for now.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/Makefile           |   2 +
 drivers/gpu/drm/panthor/panthor_coredump.c | 225 +++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_coredump.h |  68 +++++++
 drivers/gpu/drm/panthor/panthor_device.h   |   6 +
 drivers/gpu/drm/panthor/panthor_sched.c    |  69 +++++++
 drivers/gpu/drm/panthor/panthor_sched.h    |   5 +
 6 files changed, 375 insertions(+)
 create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.c
 create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.h

diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile
index 15294719b09c..9fd1e74af1df 100644
--- a/drivers/gpu/drm/panthor/Makefile
+++ b/drivers/gpu/drm/panthor/Makefile
@@ -11,4 +11,6 @@ panthor-y := \
 	panthor_mmu.o \
 	panthor_sched.o
 
+panthor-$(CONFIG_DEV_COREDUMP) += panthor_coredump.o
+
 obj-$(CONFIG_DRM_PANTHOR) += panthor.o
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
new file mode 100644
index 000000000000..767f3327e3e8
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2025 Google LLC */
+
+#include <drm/drm_drv.h>
+#include <drm/drm_print.h>
+#include <drm/drm_managed.h>
+#include <generated/utsrelease.h>
+#include <linux/devcoredump.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+
+#include "panthor_coredump.h"
+#include "panthor_device.h"
+#include "panthor_sched.h"
+
+/**
+ * enum panthor_coredump_mask - Coredump state
+ */
+enum panthor_coredump_mask {
+	PANTHOR_COREDUMP_GROUP = BIT(0),
+};
+
+/**
+ * struct panthor_coredump_header - Coredump header
+ */
+struct panthor_coredump_header {
+	enum panthor_coredump_reason reason;
+	ktime_t timestamp;
+};
+
+/**
+ * struct panthor_coredump - Coredump
+ */
+struct panthor_coredump {
+	/** @ptdev: Device. */
+	struct panthor_device *ptdev;
+
+	/** @work: Bottom half of panthor_coredump_capture. */
+	struct work_struct work;
+
+	/** @header: Header. */
+	struct panthor_coredump_header header;
+
+	/** @mask: Bitmask of captured states. */
+	u32 mask;
+
+	struct panthor_coredump_group_state group;
+
+	/* @data: Serialized coredump data. */
+	void *data;
+
+	/* @size: Serialized coredump size. */
+	size_t size;
+};
+
+static const char *reason_str(enum panthor_coredump_reason reason)
+{
+	switch (reason) {
+	case PANTHOR_COREDUMP_REASON_MMU_FAULT:
+		return "MMU_FAULT";
+	case PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT:
+		return "CSG_REQ_TIMEOUT";
+	case PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE:
+		return "CSG_UNKNOWN_STATE";
+	case PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT:
+		return "CSG_PROGRESS_TIMEOUT";
+	case PANTHOR_COREDUMP_REASON_CS_FATAL:
+		return "CS_FATAL";
+	case PANTHOR_COREDUMP_REASON_CS_FAULT:
+		return "CS_FAULT";
+	case PANTHOR_COREDUMP_REASON_CS_TILER_OOM:
+		return "CS_TILER_OOM";
+	case PANTHOR_COREDUMP_REASON_JOB_TIMEOUT:
+		return "JOB_TIMEOUT";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static void print_group(struct drm_printer *p,
+			const struct panthor_coredump_group_state *group)
+{
+	drm_puts(p, "group:\n");
+	drm_printf(p, "  priority: %d\n", group->priority);
+	drm_printf(p, "  queue_count: %u\n", group->queue_count);
+	drm_printf(p, "  pid: %d\n", group->pid);
+	drm_printf(p, "  comm: %s\n", group->comm);
+	drm_printf(p, "  destroyed: %d\n", group->destroyed);
+	drm_printf(p, "  csg_id: %d\n", group->csg_id);
+}
+
+static void print_header(struct drm_printer *p,
+			 const struct panthor_coredump_header *header,
+			 const struct drm_driver *drv)
+{
+	drm_puts(p, "header:\n");
+	drm_puts(p, "  kernel: " UTS_RELEASE "\n");
+	drm_puts(p, "  module: " KBUILD_MODNAME "\n");
+	drm_printf(p, "  driver_version: %d.%d\n", drv->major, drv->minor);
+
+	drm_printf(p, "  reason: %s\n", reason_str(header->reason));
+	drm_printf(p, "  timestamp: %lld\n", ktime_to_ns(header->timestamp));
+}
+
+static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
+{
+	/* in YAML format */
+	drm_puts(p, "---\n");
+	print_header(p, &cd->header, cd->ptdev->base.driver);
+
+	if (cd->mask & PANTHOR_COREDUMP_GROUP)
+		print_group(p, &cd->group);
+}
+
+static void process_cd(struct panthor_device *ptdev,
+		       struct panthor_coredump *cd)
+{
+	struct drm_print_iterator iter = {
+		.remain = SSIZE_MAX,
+	};
+	struct drm_printer p = drm_coredump_printer(&iter);
+
+	print_cd(&p, cd);
+
+	iter.remain = SSIZE_MAX - iter.remain;
+	iter.data = kvmalloc(iter.remain, GFP_USER);
+	if (!iter.data)
+		return;
+
+	cd->data = iter.data;
+	cd->size = iter.remain;
+
+	drm_info(&ptdev->base, "generating coredump of size %zu\n", cd->size);
+
+	p = drm_coredump_printer(&iter);
+	print_cd(&p, cd);
+}
+
+static void capture_cd(struct panthor_device *ptdev,
+		       struct panthor_coredump *cd, struct panthor_group *group)
+{
+	drm_info(&ptdev->base, "capturing coredump states\n");
+
+	if (group) {
+		panthor_group_capture_coredump(group, &cd->group);
+		cd->mask |= PANTHOR_COREDUMP_GROUP;
+	}
+}
+
+static void panthor_coredump_free(void *data)
+{
+	struct panthor_coredump *cd = data;
+	struct panthor_device *ptdev = cd->ptdev;
+
+	kvfree(cd->data);
+	kfree(cd);
+
+	atomic_set(&ptdev->coredump.pending, 0);
+}
+
+static ssize_t panthor_coredump_read(char *buffer, loff_t offset, size_t count,
+				     void *data, size_t datalen)
+{
+	const struct panthor_coredump *cd = data;
+
+	if (offset >= cd->size)
+		return 0;
+
+	if (count > cd->size - offset)
+		count = cd->size - offset;
+
+	memcpy(buffer, cd->data + offset, count);
+
+	return count;
+}
+
+static void panthor_coredump_process_work(struct work_struct *work)
+{
+	struct panthor_coredump *cd =
+		container_of(work, struct panthor_coredump, work);
+	struct panthor_device *ptdev = cd->ptdev;
+
+	process_cd(ptdev, cd);
+
+	dev_coredumpm(ptdev->base.dev, THIS_MODULE, cd, 0, GFP_KERNEL,
+		      panthor_coredump_read, panthor_coredump_free);
+}
+
+void panthor_coredump_capture(struct panthor_coredump *cd,
+			      struct panthor_group *group)
+{
+	struct panthor_device *ptdev = cd->ptdev;
+
+	capture_cd(ptdev, cd, group);
+
+	queue_work(system_unbound_wq, &cd->work);
+}
+
+struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+		       enum panthor_coredump_reason reason, gfp_t gfp)
+{
+	struct panthor_coredump *cd;
+
+	/* reject all but the first coredump until it is handled */
+	if (atomic_cmpxchg(&ptdev->coredump.pending, 0, 1)) {
+		drm_dbg(&ptdev->base, "skip subsequent coredump\n");
+		return NULL;
+	}
+
+	cd = kzalloc(sizeof(*cd), gfp);
+	if (!cd) {
+		atomic_set(&ptdev->coredump.pending, 0);
+		return NULL;
+	}
+
+	cd->ptdev = ptdev;
+	INIT_WORK(&cd->work, panthor_coredump_process_work);
+
+	cd->header.reason = reason;
+	cd->header.timestamp = ktime_get_real();
+
+	return cd;
+}
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
new file mode 100644
index 000000000000..dd1fe1c2e175
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2019 Collabora ltd. */
+
+#ifndef __PANTHOR_COREDUMP_H__
+#define __PANTHOR_COREDUMP_H__
+
+#include <drm/panthor_drm.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+struct panthor_coredump;
+struct panthor_device;
+struct panthor_group;
+
+/**
+ * enum panthor_coredump_reason - Coredump reason
+ */
+enum panthor_coredump_reason {
+	PANTHOR_COREDUMP_REASON_MMU_FAULT,
+	PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
+	PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
+	PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT,
+	PANTHOR_COREDUMP_REASON_CS_FATAL,
+	PANTHOR_COREDUMP_REASON_CS_FAULT,
+	PANTHOR_COREDUMP_REASON_CS_TILER_OOM,
+	PANTHOR_COREDUMP_REASON_JOB_TIMEOUT,
+};
+
+/**
+ * struct panthor_coredump_group_state - Coredump group state
+ *
+ * Interesting panthor_group fields.
+ */
+struct panthor_coredump_group_state {
+	enum drm_panthor_group_priority priority;
+	u32 queue_count;
+	pid_t pid;
+	char comm[TASK_COMM_LEN];
+	bool destroyed;
+	int csg_id;
+};
+
+#ifdef CONFIG_DEV_COREDUMP
+
+struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+		       enum panthor_coredump_reason reason, gfp_t gfp);
+
+void panthor_coredump_capture(struct panthor_coredump *cd,
+			      struct panthor_group *group);
+
+#else /* CONFIG_DEV_COREDUMP */
+
+static inline struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+		       enum panthor_coredump_reason reason, gfp_t gfp)
+{
+	return NULL;
+}
+
+static inline void panthor_coredump_capture(struct panthor_coredump *cd,
+					    struct panthor_group *group)
+{
+}
+
+#endif /* CONFIG_DEV_COREDUMP */
+
+#endif /* __PANTHOR_COREDUMP_H__ */
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index 4fc7cf2aeed5..766e53c25cfa 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -197,6 +197,12 @@ struct panthor_device {
 		atomic_t recovery_needed;
 	} pm;
 
+	/** @coredump: Coredump-related data. */
+	struct {
+		/** @pending: True if there is a pending coredump. */
+		atomic_t pending;
+	} coredump;
+
 	/** @profile_mask: User-set profiling flags for job accounting. */
 	u32 profile_mask;
 
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index a2248f692a03..eb45b5ad9774 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -23,6 +23,7 @@
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 
+#include "panthor_coredump.h"
 #include "panthor_devfreq.h"
 #include "panthor_device.h"
 #include "panthor_fw.h"
@@ -1031,6 +1032,10 @@ group_unbind_locked(struct panthor_group *group)
 	return 0;
 }
 
+static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
+					  enum panthor_coredump_reason reason,
+					  struct panthor_group *group);
+
 /**
  * cs_slot_prog_locked() - Program a queue slot
  * @ptdev: Device.
@@ -1249,6 +1254,10 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
 		drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
 			csg_id, csg_state);
 		new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
+
+		panthor_sched_coredump_locked(
+			ptdev, PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
+			group);
 		break;
 	}
 
@@ -1378,6 +1387,9 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
 		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
 		 (unsigned int)CS_EXCEPTION_DATA(fatal),
 		 info);
+
+	panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FATAL,
+				      group);
 }
 
 static void
@@ -1426,6 +1438,9 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
 		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
 		 (unsigned int)CS_EXCEPTION_DATA(fault),
 		 info);
+
+	panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FAULT,
+				      group);
 }
 
 static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
@@ -1480,6 +1495,10 @@ static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
 		drm_warn(&ptdev->base, "Failed to extend the tiler heap\n");
 		group->fatal_queues |= BIT(cs_id);
 		sched_queue_delayed_work(sched, tick, 0);
+
+		panthor_sched_coredump_locked(
+			ptdev, PANTHOR_COREDUMP_REASON_CS_TILER_OOM, group);
+
 		goto out_put_heap_pool;
 	}
 
@@ -1639,6 +1658,9 @@ csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 c
 		group->timedout = true;
 
 	sched_queue_delayed_work(sched, tick, 0);
+
+	panthor_sched_coredump_locked(
+		ptdev, PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT, group);
 }
 
 static void sched_process_csg_irq_locked(struct panthor_device *ptdev, u32 csg_id)
@@ -1858,8 +1880,16 @@ static int csgs_upd_ctx_apply_locked(struct panthor_device *ptdev,
 
 		if (ret && acked != req_mask &&
 		    ((csg_iface->input->req ^ csg_iface->output->ack) & req_mask) != 0) {
+			struct panthor_csg_slot *csg_slot =
+				&sched->csg_slots[csg_id];
+			struct panthor_group *group = csg_slot->group;
+
 			drm_err(&ptdev->base, "CSG %d update request timedout", csg_id);
 			ctx->timedout_mask |= BIT(csg_id);
+
+			panthor_sched_coredump_locked(
+				ptdev, PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
+				group);
 		}
 	}
 
@@ -2027,6 +2057,10 @@ tick_ctx_init(struct panthor_scheduler *sched,
 		 * CSG IRQs, so we can flag the faulty queue.
 		 */
 		if (panthor_vm_has_unhandled_faults(group->vm)) {
+			panthor_sched_coredump_locked(
+				ptdev, PANTHOR_COREDUMP_REASON_MMU_FAULT,
+				group);
+
 			sched_process_csg_irq_locked(ptdev, i);
 
 			/* No fatal fault reported, flag all queues as faulty. */
@@ -3237,6 +3271,10 @@ queue_timedout_job(struct drm_sched_job *sched_job)
 
 		group_queue_work(group, term);
 	}
+
+	panthor_sched_coredump_locked(
+		ptdev, PANTHOR_COREDUMP_REASON_JOB_TIMEOUT, group);
+
 	mutex_unlock(&sched->lock);
 
 	queue_start(queue);
@@ -3627,6 +3665,37 @@ int panthor_group_get_state(struct panthor_file *pfile,
 	return 0;
 }
 
+static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
+					  enum panthor_coredump_reason reason,
+					  struct panthor_group *group)
+{
+	struct panthor_coredump *cd;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	/* GFP_NOWAIT because this may be called from fence signaling path */
+	cd = panthor_coredump_alloc(ptdev, reason, GFP_NOWAIT);
+	if (!cd)
+		return;
+
+	panthor_coredump_capture(cd, group);
+}
+
+void panthor_group_capture_coredump(const struct panthor_group *group,
+				    struct panthor_coredump_group_state *state)
+{
+	const struct panthor_device *ptdev = group->ptdev;
+
+	/* this is called from panthor_coredump_capture */
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	state->priority = group->priority;
+	state->queue_count = group->queue_count;
+	/* TODO state->pid and state->comm */
+	state->destroyed = group->destroyed;
+	state->csg_id = group->csg_id;
+}
+
 int panthor_group_pool_create(struct panthor_file *pfile)
 {
 	struct panthor_group_pool *gpool;
diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
index 742b0b4ff3a3..6c564153133e 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.h
+++ b/drivers/gpu/drm/panthor/panthor_sched.h
@@ -14,8 +14,10 @@ struct drm_panthor_group_create;
 struct drm_panthor_queue_create;
 struct drm_panthor_group_get_state;
 struct drm_panthor_queue_submit;
+struct panthor_coredump_group_state;
 struct panthor_device;
 struct panthor_file;
+struct panthor_group;
 struct panthor_group_pool;
 struct panthor_job;
 
@@ -26,6 +28,9 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle);
 int panthor_group_get_state(struct panthor_file *pfile,
 			    struct drm_panthor_group_get_state *get_state);
 
+void panthor_group_capture_coredump(const struct panthor_group *group,
+				    struct panthor_coredump_group_state *state);
+
 struct drm_sched_job *
 panthor_job_create(struct panthor_file *pfile,
 		   u16 group_handle,
-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/9] drm/panthor: capture GPU state for devcoredump
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
  2025-07-20  0:01 ` [PATCH 1/9] " Chia-I Wu
@ 2025-07-20  0:01 ` Chia-I Wu
  2025-07-20  4:29   ` kernel test robot
  2025-07-20  0:01 ` [PATCH 3/9] drm/panthor: capture GLB " Chia-I Wu
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

Capture interesting GPU_CONTROL regs for devcoredump.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_coredump.c | 85 ++++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_coredump.h | 16 ++++
 drivers/gpu/drm/panthor/panthor_regs.h     |  6 ++
 drivers/gpu/drm/panthor/panthor_sched.c    |  6 ++
 4 files changed, 113 insertions(+)

diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
index 767f3327e3e8..a41d0bbcb4f1 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.c
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -7,11 +7,13 @@
 #include <generated/utsrelease.h>
 #include <linux/devcoredump.h>
 #include <linux/err.h>
+#include <linux/pm_runtime.h>
 #include <linux/slab.h>
 #include <linux/timekeeping.h>
 
 #include "panthor_coredump.h"
 #include "panthor_device.h"
+#include "panthor_regs.h"
 #include "panthor_sched.h"
 
 /**
@@ -19,6 +21,7 @@
  */
 enum panthor_coredump_mask {
 	PANTHOR_COREDUMP_GROUP = BIT(0),
+	PANTHOR_COREDUMP_GPU = BIT(1),
 };
 
 /**
@@ -46,6 +49,7 @@ struct panthor_coredump {
 	u32 mask;
 
 	struct panthor_coredump_group_state group;
+	struct panthor_coredump_gpu_state gpu;
 
 	/* @data: Serialized coredump data. */
 	void *data;
@@ -78,6 +82,63 @@ static const char *reason_str(enum panthor_coredump_reason reason)
 	}
 }
 
+static void print_gpu(struct drm_printer *p,
+		      const struct panthor_coredump_gpu_state *gpu,
+		      const struct drm_panthor_gpu_info *info)
+{
+	drm_puts(p, "gpu:\n");
+	drm_printf(p, "  GPU_ID: 0x%x\n", info->gpu_id);
+	drm_printf(p, "  L2_FEATURES: 0x%x\n", info->l2_features);
+	drm_printf(p, "  CORE_FEATURES: 0x%x\n", info->core_features);
+	drm_printf(p, "  TILER_FEATURES: 0x%x\n", info->tiler_features);
+	drm_printf(p, "  MEM_FEATURES: 0x%x\n", info->mem_features);
+	drm_printf(p, "  MMU_FEATURES: 0x%x\n", info->mmu_features);
+	drm_printf(p, "  AS_PRESENT: 0x%x\n", info->as_present);
+	drm_printf(p, "  CSF_ID: 0x%x\n", info->csf_id);
+	drm_printf(p, "  MMU_FEATURES: 0x%x\n", info->mmu_features);
+
+	if (gpu) {
+		drm_printf(p, "  GPU_STATUS: 0x%x\n", gpu->gpu_status);
+		drm_printf(p, "  GPU_FAULTSTATUS: 0x%x\n",
+			   gpu->gpu_faultstatus);
+		drm_printf(p, "  GPU_FAULTADDRESS: 0x%llx\n",
+			   gpu->gpu_faultaddress);
+		drm_printf(p, "  L2_CONFIG: 0x%x\n", gpu->l2_config);
+	}
+
+	drm_printf(p, "  THREAD_MAX_THREADS: 0x%x\n", info->max_threads);
+	drm_printf(p, "  THREAD_MAX_WORKGROUP_SIZE: 0x%x\n",
+		   info->thread_max_workgroup_size);
+	drm_printf(p, "  THREAD_MAX_BARRIER_SIZE: 0x%x\n",
+		   info->thread_max_barrier_size);
+	drm_printf(p, "  THREAD_FEATURES: 0x%x\n", info->thread_features);
+	drm_printf(p, "  TEXTURE_FEATURES_0: 0x%x\n",
+		   info->texture_features[0]);
+	drm_printf(p, "  TEXTURE_FEATURES_1: 0x%x\n",
+		   info->texture_features[1]);
+	drm_printf(p, "  TEXTURE_FEATURES_2: 0x%x\n",
+		   info->texture_features[2]);
+	drm_printf(p, "  TEXTURE_FEATURES_3: 0x%x\n",
+		   info->texture_features[3]);
+
+	if (gpu) {
+		drm_printf(p, "  DOORBELL_FEATURES: 0x%x\n",
+			   gpu->doorbell_features);
+	}
+
+	drm_printf(p, "  SHADER_PRESENT: 0x%llx\n", info->shader_present);
+	drm_printf(p, "  TILER_PRESENT: 0x%llx\n", info->tiler_present);
+	drm_printf(p, "  L2_PRESENT: 0x%llx\n", info->l2_present);
+	drm_printf(p, "  REVIDR: 0x%x\n", info->gpu_rev);
+	drm_printf(p, "  AMBA_FEATURES: 0x%x\n", info->coherency_features);
+
+	if (gpu) {
+		drm_printf(p, "  AMBA_ENABLE: 0x%x\n", gpu->amba_enable);
+		drm_printf(p, "  MCU_STATUS: 0x%x\n", gpu->mcu_status);
+		drm_printf(p, "  MCU_FEATURES: 0x%x\n", gpu->mcu_features);
+	}
+}
+
 static void print_group(struct drm_printer *p,
 			const struct panthor_coredump_group_state *group)
 {
@@ -111,6 +172,10 @@ static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
 
 	if (cd->mask & PANTHOR_COREDUMP_GROUP)
 		print_group(p, &cd->group);
+
+	/* many gpu states are static and are captured in drm_panthor_gpu_info */
+	print_gpu(p, cd->mask & PANTHOR_COREDUMP_GPU ? &cd->gpu : NULL,
+		  &cd->ptdev->gpu_info);
 }
 
 static void process_cd(struct panthor_device *ptdev,
@@ -137,6 +202,19 @@ static void process_cd(struct panthor_device *ptdev,
 	print_cd(&p, cd);
 }
 
+static void capture_gpu(struct panthor_device *ptdev,
+			struct panthor_coredump_gpu_state *gpu)
+{
+	gpu->gpu_status = gpu_read(ptdev, GPU_STATUS);
+	gpu->gpu_faultstatus = gpu_read(ptdev, GPU_FAULT_STATUS);
+	gpu->gpu_faultaddress = gpu_read64(ptdev, GPU_FAULT_ADDR);
+	gpu->l2_config = gpu_read(ptdev, GPU_L2_CONFIG);
+	gpu->doorbell_features = gpu_read(ptdev, GPU_DOORBELL_FEATURES);
+	gpu->amba_enable = gpu_read(ptdev, GPU_COHERENCY_PROTOCOL);
+	gpu->mcu_status = gpu_read(ptdev, MCU_STATUS);
+	gpu->mcu_features = gpu_read(ptdev, MCU_FEATURES);
+}
+
 static void capture_cd(struct panthor_device *ptdev,
 		       struct panthor_coredump *cd, struct panthor_group *group)
 {
@@ -146,6 +224,13 @@ static void capture_cd(struct panthor_device *ptdev,
 		panthor_group_capture_coredump(group, &cd->group);
 		cd->mask |= PANTHOR_COREDUMP_GROUP;
 	}
+
+	/* remaining states require the device to be powered on */
+	if (!pm_runtime_active(ptdev->base.dev))
+		return;
+
+	capture_gpu(ptdev, &cd->gpu);
+	cd->mask |= PANTHOR_COREDUMP_GPU;
 }
 
 static void panthor_coredump_free(void *data)
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
index dd1fe1c2e175..9e30c02ab962 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.h
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -40,6 +40,22 @@ struct panthor_coredump_group_state {
 	int csg_id;
 };
 
+/**
+ * struct panthor_coredump_gpu_state - Coredump GPU state
+ *
+ * Interesting GPU_CONTROL regs.
+ */
+struct panthor_coredump_gpu_state {
+	u32 gpu_status;
+	u32 gpu_faultstatus;
+	u64 gpu_faultaddress;
+	u32 l2_config;
+	u32 doorbell_features;
+	u32 amba_enable;
+	u32 mcu_status;
+	u32 mcu_features;
+};
+
 #ifdef CONFIG_DEV_COREDUMP
 
 struct panthor_coredump *
diff --git a/drivers/gpu/drm/panthor/panthor_regs.h b/drivers/gpu/drm/panthor/panthor_regs.h
index 48bbfd40138c..062f939e075c 100644
--- a/drivers/gpu/drm/panthor/panthor_regs.h
+++ b/drivers/gpu/drm/panthor/panthor_regs.h
@@ -65,6 +65,8 @@
 #define GPU_FAULT_STATUS				0x3C
 #define GPU_FAULT_ADDR					0x40
 
+#define GPU_L2_CONFIG					0x48
+
 #define GPU_PWR_KEY					0x50
 #define  GPU_PWR_KEY_UNLOCK				0x2968A819
 #define GPU_PWR_OVERRIDE0				0x54
@@ -81,6 +83,8 @@
 
 #define GPU_TEXTURE_FEATURES(n)				(0xB0 + ((n) * 4))
 
+#define GPU_DOORBELL_FEATURES				0xC0
+
 #define GPU_SHADER_PRESENT				0x100
 #define GPU_TILER_PRESENT				0x110
 #define GPU_L2_PRESENT					0x120
@@ -126,6 +130,8 @@
 #define MCU_STATUS_HALT					2
 #define MCU_STATUS_FATAL				3
 
+#define MCU_FEATURES					0x708
+
 /* Job Control regs */
 #define JOB_INT_RAWSTAT					0x1000
 #define JOB_INT_CLEAR					0x1004
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index eb45b5ad9774..a9fd71fa984b 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -3670,6 +3670,7 @@ static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
 					  struct panthor_group *group)
 {
 	struct panthor_coredump *cd;
+	int pm_active;
 
 	lockdep_assert_held(&ptdev->scheduler->lock);
 
@@ -3678,7 +3679,12 @@ static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
 	if (!cd)
 		return;
 
+	pm_active = pm_runtime_get_if_active(ptdev->base.dev);
+
 	panthor_coredump_capture(cd, group);
+
+	if (pm_active == 1)
+		pm_runtime_put(ptdev->base.dev);
 }
 
 void panthor_group_capture_coredump(const struct panthor_group *group,
-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/9] drm/panthor: capture GLB state for devcoredump
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
  2025-07-20  0:01 ` [PATCH 1/9] " Chia-I Wu
  2025-07-20  0:01 ` [PATCH 2/9] drm/panthor: capture GPU state for devcoredump Chia-I Wu
@ 2025-07-20  0:01 ` Chia-I Wu
  2025-07-20  5:41   ` kernel test robot
  2025-07-20  0:01 ` [PATCH 4/9] drm/panthor: capture CSG " Chia-I Wu
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

Capture interesting panthor_fw_global_iface fields for devcoredump.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_coredump.c | 33 ++++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_coredump.h | 13 +++++++++
 2 files changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
index a41d0bbcb4f1..44d711e2f310 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.c
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -13,6 +13,7 @@
 
 #include "panthor_coredump.h"
 #include "panthor_device.h"
+#include "panthor_fw.h"
 #include "panthor_regs.h"
 #include "panthor_sched.h"
 
@@ -22,6 +23,7 @@
 enum panthor_coredump_mask {
 	PANTHOR_COREDUMP_GROUP = BIT(0),
 	PANTHOR_COREDUMP_GPU = BIT(1),
+	PANTHOR_COREDUMP_GLB = BIT(2),
 };
 
 /**
@@ -50,6 +52,7 @@ struct panthor_coredump {
 
 	struct panthor_coredump_group_state group;
 	struct panthor_coredump_gpu_state gpu;
+	struct panthor_coredump_glb_state glb;
 
 	/* @data: Serialized coredump data. */
 	void *data;
@@ -82,6 +85,17 @@ static const char *reason_str(enum panthor_coredump_reason reason)
 	}
 }
 
+static void print_glb(struct drm_printer *p,
+		      const struct panthor_coredump_glb_state *glb)
+{
+	drm_puts(p, "glb:\n");
+	drm_printf(p, "  GLB_VERSION: 0x%x\n", glb->version);
+	drm_printf(p, "  GLB_FEATURES: 0x%x\n", glb->features);
+	drm_printf(p, "  GLB_GROUP_NUM: 0x%x\n", glb->group_num);
+	drm_printf(p, "  GLB_REQ: 0x%x\n", glb->req);
+	drm_printf(p, "  GLB_ACK: 0x%x\n", glb->ack);
+}
+
 static void print_gpu(struct drm_printer *p,
 		      const struct panthor_coredump_gpu_state *gpu,
 		      const struct drm_panthor_gpu_info *info)
@@ -176,6 +190,9 @@ static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
 	/* many gpu states are static and are captured in drm_panthor_gpu_info */
 	print_gpu(p, cd->mask & PANTHOR_COREDUMP_GPU ? &cd->gpu : NULL,
 		  &cd->ptdev->gpu_info);
+
+	if (cd->mask & PANTHOR_COREDUMP_GLB)
+		print_glb(p, &cd->glb);
 }
 
 static void process_cd(struct panthor_device *ptdev,
@@ -202,6 +219,19 @@ static void process_cd(struct panthor_device *ptdev,
 	print_cd(&p, cd);
 }
 
+static void capture_glb(struct panthor_device *ptdev,
+			struct panthor_coredump_glb_state *glb)
+{
+	const struct panthor_fw_global_iface *glb_iface =
+		panthor_fw_get_glb_iface(ptdev);
+
+	glb->version = glb_iface->control->version;
+	glb->features = glb_iface->control->features;
+	glb->group_num = glb_iface->control->group_num;
+	glb->req = glb_iface->input->req;
+	glb->ack = glb_iface->output->ack;
+}
+
 static void capture_gpu(struct panthor_device *ptdev,
 			struct panthor_coredump_gpu_state *gpu)
 {
@@ -231,6 +261,9 @@ static void capture_cd(struct panthor_device *ptdev,
 
 	capture_gpu(ptdev, &cd->gpu);
 	cd->mask |= PANTHOR_COREDUMP_GPU;
+
+	capture_glb(ptdev, &cd->glb);
+	cd->mask |= PANTHOR_COREDUMP_GLB;
 }
 
 static void panthor_coredump_free(void *data)
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
index 9e30c02ab962..e578298e9b57 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.h
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -56,6 +56,19 @@ struct panthor_coredump_gpu_state {
 	u32 mcu_features;
 };
 
+/**
+ * struct panthor_coredump_glb_state - Coredump GLB state
+ *
+ * Interesting panthor_fw_global_iface fields.
+ */
+struct panthor_coredump_glb_state {
+	u32 version;
+	u32 features;
+	u32 group_num;
+	u32 req;
+	u32 ack;
+};
+
 #ifdef CONFIG_DEV_COREDUMP
 
 struct panthor_coredump *
-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 4/9] drm/panthor: capture CSG state for devcoredump
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
                   ` (2 preceding siblings ...)
  2025-07-20  0:01 ` [PATCH 3/9] drm/panthor: capture GLB " Chia-I Wu
@ 2025-07-20  0:01 ` Chia-I Wu
  2025-07-20  0:01 ` [PATCH 5/9] drm/panthor: capture CS " Chia-I Wu
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

Capture interesting panthor_fw_csg_iface fields for devcoredump.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_coredump.c | 58 ++++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_coredump.h | 23 +++++++++
 drivers/gpu/drm/panthor/panthor_sched.c    | 13 +++++
 3 files changed, 94 insertions(+)

diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
index 44d711e2f310..e08bd33b3554 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.c
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -24,6 +24,7 @@ enum panthor_coredump_mask {
 	PANTHOR_COREDUMP_GROUP = BIT(0),
 	PANTHOR_COREDUMP_GPU = BIT(1),
 	PANTHOR_COREDUMP_GLB = BIT(2),
+	PANTHOR_COREDUMP_CSG = BIT(3),
 };
 
 /**
@@ -53,6 +54,7 @@ struct panthor_coredump {
 	struct panthor_coredump_group_state group;
 	struct panthor_coredump_gpu_state gpu;
 	struct panthor_coredump_glb_state glb;
+	struct panthor_coredump_csg_state csg;
 
 	/* @data: Serialized coredump data. */
 	void *data;
@@ -85,6 +87,28 @@ static const char *reason_str(enum panthor_coredump_reason reason)
 	}
 }
 
+static void print_csg(struct drm_printer *p,
+		      const struct panthor_coredump_csg_state *csg, u32 csg_id)
+{
+	drm_printf(p, "csg%d:\n", csg_id);
+	drm_printf(p, "  GROUP_FEATURES: 0x%x\n", csg->features);
+	drm_printf(p, "  GROUP_STREAM_NUM: 0x%x\n", csg->stream_num);
+
+	drm_printf(p, "  CSG_REQ: 0x%x\n", csg->req);
+	drm_printf(p, "  CSG_ALLOW_COMPUTE: 0x%llx\n", csg->allow_compute);
+	drm_printf(p, "  CSG_ALLOW_FRAGMENT: 0x%llx\n", csg->allow_fragment);
+	drm_printf(p, "  CSG_ALLOW_OTHER: 0x%x\n", csg->allow_other);
+	drm_printf(p, "  CSG_EP_REQ: 0x%x\n", csg->ep_req);
+	drm_printf(p, "  CSG_CONFIG: 0x%x\n", csg->config);
+
+	drm_printf(p, "  CSG_ACK: 0x%x\n", csg->ack);
+	drm_printf(p, "  CSG_STATUS_EP_CURRENT: 0x%x\n",
+		   csg->status_ep_current);
+	drm_printf(p, "  CSG_STATUS_EP_REQ: 0x%x\n", csg->status_ep_req);
+	drm_printf(p, "  CSG_STATUS_STATE: 0x%x\n", csg->status_state);
+	drm_printf(p, "  CSG_RESOURCE_DEP: 0x%x\n", csg->resource_dep);
+}
+
 static void print_glb(struct drm_printer *p,
 		      const struct panthor_coredump_glb_state *glb)
 {
@@ -193,6 +217,10 @@ static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
 
 	if (cd->mask & PANTHOR_COREDUMP_GLB)
 		print_glb(p, &cd->glb);
+
+	if (cd->mask & PANTHOR_COREDUMP_CSG) {
+		print_csg(p, &cd->csg, cd->group.csg_id);
+	}
 }
 
 static void process_cd(struct panthor_device *ptdev,
@@ -219,6 +247,29 @@ static void process_cd(struct panthor_device *ptdev,
 	print_cd(&p, cd);
 }
 
+static void capture_csg(struct panthor_device *ptdev,
+			struct panthor_coredump_csg_state *csg, u32 csg_id)
+{
+	const struct panthor_fw_csg_iface *csg_iface =
+		panthor_fw_get_csg_iface(ptdev, csg_id);
+
+	csg->features = csg_iface->control->features;
+	csg->stream_num = csg_iface->control->stream_num;
+
+	csg->req = csg_iface->input->req;
+	csg->allow_compute = csg_iface->input->allow_compute;
+	csg->allow_fragment = csg_iface->input->allow_fragment;
+	csg->allow_other = csg_iface->input->allow_other;
+	csg->ep_req = csg_iface->input->endpoint_req;
+	csg->config = csg_iface->input->config;
+
+	csg->ack = csg_iface->output->ack;
+	csg->status_ep_current = csg_iface->output->status_endpoint_current;
+	csg->status_ep_req = csg_iface->output->status_endpoint_req;
+	csg->status_state = csg_iface->output->status_state;
+	csg->resource_dep = csg_iface->output->resource_dep;
+}
+
 static void capture_glb(struct panthor_device *ptdev,
 			struct panthor_coredump_glb_state *glb)
 {
@@ -264,6 +315,13 @@ static void capture_cd(struct panthor_device *ptdev,
 
 	capture_glb(ptdev, &cd->glb);
 	cd->mask |= PANTHOR_COREDUMP_GLB;
+
+	/* remaining states require an active group */
+	if (!group || cd->group.csg_id < 0)
+		return;
+
+	capture_csg(ptdev, &cd->csg, cd->group.csg_id);
+	cd->mask |= PANTHOR_COREDUMP_CSG;
 }
 
 static void panthor_coredump_free(void *data)
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
index e578298e9b57..d965ebc545d3 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.h
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -69,6 +69,29 @@ struct panthor_coredump_glb_state {
 	u32 ack;
 };
 
+/**
+ * struct panthor_coredump_csg_state - Coredump CSG state
+ *
+ * Interesting panthor_fw_csg_iface fields.
+ */
+struct panthor_coredump_csg_state {
+	u32 features;
+	u32 stream_num;
+
+	u32 req;
+	u64 allow_compute;
+	u64 allow_fragment;
+	u32 allow_other;
+	u32 ep_req;
+	u32 config;
+
+	u32 ack;
+	u32 status_ep_current;
+	u32 status_ep_req;
+	u32 status_state;
+	u32 resource_dep;
+};
+
 #ifdef CONFIG_DEV_COREDUMP
 
 struct panthor_coredump *
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index a9fd71fa984b..504fc097ebfe 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -3681,6 +3681,19 @@ static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
 
 	pm_active = pm_runtime_get_if_active(ptdev->base.dev);
 
+	/* force a CSG_STATUS_UPDATE */
+	if (pm_active && group && group->csg_id >= 0) {
+		struct panthor_fw_csg_iface *csg_iface;
+		u32 acked;
+
+		csg_iface = panthor_fw_get_csg_iface(ptdev, group->csg_id);
+
+		panthor_fw_toggle_reqs(csg_iface, req, ack, CSG_STATUS_UPDATE);
+		panthor_fw_ring_csg_doorbells(ptdev, BIT(group->csg_id));
+		panthor_fw_csg_wait_acks(ptdev, group->csg_id,
+					 CSG_STATUS_UPDATE, &acked, 100);
+	}
+
 	panthor_coredump_capture(cd, group);
 
 	if (pm_active == 1)
-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 5/9] drm/panthor: capture CS state for devcoredump
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
                   ` (3 preceding siblings ...)
  2025-07-20  0:01 ` [PATCH 4/9] drm/panthor: capture CSG " Chia-I Wu
@ 2025-07-20  0:01 ` Chia-I Wu
  2025-07-20  0:01 ` [PATCH 6/9] drm/panthor: capture AS " Chia-I Wu
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

Capture interesting panthor_fw_cs_iface, panthor_fw_ringbuf_input_iface,
and panthor_fw_ringbuf_output_iface fields for devcoredump.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_coredump.c | 79 ++++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_coredump.h | 32 +++++++++
 drivers/gpu/drm/panthor/panthor_sched.c    | 11 +++
 drivers/gpu/drm/panthor/panthor_sched.h    |  7 ++
 4 files changed, 129 insertions(+)

diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
index e08bd33b3554..60d651a8468a 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.c
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -25,6 +25,7 @@ enum panthor_coredump_mask {
 	PANTHOR_COREDUMP_GPU = BIT(1),
 	PANTHOR_COREDUMP_GLB = BIT(2),
 	PANTHOR_COREDUMP_CSG = BIT(3),
+	PANTHOR_COREDUMP_CS = BIT(4),
 };
 
 /**
@@ -55,6 +56,7 @@ struct panthor_coredump {
 	struct panthor_coredump_gpu_state gpu;
 	struct panthor_coredump_glb_state glb;
 	struct panthor_coredump_csg_state csg;
+	struct panthor_coredump_cs_state cs[MAX_CS_PER_CSG];
 
 	/* @data: Serialized coredump data. */
 	void *data;
@@ -87,6 +89,37 @@ static const char *reason_str(enum panthor_coredump_reason reason)
 	}
 }
 
+static void print_cs(struct drm_printer *p,
+		     const struct panthor_coredump_cs_state *cs, u32 cs_id)
+{
+	drm_printf(p, "cs%d:\n", cs_id);
+	drm_printf(p, "  STREAM_FEATURES: 0x%x\n", cs->features);
+
+	drm_printf(p, "  CS_REQ: 0x%x\n", cs->req);
+	drm_printf(p, "  CS_CONFIG: 0x%x\n", cs->config);
+	drm_printf(p, "  CS_BASE: 0x%llx\n", cs->base);
+	drm_printf(p, "  CS_SIZE: 0x%x\n", cs->size);
+
+	drm_printf(p, "  CS_ACK: 0x%x\n", cs->ack);
+	drm_printf(p, "  CS_STATUS_CMD_PTR: 0x%llx\n", cs->status_cmd_ptr);
+	drm_printf(p, "  CS_STATUS_WAIT: 0x%x\n", cs->status_wait);
+	drm_printf(p, "  CS_STATUS_REQ_RESOURCE: 0x%x\n",
+		   cs->status_req_resource);
+	drm_printf(p, "  CS_STATUS_SCOREBOARDS: 0x%x\n",
+		   cs->status_scoreboards);
+	drm_printf(p, "  CS_STATUS_BLOCKED_REASON: 0x%x\n",
+		   cs->status_blocked_reason);
+	drm_printf(p, "  CS_FAULT: 0x%x\n", cs->fault);
+	drm_printf(p, "  CS_FATAL: 0x%x\n", cs->fatal);
+	drm_printf(p, "  CS_FAULT_INFO: 0x%llx\n", cs->fault_info);
+	drm_printf(p, "  CS_FATAL_INFO: 0x%llx\n", cs->fatal_info);
+
+	drm_printf(p, "  CS_INSERT: 0x%llx\n", cs->insert);
+	drm_printf(p, "  CS_EXTRACT_INIT: 0x%llx\n", cs->extract_init);
+	drm_printf(p, "  CS_EXTRACT: 0x%llx\n", cs->extract);
+	drm_printf(p, "  CS_ACTIVE: 0x%x\n", cs->active);
+}
+
 static void print_csg(struct drm_printer *p,
 		      const struct panthor_coredump_csg_state *csg, u32 csg_id)
 {
@@ -221,6 +254,11 @@ static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
 	if (cd->mask & PANTHOR_COREDUMP_CSG) {
 		print_csg(p, &cd->csg, cd->group.csg_id);
 	}
+
+	if (cd->mask & PANTHOR_COREDUMP_CS) {
+		for (u32 i = 0; i < cd->group.queue_count; i++)
+			print_cs(p, &cd->cs[i], i);
+	}
 }
 
 static void process_cd(struct panthor_device *ptdev,
@@ -247,6 +285,43 @@ static void process_cd(struct panthor_device *ptdev,
 	print_cd(&p, cd);
 }
 
+static void capture_cs(struct panthor_device *ptdev,
+		       struct panthor_coredump_cs_state *cs, u32 csg_id,
+		       u32 cs_id, const struct panthor_group *group)
+{
+	const struct panthor_fw_cs_iface *cs_iface =
+		panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
+	const struct panthor_fw_ringbuf_input_iface *input_iface;
+	const struct panthor_fw_ringbuf_output_iface *output_iface;
+
+	cs->features = cs_iface->control->features;
+
+	cs->req = cs_iface->input->req;
+	cs->config = cs_iface->input->config;
+	cs->base = cs_iface->input->ringbuf_base;
+	cs->size = cs_iface->input->ringbuf_size;
+
+	cs->ack = cs_iface->output->ack;
+	cs->status_cmd_ptr = cs_iface->output->status_cmd_ptr;
+	cs->status_wait = cs_iface->output->status_wait;
+	cs->status_req_resource = cs_iface->output->status_req_resource;
+	cs->status_scoreboards = cs_iface->output->status_scoreboards;
+	cs->status_blocked_reason = cs_iface->output->status_blocked_reason;
+	cs->fault = cs_iface->output->fault;
+	cs->fatal = cs_iface->output->fatal;
+	cs->fault_info = cs_iface->output->fault_info;
+	cs->fatal_info = cs_iface->output->fatal_info;
+
+	panthor_group_get_ringbuf_iface(group, cs_id, &input_iface,
+					&output_iface);
+
+	cs->insert = input_iface->insert;
+	cs->extract_init = input_iface->extract;
+
+	cs->extract = output_iface->extract;
+	cs->active = output_iface->active;
+}
+
 static void capture_csg(struct panthor_device *ptdev,
 			struct panthor_coredump_csg_state *csg, u32 csg_id)
 {
@@ -322,6 +397,10 @@ static void capture_cd(struct panthor_device *ptdev,
 
 	capture_csg(ptdev, &cd->csg, cd->group.csg_id);
 	cd->mask |= PANTHOR_COREDUMP_CSG;
+
+	for (u32 i = 0; i < cd->group.queue_count; i++)
+		capture_cs(ptdev, &cd->cs[i], cd->group.csg_id, i, group);
+	cd->mask |= PANTHOR_COREDUMP_CS;
 }
 
 static void panthor_coredump_free(void *data)
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
index d965ebc545d3..44402c6142cb 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.h
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -92,6 +92,38 @@ struct panthor_coredump_csg_state {
 	u32 resource_dep;
 };
 
+/**
+ * struct panthor_coredump_cs_state - Coredump CS state
+ *
+ * Interesting panthor_fw_cs_iface, panthor_fw_ringbuf_input_iface, and
+ * panthor_fw_ringbuf_output_iface fields.
+ */
+struct panthor_coredump_cs_state {
+	u32 features;
+
+	u32 req;
+	u32 config;
+	u64 base;
+	u32 size;
+
+	u32 ack;
+	u64 status_cmd_ptr;
+	u32 status_wait;
+	u32 status_req_resource;
+	u32 status_scoreboards;
+	u32 status_blocked_reason;
+	u32 fault;
+	u32 fatal;
+	u64 fault_info;
+	u64 fatal_info;
+
+	u64 insert;
+	u64 extract_init;
+
+	u64 extract;
+	u32 active;
+};
+
 #ifdef CONFIG_DEV_COREDUMP
 
 struct panthor_coredump *
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 504fc097ebfe..4bc31c5f667d 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -3715,6 +3715,17 @@ void panthor_group_capture_coredump(const struct panthor_group *group,
 	state->csg_id = group->csg_id;
 }
 
+void panthor_group_get_ringbuf_iface(
+	const struct panthor_group *group, u32 cs_id,
+	const struct panthor_fw_ringbuf_input_iface **input_iface,
+	const struct panthor_fw_ringbuf_output_iface **output_iface)
+{
+	const struct panthor_queue *queue = group->queues[cs_id];
+
+	*input_iface = queue->iface.input;
+	*output_iface = queue->iface.output;
+}
+
 int panthor_group_pool_create(struct panthor_file *pfile)
 {
 	struct panthor_group_pool *gpool;
diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
index 6c564153133e..284ba39f958a 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.h
+++ b/drivers/gpu/drm/panthor/panthor_sched.h
@@ -17,6 +17,8 @@ struct drm_panthor_queue_submit;
 struct panthor_coredump_group_state;
 struct panthor_device;
 struct panthor_file;
+struct panthor_fw_ringbuf_input_iface;
+struct panthor_fw_ringbuf_output_iface;
 struct panthor_group;
 struct panthor_group_pool;
 struct panthor_job;
@@ -31,6 +33,11 @@ int panthor_group_get_state(struct panthor_file *pfile,
 void panthor_group_capture_coredump(const struct panthor_group *group,
 				    struct panthor_coredump_group_state *state);
 
+void panthor_group_get_ringbuf_iface(
+	const struct panthor_group *group, u32 cs_id,
+	const struct panthor_fw_ringbuf_input_iface **input_iface,
+	const struct panthor_fw_ringbuf_output_iface **output_iface);
+
 struct drm_sched_job *
 panthor_job_create(struct panthor_file *pfile,
 		   u16 group_handle,
-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 6/9] drm/panthor: capture AS state for devcoredump
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
                   ` (4 preceding siblings ...)
  2025-07-20  0:01 ` [PATCH 5/9] drm/panthor: capture CS " Chia-I Wu
@ 2025-07-20  0:01 ` Chia-I Wu
  2025-07-20  0:01 ` [PATCH 7/9] drm/panthor: capture VMA " Chia-I Wu
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

Capture interesting MMU_AS_CONTROL regs for devcoredump.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_coredump.c | 33 ++++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_coredump.h | 11 ++++++++
 drivers/gpu/drm/panthor/panthor_sched.c    |  5 ++++
 drivers/gpu/drm/panthor/panthor_sched.h    |  2 ++
 4 files changed, 51 insertions(+)

diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
index 60d651a8468a..acc8ad4cc498 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.c
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -14,6 +14,7 @@
 #include "panthor_coredump.h"
 #include "panthor_device.h"
 #include "panthor_fw.h"
+#include "panthor_mmu.h"
 #include "panthor_regs.h"
 #include "panthor_sched.h"
 
@@ -26,6 +27,7 @@ enum panthor_coredump_mask {
 	PANTHOR_COREDUMP_GLB = BIT(2),
 	PANTHOR_COREDUMP_CSG = BIT(3),
 	PANTHOR_COREDUMP_CS = BIT(4),
+	PANTHOR_COREDUMP_AS = BIT(5),
 };
 
 /**
@@ -57,6 +59,7 @@ struct panthor_coredump {
 	struct panthor_coredump_glb_state glb;
 	struct panthor_coredump_csg_state csg;
 	struct panthor_coredump_cs_state cs[MAX_CS_PER_CSG];
+	struct panthor_coredump_as_state as;
 
 	/* @data: Serialized coredump data. */
 	void *data;
@@ -89,6 +92,15 @@ static const char *reason_str(enum panthor_coredump_reason reason)
 	}
 }
 
+static void print_as(struct drm_printer *p,
+		     const struct panthor_coredump_as_state *as, u32 as_id)
+{
+	drm_printf(p, "as%d:\n", as_id);
+	drm_printf(p, "  FAULTSTATUS: 0x%x\n", as->faultstatus);
+	drm_printf(p, "  FAULTADDRESS: 0x%llx\n", as->faultaddress);
+	drm_printf(p, "  FAULTEXTRA: 0x%llx\n", as->faultextra);
+}
+
 static void print_cs(struct drm_printer *p,
 		     const struct panthor_coredump_cs_state *cs, u32 cs_id)
 {
@@ -259,6 +271,12 @@ static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
 		for (u32 i = 0; i < cd->group.queue_count; i++)
 			print_cs(p, &cd->cs[i], i);
 	}
+
+	if (cd->mask & PANTHOR_COREDUMP_AS) {
+		const u32 as_id = cd->csg.config & 0xf;
+
+		print_as(p, &cd->as, as_id);
+	}
 }
 
 static void process_cd(struct panthor_device *ptdev,
@@ -285,6 +303,14 @@ static void process_cd(struct panthor_device *ptdev,
 	print_cd(&p, cd);
 }
 
+static void capture_as(struct panthor_device *ptdev,
+		       struct panthor_coredump_as_state *as, u32 as_id)
+{
+	as->faultstatus = gpu_read(ptdev, AS_FAULTSTATUS(as_id));
+	as->faultaddress = gpu_read64(ptdev, AS_FAULTADDRESS(as_id));
+	as->faultextra = gpu_read64(ptdev, AS_FAULTEXTRA(as_id));
+}
+
 static void capture_cs(struct panthor_device *ptdev,
 		       struct panthor_coredump_cs_state *cs, u32 csg_id,
 		       u32 cs_id, const struct panthor_group *group)
@@ -374,6 +400,8 @@ static void capture_gpu(struct panthor_device *ptdev,
 static void capture_cd(struct panthor_device *ptdev,
 		       struct panthor_coredump *cd, struct panthor_group *group)
 {
+	struct panthor_vm *vm;
+
 	drm_info(&ptdev->base, "capturing coredump states\n");
 
 	if (group) {
@@ -401,6 +429,11 @@ static void capture_cd(struct panthor_device *ptdev,
 	for (u32 i = 0; i < cd->group.queue_count; i++)
 		capture_cs(ptdev, &cd->cs[i], cd->group.csg_id, i, group);
 	cd->mask |= PANTHOR_COREDUMP_CS;
+
+	vm = panthor_group_vm(group);
+
+	capture_as(ptdev, &cd->as, panthor_vm_as(vm));
+	cd->mask |= PANTHOR_COREDUMP_AS;
 }
 
 static void panthor_coredump_free(void *data)
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
index 44402c6142cb..8aceb0c7d0d4 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.h
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -124,6 +124,17 @@ struct panthor_coredump_cs_state {
 	u32 active;
 };
 
+/**
+ * struct panthor_coredump_as_state - Coredump AS state
+ *
+ * Interesting MMU_AS_CONTROL regs.
+ */
+struct panthor_coredump_as_state {
+	u32 faultstatus;
+	u64 faultaddress;
+	u64 faultextra;
+};
+
 #ifdef CONFIG_DEV_COREDUMP
 
 struct panthor_coredump *
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 4bc31c5f667d..82e43b7ca7aa 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -3726,6 +3726,11 @@ void panthor_group_get_ringbuf_iface(
 	*output_iface = queue->iface.output;
 }
 
+struct panthor_vm *panthor_group_vm(struct panthor_group *group)
+{
+	return group->vm;
+}
+
 int panthor_group_pool_create(struct panthor_file *pfile)
 {
 	struct panthor_group_pool *gpool;
diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
index 284ba39f958a..0cb58212fd44 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.h
+++ b/drivers/gpu/drm/panthor/panthor_sched.h
@@ -38,6 +38,8 @@ void panthor_group_get_ringbuf_iface(
 	const struct panthor_fw_ringbuf_input_iface **input_iface,
 	const struct panthor_fw_ringbuf_output_iface **output_iface);
 
+struct panthor_vm *panthor_group_vm(struct panthor_group *group);
+
 struct drm_sched_job *
 panthor_job_create(struct panthor_file *pfile,
 		   u16 group_handle,
-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 7/9] drm/panthor: capture VMA state for devcoredump
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
                   ` (5 preceding siblings ...)
  2025-07-20  0:01 ` [PATCH 6/9] drm/panthor: capture AS " Chia-I Wu
@ 2025-07-20  0:01 ` Chia-I Wu
  2025-07-20  0:01 ` [PATCH 8/9] drm/panthor: check bo offset alignment in vm bind Chia-I Wu
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

Capture interesting panthor_vma fields for devcoredump.

Because bo->label can change at anytime, we cap it to 32 chars to
simplify size estimation.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_coredump.c | 78 ++++++++++++++++++++--
 drivers/gpu/drm/panthor/panthor_coredump.h | 15 +++++
 drivers/gpu/drm/panthor/panthor_mmu.c      | 43 ++++++++++++
 drivers/gpu/drm/panthor/panthor_mmu.h      |  4 ++
 4 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
index acc8ad4cc498..5502452a5baa 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.c
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -14,6 +14,7 @@
 #include "panthor_coredump.h"
 #include "panthor_device.h"
 #include "panthor_fw.h"
+#include "panthor_gem.h"
 #include "panthor_mmu.h"
 #include "panthor_regs.h"
 #include "panthor_sched.h"
@@ -28,6 +29,7 @@ enum panthor_coredump_mask {
 	PANTHOR_COREDUMP_CSG = BIT(3),
 	PANTHOR_COREDUMP_CS = BIT(4),
 	PANTHOR_COREDUMP_AS = BIT(5),
+	PANTHOR_COREDUMP_VMA = BIT(6),
 };
 
 /**
@@ -45,6 +47,9 @@ struct panthor_coredump {
 	/** @ptdev: Device. */
 	struct panthor_device *ptdev;
 
+	/** @gfp: Allocation flags for panthor_coredump_capture. */
+	gfp_t gfp;
+
 	/** @work: Bottom half of panthor_coredump_capture. */
 	struct work_struct work;
 
@@ -60,6 +65,8 @@ struct panthor_coredump {
 	struct panthor_coredump_csg_state csg;
 	struct panthor_coredump_cs_state cs[MAX_CS_PER_CSG];
 	struct panthor_coredump_as_state as;
+	struct panthor_coredump_vma_state *vma;
+	u32 vma_count;
 
 	/* @data: Serialized coredump data. */
 	void *data;
@@ -92,6 +99,38 @@ static const char *reason_str(enum panthor_coredump_reason reason)
 	}
 }
 
+static void print_vma(struct drm_printer *p,
+		      const struct panthor_coredump_vma_state *vma, u32 vma_id,
+		      size_t *max_dyn_size)
+{
+	struct panthor_gem_object *bo = vma->bo;
+
+	if (!vma_id)
+		drm_puts(p, "vma:\n");
+
+	drm_printf(p, "  - flags: 0x%x\n", vma->flags);
+	drm_printf(p, "    iova: 0x%llx\n", vma->iova);
+	drm_printf(p, "    size: 0x%llx\n", vma->size);
+
+	if (!bo)
+		return;
+
+	/* bo->label is dynamic */
+	if (max_dyn_size) {
+		drm_puts(p, "    label: |\n");
+		drm_puts(p, "      \n");
+		*max_dyn_size += 32;
+	} else {
+		scoped_guard(mutex, &bo->label.lock)
+		{
+			if (bo->label.str) {
+				drm_puts(p, "    label: |\n");
+				drm_printf(p, "      %.32s\n", bo->label.str);
+			}
+		}
+	}
+}
+
 static void print_as(struct drm_printer *p,
 		     const struct panthor_coredump_as_state *as, u32 as_id)
 {
@@ -247,7 +286,8 @@ static void print_header(struct drm_printer *p,
 	drm_printf(p, "  timestamp: %lld\n", ktime_to_ns(header->timestamp));
 }
 
-static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
+static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd,
+		     size_t *max_dyn_size)
 {
 	/* in YAML format */
 	drm_puts(p, "---\n");
@@ -277,6 +317,11 @@ static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
 
 		print_as(p, &cd->as, as_id);
 	}
+
+	if (cd->mask & PANTHOR_COREDUMP_VMA) {
+		for (u32 i = 0; i < cd->vma_count; i++)
+			print_vma(p, &cd->vma[i], i, max_dyn_size);
+	}
 }
 
 static void process_cd(struct panthor_device *ptdev,
@@ -286,10 +331,13 @@ static void process_cd(struct panthor_device *ptdev,
 		.remain = SSIZE_MAX,
 	};
 	struct drm_printer p = drm_coredump_printer(&iter);
+	size_t max_dyn_size = 0;
 
-	print_cd(&p, cd);
+	print_cd(&p, cd, &max_dyn_size);
+	if (max_dyn_size > iter.remain)
+		return;
 
-	iter.remain = SSIZE_MAX - iter.remain;
+	iter.remain = SSIZE_MAX - iter.remain + max_dyn_size;
 	iter.data = kvmalloc(iter.remain, GFP_USER);
 	if (!iter.data)
 		return;
@@ -297,10 +345,25 @@ static void process_cd(struct panthor_device *ptdev,
 	cd->data = iter.data;
 	cd->size = iter.remain;
 
-	drm_info(&ptdev->base, "generating coredump of size %zu\n", cd->size);
+	drm_info(&ptdev->base, "generating coredump of estimated size %zu\n",
+		 cd->size);
 
 	p = drm_coredump_printer(&iter);
-	print_cd(&p, cd);
+	print_cd(&p, cd, NULL);
+
+	cd->size -= iter.remain;
+
+	/* free vma now */
+	if (cd->mask & PANTHOR_COREDUMP_VMA) {
+		for (u32 i = 0; i < cd->vma_count; i++) {
+			struct panthor_coredump_vma_state *vma = &cd->vma[i];
+
+			drm_gem_object_put(&vma->bo->base.base);
+		}
+		kfree(cd->vma);
+
+		cd->mask &= ~PANTHOR_COREDUMP_VMA;
+	}
 }
 
 static void capture_as(struct panthor_device *ptdev,
@@ -434,6 +497,10 @@ static void capture_cd(struct panthor_device *ptdev,
 
 	capture_as(ptdev, &cd->as, panthor_vm_as(vm));
 	cd->mask |= PANTHOR_COREDUMP_AS;
+
+	cd->vma = panthor_vm_capture_coredump(vm, &cd->vma_count, cd->gfp);
+	if (cd->vma_count)
+		cd->mask |= PANTHOR_COREDUMP_VMA;
 }
 
 static void panthor_coredump_free(void *data)
@@ -504,6 +571,7 @@ panthor_coredump_alloc(struct panthor_device *ptdev,
 	}
 
 	cd->ptdev = ptdev;
+	cd->gfp = gfp;
 	INIT_WORK(&cd->work, panthor_coredump_process_work);
 
 	cd->header.reason = reason;
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
index 8aceb0c7d0d4..8a89c39cf2f5 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.h
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -10,6 +10,7 @@
 
 struct panthor_coredump;
 struct panthor_device;
+struct panthor_gem_object;
 struct panthor_group;
 
 /**
@@ -135,6 +136,20 @@ struct panthor_coredump_as_state {
 	u64 faultextra;
 };
 
+/**
+ * struct panthor_coredump_vma_state - Coredump VMA state
+ *
+ * Interesting panthor_vma fields.
+ */
+struct panthor_coredump_vma_state {
+	u32 flags;
+	u64 iova;
+	u64 size;
+
+	struct panthor_gem_object *bo;
+	u64 bo_offset;
+};
+
 #ifdef CONFIG_DEV_COREDUMP
 
 struct panthor_coredump *
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index b39ea6acc6a9..a857a0dd1099 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -27,6 +27,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/sizes.h>
 
+#include "panthor_coredump.h"
 #include "panthor_device.h"
 #include "panthor_gem.h"
 #include "panthor_heap.h"
@@ -2694,6 +2695,48 @@ int panthor_vm_prepare_mapped_bos_resvs(struct drm_exec *exec, struct panthor_vm
 	return drm_gpuvm_prepare_objects(&vm->base, exec, slot_count);
 }
 
+struct panthor_coredump_vma_state *
+panthor_vm_capture_coredump(struct panthor_vm *vm, u32 *vma_count, gfp_t gfp)
+{
+	struct drm_gpuva *gpuva;
+	u32 count;
+
+	guard(mutex)(&vm->op_lock);
+
+	count = 0;
+	drm_gpuvm_for_each_va(gpuva, &vm->base)
+		count++;
+
+	struct panthor_coredump_vma_state *states =
+		kcalloc(count, sizeof(*states), gfp);
+	if (!states) {
+		*vma_count = 0;
+		return NULL;
+	}
+
+	count = 0;
+	drm_gpuvm_for_each_va(gpuva, &vm->base) {
+		const struct panthor_vma *vma =
+			container_of(gpuva, struct panthor_vma, base);
+		struct panthor_coredump_vma_state *state = &states[count];
+
+		state->flags = vma->flags;
+		state->iova = vma->base.va.addr;
+		state->size = vma->base.va.range;
+		if (vma->base.gem.obj) {
+			state->bo = to_panthor_bo(vma->base.gem.obj);
+			state->bo_offset = vma->base.gem.offset;
+			drm_gem_object_get(&state->bo->base.base);
+		}
+
+		count++;
+	}
+
+	*vma_count = count;
+
+	return states;
+}
+
 /**
  * panthor_mmu_unplug() - Unplug the MMU logic
  * @ptdev: Device.
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.h b/drivers/gpu/drm/panthor/panthor_mmu.h
index fc274637114e..c775b92d0502 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.h
+++ b/drivers/gpu/drm/panthor/panthor_mmu.h
@@ -10,6 +10,7 @@
 struct drm_exec;
 struct drm_sched_job;
 struct drm_memory_stats;
+struct panthor_coredump_vma_state;
 struct panthor_gem_object;
 struct panthor_heap_pool;
 struct panthor_vm;
@@ -97,6 +98,9 @@ void panthor_vm_update_resvs(struct panthor_vm *vm, struct drm_exec *exec,
 			     enum dma_resv_usage private_usage,
 			     enum dma_resv_usage extobj_usage);
 
+struct panthor_coredump_vma_state *
+panthor_vm_capture_coredump(struct panthor_vm *vm, u32 *vma_count, gfp_t gfp);
+
 int panthor_mmu_pt_cache_init(void);
 void panthor_mmu_pt_cache_fini(void);
 
-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 8/9] drm/panthor: check bo offset alignment in vm bind
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
                   ` (6 preceding siblings ...)
  2025-07-20  0:01 ` [PATCH 7/9] drm/panthor: capture VMA " Chia-I Wu
@ 2025-07-20  0:01 ` Chia-I Wu
  2025-08-21  7:33   ` Boris Brezillon
  2025-07-20  0:01 ` [PATCH 9/9] drm/panthor: add DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE Chia-I Wu
  2025-07-20  0:41 ` [PATCH 0/9] drm/panthor: add devcoredump support Daniel Almeida
  9 siblings, 1 reply; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

Fail early from panthor_vm_bind_prepare_op_ctx instead of late from
ops->map_pages.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index a857a0dd1099..7862c99984b6 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -1206,7 +1206,7 @@ static int panthor_vm_prepare_map_op_ctx(struct panthor_vm_op_ctx *op_ctx,
 	    (flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) != DRM_PANTHOR_VM_BIND_OP_TYPE_MAP)
 		return -EINVAL;
 
-	/* Make sure the VA and size are aligned and in-bounds. */
+	/* Make sure the VA and size are in-bounds. */
 	if (size > bo->base.base.size || offset > bo->base.base.size - size)
 		return -EINVAL;
 
@@ -2423,7 +2423,7 @@ panthor_vm_bind_prepare_op_ctx(struct drm_file *file,
 	int ret;
 
 	/* Aligned on page size. */
-	if (!IS_ALIGNED(op->va | op->size, vm_pgsz))
+	if (!IS_ALIGNED(op->va | op->size | op->bo_offset, vm_pgsz))
 		return -EINVAL;
 
 	switch (op->flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) {
-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 9/9] drm/panthor: add DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
                   ` (7 preceding siblings ...)
  2025-07-20  0:01 ` [PATCH 8/9] drm/panthor: check bo offset alignment in vm bind Chia-I Wu
@ 2025-07-20  0:01 ` Chia-I Wu
  2025-08-21  7:55   ` Boris Brezillon
  2025-07-20  0:41 ` [PATCH 0/9] drm/panthor: add devcoredump support Daniel Almeida
  9 siblings, 1 reply; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  0:01 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

When the flag is set, bo data is captured for devcoredump.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
---
 drivers/gpu/drm/panthor/panthor_coredump.c | 36 ++++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_drv.c      |  3 +-
 drivers/gpu/drm/panthor/panthor_mmu.c      |  7 +++--
 include/uapi/drm/panthor_drm.h             |  7 +++++
 4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
index 5502452a5baa..db5695b38c2d 100644
--- a/drivers/gpu/drm/panthor/panthor_coredump.c
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -5,6 +5,7 @@
 #include <drm/drm_print.h>
 #include <drm/drm_managed.h>
 #include <generated/utsrelease.h>
+#include <linux/ascii85.h>
 #include <linux/devcoredump.h>
 #include <linux/err.h>
 #include <linux/pm_runtime.h>
@@ -99,6 +100,26 @@ static const char *reason_str(enum panthor_coredump_reason reason)
 	}
 }
 
+static void print_bo(struct drm_printer *p, struct panthor_gem_object *bo,
+		     u64 offset, u64 size)
+{
+	struct iosys_map map;
+	const u32 *vals;
+	u64 count;
+	char buf[ASCII85_BUFSZ];
+
+	if (drm_gem_vmap(&bo->base.base, &map))
+		return;
+
+	/* offset and size are aligned to panthor_vm_page_size, which is SZ_4K */
+	vals = map.vaddr + offset;
+	count = size / sizeof(u32);
+	for (u64 i = 0; i < count; i++)
+		drm_puts(p, ascii85_encode(vals[i], buf));
+
+	drm_gem_vunmap(&bo->base.base, &map);
+}
+
 static void print_vma(struct drm_printer *p,
 		      const struct panthor_coredump_vma_state *vma, u32 vma_id,
 		      size_t *max_dyn_size)
@@ -129,6 +150,21 @@ static void print_vma(struct drm_printer *p,
 			}
 		}
 	}
+
+	if (vma->flags & DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE) {
+		drm_puts(p, "    data: |\n");
+		drm_puts(p, "      ");
+
+		/* bo data is dynamic */
+		if (max_dyn_size) {
+			*max_dyn_size +=
+				vma->size / sizeof(u32) * (ASCII85_BUFSZ - 1);
+		} else {
+			print_bo(p, bo, vma->bo_offset, vma->size);
+		}
+
+		drm_puts(p, "\n");
+	}
 }
 
 static void print_as(struct drm_printer *p,
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index 1116f2d2826e..6c4de1e73cd1 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -1608,6 +1608,7 @@ static void panthor_debugfs_init(struct drm_minor *minor)
  * - 1.3 - adds DRM_PANTHOR_GROUP_STATE_INNOCENT flag
  * - 1.4 - adds DRM_IOCTL_PANTHOR_BO_SET_LABEL ioctl
  * - 1.5 - adds DRM_PANTHOR_SET_USER_MMIO_OFFSET ioctl
+ * - 1.6 - adds DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE flag
  */
 static const struct drm_driver panthor_drm_driver = {
 	.driver_features = DRIVER_RENDER | DRIVER_GEM | DRIVER_SYNCOBJ |
@@ -1621,7 +1622,7 @@ static const struct drm_driver panthor_drm_driver = {
 	.name = "panthor",
 	.desc = "Panthor DRM driver",
 	.major = 1,
-	.minor = 5,
+	.minor = 6,
 
 	.gem_create_object = panthor_gem_create_object,
 	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index 7862c99984b6..72b1b2799b65 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -2045,10 +2045,11 @@ static void panthor_vma_init(struct panthor_vma *vma, u32 flags)
 	vma->flags = flags;
 }
 
-#define PANTHOR_VM_MAP_FLAGS \
+#define PANTHOR_VM_MAP_FLAGS                   \
 	(DRM_PANTHOR_VM_BIND_OP_MAP_READONLY | \
-	 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | \
-	 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED)
+	 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |   \
+	 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED | \
+	 DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE)
 
 static int panthor_gpuva_sm_step_map(struct drm_gpuva_op *op, void *priv)
 {
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index e1f43deb7eca..c4c5e38365e9 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -496,6 +496,13 @@ enum drm_panthor_vm_bind_op_flags {
 	 */
 	DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED = 1 << 2,
 
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE: Dump the VMA for devcoredump.
+	 *
+	 * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE = 1 << 3,
+
 	/**
 	 * @DRM_PANTHOR_VM_BIND_OP_TYPE_MASK: Mask used to determine the type of operation.
 	 */
-- 
2.50.0.727.gbf7dc18ff4-goog


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 0/9] drm/panthor: add devcoredump support
  2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
                   ` (8 preceding siblings ...)
  2025-07-20  0:01 ` [PATCH 9/9] drm/panthor: add DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE Chia-I Wu
@ 2025-07-20  0:41 ` Daniel Almeida
  2025-07-20  1:13   ` Chia-I Wu
  9 siblings, 1 reply; 19+ messages in thread
From: Daniel Almeida @ 2025-07-20  0:41 UTC (permalink / raw)
  To: Chia-I Wu
  Cc: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

Hi Chia-I Wu :)

> On 19 Jul 2025, at 21:01, Chia-I Wu <olvaffe@gmail.com> wrote:
> 
> This series adds devcoredump support to panthor.
> 
> This is written from scratch and is not based on the prior work[1]. The
> main differences are

I wonder why this was started from scratch? IIRC, that work stopped, among
other things, because we were not sure about what exactly to include in the
dump. I don't think it warranted a completely new implementation, IMHO.

Do you plan to work on the userspace part as well?

-- Daniel


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 0/9] drm/panthor: add devcoredump support
  2025-07-20  0:41 ` [PATCH 0/9] drm/panthor: add devcoredump support Daniel Almeida
@ 2025-07-20  1:13   ` Chia-I Wu
  0 siblings, 0 replies; 19+ messages in thread
From: Chia-I Wu @ 2025-07-20  1:13 UTC (permalink / raw)
  To: Daniel Almeida
  Cc: Boris Brezillon, Steven Price, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

On Sat, Jul 19, 2025 at 5:41 PM Daniel Almeida
<daniel.almeida@collabora.com> wrote:
>
> Hi Chia-I Wu :)
>
> > On 19 Jul 2025, at 21:01, Chia-I Wu <olvaffe@gmail.com> wrote:
> >
> > This series adds devcoredump support to panthor.
> >
> > This is written from scratch and is not based on the prior work[1]. The
> > main differences are
>
> I wonder why this was started from scratch? IIRC, that work stopped, among
> other things, because we were not sure about what exactly to include in the
> dump. I don't think it warranted a completely new implementation, IMHO.
As noted in the listed differences, this impl triggers coredumps in
more places (e.g., mmu faults), captures lower-level hw regs,
separates capturing and processing, and outputs in text format.  It
turns out there is little code that can be inherited from the prior
work.

It also does not support dumping successful jobs.

>
> Do you plan to work on the userspace part as well?
Yes, there is a very early tool in
https://gitlab.freedesktop.org/panfrost/linux/-/issues/44.  There is
also a sample dump that shows the raw dump, the decoded one, and the
decoded ringbufs / cmdbufs.

>
> -- Daniel
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/9] drm/panthor: add devcoredump support
  2025-07-20  0:01 ` [PATCH 1/9] " Chia-I Wu
@ 2025-07-20  3:17   ` kernel test robot
  2025-07-28 11:24   ` Steven Price
  1 sibling, 0 replies; 19+ messages in thread
From: kernel test robot @ 2025-07-20  3:17 UTC (permalink / raw)
  To: Chia-I Wu, Boris Brezillon, Steven Price, Liviu Dudau,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, linux-kernel, dri-devel
  Cc: llvm, oe-kbuild-all

Hi Chia-I,

kernel test robot noticed the following build warnings:

[auto build test WARNING on drm-misc/drm-misc-next]
[also build test WARNING on v6.16-rc6 next-20250718]
[cannot apply to linus/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Chia-I-Wu/drm-panthor-add-devcoredump-support/20250720-080312
base:   git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link:    https://lore.kernel.org/r/20250720000146.1405060-2-olvaffe%40gmail.com
patch subject: [PATCH 1/9] drm/panthor: add devcoredump support
config: x86_64-buildonly-randconfig-004-20250720 (https://download.01.org/0day-ci/archive/20250720/202507201010.Tou41l73-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250720/202507201010.Tou41l73-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507201010.Tou41l73-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:21 Enum value 'PANTHOR_COREDUMP_GROUP' not described in enum 'panthor_coredump_mask'
>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:29 struct member 'reason' not described in 'panthor_coredump_header'
>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:29 struct member 'timestamp' not described in 'panthor_coredump_header'
>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:54 struct member 'group' not described in 'panthor_coredump'
>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:54 struct member 'data' not described in 'panthor_coredump'
>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:54 struct member 'size' not described in 'panthor_coredump'

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/9] drm/panthor: capture GPU state for devcoredump
  2025-07-20  0:01 ` [PATCH 2/9] drm/panthor: capture GPU state for devcoredump Chia-I Wu
@ 2025-07-20  4:29   ` kernel test robot
  0 siblings, 0 replies; 19+ messages in thread
From: kernel test robot @ 2025-07-20  4:29 UTC (permalink / raw)
  To: Chia-I Wu, Boris Brezillon, Steven Price, Liviu Dudau,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, linux-kernel, dri-devel
  Cc: llvm, oe-kbuild-all

Hi Chia-I,

kernel test robot noticed the following build warnings:

[auto build test WARNING on drm-misc/drm-misc-next]
[also build test WARNING on next-20250718]
[cannot apply to linus/master v6.16-rc6]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Chia-I-Wu/drm-panthor-add-devcoredump-support/20250720-080312
base:   git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link:    https://lore.kernel.org/r/20250720000146.1405060-3-olvaffe%40gmail.com
patch subject: [PATCH 2/9] drm/panthor: capture GPU state for devcoredump
config: x86_64-buildonly-randconfig-004-20250720 (https://download.01.org/0day-ci/archive/20250720/202507201259.fG0O42j1-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250720/202507201259.fG0O42j1-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507201259.fG0O42j1-lkp@intel.com/

All warnings (new ones prefixed by >>):

   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:24 Enum value 'PANTHOR_COREDUMP_GROUP' not described in enum 'panthor_coredump_mask'
>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:24 Enum value 'PANTHOR_COREDUMP_GPU' not described in enum 'panthor_coredump_mask'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:32 struct member 'reason' not described in 'panthor_coredump_header'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:32 struct member 'timestamp' not described in 'panthor_coredump_header'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:58 struct member 'group' not described in 'panthor_coredump'
>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:58 struct member 'gpu' not described in 'panthor_coredump'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:58 struct member 'data' not described in 'panthor_coredump'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:58 struct member 'size' not described in 'panthor_coredump'

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/9] drm/panthor: capture GLB state for devcoredump
  2025-07-20  0:01 ` [PATCH 3/9] drm/panthor: capture GLB " Chia-I Wu
@ 2025-07-20  5:41   ` kernel test robot
  0 siblings, 0 replies; 19+ messages in thread
From: kernel test robot @ 2025-07-20  5:41 UTC (permalink / raw)
  To: Chia-I Wu, Boris Brezillon, Steven Price, Liviu Dudau,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, linux-kernel, dri-devel
  Cc: llvm, oe-kbuild-all

Hi Chia-I,

kernel test robot noticed the following build warnings:

[auto build test WARNING on drm-misc/drm-misc-next]
[also build test WARNING on next-20250718]
[cannot apply to linus/master v6.16-rc6]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Chia-I-Wu/drm-panthor-add-devcoredump-support/20250720-080312
base:   git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link:    https://lore.kernel.org/r/20250720000146.1405060-4-olvaffe%40gmail.com
patch subject: [PATCH 3/9] drm/panthor: capture GLB state for devcoredump
config: x86_64-buildonly-randconfig-004-20250720 (https://download.01.org/0day-ci/archive/20250720/202507201318.sVNbKtUN-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250720/202507201318.sVNbKtUN-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507201318.sVNbKtUN-lkp@intel.com/

All warnings (new ones prefixed by >>):

   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:26 Enum value 'PANTHOR_COREDUMP_GROUP' not described in enum 'panthor_coredump_mask'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:26 Enum value 'PANTHOR_COREDUMP_GPU' not described in enum 'panthor_coredump_mask'
>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:26 Enum value 'PANTHOR_COREDUMP_GLB' not described in enum 'panthor_coredump_mask'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:34 struct member 'reason' not described in 'panthor_coredump_header'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:34 struct member 'timestamp' not described in 'panthor_coredump_header'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:61 struct member 'group' not described in 'panthor_coredump'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:61 struct member 'gpu' not described in 'panthor_coredump'
>> Warning: drivers/gpu/drm/panthor/panthor_coredump.c:61 struct member 'glb' not described in 'panthor_coredump'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:61 struct member 'data' not described in 'panthor_coredump'
   Warning: drivers/gpu/drm/panthor/panthor_coredump.c:61 struct member 'size' not described in 'panthor_coredump'

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/9] drm/panthor: add devcoredump support
  2025-07-20  0:01 ` [PATCH 1/9] " Chia-I Wu
  2025-07-20  3:17   ` kernel test robot
@ 2025-07-28 11:24   ` Steven Price
  2025-08-21  8:16     ` Boris Brezillon
  1 sibling, 1 reply; 19+ messages in thread
From: Steven Price @ 2025-07-28 11:24 UTC (permalink / raw)
  To: Chia-I Wu, Boris Brezillon, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter,
	linux-kernel, dri-devel

On 20/07/2025 01:01, Chia-I Wu wrote:
> Create a devcoredump on any faulty or fatal event. The coredump data is
> in YAML format for readability and flexibility.
> 
> Only panthor_group state is captured for now.
> 
> Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
> ---
>  drivers/gpu/drm/panthor/Makefile           |   2 +
>  drivers/gpu/drm/panthor/panthor_coredump.c | 225 +++++++++++++++++++++
>  drivers/gpu/drm/panthor/panthor_coredump.h |  68 +++++++
>  drivers/gpu/drm/panthor/panthor_device.h   |   6 +
>  drivers/gpu/drm/panthor/panthor_sched.c    |  69 +++++++
>  drivers/gpu/drm/panthor/panthor_sched.h    |   5 +
>  6 files changed, 375 insertions(+)
>  create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.c
>  create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.h
> 
> diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile
> index 15294719b09c..9fd1e74af1df 100644
> --- a/drivers/gpu/drm/panthor/Makefile
> +++ b/drivers/gpu/drm/panthor/Makefile
> @@ -11,4 +11,6 @@ panthor-y := \
>  	panthor_mmu.o \
>  	panthor_sched.o
>  
> +panthor-$(CONFIG_DEV_COREDUMP) += panthor_coredump.o
> +
>  obj-$(CONFIG_DRM_PANTHOR) += panthor.o
> diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
> new file mode 100644
> index 000000000000..767f3327e3e8
> --- /dev/null
> +++ b/drivers/gpu/drm/panthor/panthor_coredump.c
> @@ -0,0 +1,225 @@
> +// SPDX-License-Identifier: GPL-2.0 or MIT
> +/* Copyright 2025 Google LLC */
> +
> +#include <drm/drm_drv.h>
> +#include <drm/drm_print.h>
> +#include <drm/drm_managed.h>
> +#include <generated/utsrelease.h>
> +#include <linux/devcoredump.h>
> +#include <linux/err.h>
> +#include <linux/slab.h>
> +#include <linux/timekeeping.h>
> +
> +#include "panthor_coredump.h"
> +#include "panthor_device.h"
> +#include "panthor_sched.h"
> +
> +/**
> + * enum panthor_coredump_mask - Coredump state
> + */
> +enum panthor_coredump_mask {
> +	PANTHOR_COREDUMP_GROUP = BIT(0),
> +};
> +
> +/**
> + * struct panthor_coredump_header - Coredump header
> + */
> +struct panthor_coredump_header {
> +	enum panthor_coredump_reason reason;
> +	ktime_t timestamp;
> +};
> +
> +/**
> + * struct panthor_coredump - Coredump
> + */
> +struct panthor_coredump {
> +	/** @ptdev: Device. */
> +	struct panthor_device *ptdev;
> +
> +	/** @work: Bottom half of panthor_coredump_capture. */
> +	struct work_struct work;
> +
> +	/** @header: Header. */
> +	struct panthor_coredump_header header;
> +
> +	/** @mask: Bitmask of captured states. */
> +	u32 mask;
> +
> +	struct panthor_coredump_group_state group;
> +
> +	/* @data: Serialized coredump data. */
> +	void *data;
> +
> +	/* @size: Serialized coredump size. */
> +	size_t size;
> +};
> +
> +static const char *reason_str(enum panthor_coredump_reason reason)
> +{
> +	switch (reason) {
> +	case PANTHOR_COREDUMP_REASON_MMU_FAULT:
> +		return "MMU_FAULT";
> +	case PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT:
> +		return "CSG_REQ_TIMEOUT";
> +	case PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE:
> +		return "CSG_UNKNOWN_STATE";
> +	case PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT:
> +		return "CSG_PROGRESS_TIMEOUT";
> +	case PANTHOR_COREDUMP_REASON_CS_FATAL:
> +		return "CS_FATAL";
> +	case PANTHOR_COREDUMP_REASON_CS_FAULT:
> +		return "CS_FAULT";
> +	case PANTHOR_COREDUMP_REASON_CS_TILER_OOM:
> +		return "CS_TILER_OOM";
> +	case PANTHOR_COREDUMP_REASON_JOB_TIMEOUT:
> +		return "JOB_TIMEOUT";
> +	default:
> +		return "UNKNOWN";
> +	}
> +}

I'd recommend using a macro to reduce the repetition, e.g. take a look
at PANTHOR_EXCEPTION().

> +
> +static void print_group(struct drm_printer *p,
> +			const struct panthor_coredump_group_state *group)
> +{
> +	drm_puts(p, "group:\n");
> +	drm_printf(p, "  priority: %d\n", group->priority);
> +	drm_printf(p, "  queue_count: %u\n", group->queue_count);
> +	drm_printf(p, "  pid: %d\n", group->pid);
> +	drm_printf(p, "  comm: %s\n", group->comm);

I can see the attraction of YAML, but here "comm" might contain
characters that break the YAML parsing. So either we need to correctly
quote such characters, or accept this isn't YAML.

In particular YAML starts to become ugly in the final patch when you are
dumping buffer objects. Although AFAICT that is quoted successfully.

> +	drm_printf(p, "  destroyed: %d\n", group->destroyed);
> +	drm_printf(p, "  csg_id: %d\n", group->csg_id);
> +}
> +
> +static void print_header(struct drm_printer *p,
> +			 const struct panthor_coredump_header *header,
> +			 const struct drm_driver *drv)
> +{
> +	drm_puts(p, "header:\n");
> +	drm_puts(p, "  kernel: " UTS_RELEASE "\n");
> +	drm_puts(p, "  module: " KBUILD_MODNAME "\n");
> +	drm_printf(p, "  driver_version: %d.%d\n", drv->major, drv->minor);
> +
> +	drm_printf(p, "  reason: %s\n", reason_str(header->reason));
> +	drm_printf(p, "  timestamp: %lld\n", ktime_to_ns(header->timestamp));
> +}
> +
> +static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
> +{
> +	/* in YAML format */
> +	drm_puts(p, "---\n");
> +	print_header(p, &cd->header, cd->ptdev->base.driver);
> +
> +	if (cd->mask & PANTHOR_COREDUMP_GROUP)
> +		print_group(p, &cd->group);
> +}
> +
> +static void process_cd(struct panthor_device *ptdev,
> +		       struct panthor_coredump *cd)
> +{
> +	struct drm_print_iterator iter = {
> +		.remain = SSIZE_MAX,
> +	};
> +	struct drm_printer p = drm_coredump_printer(&iter);
> +
> +	print_cd(&p, cd);
> +
> +	iter.remain = SSIZE_MAX - iter.remain;
> +	iter.data = kvmalloc(iter.remain, GFP_USER);
> +	if (!iter.data)
> +		return;
> +
> +	cd->data = iter.data;
> +	cd->size = iter.remain;
> +
> +	drm_info(&ptdev->base, "generating coredump of size %zu\n", cd->size);
> +
> +	p = drm_coredump_printer(&iter);
> +	print_cd(&p, cd);
> +}

I think this would be better written in the style suggested in the
drm_print.h header, moving the iterator into print_cd():

static ssize_t print_cd(char *buffer, ssize_t count, const struct
panthor_coredump *cd)
{
	struct drm_print_iterator iter = {
		.data = buffer,
		.remain = count,
	};
	struct drm_printer p = drm_coredump_printer(&iter);

	/* in YAML format */
	drm_puts(p, "---\n");
	print_header(p, &cd->header, cd->ptdev->base.driver);

	if (cd->mask & PANTHOR_COREDUMP_GROUP)
		print_group(p, &cd->group);

	return count - iter.remain;
}

static void process_cd(struct panthor_device *ptdev,
		       struct panthor_coredump *cd)
{
	ssize_t count = print_cd(NULL, SSIZE_MAX, cd);

	cd->data = kvmalloc(count, GFP_USER);
	if (!cd->data)
		return;
	cd->size = count;

	drm_info(&ptdev->base, "generating coredump of size %zu\n", count);
	print_cd(cd->data, cd->size, cd);
}

> +
> +static void capture_cd(struct panthor_device *ptdev,
> +		       struct panthor_coredump *cd, struct panthor_group *group)
> +{
> +	drm_info(&ptdev->base, "capturing coredump states\n");
> +
> +	if (group) {
> +		panthor_group_capture_coredump(group, &cd->group);
> +		cd->mask |= PANTHOR_COREDUMP_GROUP;
> +	}
> +}
> +
> +static void panthor_coredump_free(void *data)
> +{
> +	struct panthor_coredump *cd = data;
> +	struct panthor_device *ptdev = cd->ptdev;
> +
> +	kvfree(cd->data);
> +	kfree(cd);
> +
> +	atomic_set(&ptdev->coredump.pending, 0);
> +}
> +
> +static ssize_t panthor_coredump_read(char *buffer, loff_t offset, size_t count,
> +				     void *data, size_t datalen)
> +{
> +	const struct panthor_coredump *cd = data;
> +
> +	if (offset >= cd->size)
> +		return 0;
> +
> +	if (count > cd->size - offset)
> +		count = cd->size - offset;
> +
> +	memcpy(buffer, cd->data + offset, count);
> +
> +	return count;
> +}
> +
> +static void panthor_coredump_process_work(struct work_struct *work)
> +{
> +	struct panthor_coredump *cd =
> +		container_of(work, struct panthor_coredump, work);
> +	struct panthor_device *ptdev = cd->ptdev;
> +
> +	process_cd(ptdev, cd);
> +
> +	dev_coredumpm(ptdev->base.dev, THIS_MODULE, cd, 0, GFP_KERNEL,
> +		      panthor_coredump_read, panthor_coredump_free);

Is there a good reason to reinvent the read/free functionality of
devcoredump? Can we not just use dev_coredumpv() instead? The only
benefit I can see if the automatic rearming of coredump.pending, but
panfrost handles this by having a "panfrost_dump_core" flag which is
re-armed manually from user space.

Given core dumps might be large and fairly expensive to create, it seems
sensible to not automatically re-arm.

> +}
> +
> +void panthor_coredump_capture(struct panthor_coredump *cd,
> +			      struct panthor_group *group)
> +{
> +	struct panthor_device *ptdev = cd->ptdev;
> +
> +	capture_cd(ptdev, cd, group);
> +
> +	queue_work(system_unbound_wq, &cd->work);
> +}

So I can see why you want to move the work onto a workqueue, but I'm a
little worried about lifetimes.

It seems slightly odd that you are capturing the data into a binary
format (struct panthor_coredump_group_state, and later
panthor_coredump_gpu_state, panthor_coredump_glb_state,
panthor_coredump_csg_state etc) and then kicking off a separate
workqueue item to convert it all to YAML.

> +
> +struct panthor_coredump *
> +panthor_coredump_alloc(struct panthor_device *ptdev,
> +		       enum panthor_coredump_reason reason, gfp_t gfp)
> +{
> +	struct panthor_coredump *cd;
> +
> +	/* reject all but the first coredump until it is handled */
> +	if (atomic_cmpxchg(&ptdev->coredump.pending, 0, 1)) {
> +		drm_dbg(&ptdev->base, "skip subsequent coredump\n");
> +		return NULL;
> +	}
> +
> +	cd = kzalloc(sizeof(*cd), gfp);
> +	if (!cd) {
> +		atomic_set(&ptdev->coredump.pending, 0);
> +		return NULL;
> +	}
> +
> +	cd->ptdev = ptdev;
> +	INIT_WORK(&cd->work, panthor_coredump_process_work);
> +
> +	cd->header.reason = reason;
> +	cd->header.timestamp = ktime_get_real();
> +
> +	return cd;
> +}
> diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
> new file mode 100644
> index 000000000000..dd1fe1c2e175
> --- /dev/null
> +++ b/drivers/gpu/drm/panthor/panthor_coredump.h
> @@ -0,0 +1,68 @@
> +/* SPDX-License-Identifier: GPL-2.0 or MIT */
> +/* Copyright 2019 Collabora ltd. */
> +
> +#ifndef __PANTHOR_COREDUMP_H__
> +#define __PANTHOR_COREDUMP_H__
> +
> +#include <drm/panthor_drm.h>
> +#include <linux/sched.h>
> +#include <linux/types.h>
> +
> +struct panthor_coredump;
> +struct panthor_device;
> +struct panthor_group;
> +
> +/**
> + * enum panthor_coredump_reason - Coredump reason
> + */
> +enum panthor_coredump_reason {
> +	PANTHOR_COREDUMP_REASON_MMU_FAULT,
> +	PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
> +	PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
> +	PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT,
> +	PANTHOR_COREDUMP_REASON_CS_FATAL,
> +	PANTHOR_COREDUMP_REASON_CS_FAULT,
> +	PANTHOR_COREDUMP_REASON_CS_TILER_OOM,
> +	PANTHOR_COREDUMP_REASON_JOB_TIMEOUT,
> +};
> +
> +/**
> + * struct panthor_coredump_group_state - Coredump group state
> + *
> + * Interesting panthor_group fields.
> + */
> +struct panthor_coredump_group_state {
> +	enum drm_panthor_group_priority priority;
> +	u32 queue_count;
> +	pid_t pid;
> +	char comm[TASK_COMM_LEN];
> +	bool destroyed;
> +	int csg_id;
> +};
> +
> +#ifdef CONFIG_DEV_COREDUMP
> +
> +struct panthor_coredump *
> +panthor_coredump_alloc(struct panthor_device *ptdev,
> +		       enum panthor_coredump_reason reason, gfp_t gfp);
> +
> +void panthor_coredump_capture(struct panthor_coredump *cd,
> +			      struct panthor_group *group);
> +
> +#else /* CONFIG_DEV_COREDUMP */
> +
> +static inline struct panthor_coredump *
> +panthor_coredump_alloc(struct panthor_device *ptdev,
> +		       enum panthor_coredump_reason reason, gfp_t gfp)
> +{
> +	return NULL;
> +}
> +
> +static inline void panthor_coredump_capture(struct panthor_coredump *cd,
> +					    struct panthor_group *group)
> +{
> +}

panthor_coredump_alloc() is always called immediately before
panthor_coredump_capture(). So instead we could just export a wrapper
than combines both functions. This also avoids the caller having to deal
with panthor_coredump_alloc() failing.

Thanks,
Steve

> +
> +#endif /* CONFIG_DEV_COREDUMP */
> +
> +#endif /* __PANTHOR_COREDUMP_H__ */
> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> index 4fc7cf2aeed5..766e53c25cfa 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.h
> +++ b/drivers/gpu/drm/panthor/panthor_device.h
> @@ -197,6 +197,12 @@ struct panthor_device {
>  		atomic_t recovery_needed;
>  	} pm;
>  
> +	/** @coredump: Coredump-related data. */
> +	struct {
> +		/** @pending: True if there is a pending coredump. */
> +		atomic_t pending;
> +	} coredump;
> +
>  	/** @profile_mask: User-set profiling flags for job accounting. */
>  	u32 profile_mask;
>  
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index a2248f692a03..eb45b5ad9774 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -23,6 +23,7 @@
>  #include <linux/platform_device.h>
>  #include <linux/pm_runtime.h>
>  
> +#include "panthor_coredump.h"
>  #include "panthor_devfreq.h"
>  #include "panthor_device.h"
>  #include "panthor_fw.h"
> @@ -1031,6 +1032,10 @@ group_unbind_locked(struct panthor_group *group)
>  	return 0;
>  }
>  
> +static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
> +					  enum panthor_coredump_reason reason,
> +					  struct panthor_group *group);
> +
>  /**
>   * cs_slot_prog_locked() - Program a queue slot
>   * @ptdev: Device.
> @@ -1249,6 +1254,10 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
>  		drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
>  			csg_id, csg_state);
>  		new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
> +
> +		panthor_sched_coredump_locked(
> +			ptdev, PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
> +			group);
>  		break;
>  	}
>  
> @@ -1378,6 +1387,9 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
>  		 (unsigned int)CS_EXCEPTION_DATA(fatal),
>  		 info);
> +
> +	panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FATAL,
> +				      group);
>  }
>  
>  static void
> @@ -1426,6 +1438,9 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
>  		 (unsigned int)CS_EXCEPTION_DATA(fault),
>  		 info);
> +
> +	panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FAULT,
> +				      group);
>  }
>  
>  static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
> @@ -1480,6 +1495,10 @@ static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
>  		drm_warn(&ptdev->base, "Failed to extend the tiler heap\n");
>  		group->fatal_queues |= BIT(cs_id);
>  		sched_queue_delayed_work(sched, tick, 0);
> +
> +		panthor_sched_coredump_locked(
> +			ptdev, PANTHOR_COREDUMP_REASON_CS_TILER_OOM, group);
> +
>  		goto out_put_heap_pool;
>  	}
>  
> @@ -1639,6 +1658,9 @@ csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 c
>  		group->timedout = true;
>  
>  	sched_queue_delayed_work(sched, tick, 0);
> +
> +	panthor_sched_coredump_locked(
> +		ptdev, PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT, group);
>  }
>  
>  static void sched_process_csg_irq_locked(struct panthor_device *ptdev, u32 csg_id)
> @@ -1858,8 +1880,16 @@ static int csgs_upd_ctx_apply_locked(struct panthor_device *ptdev,
>  
>  		if (ret && acked != req_mask &&
>  		    ((csg_iface->input->req ^ csg_iface->output->ack) & req_mask) != 0) {
> +			struct panthor_csg_slot *csg_slot =
> +				&sched->csg_slots[csg_id];
> +			struct panthor_group *group = csg_slot->group;
> +
>  			drm_err(&ptdev->base, "CSG %d update request timedout", csg_id);
>  			ctx->timedout_mask |= BIT(csg_id);
> +
> +			panthor_sched_coredump_locked(
> +				ptdev, PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
> +				group);
>  		}
>  	}
>  
> @@ -2027,6 +2057,10 @@ tick_ctx_init(struct panthor_scheduler *sched,
>  		 * CSG IRQs, so we can flag the faulty queue.
>  		 */
>  		if (panthor_vm_has_unhandled_faults(group->vm)) {
> +			panthor_sched_coredump_locked(
> +				ptdev, PANTHOR_COREDUMP_REASON_MMU_FAULT,
> +				group);
> +
>  			sched_process_csg_irq_locked(ptdev, i);
>  
>  			/* No fatal fault reported, flag all queues as faulty. */
> @@ -3237,6 +3271,10 @@ queue_timedout_job(struct drm_sched_job *sched_job)
>  
>  		group_queue_work(group, term);
>  	}
> +
> +	panthor_sched_coredump_locked(
> +		ptdev, PANTHOR_COREDUMP_REASON_JOB_TIMEOUT, group);
> +
>  	mutex_unlock(&sched->lock);
>  
>  	queue_start(queue);
> @@ -3627,6 +3665,37 @@ int panthor_group_get_state(struct panthor_file *pfile,
>  	return 0;
>  }
>  
> +static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
> +					  enum panthor_coredump_reason reason,
> +					  struct panthor_group *group)
> +{
> +	struct panthor_coredump *cd;
> +
> +	lockdep_assert_held(&ptdev->scheduler->lock);
> +
> +	/* GFP_NOWAIT because this may be called from fence signaling path */
> +	cd = panthor_coredump_alloc(ptdev, reason, GFP_NOWAIT);
> +	if (!cd)
> +		return;
> +
> +	panthor_coredump_capture(cd, group);
> +}
> +
> +void panthor_group_capture_coredump(const struct panthor_group *group,
> +				    struct panthor_coredump_group_state *state)
> +{
> +	const struct panthor_device *ptdev = group->ptdev;
> +
> +	/* this is called from panthor_coredump_capture */
> +	lockdep_assert_held(&ptdev->scheduler->lock);
> +
> +	state->priority = group->priority;
> +	state->queue_count = group->queue_count;
> +	/* TODO state->pid and state->comm */
> +	state->destroyed = group->destroyed;
> +	state->csg_id = group->csg_id;
> +}
> +
>  int panthor_group_pool_create(struct panthor_file *pfile)
>  {
>  	struct panthor_group_pool *gpool;
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
> index 742b0b4ff3a3..6c564153133e 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.h
> +++ b/drivers/gpu/drm/panthor/panthor_sched.h
> @@ -14,8 +14,10 @@ struct drm_panthor_group_create;
>  struct drm_panthor_queue_create;
>  struct drm_panthor_group_get_state;
>  struct drm_panthor_queue_submit;
> +struct panthor_coredump_group_state;
>  struct panthor_device;
>  struct panthor_file;
> +struct panthor_group;
>  struct panthor_group_pool;
>  struct panthor_job;
>  
> @@ -26,6 +28,9 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle);
>  int panthor_group_get_state(struct panthor_file *pfile,
>  			    struct drm_panthor_group_get_state *get_state);
>  
> +void panthor_group_capture_coredump(const struct panthor_group *group,
> +				    struct panthor_coredump_group_state *state);
> +
>  struct drm_sched_job *
>  panthor_job_create(struct panthor_file *pfile,
>  		   u16 group_handle,


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 8/9] drm/panthor: check bo offset alignment in vm bind
  2025-07-20  0:01 ` [PATCH 8/9] drm/panthor: check bo offset alignment in vm bind Chia-I Wu
@ 2025-08-21  7:33   ` Boris Brezillon
  0 siblings, 0 replies; 19+ messages in thread
From: Boris Brezillon @ 2025-08-21  7:33 UTC (permalink / raw)
  To: Chia-I Wu
  Cc: Steven Price, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, linux-kernel,
	dri-devel

On Sat, 19 Jul 2025 17:01:45 -0700
Chia-I Wu <olvaffe@gmail.com> wrote:

> Fail early from panthor_vm_bind_prepare_op_ctx instead of late from
> ops->map_pages.
> 
> Signed-off-by: Chia-I Wu <olvaffe@gmail.com>

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>

We can probably merge this one ahead of the coredump stuff.

> ---
>  drivers/gpu/drm/panthor/panthor_mmu.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index a857a0dd1099..7862c99984b6 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -1206,7 +1206,7 @@ static int panthor_vm_prepare_map_op_ctx(struct panthor_vm_op_ctx *op_ctx,
>  	    (flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) != DRM_PANTHOR_VM_BIND_OP_TYPE_MAP)
>  		return -EINVAL;
>  
> -	/* Make sure the VA and size are aligned and in-bounds. */
> +	/* Make sure the VA and size are in-bounds. */
>  	if (size > bo->base.base.size || offset > bo->base.base.size - size)
>  		return -EINVAL;
>  
> @@ -2423,7 +2423,7 @@ panthor_vm_bind_prepare_op_ctx(struct drm_file *file,
>  	int ret;
>  
>  	/* Aligned on page size. */
> -	if (!IS_ALIGNED(op->va | op->size, vm_pgsz))
> +	if (!IS_ALIGNED(op->va | op->size | op->bo_offset, vm_pgsz))
>  		return -EINVAL;
>  
>  	switch (op->flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) {


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 9/9] drm/panthor: add DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE
  2025-07-20  0:01 ` [PATCH 9/9] drm/panthor: add DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE Chia-I Wu
@ 2025-08-21  7:55   ` Boris Brezillon
  0 siblings, 0 replies; 19+ messages in thread
From: Boris Brezillon @ 2025-08-21  7:55 UTC (permalink / raw)
  To: Chia-I Wu
  Cc: Steven Price, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, linux-kernel,
	dri-devel

On Sat, 19 Jul 2025 17:01:46 -0700
Chia-I Wu <olvaffe@gmail.com> wrote:

> When the flag is set, bo data is captured for devcoredump.
> 
> Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
> ---
>  drivers/gpu/drm/panthor/panthor_coredump.c | 36 ++++++++++++++++++++++
>  drivers/gpu/drm/panthor/panthor_drv.c      |  3 +-
>  drivers/gpu/drm/panthor/panthor_mmu.c      |  7 +++--
>  include/uapi/drm/panthor_drm.h             |  7 +++++
>  4 files changed, 49 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
> index 5502452a5baa..db5695b38c2d 100644
> --- a/drivers/gpu/drm/panthor/panthor_coredump.c
> +++ b/drivers/gpu/drm/panthor/panthor_coredump.c
> @@ -5,6 +5,7 @@
>  #include <drm/drm_print.h>
>  #include <drm/drm_managed.h>
>  #include <generated/utsrelease.h>
> +#include <linux/ascii85.h>
>  #include <linux/devcoredump.h>
>  #include <linux/err.h>
>  #include <linux/pm_runtime.h>
> @@ -99,6 +100,26 @@ static const char *reason_str(enum panthor_coredump_reason reason)
>  	}
>  }
>  
> +static void print_bo(struct drm_printer *p, struct panthor_gem_object *bo,
> +		     u64 offset, u64 size)
> +{
> +	struct iosys_map map;
> +	const u32 *vals;
> +	u64 count;
> +	char buf[ASCII85_BUFSZ];
> +
> +	if (drm_gem_vmap(&bo->base.base, &map))
> +		return;
> +
> +	/* offset and size are aligned to panthor_vm_page_size, which is SZ_4K */
> +	vals = map.vaddr + offset;
> +	count = size / sizeof(u32);
> +	for (u64 i = 0; i < count; i++)
> +		drm_puts(p, ascii85_encode(vals[i], buf));
> +
> +	drm_gem_vunmap(&bo->base.base, &map);
> +}
> +
>  static void print_vma(struct drm_printer *p,
>  		      const struct panthor_coredump_vma_state *vma, u32 vma_id,
>  		      size_t *max_dyn_size)
> @@ -129,6 +150,21 @@ static void print_vma(struct drm_printer *p,
>  			}
>  		}
>  	}
> +
> +	if (vma->flags & DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE) {
> +		drm_puts(p, "    data: |\n");
> +		drm_puts(p, "      ");
> +
> +		/* bo data is dynamic */
> +		if (max_dyn_size) {
> +			*max_dyn_size +=
> +				vma->size / sizeof(u32) * (ASCII85_BUFSZ - 1);
> +		} else {
> +			print_bo(p, bo, vma->bo_offset, vma->size);
> +		}

Back when Daniel was working on it, I suggested dumping VAs and BOs
content separately, so we can shrink the dumps when sparse is involved.
Otherwise you'll have these huge VA range filled with repeated dummy
pages. It's then up to the coredump analysis tool to reconstruct the
mapping between VAs and BOs.

> +
> +		drm_puts(p, "\n");
> +	}
>  }
>  
>  static void print_as(struct drm_printer *p,
> diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
> index 1116f2d2826e..6c4de1e73cd1 100644
> --- a/drivers/gpu/drm/panthor/panthor_drv.c
> +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> @@ -1608,6 +1608,7 @@ static void panthor_debugfs_init(struct drm_minor *minor)
>   * - 1.3 - adds DRM_PANTHOR_GROUP_STATE_INNOCENT flag
>   * - 1.4 - adds DRM_IOCTL_PANTHOR_BO_SET_LABEL ioctl
>   * - 1.5 - adds DRM_PANTHOR_SET_USER_MMIO_OFFSET ioctl
> + * - 1.6 - adds DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE flag
>   */
>  static const struct drm_driver panthor_drm_driver = {
>  	.driver_features = DRIVER_RENDER | DRIVER_GEM | DRIVER_SYNCOBJ |
> @@ -1621,7 +1622,7 @@ static const struct drm_driver panthor_drm_driver = {
>  	.name = "panthor",
>  	.desc = "Panthor DRM driver",
>  	.major = 1,
> -	.minor = 5,
> +	.minor = 6,
>  
>  	.gem_create_object = panthor_gem_create_object,
>  	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index 7862c99984b6..72b1b2799b65 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -2045,10 +2045,11 @@ static void panthor_vma_init(struct panthor_vma *vma, u32 flags)
>  	vma->flags = flags;
>  }
>  
> -#define PANTHOR_VM_MAP_FLAGS \
> +#define PANTHOR_VM_MAP_FLAGS                   \
>  	(DRM_PANTHOR_VM_BIND_OP_MAP_READONLY | \
> -	 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | \
> -	 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED)
> +	 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |   \
> +	 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED | \
> +	 DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE)
>  
>  static int panthor_gpuva_sm_step_map(struct drm_gpuva_op *op, void *priv)
>  {
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index e1f43deb7eca..c4c5e38365e9 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -496,6 +496,13 @@ enum drm_panthor_vm_bind_op_flags {
>  	 */
>  	DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED = 1 << 2,
>  
> +	/**
> +	 * @DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE: Dump the VMA for devcoredump.
> +	 *
> +	 * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP.
> +	 */
> +	DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE = 1 << 3,

It feels weird to have this verbose-dump option exposed as a VM
bind flag. Is there anything in the Vulkan GPU crash extension that
allows flagging individual memory objects are dumpable? I understand
that dumping all the VM data means generating potentially huge dumps,
and that you sometimes could trim that out because all you care about
in your debug session is CS/shader binaries, but other times it proves
useful to have regular buffers dumped too. If the coredump is
partial, it means you'll have to go and ask for users to try and
reproduce the issue with a dump_all flags set.

Given devcoredump is a device interface (meaning we can't really filter
coredumps per context), I'd be tempted to make this 'dont-dump-BOs'
option an opt-out debugfs knob, so that, by default, everything is
dumped.

> +
>  	/**
>  	 * @DRM_PANTHOR_VM_BIND_OP_TYPE_MASK: Mask used to determine the type of operation.
>  	 */


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/9] drm/panthor: add devcoredump support
  2025-07-28 11:24   ` Steven Price
@ 2025-08-21  8:16     ` Boris Brezillon
  0 siblings, 0 replies; 19+ messages in thread
From: Boris Brezillon @ 2025-08-21  8:16 UTC (permalink / raw)
  To: Steven Price
  Cc: Chia-I Wu, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, linux-kernel,
	dri-devel

On Mon, 28 Jul 2025 12:24:02 +0100
Steven Price <steven.price@arm.com> wrote:

> On 20/07/2025 01:01, Chia-I Wu wrote:
> > Create a devcoredump on any faulty or fatal event. The coredump data is
> > in YAML format for readability and flexibility.
> > 
> > Only panthor_group state is captured for now.
> > 
> > Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
> > ---
> >  drivers/gpu/drm/panthor/Makefile           |   2 +
> >  drivers/gpu/drm/panthor/panthor_coredump.c | 225 +++++++++++++++++++++
> >  drivers/gpu/drm/panthor/panthor_coredump.h |  68 +++++++
> >  drivers/gpu/drm/panthor/panthor_device.h   |   6 +
> >  drivers/gpu/drm/panthor/panthor_sched.c    |  69 +++++++
> >  drivers/gpu/drm/panthor/panthor_sched.h    |   5 +
> >  6 files changed, 375 insertions(+)
> >  create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.c
> >  create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.h
> > 
> > diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile
> > index 15294719b09c..9fd1e74af1df 100644
> > --- a/drivers/gpu/drm/panthor/Makefile
> > +++ b/drivers/gpu/drm/panthor/Makefile
> > @@ -11,4 +11,6 @@ panthor-y := \
> >  	panthor_mmu.o \
> >  	panthor_sched.o
> >  
> > +panthor-$(CONFIG_DEV_COREDUMP) += panthor_coredump.o
> > +
> >  obj-$(CONFIG_DRM_PANTHOR) += panthor.o
> > diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
> > new file mode 100644
> > index 000000000000..767f3327e3e8
> > --- /dev/null
> > +++ b/drivers/gpu/drm/panthor/panthor_coredump.c
> > @@ -0,0 +1,225 @@
> > +// SPDX-License-Identifier: GPL-2.0 or MIT
> > +/* Copyright 2025 Google LLC */
> > +
> > +#include <drm/drm_drv.h>
> > +#include <drm/drm_print.h>
> > +#include <drm/drm_managed.h>
> > +#include <generated/utsrelease.h>
> > +#include <linux/devcoredump.h>
> > +#include <linux/err.h>
> > +#include <linux/slab.h>
> > +#include <linux/timekeeping.h>
> > +
> > +#include "panthor_coredump.h"
> > +#include "panthor_device.h"
> > +#include "panthor_sched.h"
> > +
> > +/**
> > + * enum panthor_coredump_mask - Coredump state
> > + */
> > +enum panthor_coredump_mask {
> > +	PANTHOR_COREDUMP_GROUP = BIT(0),
> > +};
> > +
> > +/**
> > + * struct panthor_coredump_header - Coredump header
> > + */
> > +struct panthor_coredump_header {
> > +	enum panthor_coredump_reason reason;
> > +	ktime_t timestamp;
> > +};
> > +
> > +/**
> > + * struct panthor_coredump - Coredump
> > + */
> > +struct panthor_coredump {
> > +	/** @ptdev: Device. */
> > +	struct panthor_device *ptdev;
> > +
> > +	/** @work: Bottom half of panthor_coredump_capture. */
> > +	struct work_struct work;
> > +
> > +	/** @header: Header. */
> > +	struct panthor_coredump_header header;
> > +
> > +	/** @mask: Bitmask of captured states. */
> > +	u32 mask;
> > +
> > +	struct panthor_coredump_group_state group;
> > +
> > +	/* @data: Serialized coredump data. */
> > +	void *data;
> > +
> > +	/* @size: Serialized coredump size. */
> > +	size_t size;
> > +};
> > +
> > +static const char *reason_str(enum panthor_coredump_reason reason)
> > +{
> > +	switch (reason) {
> > +	case PANTHOR_COREDUMP_REASON_MMU_FAULT:
> > +		return "MMU_FAULT";
> > +	case PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT:
> > +		return "CSG_REQ_TIMEOUT";
> > +	case PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE:
> > +		return "CSG_UNKNOWN_STATE";
> > +	case PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT:
> > +		return "CSG_PROGRESS_TIMEOUT";
> > +	case PANTHOR_COREDUMP_REASON_CS_FATAL:
> > +		return "CS_FATAL";
> > +	case PANTHOR_COREDUMP_REASON_CS_FAULT:
> > +		return "CS_FAULT";
> > +	case PANTHOR_COREDUMP_REASON_CS_TILER_OOM:
> > +		return "CS_TILER_OOM";
> > +	case PANTHOR_COREDUMP_REASON_JOB_TIMEOUT:
> > +		return "JOB_TIMEOUT";
> > +	default:
> > +		return "UNKNOWN";
> > +	}
> > +}  
> 
> I'd recommend using a macro to reduce the repetition, e.g. take a look
> at PANTHOR_EXCEPTION().
> 
> > +
> > +static void print_group(struct drm_printer *p,
> > +			const struct panthor_coredump_group_state *group)
> > +{
> > +	drm_puts(p, "group:\n");
> > +	drm_printf(p, "  priority: %d\n", group->priority);
> > +	drm_printf(p, "  queue_count: %u\n", group->queue_count);
> > +	drm_printf(p, "  pid: %d\n", group->pid);
> > +	drm_printf(p, "  comm: %s\n", group->comm);  
> 
> I can see the attraction of YAML, but here "comm" might contain
> characters that break the YAML parsing. So either we need to correctly
> quote such characters, or accept this isn't YAML.
> 
> In particular YAML starts to become ugly in the final patch when you are
> dumping buffer objects. Although AFAICT that is quoted successfully.

I honestly have mixed feelings about human-readable coredumps. On one
hand it gives you general GPU state info very quickly, on the other
hand, it becomes a lot bigger when you get to dump the info you need
for an actual post-mortem debugging session (VAs and BOs). And let's be
honest, those dumps will be passed to the GPU-specific coredump analysis
tool 99.99% of the time, so it's not like human-readability is important
in practice.

I know most drivers (Xe, AMD, MSM, ...) are using the drm printer and
generating human readable dumps, which I guess is one more reason to go
for this approach, but I wish we had some kind of ELF-like format for
these dumps, with separate sections and an easy way for the coredump
analysis tools to navigate among these sections easily instead of
having to parse text. Oh well, looks like this ship has long sailed,
and we get to follow others lead here.

> 
> > +	drm_printf(p, "  destroyed: %d\n", group->destroyed);
> > +	drm_printf(p, "  csg_id: %d\n", group->csg_id);
> > +}
> > +
> > +static void print_header(struct drm_printer *p,
> > +			 const struct panthor_coredump_header *header,
> > +			 const struct drm_driver *drv)
> > +{
> > +	drm_puts(p, "header:\n");
> > +	drm_puts(p, "  kernel: " UTS_RELEASE "\n");
> > +	drm_puts(p, "  module: " KBUILD_MODNAME "\n");
> > +	drm_printf(p, "  driver_version: %d.%d\n", drv->major, drv->minor);
> > +
> > +	drm_printf(p, "  reason: %s\n", reason_str(header->reason));
> > +	drm_printf(p, "  timestamp: %lld\n", ktime_to_ns(header->timestamp));
> > +}
> > +
> > +static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
> > +{
> > +	/* in YAML format */
> > +	drm_puts(p, "---\n");
> > +	print_header(p, &cd->header, cd->ptdev->base.driver);
> > +
> > +	if (cd->mask & PANTHOR_COREDUMP_GROUP)
> > +		print_group(p, &cd->group);
> > +}
> > +
> > +static void process_cd(struct panthor_device *ptdev,
> > +		       struct panthor_coredump *cd)
> > +{
> > +	struct drm_print_iterator iter = {
> > +		.remain = SSIZE_MAX,
> > +	};
> > +	struct drm_printer p = drm_coredump_printer(&iter);
> > +
> > +	print_cd(&p, cd);
> > +
> > +	iter.remain = SSIZE_MAX - iter.remain;
> > +	iter.data = kvmalloc(iter.remain, GFP_USER);
> > +	if (!iter.data)
> > +		return;
> > +
> > +	cd->data = iter.data;
> > +	cd->size = iter.remain;
> > +
> > +	drm_info(&ptdev->base, "generating coredump of size %zu\n", cd->size);
> > +
> > +	p = drm_coredump_printer(&iter);
> > +	print_cd(&p, cd);
> > +}  
> 
> I think this would be better written in the style suggested in the
> drm_print.h header, moving the iterator into print_cd():
> 
> static ssize_t print_cd(char *buffer, ssize_t count, const struct
> panthor_coredump *cd)
> {
> 	struct drm_print_iterator iter = {
> 		.data = buffer,
> 		.remain = count,
> 	};
> 	struct drm_printer p = drm_coredump_printer(&iter);
> 
> 	/* in YAML format */
> 	drm_puts(p, "---\n");
> 	print_header(p, &cd->header, cd->ptdev->base.driver);
> 
> 	if (cd->mask & PANTHOR_COREDUMP_GROUP)
> 		print_group(p, &cd->group);
> 
> 	return count - iter.remain;
> }
> 
> static void process_cd(struct panthor_device *ptdev,
> 		       struct panthor_coredump *cd)
> {
> 	ssize_t count = print_cd(NULL, SSIZE_MAX, cd);
> 
> 	cd->data = kvmalloc(count, GFP_USER);
> 	if (!cd->data)
> 		return;
> 	cd->size = count;
> 
> 	drm_info(&ptdev->base, "generating coredump of size %zu\n", count);
> 	print_cd(cd->data, cd->size, cd);
> }
> 
> > +
> > +static void capture_cd(struct panthor_device *ptdev,
> > +		       struct panthor_coredump *cd, struct panthor_group *group)
> > +{
> > +	drm_info(&ptdev->base, "capturing coredump states\n");
> > +
> > +	if (group) {
> > +		panthor_group_capture_coredump(group, &cd->group);
> > +		cd->mask |= PANTHOR_COREDUMP_GROUP;
> > +	}
> > +}
> > +
> > +static void panthor_coredump_free(void *data)
> > +{
> > +	struct panthor_coredump *cd = data;
> > +	struct panthor_device *ptdev = cd->ptdev;
> > +
> > +	kvfree(cd->data);
> > +	kfree(cd);
> > +
> > +	atomic_set(&ptdev->coredump.pending, 0);
> > +}
> > +
> > +static ssize_t panthor_coredump_read(char *buffer, loff_t offset, size_t count,
> > +				     void *data, size_t datalen)
> > +{
> > +	const struct panthor_coredump *cd = data;
> > +
> > +	if (offset >= cd->size)
> > +		return 0;
> > +
> > +	if (count > cd->size - offset)
> > +		count = cd->size - offset;
> > +
> > +	memcpy(buffer, cd->data + offset, count);
> > +
> > +	return count;
> > +}
> > +
> > +static void panthor_coredump_process_work(struct work_struct *work)
> > +{
> > +	struct panthor_coredump *cd =
> > +		container_of(work, struct panthor_coredump, work);
> > +	struct panthor_device *ptdev = cd->ptdev;
> > +
> > +	process_cd(ptdev, cd);
> > +
> > +	dev_coredumpm(ptdev->base.dev, THIS_MODULE, cd, 0, GFP_KERNEL,
> > +		      panthor_coredump_read, panthor_coredump_free);  
> 
> Is there a good reason to reinvent the read/free functionality of
> devcoredump? Can we not just use dev_coredumpv() instead? The only
> benefit I can see if the automatic rearming of coredump.pending, but
> panfrost handles this by having a "panfrost_dump_core" flag which is
> re-armed manually from user space.
> 
> Given core dumps might be large and fairly expensive to create, it seems
> sensible to not automatically re-arm.
> 
> > +}
> > +
> > +void panthor_coredump_capture(struct panthor_coredump *cd,
> > +			      struct panthor_group *group)
> > +{
> > +	struct panthor_device *ptdev = cd->ptdev;
> > +
> > +	capture_cd(ptdev, cd, group);
> > +
> > +	queue_work(system_unbound_wq, &cd->work);
> > +}  
> 
> So I can see why you want to move the work onto a workqueue, but I'm a
> little worried about lifetimes.
> 
> It seems slightly odd that you are capturing the data into a binary
> format (struct panthor_coredump_group_state, and later
> panthor_coredump_gpu_state, panthor_coredump_glb_state,
> panthor_coredump_csg_state etc) and then kicking off a separate
> workqueue item to convert it all to YAML.
> 
> > +
> > +struct panthor_coredump *
> > +panthor_coredump_alloc(struct panthor_device *ptdev,
> > +		       enum panthor_coredump_reason reason, gfp_t gfp)
> > +{
> > +	struct panthor_coredump *cd;
> > +
> > +	/* reject all but the first coredump until it is handled */
> > +	if (atomic_cmpxchg(&ptdev->coredump.pending, 0, 1)) {
> > +		drm_dbg(&ptdev->base, "skip subsequent coredump\n");
> > +		return NULL;
> > +	}
> > +
> > +	cd = kzalloc(sizeof(*cd), gfp);
> > +	if (!cd) {
> > +		atomic_set(&ptdev->coredump.pending, 0);
> > +		return NULL;
> > +	}
> > +
> > +	cd->ptdev = ptdev;
> > +	INIT_WORK(&cd->work, panthor_coredump_process_work);
> > +
> > +	cd->header.reason = reason;
> > +	cd->header.timestamp = ktime_get_real();
> > +
> > +	return cd;
> > +}
> > diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
> > new file mode 100644
> > index 000000000000..dd1fe1c2e175
> > --- /dev/null
> > +++ b/drivers/gpu/drm/panthor/panthor_coredump.h
> > @@ -0,0 +1,68 @@
> > +/* SPDX-License-Identifier: GPL-2.0 or MIT */
> > +/* Copyright 2019 Collabora ltd. */
> > +
> > +#ifndef __PANTHOR_COREDUMP_H__
> > +#define __PANTHOR_COREDUMP_H__
> > +
> > +#include <drm/panthor_drm.h>
> > +#include <linux/sched.h>
> > +#include <linux/types.h>
> > +
> > +struct panthor_coredump;
> > +struct panthor_device;
> > +struct panthor_group;
> > +
> > +/**
> > + * enum panthor_coredump_reason - Coredump reason
> > + */
> > +enum panthor_coredump_reason {
> > +	PANTHOR_COREDUMP_REASON_MMU_FAULT,
> > +	PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
> > +	PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
> > +	PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT,
> > +	PANTHOR_COREDUMP_REASON_CS_FATAL,
> > +	PANTHOR_COREDUMP_REASON_CS_FAULT,
> > +	PANTHOR_COREDUMP_REASON_CS_TILER_OOM,
> > +	PANTHOR_COREDUMP_REASON_JOB_TIMEOUT,
> > +};
> > +
> > +/**
> > + * struct panthor_coredump_group_state - Coredump group state
> > + *
> > + * Interesting panthor_group fields.
> > + */
> > +struct panthor_coredump_group_state {
> > +	enum drm_panthor_group_priority priority;
> > +	u32 queue_count;
> > +	pid_t pid;
> > +	char comm[TASK_COMM_LEN];
> > +	bool destroyed;
> > +	int csg_id;
> > +};
> > +
> > +#ifdef CONFIG_DEV_COREDUMP
> > +
> > +struct panthor_coredump *
> > +panthor_coredump_alloc(struct panthor_device *ptdev,
> > +		       enum panthor_coredump_reason reason, gfp_t gfp);
> > +
> > +void panthor_coredump_capture(struct panthor_coredump *cd,
> > +			      struct panthor_group *group);
> > +
> > +#else /* CONFIG_DEV_COREDUMP */
> > +
> > +static inline struct panthor_coredump *
> > +panthor_coredump_alloc(struct panthor_device *ptdev,
> > +		       enum panthor_coredump_reason reason, gfp_t gfp)
> > +{
> > +	return NULL;
> > +}
> > +
> > +static inline void panthor_coredump_capture(struct panthor_coredump *cd,
> > +					    struct panthor_group *group)
> > +{
> > +}  
> 
> panthor_coredump_alloc() is always called immediately before
> panthor_coredump_capture(). So instead we could just export a wrapper
> than combines both functions. This also avoids the caller having to deal
> with panthor_coredump_alloc() failing.
> 
> Thanks,
> Steve
> 
> > +
> > +#endif /* CONFIG_DEV_COREDUMP */
> > +
> > +#endif /* __PANTHOR_COREDUMP_H__ */
> > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> > index 4fc7cf2aeed5..766e53c25cfa 100644
> > --- a/drivers/gpu/drm/panthor/panthor_device.h
> > +++ b/drivers/gpu/drm/panthor/panthor_device.h
> > @@ -197,6 +197,12 @@ struct panthor_device {
> >  		atomic_t recovery_needed;
> >  	} pm;
> >  
> > +	/** @coredump: Coredump-related data. */
> > +	struct {
> > +		/** @pending: True if there is a pending coredump. */
> > +		atomic_t pending;
> > +	} coredump;
> > +
> >  	/** @profile_mask: User-set profiling flags for job accounting. */
> >  	u32 profile_mask;
> >  
> > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > index a2248f692a03..eb45b5ad9774 100644
> > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > @@ -23,6 +23,7 @@
> >  #include <linux/platform_device.h>
> >  #include <linux/pm_runtime.h>
> >  
> > +#include "panthor_coredump.h"
> >  #include "panthor_devfreq.h"
> >  #include "panthor_device.h"
> >  #include "panthor_fw.h"
> > @@ -1031,6 +1032,10 @@ group_unbind_locked(struct panthor_group *group)
> >  	return 0;
> >  }
> >  
> > +static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
> > +					  enum panthor_coredump_reason reason,
> > +					  struct panthor_group *group);
> > +
> >  /**
> >   * cs_slot_prog_locked() - Program a queue slot
> >   * @ptdev: Device.
> > @@ -1249,6 +1254,10 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
> >  		drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
> >  			csg_id, csg_state);
> >  		new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
> > +
> > +		panthor_sched_coredump_locked(
> > +			ptdev, PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
> > +			group);
> >  		break;
> >  	}
> >  
> > @@ -1378,6 +1387,9 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
> >  		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
> >  		 (unsigned int)CS_EXCEPTION_DATA(fatal),
> >  		 info);
> > +
> > +	panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FATAL,
> > +				      group);
> >  }
> >  
> >  static void
> > @@ -1426,6 +1438,9 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
> >  		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
> >  		 (unsigned int)CS_EXCEPTION_DATA(fault),
> >  		 info);
> > +
> > +	panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FAULT,
> > +				      group);
> >  }
> >  
> >  static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
> > @@ -1480,6 +1495,10 @@ static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
> >  		drm_warn(&ptdev->base, "Failed to extend the tiler heap\n");
> >  		group->fatal_queues |= BIT(cs_id);
> >  		sched_queue_delayed_work(sched, tick, 0);
> > +
> > +		panthor_sched_coredump_locked(
> > +			ptdev, PANTHOR_COREDUMP_REASON_CS_TILER_OOM, group);
> > +
> >  		goto out_put_heap_pool;
> >  	}
> >  
> > @@ -1639,6 +1658,9 @@ csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 c
> >  		group->timedout = true;
> >  
> >  	sched_queue_delayed_work(sched, tick, 0);
> > +
> > +	panthor_sched_coredump_locked(
> > +		ptdev, PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT, group);
> >  }
> >  
> >  static void sched_process_csg_irq_locked(struct panthor_device *ptdev, u32 csg_id)
> > @@ -1858,8 +1880,16 @@ static int csgs_upd_ctx_apply_locked(struct panthor_device *ptdev,
> >  
> >  		if (ret && acked != req_mask &&
> >  		    ((csg_iface->input->req ^ csg_iface->output->ack) & req_mask) != 0) {
> > +			struct panthor_csg_slot *csg_slot =
> > +				&sched->csg_slots[csg_id];
> > +			struct panthor_group *group = csg_slot->group;
> > +
> >  			drm_err(&ptdev->base, "CSG %d update request timedout", csg_id);
> >  			ctx->timedout_mask |= BIT(csg_id);
> > +
> > +			panthor_sched_coredump_locked(
> > +				ptdev, PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
> > +				group);
> >  		}
> >  	}
> >  
> > @@ -2027,6 +2057,10 @@ tick_ctx_init(struct panthor_scheduler *sched,
> >  		 * CSG IRQs, so we can flag the faulty queue.
> >  		 */
> >  		if (panthor_vm_has_unhandled_faults(group->vm)) {
> > +			panthor_sched_coredump_locked(
> > +				ptdev, PANTHOR_COREDUMP_REASON_MMU_FAULT,
> > +				group);
> > +
> >  			sched_process_csg_irq_locked(ptdev, i);
> >  
> >  			/* No fatal fault reported, flag all queues as faulty. */
> > @@ -3237,6 +3271,10 @@ queue_timedout_job(struct drm_sched_job *sched_job)
> >  
> >  		group_queue_work(group, term);
> >  	}
> > +
> > +	panthor_sched_coredump_locked(
> > +		ptdev, PANTHOR_COREDUMP_REASON_JOB_TIMEOUT, group);
> > +
> >  	mutex_unlock(&sched->lock);
> >  
> >  	queue_start(queue);
> > @@ -3627,6 +3665,37 @@ int panthor_group_get_state(struct panthor_file *pfile,
> >  	return 0;
> >  }
> >  
> > +static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
> > +					  enum panthor_coredump_reason reason,
> > +					  struct panthor_group *group)
> > +{
> > +	struct panthor_coredump *cd;
> > +
> > +	lockdep_assert_held(&ptdev->scheduler->lock);
> > +
> > +	/* GFP_NOWAIT because this may be called from fence signaling path */
> > +	cd = panthor_coredump_alloc(ptdev, reason, GFP_NOWAIT);
> > +	if (!cd)
> > +		return;
> > +
> > +	panthor_coredump_capture(cd, group);
> > +}
> > +
> > +void panthor_group_capture_coredump(const struct panthor_group *group,
> > +				    struct panthor_coredump_group_state *state)
> > +{
> > +	const struct panthor_device *ptdev = group->ptdev;
> > +
> > +	/* this is called from panthor_coredump_capture */
> > +	lockdep_assert_held(&ptdev->scheduler->lock);
> > +
> > +	state->priority = group->priority;
> > +	state->queue_count = group->queue_count;
> > +	/* TODO state->pid and state->comm */
> > +	state->destroyed = group->destroyed;
> > +	state->csg_id = group->csg_id;
> > +}
> > +
> >  int panthor_group_pool_create(struct panthor_file *pfile)
> >  {
> >  	struct panthor_group_pool *gpool;
> > diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
> > index 742b0b4ff3a3..6c564153133e 100644
> > --- a/drivers/gpu/drm/panthor/panthor_sched.h
> > +++ b/drivers/gpu/drm/panthor/panthor_sched.h
> > @@ -14,8 +14,10 @@ struct drm_panthor_group_create;
> >  struct drm_panthor_queue_create;
> >  struct drm_panthor_group_get_state;
> >  struct drm_panthor_queue_submit;
> > +struct panthor_coredump_group_state;
> >  struct panthor_device;
> >  struct panthor_file;
> > +struct panthor_group;
> >  struct panthor_group_pool;
> >  struct panthor_job;
> >  
> > @@ -26,6 +28,9 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle);
> >  int panthor_group_get_state(struct panthor_file *pfile,
> >  			    struct drm_panthor_group_get_state *get_state);
> >  
> > +void panthor_group_capture_coredump(const struct panthor_group *group,
> > +				    struct panthor_coredump_group_state *state);
> > +
> >  struct drm_sched_job *
> >  panthor_job_create(struct panthor_file *pfile,
> >  		   u16 group_handle,  
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2025-08-21  8:16 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-20  0:01 [PATCH 0/9] drm/panthor: add devcoredump support Chia-I Wu
2025-07-20  0:01 ` [PATCH 1/9] " Chia-I Wu
2025-07-20  3:17   ` kernel test robot
2025-07-28 11:24   ` Steven Price
2025-08-21  8:16     ` Boris Brezillon
2025-07-20  0:01 ` [PATCH 2/9] drm/panthor: capture GPU state for devcoredump Chia-I Wu
2025-07-20  4:29   ` kernel test robot
2025-07-20  0:01 ` [PATCH 3/9] drm/panthor: capture GLB " Chia-I Wu
2025-07-20  5:41   ` kernel test robot
2025-07-20  0:01 ` [PATCH 4/9] drm/panthor: capture CSG " Chia-I Wu
2025-07-20  0:01 ` [PATCH 5/9] drm/panthor: capture CS " Chia-I Wu
2025-07-20  0:01 ` [PATCH 6/9] drm/panthor: capture AS " Chia-I Wu
2025-07-20  0:01 ` [PATCH 7/9] drm/panthor: capture VMA " Chia-I Wu
2025-07-20  0:01 ` [PATCH 8/9] drm/panthor: check bo offset alignment in vm bind Chia-I Wu
2025-08-21  7:33   ` Boris Brezillon
2025-07-20  0:01 ` [PATCH 9/9] drm/panthor: add DRM_PANTHOR_VM_BIND_OP_MAP_DUMPABLE Chia-I Wu
2025-08-21  7:55   ` Boris Brezillon
2025-07-20  0:41 ` [PATCH 0/9] drm/panthor: add devcoredump support Daniel Almeida
2025-07-20  1:13   ` Chia-I Wu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).