[PATCH v1 0/5] drm/panthor: Implement fault information propagation

dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v1 0/5] drm/panthor: Implement fault information propagation
@ 2025-12-15 11:54 Lukas Zapolskas
  2025-12-15 11:54 ` [PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace Lukas Zapolskas
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Lukas Zapolskas @ 2025-12-15 11:54 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter
  Cc: nd, dri-devel, linux-kernel, Lukas Zapolskas

The Panthor CSF firmware and hardware interfaces provide additional
metadata when faults occur. In particular, there are three relevant
categories of faults here:
* faults raised at the address space level. These are fatal and each
  group bound to that AS is affected.
* fatal faults raised on one of the queues in that group, preventing
  further submissions on the group.
* recoverable faults raised on one of the queues in that group.
  The queue may progress further after this point, so multiple such
  faults may be generated.
Each of these categories provides fault information containing
sources of the error, read/write/execute bits and potentially virtual
addresses at which these faults occurred.

This series extends the GROUP_GET_STATE ioctl to propagate the fault
metadata to enable the VK_EXT_device_fault(3) extension.

Lukas Zapolskas (4):
  drm/panthor: Store queue fault and fatal information
  drm/panthor: Track VM faults
  drm/panthor: Propagate VM-level faults to groups
  drm/panthor: Use GROUP_GET_STATE to provide group and queue errors

Paul Toadere (1):
  drm/panthor: Implement CS_FAULT propagation to userspace

 drivers/gpu/drm/panthor/panthor_drv.c   |  85 ++++++++-
 drivers/gpu/drm/panthor/panthor_mmu.c   |  24 ++-
 drivers/gpu/drm/panthor/panthor_mmu.h   |  22 +++
 drivers/gpu/drm/panthor/panthor_regs.h  |   3 +
 drivers/gpu/drm/panthor/panthor_sched.c | 237 ++++++++++++++++++++----
 drivers/gpu/drm/panthor/panthor_sched.h |   4 +-
 include/uapi/drm/panthor_drm.h          |  52 +++++-
 7 files changed, 378 insertions(+), 49 deletions(-)

--
2.33.0.dirty

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace
  2025-12-15 11:54 [PATCH v1 0/5] drm/panthor: Implement fault information propagation Lukas Zapolskas
@ 2025-12-15 11:54 ` Lukas Zapolskas
  2025-12-15 12:03   ` Boris Brezillon
  2025-12-15 11:54 ` [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information Lukas Zapolskas
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 16+ messages in thread
From: Lukas Zapolskas @ 2025-12-15 11:54 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter
  Cc: nd, dri-devel, linux-kernel, Paul Toadere, Lukas Zapolskas

From: Paul Toadere <paul.toadere@arm.com>

Though faulted queues do not prevent further submission, the
recoverable faults may have further consequences which are
worth recording and providing to the user.

Signed-off-by: Paul Toadere <paul.toadere@arm.com>
Co-developed-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
---
 drivers/gpu/drm/panthor/panthor_sched.c | 18 +++++++++++++++---
 include/uapi/drm/panthor_drm.h          | 11 +++++++++--
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index a17b067a0439..eb8841beba39 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -569,6 +569,14 @@ struct panthor_group {
 	/** @fatal_queues: Bitmask reflecting the queues that hit a fatal exception. */
 	u32 fatal_queues;
 
+	/**
+	 * @fault_queues: Bitmask reflecting the queues that hit a recoverable exception.
+	 *
+	 * This field is reset when the GROUP_GET_STATE ioctl is used to collect the fault
+	 * information.
+	 */
+	u32 fault_queues;
+
 	/** @tiler_oom: Mask of queues that have a tiler OOM event to process. */
 	atomic_t tiler_oom;
 
@@ -1553,6 +1561,8 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
 	if (group) {
 		drm_warn(&ptdev->base, "CS_FAULT: pid=%d, comm=%s\n",
 			 group->task_info.pid, group->task_info.comm);
+
+		group->fault_queues |= BIT(cs_id);
 	}
 
 	drm_warn(&ptdev->base,
@@ -3807,9 +3817,6 @@ int panthor_group_get_state(struct panthor_file *pfile,
 	struct panthor_scheduler *sched = ptdev->scheduler;
 	struct panthor_group *group;
 
-	if (get_state->pad)
-		return -EINVAL;
-
 	group = group_from_handle(gpool, get_state->group_handle);
 	if (!group)
 		return -EINVAL;
@@ -3825,6 +3832,11 @@ int panthor_group_get_state(struct panthor_file *pfile,
 	}
 	if (group->innocent)
 		get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
+	if (group->fault_queues) {
+		get_state->state |= DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT;
+		get_state->fault_queues = group->fault_queues;
+		group->fault_queues = 0;
+	}
 	mutex_unlock(&sched->lock);
 
 	group_put(group);
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index e238c6264fa1..77262d2b9672 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -965,6 +965,13 @@ enum drm_panthor_group_state_flags {
 	 * DRM_PANTHOR_GROUP_STATE_FATAL_FAULT is not.
 	 */
 	DRM_PANTHOR_GROUP_STATE_INNOCENT = 1 << 2,
+
+	/**
+	 * @DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT: Group had recoverable faults.
+	 *
+	 * When a group ends up with this flag set, jobs can still be submitted to its queues.
+	 */
+	DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT = 1 << 3,
 };
 
 /**
@@ -986,8 +993,8 @@ struct drm_panthor_group_get_state {
 	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
 	__u32 fatal_queues;
 
-	/** @pad: MBZ */
-	__u32 pad;
+	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
+	__u32 fault_queues;
 };
 
 /**
-- 
2.33.0.dirty


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information
  2025-12-15 11:54 [PATCH v1 0/5] drm/panthor: Implement fault information propagation Lukas Zapolskas
  2025-12-15 11:54 ` [PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace Lukas Zapolskas
@ 2025-12-15 11:54 ` Lukas Zapolskas
  2025-12-15 12:11   ` Boris Brezillon
  2025-12-17 11:37   ` Steven Price
  2025-12-15 11:54 ` [PATCH v1 3/5] drm/panthor: Track VM faults Lukas Zapolskas
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 16+ messages in thread
From: Lukas Zapolskas @ 2025-12-15 11:54 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter
  Cc: nd, dri-devel, linux-kernel, Lukas Zapolskas

A queue may encounter either one fatal fault or any number of
recoverable faults during execution. The CSF FW provides the
FAULT/FATAL registers, indicating the fault type, and another
set of registers providing more metadata about why the fault
was generated. Storing the information allows it to be
reported to the user using the GROUP_GET_STATE ioctl.

Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
---
 drivers/gpu/drm/panthor/panthor_sched.c | 116 +++++++++++++++++-------
 include/uapi/drm/panthor_drm.h          |  17 ++++
 2 files changed, 100 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index eb8841beba39..a77399e95620 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -342,6 +342,14 @@ struct panthor_syncobj_64b {
 	u32 pad;
 };
 
+struct panthor_queue_event {
+	/** @link: Link to a list of Panthor event errors. */
+	struct list_head link;
+
+	/** @event: The event containing all of the fault/fatal metadata. */
+	struct drm_panthor_queue_event event;
+};
+
 /**
  * struct panthor_queue - Execution queue
  */
@@ -485,6 +493,9 @@ struct panthor_queue {
 		/** @seqno: Index of the next available profiling information slot. */
 		u32 seqno;
 	} profiling;
+
+	/** @events: List of fault or fatal events reported on this queue. */
+	struct list_head events;
 };
 
 /**
@@ -918,6 +929,8 @@ panthor_queue_get_syncwait_obj(struct panthor_group *group, struct panthor_queue
 
 static void group_free_queue(struct panthor_group *group, struct panthor_queue *queue)
 {
+	struct panthor_queue_event *evt, *tmp;
+
 	if (IS_ERR_OR_NULL(queue))
 		return;
 
@@ -934,6 +947,11 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
 
 	panthor_queue_put_syncwait_obj(queue);
 
+	list_for_each_entry_safe(evt, tmp, &queue->events, link) {
+		list_del(&evt->link);
+		kfree(evt);
+	}
+
 	panthor_kernel_bo_destroy(queue->ringbuf);
 	panthor_kernel_bo_destroy(queue->iface.mem);
 	panthor_kernel_bo_destroy(queue->profiling.slots);
@@ -1476,6 +1494,69 @@ csg_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 priority)
 	return 0;
 }
 
+static struct panthor_queue_event *
+panthor_queue_create_event(unsigned long event_type, u32 cs_id, u32 exception)
+{
+	struct panthor_queue_event *event;
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event)
+		return ERR_PTR(-ENOMEM);
+
+	event->event = (struct drm_panthor_queue_event){
+		.queue_id = cs_id,
+		.event_type = event_type,
+		.exception_type = CS_EXCEPTION_TYPE(exception),
+		.exception_data = CS_EXCEPTION_DATA(exception),
+	};
+	INIT_LIST_HEAD(&event->link);
+
+	return event;
+}
+
+#define PANTHOR_DEFINE_EVENT_INFO(__type, __msg, __event) \
+static u32 panthor_queue_set_ ## __type ## _info(struct panthor_device *ptdev,			\
+						 struct panthor_group *group,			\
+						 u32 csg_id, u32 cs_id)				\
+{												\
+	struct panthor_scheduler *sched = ptdev->scheduler;					\
+	struct panthor_fw_cs_iface *iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);	\
+	struct panthor_queue *queue = group && cs_id < group->queue_count ?			\
+				      group->queues[cs_id] : NULL;				\
+	struct panthor_queue_event *event;							\
+												\
+	lockdep_assert_held(&sched->lock);							\
+												\
+	if (!iface || !queue)									\
+		return 0;									\
+												\
+	const u32 exception = iface->output->__type;						\
+	const u64 info = iface->output->__type ## _info;					\
+												\
+	event = panthor_queue_create_event((__event), cs_id, exception);			\
+												\
+	if (!IS_ERR(event))									\
+		list_add_tail(&event->link, &queue->events);					\
+	else											\
+		drm_err(&ptdev->base, "Could not store fault notification, err = %ld",		\
+			PTR_ERR(event));							\
+												\
+	drm_warn(&ptdev->base,									\
+		 "CSG slot %d CS slot: %d\n"							\
+		 "CS_" __msg  ".EXCEPTION_TYPE: 0x%x (%s)\n"					\
+		 "CS_" __msg  ".EXCEPTION_DATA: 0x%x\n"						\
+		 "CS_" __msg  "_INFO.EXCEPTION_DATA: 0x%llx\n",					\
+		 csg_id, cs_id,									\
+		 (unsigned int)CS_EXCEPTION_TYPE(exception),					\
+		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(exception)),			\
+		 (unsigned int)CS_EXCEPTION_DATA(exception), info);				\
+												\
+	return exception;									\
+}
+
+PANTHOR_DEFINE_EVENT_INFO(fatal, "FATAL", DRM_PANTHOR_GROUP_STATE_FATAL_FAULT);
+PANTHOR_DEFINE_EVENT_INFO(fault, "FAULT", DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT);
+
 static void
 cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
 				   u32 csg_id, u32 cs_id)
@@ -1483,15 +1564,11 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
 	struct panthor_scheduler *sched = ptdev->scheduler;
 	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
 	struct panthor_group *group = csg_slot->group;
-	struct panthor_fw_cs_iface *cs_iface;
 	u32 fatal;
-	u64 info;
 
 	lockdep_assert_held(&sched->lock);
 
-	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
-	fatal = cs_iface->output->fatal;
-	info = cs_iface->output->fatal_info;
+	fatal = panthor_queue_set_fatal_info(ptdev, group, csg_id, cs_id);
 
 	if (group) {
 		drm_warn(&ptdev->base, "CS_FATAL: pid=%d, comm=%s\n",
@@ -1509,17 +1586,6 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
 	} else {
 		sched_queue_delayed_work(sched, tick, 0);
 	}
-
-	drm_warn(&ptdev->base,
-		 "CSG slot %d CS slot: %d\n"
-		 "CS_FATAL.EXCEPTION_TYPE: 0x%x (%s)\n"
-		 "CS_FATAL.EXCEPTION_DATA: 0x%x\n"
-		 "CS_FATAL_INFO.EXCEPTION_DATA: 0x%llx\n",
-		 csg_id, cs_id,
-		 (unsigned int)CS_EXCEPTION_TYPE(fatal),
-		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
-		 (unsigned int)CS_EXCEPTION_DATA(fatal),
-		 info);
 }
 
 static void
@@ -1531,15 +1597,10 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
 	struct panthor_group *group = csg_slot->group;
 	struct panthor_queue *queue = group && cs_id < group->queue_count ?
 				      group->queues[cs_id] : NULL;
-	struct panthor_fw_cs_iface *cs_iface;
-	u32 fault;
-	u64 info;
 
 	lockdep_assert_held(&sched->lock);
 
-	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
-	fault = cs_iface->output->fault;
-	info = cs_iface->output->fault_info;
+	panthor_queue_set_fault_info(ptdev, group, csg_id, cs_id);
 
 	if (queue) {
 		u64 cs_extract = queue->iface.output->extract;
@@ -1564,17 +1625,6 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
 
 		group->fault_queues |= BIT(cs_id);
 	}
-
-	drm_warn(&ptdev->base,
-		 "CSG slot %d CS slot: %d\n"
-		 "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n"
-		 "CS_FAULT.EXCEPTION_DATA: 0x%x\n"
-		 "CS_FAULT_INFO.EXCEPTION_DATA: 0x%llx\n",
-		 csg_id, cs_id,
-		 (unsigned int)CS_EXCEPTION_TYPE(fault),
-		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
-		 (unsigned int)CS_EXCEPTION_DATA(fault),
-		 info);
 }
 
 static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index 77262d2b9672..083a02418d28 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -974,6 +974,23 @@ enum drm_panthor_group_state_flags {
 	DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT = 1 << 3,
 };
 
+/**
+ * struct drm_panthor_queue_event - Fault or fatal event occurring on a single queue.
+ */
+struct drm_panthor_queue_event {
+	/** @queue_id: The ID of the queue that faulted. */
+	__u32 queue_id;
+
+	/** @event_type: What kind of event is being propagated. */
+	__u32 event_type;
+
+	/** @exception_type: The type of exception that caused the fault. */
+	__u32 exception_type;
+
+	/** @exception_data: Exception-specific data. */
+	__u32 exception_data;
+};
+
 /**
  * struct drm_panthor_group_get_state - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_GET_STATE
  *
-- 
2.33.0.dirty


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v1 3/5] drm/panthor: Track VM faults
  2025-12-15 11:54 [PATCH v1 0/5] drm/panthor: Implement fault information propagation Lukas Zapolskas
  2025-12-15 11:54 ` [PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace Lukas Zapolskas
  2025-12-15 11:54 ` [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information Lukas Zapolskas
@ 2025-12-15 11:54 ` Lukas Zapolskas
  2025-12-15 12:37   ` Boris Brezillon
  2025-12-15 11:54 ` [PATCH v1 4/5] drm/panthor: Propagate VM-level faults to groups Lukas Zapolskas
  2025-12-15 11:54 ` [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors Lukas Zapolskas
  4 siblings, 1 reply; 16+ messages in thread
From: Lukas Zapolskas @ 2025-12-15 11:54 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter
  Cc: nd, dri-devel, linux-kernel, Lukas Zapolskas

Faults reported via the MMU_CONTROL register block will result in fatal
faults for running groups on that AS, which will also be useful to know
for the user.

Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c  | 16 ++++++++++++++--
 drivers/gpu/drm/panthor/panthor_mmu.h  | 20 ++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_regs.h |  3 +++
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index 473a8bebd61e..10a7418eecda 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -384,6 +384,9 @@ struct panthor_vm {
 		/** @locked_region.size: Size of the locked region. */
 		u64 size;
 	} locked_region;
+
+	/** @fault: Fault information (if any) for this VM. */
+	struct panthor_vm_fault fault;
 };
 
 /**
@@ -741,6 +744,7 @@ int panthor_vm_active(struct panthor_vm *vm)
 
 	/* If the VM is re-activated, we clear the fault. */
 	vm->unhandled_fault = false;
+	vm->fault = (struct panthor_vm_fault){ 0 };
 
 	/* Unhandled pagefault on this AS, clear the fault and re-enable interrupts
 	 * before enabling the AS.
@@ -1744,8 +1748,16 @@ static void panthor_mmu_irq_handler(struct panthor_device *ptdev, u32 status)
 		 */
 		ptdev->mmu->irq.mask = new_int_mask;
 
-		if (ptdev->mmu->as.slots[as].vm)
-			ptdev->mmu->as.slots[as].vm->unhandled_fault = true;
+		if (ptdev->mmu->as.slots[as].vm) {
+			struct panthor_vm *vm = ptdev->mmu->as.slots[as].vm;
+
+			vm->unhandled_fault = true;
+			vm->fault.exception_type = AS_FAULTSTATUS_EXCEPTION_TYPE(status);
+			vm->fault.access_type = AS_FAULTSTATUS_ACCESS_TYPE(status);
+			vm->fault.source_id = AS_FAULTSTATUS_SOURCE_ID(status);
+			vm->fault.valid_address = true;
+			vm->fault.address = addr;
+		}
 
 		/* Disable the MMU to kill jobs on this AS. */
 		panthor_mmu_as_disable(ptdev, as, false);
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.h b/drivers/gpu/drm/panthor/panthor_mmu.h
index 0e268fdfdb2f..023fdc79c231 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.h
+++ b/drivers/gpu/drm/panthor/panthor_mmu.h
@@ -16,6 +16,26 @@ struct panthor_vm;
 struct panthor_vma;
 struct panthor_mmu;
 
+/**
+ * struct panthor_vm_fault - Tracking information for VM-level faults.
+ */
+struct panthor_vm_fault {
+	/** @address: Virtual address of the faulting access. */
+	u64 address;
+
+	/** @exception_type: The type of exception that caused the fault. */
+	u32 exception_type;
+
+	/** @access_type: The direction of data transfer that caused the fault. */
+	u32 access_type;
+
+	/** @source_id: ID supplying further data about the source of the fault. */
+	u32 source_id;
+
+	/** @valid_address: Whether the virtual address is valid. */
+	bool valid_address;
+};
+
 int panthor_mmu_init(struct panthor_device *ptdev);
 void panthor_mmu_unplug(struct panthor_device *ptdev);
 void panthor_mmu_pre_reset(struct panthor_device *ptdev);
diff --git a/drivers/gpu/drm/panthor/panthor_regs.h b/drivers/gpu/drm/panthor/panthor_regs.h
index 08bf06c452d6..5aa5e37d29c9 100644
--- a/drivers/gpu/drm/panthor/panthor_regs.h
+++ b/drivers/gpu/drm/panthor/panthor_regs.h
@@ -178,10 +178,13 @@
 #define   AS_LOCK_REGION_MIN_SIZE			(1ULL << 15)
 #define AS_FAULTSTATUS(as)				(MMU_AS(as) + 0x1C)
 #define  AS_FAULTSTATUS_ACCESS_TYPE_MASK		(0x3 << 8)
+#define  AS_FAULTSTATUS_ACCESS_TYPE(x)			(((x) >> 8) & GENMASK(2, 0))
 #define  AS_FAULTSTATUS_ACCESS_TYPE_ATOMIC		(0x0 << 8)
 #define  AS_FAULTSTATUS_ACCESS_TYPE_EX			(0x1 << 8)
 #define  AS_FAULTSTATUS_ACCESS_TYPE_READ		(0x2 << 8)
 #define  AS_FAULTSTATUS_ACCESS_TYPE_WRITE		(0x3 << 8)
+#define  AS_FAULTSTATUS_EXCEPTION_TYPE(x)		((x) & GENMASK(7, 0))
+#define  AS_FAULTSTATUS_SOURCE_ID(x)			(((x) >> 16) & GENMASK(16, 0))
 #define AS_FAULTADDRESS(as)				(MMU_AS(as) + 0x20)
 #define AS_STATUS(as)					(MMU_AS(as) + 0x28)
 #define   AS_STATUS_AS_ACTIVE				BIT(0)
-- 
2.33.0.dirty


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v1 4/5] drm/panthor: Propagate VM-level faults to groups
  2025-12-15 11:54 [PATCH v1 0/5] drm/panthor: Implement fault information propagation Lukas Zapolskas
                   ` (2 preceding siblings ...)
  2025-12-15 11:54 ` [PATCH v1 3/5] drm/panthor: Track VM faults Lukas Zapolskas
@ 2025-12-15 11:54 ` Lukas Zapolskas
  2025-12-15 12:41   ` Boris Brezillon
  2025-12-15 12:46   ` Boris Brezillon
  2025-12-15 11:54 ` [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors Lukas Zapolskas
  4 siblings, 2 replies; 16+ messages in thread
From: Lukas Zapolskas @ 2025-12-15 11:54 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter
  Cc: nd, dri-devel, linux-kernel, Lukas Zapolskas

Receiving an MMU fault currently disables the AS, so each of the groups
is marked with the appropriate faults. Since no further submissions
can occur on a fatal fault for that group, the fault information
does not have to be cleared until the group is terminated.

Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c   |  8 ++++++++
 drivers/gpu/drm/panthor/panthor_mmu.h   |  2 ++
 drivers/gpu/drm/panthor/panthor_sched.c | 13 +++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index 10a7418eecda..9e78b0509f1a 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -2895,3 +2895,11 @@ void panthor_mmu_pt_cache_fini(void)
 {
 	kmem_cache_destroy(pt_cache);
 }
+
+struct panthor_vm_fault *panthor_vm_get_fault(struct panthor_vm *vm)
+{
+	if (!vm)
+		return NULL;
+
+	return &vm->fault;
+}
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.h b/drivers/gpu/drm/panthor/panthor_mmu.h
index 023fdc79c231..d69b4000a39e 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.h
+++ b/drivers/gpu/drm/panthor/panthor_mmu.h
@@ -123,4 +123,6 @@ void panthor_mmu_pt_cache_fini(void);
 void panthor_mmu_debugfs_init(struct drm_minor *minor);
 #endif
 
+struct panthor_vm_fault *panthor_vm_get_fault(struct panthor_vm *vm);
+
 #endif
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index a77399e95620..9ea0d2b27114 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -722,6 +722,11 @@ struct panthor_group {
 	 * panthor_group::groups::waiting list.
 	 */
 	struct list_head wait_node;
+
+	/**
+	 * @fatal: VM-level fault that caused a fatal error on the group.
+	 */
+	struct panthor_vm_fault fatal;
 };
 
 struct panthor_job_profiling_data {
@@ -1575,6 +1580,14 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
 			 group->task_info.pid, group->task_info.comm);
 
 		group->fatal_queues |= BIT(cs_id);
+
+		if (panthor_vm_has_unhandled_faults(group->vm)) {
+			struct panthor_vm_fault *fault;
+
+			fault = panthor_vm_get_fault(group->vm);
+			if (fault)
+				group->fatal = *fault;
+		}
 	}
 
 	if (CS_EXCEPTION_TYPE(fatal) == DRM_PANTHOR_EXCEPTION_CS_UNRECOVERABLE) {
-- 
2.33.0.dirty


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors
  2025-12-15 11:54 [PATCH v1 0/5] drm/panthor: Implement fault information propagation Lukas Zapolskas
                   ` (3 preceding siblings ...)
  2025-12-15 11:54 ` [PATCH v1 4/5] drm/panthor: Propagate VM-level faults to groups Lukas Zapolskas
@ 2025-12-15 11:54 ` Lukas Zapolskas
  2025-12-15 17:31   ` Boris Brezillon
                     ` (3 more replies)
  4 siblings, 4 replies; 16+ messages in thread
From: Lukas Zapolskas @ 2025-12-15 11:54 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter
  Cc: nd, dri-devel, linux-kernel, Lukas Zapolskas, Ketil Johnsen

GROUP_GET_STATE now can be called two times: the first time to
enumerate the faults that occurred on the associated queues, and
the second time to copy out any fault information. The
necessary infrastructure to copy out drm_panthor_obj_array's
is pulled in from
https://lore.kernel.org/dri-devel/20240828172605.19176-7-mihail.atanassov@arm.com/

Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
Co-developed-by: Ketil Johnsen <ketil.johnsen@arm.com>
Signed-off-by: Ketil Johnsen <ketil.johnsen@arm.com>
---
 drivers/gpu/drm/panthor/panthor_drv.c   | 85 ++++++++++++++++++++++-
 drivers/gpu/drm/panthor/panthor_sched.c | 92 +++++++++++++++++++++++--
 drivers/gpu/drm/panthor/panthor_sched.h |  4 +-
 include/uapi/drm/panthor_drm.h          | 24 +++++++
 4 files changed, 195 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index 98d4e8d867ed..bdcaf85b98cd 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -75,6 +75,55 @@ panthor_set_uobj(u64 usr_ptr, u32 usr_size, u32 min_size, u32 kern_size, const v
 	return 0;
 }
 
+/**
+ * panthor_set_uobj_array() - Copy a kernel object array into a user object array.
+ * @out: The object array to copy to.
+ * @min_stride: Minimum array stride.
+ * @obj_size: Kernel object size.
+ * @in: Kernel object array to copy from.
+ *
+ * Helper automating kernel -> user object copies.
+ *
+ * Don't use this function directly, use PANTHOR_UOBJ_SET_ARRAY() instead.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int
+panthor_set_uobj_array(const struct drm_panthor_obj_array *out, u32 min_stride, u32 obj_size,
+		       const void *in)
+{
+	if (out->stride < min_stride)
+		return -EINVAL;
+
+	if (!out->count)
+		return 0;
+
+	if (obj_size == out->stride) {
+		if (copy_to_user(u64_to_user_ptr(out->array), in,
+				 (unsigned long)obj_size * out->count))
+			return -EFAULT;
+	} else {
+		u32 cpy_elem_size = min_t(u32, out->stride, obj_size);
+		void __user *out_ptr = u64_to_user_ptr(out->array);
+		const void *in_ptr = in;
+
+		for (u32 i = 0; i < out->count; i++) {
+			if (copy_to_user(out_ptr, in_ptr, cpy_elem_size))
+				return -EFAULT;
+
+			if (out->stride > obj_size &&
+			    clear_user(out_ptr + cpy_elem_size, out->stride - obj_size)) {
+				return -EFAULT;
+			}
+
+			out_ptr += out->stride;
+			in_ptr += obj_size;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * panthor_get_uobj_array() - Copy a user object array into a kernel accessible object array.
  * @in: The object array to copy.
@@ -178,7 +227,8 @@ panthor_get_uobj_array(const struct drm_panthor_obj_array *in, u32 min_stride,
 		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_submit, syncs), \
 		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_create, ringbuf_size), \
 		 PANTHOR_UOBJ_DECL(struct drm_panthor_vm_bind_op, syncs), \
-		 PANTHOR_UOBJ_DECL(struct drm_panthor_bo_sync_op, size))
+		 PANTHOR_UOBJ_DECL(struct drm_panthor_bo_sync_op, size), \
+		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_event, exception_data))
 
 /**
  * PANTHOR_UOBJ_SET() - Copy a kernel object to a user object.
@@ -193,6 +243,20 @@ panthor_get_uobj_array(const struct drm_panthor_obj_array *in, u32 min_stride,
 			 PANTHOR_UOBJ_MIN_SIZE(_src_obj), \
 			 sizeof(_src_obj), &(_src_obj))
 
+/**
+ * PANTHOR_UOBJ_SET_ARRAY() - Copies from _src_array to @_dest_drm_panthor_obj_array.array.
+ * @_dest_drm_panthor_obj_array: The &struct drm_panthor_obj_array containing a __u64 raw
+ * pointer to the destination C array in user space and the size of each array
+ * element in user space (the 'stride').
+ * @_src_array: The source C array object in kernel space.
+ *
+ * Return: Error code. See panthor_set_uobj_array().
+ */
+#define PANTHOR_UOBJ_SET_ARRAY(_dest_drm_panthor_obj_array, _src_array) \
+	panthor_set_uobj_array(_dest_drm_panthor_obj_array, \
+			       PANTHOR_UOBJ_MIN_SIZE((_src_array)[0]), \
+			       sizeof((_src_array)[0]), _src_array)
+
 /**
  * PANTHOR_UOBJ_GET_ARRAY() - Copy a user object array to a kernel accessible
  * object array.
@@ -1128,9 +1192,23 @@ static int panthor_ioctl_group_get_state(struct drm_device *ddev, void *data,
 					 struct drm_file *file)
 {
 	struct panthor_file *pfile = file->driver_priv;
+	struct drm_panthor_queue_event *__free(kvfree) events = NULL;
 	struct drm_panthor_group_get_state *args = data;
+	int ret;
 
-	return panthor_group_get_state(pfile, args);
+	/* First call enumerates the faults. */
+	if (!args->faults.count && !args->faults.array)
+		return panthor_group_get_state(pfile, args, NULL, 0);
+
+	ret = PANTHOR_UOBJ_GET_ARRAY(events, &args->faults);
+	if (ret)
+		return ret;
+
+	ret = panthor_group_get_state(pfile, args, events, args->faults.count);
+	if (!ret)
+		ret = PANTHOR_UOBJ_SET_ARRAY(&args->faults, events);
+
+	return ret;
 }
 
 static int panthor_ioctl_tiler_heap_create(struct drm_device *ddev, void *data,
@@ -1678,6 +1756,7 @@ static void panthor_debugfs_init(struct drm_minor *minor)
  *       - adds DRM_IOCTL_PANTHOR_BO_SYNC ioctl
  *       - adds DRM_IOCTL_PANTHOR_BO_QUERY_INFO ioctl
  *       - adds drm_panthor_gpu_info::selected_coherency
+ * - 1.8 - extends GROUP_GET_STATE to propagate fault and fatal event metadata
  */
 static const struct drm_driver panthor_drm_driver = {
 	.driver_features = DRIVER_RENDER | DRIVER_GEM | DRIVER_SYNCOBJ |
@@ -1691,7 +1770,7 @@ static const struct drm_driver panthor_drm_driver = {
 	.name = "panthor",
 	.desc = "Panthor DRM driver",
 	.major = 1,
-	.minor = 7,
+	.minor = 8,
 
 	.gem_create_object = panthor_gem_create_object,
 	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 9ea0d2b27114..f58d9a21dec4 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -3872,21 +3872,86 @@ static struct panthor_group *group_from_handle(struct panthor_group_pool *pool,
 	return group;
 }
 
+static int panthor_group_count_faults(struct panthor_scheduler *sched, struct panthor_group *group)
+{
+	int count = 0;
+	unsigned long queue_events = group->fault_queues | group->fatal_queues;
+
+	lockdep_assert(&sched->lock);
+
+	for (size_t i = 0; i < group->queue_count; i++) {
+		struct panthor_queue *queue;
+		struct panthor_queue_event *evnt;
+
+		if (!test_bit(i, &queue_events))
+			continue;
+
+		queue = group->queues[i];
+		if (!queue)
+			continue;
+
+		list_for_each_entry(evnt, &queue->events, link)
+			count++;
+	}
+
+	return count;
+}
+
+static void panthor_group_get_faults(struct panthor_scheduler *sched, struct panthor_group *group,
+				     struct drm_panthor_queue_event *events, u32 count)
+{
+	int nr_events = 0;
+	unsigned long queue_events = group->fault_queues | group->fatal_queues;
+
+	lockdep_assert(&sched->lock);
+
+	for (u32 i = 0; i < group->queue_count; i++) {
+		struct panthor_queue *queue;
+		struct panthor_queue_event *evnt, *tmp;
+
+		if (!test_bit(i, &queue_events))
+			continue;
+
+		queue = group->queues[i];
+
+		if (!queue)
+			continue;
+
+		list_for_each_entry_safe(evnt, tmp, &queue->events, link) {
+			if (nr_events >= count)
+				return;
+
+			events[nr_events++] = evnt->event;
+			list_del(&evnt->link);
+			kfree(evnt);
+		}
+
+		/* In case of additional faults being generated between invocations
+		 * of group_get state, there may not be enough space to drain
+		 * events to the user provided buffer. In those cases, the queue
+		 * should remain listed as faulted.
+		 */
+		if ((group->fault_queues & BIT(i)) && list_empty(&queue->events))
+			group->fault_queues &= ~BIT(i);
+	}
+}
+
 int panthor_group_get_state(struct panthor_file *pfile,
-			    struct drm_panthor_group_get_state *get_state)
+			    struct drm_panthor_group_get_state *get_state,
+			    struct drm_panthor_queue_event *events, u32 count)
 {
 	struct panthor_group_pool *gpool = pfile->groups;
 	struct panthor_device *ptdev = pfile->ptdev;
 	struct panthor_scheduler *sched = ptdev->scheduler;
-	struct panthor_group *group;
+	struct panthor_group *group = NULL;
+	u32 fault_count;
 
 	group = group_from_handle(gpool, get_state->group_handle);
 	if (!group)
 		return -EINVAL;
 
-	memset(get_state, 0, sizeof(*get_state));
+	guard(mutex)(&sched->lock);
 
-	mutex_lock(&sched->lock);
 	if (group->timedout)
 		get_state->state |= DRM_PANTHOR_GROUP_STATE_TIMEDOUT;
 	if (group->fatal_queues) {
@@ -3898,10 +3963,25 @@ int panthor_group_get_state(struct panthor_file *pfile,
 	if (group->fault_queues) {
 		get_state->state |= DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT;
 		get_state->fault_queues = group->fault_queues;
-		group->fault_queues = 0;
 	}
-	mutex_unlock(&sched->lock);
 
+	get_state->exception_type = group->fault.exception_type;
+	get_state->access_type = group->fault.access_type;
+	get_state->source_id = group->fault.source_id;
+	get_state->valid_address = group->fault.valid_address;
+	get_state->address = group->fault.address;
+
+	fault_count = panthor_group_count_faults(sched, group);
+
+	if (!count && !events) {
+		get_state->faults.count = fault_count;
+		get_state->faults.stride = sizeof(struct drm_panthor_queue_event);
+		goto exit;
+	}
+
+	panthor_group_get_faults(sched, group, events, min(get_state->faults.count, count));
+
+exit:
 	group_put(group);
 	return 0;
 }
diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
index f4a475aa34c0..d75870a6d836 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.h
+++ b/drivers/gpu/drm/panthor/panthor_sched.h
@@ -14,6 +14,7 @@ struct drm_panthor_group_create;
 struct drm_panthor_queue_create;
 struct drm_panthor_group_get_state;
 struct drm_panthor_queue_submit;
+struct drm_panthor_queue_event;
 struct panthor_device;
 struct panthor_file;
 struct panthor_group_pool;
@@ -25,7 +26,8 @@ int panthor_group_create(struct panthor_file *pfile,
 			 u64 drm_client_id);
 int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle);
 int panthor_group_get_state(struct panthor_file *pfile,
-			    struct drm_panthor_group_get_state *get_state);
+			    struct drm_panthor_group_get_state *get_state,
+			    struct drm_panthor_queue_event *events, u32 count);
 
 struct drm_sched_job *
 panthor_job_create(struct panthor_file *pfile,
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index 083a02418d28..931c8371b1b6 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -1012,6 +1012,30 @@ struct drm_panthor_group_get_state {
 
 	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
 	__u32 fault_queues;
+
+	/** @exception_type: The type of exception that caused the fault. */
+	__u32 exception_type;
+
+	/** @access_type: The direction of the data transfer that caused the fault., if known. */
+	__u32 access_type;
+
+	/** @source_id: ID supplying further data about the source of the fault. */
+	__u32 source_id;
+
+	/**
+	 * @valid_address: Whether the address is valid or not. Some faults may not come with
+	 * a valid GPU VA.
+	 */
+	__u8 valid_address;
+
+	/** @pad0: MBZ. */
+	__u8 pad0[3];
+
+	/** @address: GPU VA of the faulting access, if present. */
+	__u64 address;
+
+	/** @faults: Array of faults passed back to the user. */
+	struct drm_panthor_obj_array faults;
 };
 
 /**
-- 
2.33.0.dirty


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace
  2025-12-15 11:54 ` [PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace Lukas Zapolskas
@ 2025-12-15 12:03   ` Boris Brezillon
  0 siblings, 0 replies; 16+ messages in thread
From: Boris Brezillon @ 2025-12-15 12:03 UTC (permalink / raw)
  To: Lukas Zapolskas
  Cc: Liviu Dudau, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	David Airlie, Simona Vetter, nd, dri-devel, linux-kernel,
	Paul Toadere

On Mon, 15 Dec 2025 11:54:53 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:

> From: Paul Toadere <paul.toadere@arm.com>
> 
> Though faulted queues do not prevent further submission, the
> recoverable faults may have further consequences which are
> worth recording and providing to the user.
> 
> Signed-off-by: Paul Toadere <paul.toadere@arm.com>
> Co-developed-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 18 +++++++++++++++---
>  include/uapi/drm/panthor_drm.h          | 11 +++++++++--
>  2 files changed, 24 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index a17b067a0439..eb8841beba39 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -569,6 +569,14 @@ struct panthor_group {
>  	/** @fatal_queues: Bitmask reflecting the queues that hit a fatal exception. */
>  	u32 fatal_queues;
>  
> +	/**
> +	 * @fault_queues: Bitmask reflecting the queues that hit a recoverable exception.
> +	 *
> +	 * This field is reset when the GROUP_GET_STATE ioctl is used to collect the fault
> +	 * information.
> +	 */
> +	u32 fault_queues;

s/fault_queues/faulty_queues/ ?

> +
>  	/** @tiler_oom: Mask of queues that have a tiler OOM event to process. */
>  	atomic_t tiler_oom;
>  
> @@ -1553,6 +1561,8 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  	if (group) {
>  		drm_warn(&ptdev->base, "CS_FAULT: pid=%d, comm=%s\n",
>  			 group->task_info.pid, group->task_info.comm);
> +
> +		group->fault_queues |= BIT(cs_id);
>  	}
>  
>  	drm_warn(&ptdev->base,
> @@ -3807,9 +3817,6 @@ int panthor_group_get_state(struct panthor_file *pfile,
>  	struct panthor_scheduler *sched = ptdev->scheduler;
>  	struct panthor_group *group;
>  
> -	if (get_state->pad)
> -		return -EINVAL;
> -
>  	group = group_from_handle(gpool, get_state->group_handle);
>  	if (!group)
>  		return -EINVAL;
> @@ -3825,6 +3832,11 @@ int panthor_group_get_state(struct panthor_file *pfile,
>  	}
>  	if (group->innocent)
>  		get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
> +	if (group->fault_queues) {
> +		get_state->state |= DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT;
> +		get_state->fault_queues = group->fault_queues;
> +		group->fault_queues = 0;
> +	}
>  	mutex_unlock(&sched->lock);
>  
>  	group_put(group);
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index e238c6264fa1..77262d2b9672 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -965,6 +965,13 @@ enum drm_panthor_group_state_flags {
>  	 * DRM_PANTHOR_GROUP_STATE_FATAL_FAULT is not.
>  	 */
>  	DRM_PANTHOR_GROUP_STATE_INNOCENT = 1 << 2,
> +
> +	/**
> +	 * @DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT: Group had recoverable faults.
> +	 *
> +	 * When a group ends up with this flag set, jobs can still be submitted to its queues.
> +	 */
> +	DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT = 1 << 3,
>  };
>  
>  /**
> @@ -986,8 +993,8 @@ struct drm_panthor_group_get_state {
>  	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
>  	__u32 fatal_queues;
>  
> -	/** @pad: MBZ */
> -	__u32 pad;
> +	/** @fatal_queues: Bitmask of queues that faced fatal faults. */

s/fatal/recoverable/

> +	__u32 fault_queues;
>  };
>  
>  /**


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information
  2025-12-15 11:54 ` [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information Lukas Zapolskas
@ 2025-12-15 12:11   ` Boris Brezillon
  2025-12-17 11:37   ` Steven Price
  1 sibling, 0 replies; 16+ messages in thread
From: Boris Brezillon @ 2025-12-15 12:11 UTC (permalink / raw)
  To: Lukas Zapolskas
  Cc: Liviu Dudau, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	David Airlie, Simona Vetter, nd, dri-devel, linux-kernel

On Mon, 15 Dec 2025 11:54:54 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:

> A queue may encounter either one fatal fault or any number of
> recoverable faults during execution. The CSF FW provides the
> FAULT/FATAL registers, indicating the fault type, and another
> set of registers providing more metadata about why the fault
> was generated. Storing the information allows it to be
> reported to the user using the GROUP_GET_STATE ioctl.
> 
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 116 +++++++++++++++++-------
>  include/uapi/drm/panthor_drm.h          |  17 ++++
>  2 files changed, 100 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index eb8841beba39..a77399e95620 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -342,6 +342,14 @@ struct panthor_syncobj_64b {
>  	u32 pad;
>  };
>  
> +struct panthor_queue_event {
> +	/** @link: Link to a list of Panthor event errors. */
> +	struct list_head link;
> +
> +	/** @event: The event containing all of the fault/fatal metadata. */
> +	struct drm_panthor_queue_event event;
> +};
> +
>  /**
>   * struct panthor_queue - Execution queue
>   */
> @@ -485,6 +493,9 @@ struct panthor_queue {
>  		/** @seqno: Index of the next available profiling information slot. */
>  		u32 seqno;
>  	} profiling;
> +
> +	/** @events: List of fault or fatal events reported on this queue. */
> +	struct list_head events;
>  };
>  
>  /**
> @@ -918,6 +929,8 @@ panthor_queue_get_syncwait_obj(struct panthor_group *group, struct panthor_queue
>  
>  static void group_free_queue(struct panthor_group *group, struct panthor_queue *queue)
>  {
> +	struct panthor_queue_event *evt, *tmp;
> +
>  	if (IS_ERR_OR_NULL(queue))
>  		return;
>  
> @@ -934,6 +947,11 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>  
>  	panthor_queue_put_syncwait_obj(queue);
>  
> +	list_for_each_entry_safe(evt, tmp, &queue->events, link) {
> +		list_del(&evt->link);
> +		kfree(evt);
> +	}
> +
>  	panthor_kernel_bo_destroy(queue->ringbuf);
>  	panthor_kernel_bo_destroy(queue->iface.mem);
>  	panthor_kernel_bo_destroy(queue->profiling.slots);
> @@ -1476,6 +1494,69 @@ csg_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 priority)
>  	return 0;
>  }
>  
> +static struct panthor_queue_event *
> +panthor_queue_create_event(unsigned long event_type, u32 cs_id, u32 exception)
> +{
> +	struct panthor_queue_event *event;
> +
> +	event = kzalloc(sizeof(*event), GFP_KERNEL);

This is called from the dma-signalling path, from which we can't
allocate with GFP_KERNEL. I think it'd be preferable to pre-allocate a
fixed size event array at queue creation time (can be an extra param
passed to GROUP_CREATE), and report overflows if we're running out of
slots.

> +	if (!event)
> +		return ERR_PTR(-ENOMEM);
> +
> +	event->event = (struct drm_panthor_queue_event){
> +		.queue_id = cs_id,
> +		.event_type = event_type,
> +		.exception_type = CS_EXCEPTION_TYPE(exception),
> +		.exception_data = CS_EXCEPTION_DATA(exception),
> +	};
> +	INIT_LIST_HEAD(&event->link);
> +
> +	return event;
> +}
> +
> +#define PANTHOR_DEFINE_EVENT_INFO(__type, __msg, __event) \
> +static u32 panthor_queue_set_ ## __type ## _info(struct panthor_device *ptdev,			\
> +						 struct panthor_group *group,			\
> +						 u32 csg_id, u32 cs_id)				\
> +{												\
> +	struct panthor_scheduler *sched = ptdev->scheduler;					\
> +	struct panthor_fw_cs_iface *iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);	\
> +	struct panthor_queue *queue = group && cs_id < group->queue_count ?			\
> +				      group->queues[cs_id] : NULL;				\
> +	struct panthor_queue_event *event;							\
> +												\
> +	lockdep_assert_held(&sched->lock);							\
> +												\
> +	if (!iface || !queue)									\
> +		return 0;									\
> +												\
> +	const u32 exception = iface->output->__type;						\
> +	const u64 info = iface->output->__type ## _info;					\
> +												\
> +	event = panthor_queue_create_event((__event), cs_id, exception);			\
> +												\
> +	if (!IS_ERR(event))									\
> +		list_add_tail(&event->link, &queue->events);					\
> +	else											\
> +		drm_err(&ptdev->base, "Could not store fault notification, err = %ld",		\
> +			PTR_ERR(event));							\
> +												\
> +	drm_warn(&ptdev->base,									\
> +		 "CSG slot %d CS slot: %d\n"							\
> +		 "CS_" __msg  ".EXCEPTION_TYPE: 0x%x (%s)\n"					\
> +		 "CS_" __msg  ".EXCEPTION_DATA: 0x%x\n"						\
> +		 "CS_" __msg  "_INFO.EXCEPTION_DATA: 0x%llx\n",					\
> +		 csg_id, cs_id,									\
> +		 (unsigned int)CS_EXCEPTION_TYPE(exception),					\
> +		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(exception)),			\
> +		 (unsigned int)CS_EXCEPTION_DATA(exception), info);				\
> +												\
> +	return exception;									\
> +}
> +
> +PANTHOR_DEFINE_EVENT_INFO(fatal, "FATAL", DRM_PANTHOR_GROUP_STATE_FATAL_FAULT);
> +PANTHOR_DEFINE_EVENT_INFO(fault, "FAULT", DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT);
> +
>  static void
>  cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  				   u32 csg_id, u32 cs_id)
> @@ -1483,15 +1564,11 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  	struct panthor_scheduler *sched = ptdev->scheduler;
>  	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
>  	struct panthor_group *group = csg_slot->group;
> -	struct panthor_fw_cs_iface *cs_iface;
>  	u32 fatal;
> -	u64 info;
>  
>  	lockdep_assert_held(&sched->lock);
>  
> -	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
> -	fatal = cs_iface->output->fatal;
> -	info = cs_iface->output->fatal_info;
> +	fatal = panthor_queue_set_fatal_info(ptdev, group, csg_id, cs_id);
>  
>  	if (group) {
>  		drm_warn(&ptdev->base, "CS_FATAL: pid=%d, comm=%s\n",
> @@ -1509,17 +1586,6 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  	} else {
>  		sched_queue_delayed_work(sched, tick, 0);
>  	}
> -
> -	drm_warn(&ptdev->base,
> -		 "CSG slot %d CS slot: %d\n"
> -		 "CS_FATAL.EXCEPTION_TYPE: 0x%x (%s)\n"
> -		 "CS_FATAL.EXCEPTION_DATA: 0x%x\n"
> -		 "CS_FATAL_INFO.EXCEPTION_DATA: 0x%llx\n",
> -		 csg_id, cs_id,
> -		 (unsigned int)CS_EXCEPTION_TYPE(fatal),
> -		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
> -		 (unsigned int)CS_EXCEPTION_DATA(fatal),
> -		 info);
>  }
>  
>  static void
> @@ -1531,15 +1597,10 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  	struct panthor_group *group = csg_slot->group;
>  	struct panthor_queue *queue = group && cs_id < group->queue_count ?
>  				      group->queues[cs_id] : NULL;
> -	struct panthor_fw_cs_iface *cs_iface;
> -	u32 fault;
> -	u64 info;
>  
>  	lockdep_assert_held(&sched->lock);
>  
> -	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
> -	fault = cs_iface->output->fault;
> -	info = cs_iface->output->fault_info;
> +	panthor_queue_set_fault_info(ptdev, group, csg_id, cs_id);
>  
>  	if (queue) {
>  		u64 cs_extract = queue->iface.output->extract;
> @@ -1564,17 +1625,6 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  
>  		group->fault_queues |= BIT(cs_id);
>  	}
> -
> -	drm_warn(&ptdev->base,
> -		 "CSG slot %d CS slot: %d\n"
> -		 "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n"
> -		 "CS_FAULT.EXCEPTION_DATA: 0x%x\n"
> -		 "CS_FAULT_INFO.EXCEPTION_DATA: 0x%llx\n",
> -		 csg_id, cs_id,
> -		 (unsigned int)CS_EXCEPTION_TYPE(fault),
> -		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
> -		 (unsigned int)CS_EXCEPTION_DATA(fault),
> -		 info);
>  }
>  
>  static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index 77262d2b9672..083a02418d28 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -974,6 +974,23 @@ enum drm_panthor_group_state_flags {
>  	DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT = 1 << 3,
>  };
>  
> +/**
> + * struct drm_panthor_queue_event - Fault or fatal event occurring on a single queue.
> + */
> +struct drm_panthor_queue_event {
> +	/** @queue_id: The ID of the queue that faulted. */
> +	__u32 queue_id;
> +
> +	/** @event_type: What kind of event is being propagated. */
> +	__u32 event_type;
> +
> +	/** @exception_type: The type of exception that caused the fault. */
> +	__u32 exception_type;
> +
> +	/** @exception_data: Exception-specific data. */
> +	__u32 exception_data;
> +};
> +
>  /**
>   * struct drm_panthor_group_get_state - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_GET_STATE
>   *


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 3/5] drm/panthor: Track VM faults
  2025-12-15 11:54 ` [PATCH v1 3/5] drm/panthor: Track VM faults Lukas Zapolskas
@ 2025-12-15 12:37   ` Boris Brezillon
  0 siblings, 0 replies; 16+ messages in thread
From: Boris Brezillon @ 2025-12-15 12:37 UTC (permalink / raw)
  To: Lukas Zapolskas
  Cc: Liviu Dudau, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	David Airlie, Simona Vetter, nd, dri-devel, linux-kernel

On Mon, 15 Dec 2025 11:54:55 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:

> Faults reported via the MMU_CONTROL register block will result in fatal
> faults for running groups on that AS, which will also be useful to know
> for the user.
> 
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> ---
>  drivers/gpu/drm/panthor/panthor_mmu.c  | 16 ++++++++++++++--
>  drivers/gpu/drm/panthor/panthor_mmu.h  | 20 ++++++++++++++++++++
>  drivers/gpu/drm/panthor/panthor_regs.h |  3 +++
>  3 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index 473a8bebd61e..10a7418eecda 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -384,6 +384,9 @@ struct panthor_vm {
>  		/** @locked_region.size: Size of the locked region. */
>  		u64 size;
>  	} locked_region;
> +
> +	/** @fault: Fault information (if any) for this VM. */
> +	struct panthor_vm_fault fault;
>  };
>  
>  /**
> @@ -741,6 +744,7 @@ int panthor_vm_active(struct panthor_vm *vm)
>  
>  	/* If the VM is re-activated, we clear the fault. */
>  	vm->unhandled_fault = false;
> +	vm->fault = (struct panthor_vm_fault){ 0 };

I'd rather go for a memset() here, to follow the kernel coding style.

	memset(&vm->fault, 0, sizeof(vm->fault));

>  
>  	/* Unhandled pagefault on this AS, clear the fault and re-enable interrupts
>  	 * before enabling the AS.
> @@ -1744,8 +1748,16 @@ static void panthor_mmu_irq_handler(struct panthor_device *ptdev, u32 status)
>  		 */
>  		ptdev->mmu->irq.mask = new_int_mask;
>  
> -		if (ptdev->mmu->as.slots[as].vm)
> -			ptdev->mmu->as.slots[as].vm->unhandled_fault = true;
> +		if (ptdev->mmu->as.slots[as].vm) {
> +			struct panthor_vm *vm = ptdev->mmu->as.slots[as].vm;
> +
> +			vm->unhandled_fault = true;
> +			vm->fault.exception_type = AS_FAULTSTATUS_EXCEPTION_TYPE(status);
> +			vm->fault.access_type = AS_FAULTSTATUS_ACCESS_TYPE(status);
> +			vm->fault.source_id = AS_FAULTSTATUS_SOURCE_ID(status);

I'd be tempted to return the raw value to userspace instead of parsing
it here. This way if the meaning/layout changes, it's the UMD
responsibility to adjust the parsing (we do that with various other
things already, based on the GPU_ID).

> +			vm->fault.valid_address = true;

Do we really need an extra field to report address validity? Can't we
just the UMD check the status and take a decision based on that?

> +			vm->fault.address = addr;
> +		}
>  
>  		/* Disable the MMU to kill jobs on this AS. */
>  		panthor_mmu_as_disable(ptdev, as, false);
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.h b/drivers/gpu/drm/panthor/panthor_mmu.h
> index 0e268fdfdb2f..023fdc79c231 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.h
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.h
> @@ -16,6 +16,26 @@ struct panthor_vm;
>  struct panthor_vma;
>  struct panthor_mmu;
>  
> +/**
> + * struct panthor_vm_fault - Tracking information for VM-level faults.
> + */
> +struct panthor_vm_fault {
> +	/** @address: Virtual address of the faulting access. */
> +	u64 address;
> +
> +	/** @exception_type: The type of exception that caused the fault. */
> +	u32 exception_type;
> +
> +	/** @access_type: The direction of data transfer that caused the fault. */
> +	u32 access_type;
> +
> +	/** @source_id: ID supplying further data about the source of the fault. */
> +	u32 source_id;
> +
> +	/** @valid_address: Whether the virtual address is valid. */
> +	bool valid_address;
> +};
> +
>  int panthor_mmu_init(struct panthor_device *ptdev);
>  void panthor_mmu_unplug(struct panthor_device *ptdev);
>  void panthor_mmu_pre_reset(struct panthor_device *ptdev);
> diff --git a/drivers/gpu/drm/panthor/panthor_regs.h b/drivers/gpu/drm/panthor/panthor_regs.h
> index 08bf06c452d6..5aa5e37d29c9 100644
> --- a/drivers/gpu/drm/panthor/panthor_regs.h
> +++ b/drivers/gpu/drm/panthor/panthor_regs.h
> @@ -178,10 +178,13 @@
>  #define   AS_LOCK_REGION_MIN_SIZE			(1ULL << 15)
>  #define AS_FAULTSTATUS(as)				(MMU_AS(as) + 0x1C)
>  #define  AS_FAULTSTATUS_ACCESS_TYPE_MASK		(0x3 << 8)
> +#define  AS_FAULTSTATUS_ACCESS_TYPE(x)			(((x) >> 8) & GENMASK(2, 0))
>  #define  AS_FAULTSTATUS_ACCESS_TYPE_ATOMIC		(0x0 << 8)
>  #define  AS_FAULTSTATUS_ACCESS_TYPE_EX			(0x1 << 8)
>  #define  AS_FAULTSTATUS_ACCESS_TYPE_READ		(0x2 << 8)
>  #define  AS_FAULTSTATUS_ACCESS_TYPE_WRITE		(0x3 << 8)
> +#define  AS_FAULTSTATUS_EXCEPTION_TYPE(x)		((x) & GENMASK(7, 0))
> +#define  AS_FAULTSTATUS_SOURCE_ID(x)			(((x) >> 16) & GENMASK(16, 0))
>  #define AS_FAULTADDRESS(as)				(MMU_AS(as) + 0x20)
>  #define AS_STATUS(as)					(MMU_AS(as) + 0x28)
>  #define   AS_STATUS_AS_ACTIVE				BIT(0)


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 4/5] drm/panthor: Propagate VM-level faults to groups
  2025-12-15 11:54 ` [PATCH v1 4/5] drm/panthor: Propagate VM-level faults to groups Lukas Zapolskas
@ 2025-12-15 12:41   ` Boris Brezillon
  2025-12-15 12:46   ` Boris Brezillon
  1 sibling, 0 replies; 16+ messages in thread
From: Boris Brezillon @ 2025-12-15 12:41 UTC (permalink / raw)
  To: Lukas Zapolskas
  Cc: Liviu Dudau, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	David Airlie, Simona Vetter, nd, dri-devel, linux-kernel

On Mon, 15 Dec 2025 11:54:56 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:

> Receiving an MMU fault currently disables the AS, so each of the groups
> is marked with the appropriate faults. Since no further submissions
> can occur on a fatal fault for that group, the fault information
> does not have to be cleared until the group is terminated.
> 
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> ---
>  drivers/gpu/drm/panthor/panthor_mmu.c   |  8 ++++++++
>  drivers/gpu/drm/panthor/panthor_mmu.h   |  2 ++
>  drivers/gpu/drm/panthor/panthor_sched.c | 13 +++++++++++++
>  3 files changed, 23 insertions(+)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index 10a7418eecda..9e78b0509f1a 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -2895,3 +2895,11 @@ void panthor_mmu_pt_cache_fini(void)
>  {
>  	kmem_cache_destroy(pt_cache);
>  }
> +
> +struct panthor_vm_fault *panthor_vm_get_fault(struct panthor_vm *vm)

const struct panthor_vm_fault *
panthor_vm_get_fault(struct panthor_vm *vm)

or

struct panthor_vm_fault
panthor_vm_get_fault(struct panthor_vm *vm)

or

void
panthor_vm_get_fault(struct panthor_vm *vm,
		     struct panthor_vm_fault *fault)

but you shouldn't let the caller with a writable pointer to vm->fault.

> +{
> +	if (!vm)
> +		return NULL;

I don't see a valid case where panthor_vm_get_fault() would be called
with a NULL pointer.

> +
> +	return &vm->fault;
> +}
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.h b/drivers/gpu/drm/panthor/panthor_mmu.h
> index 023fdc79c231..d69b4000a39e 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.h
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.h
> @@ -123,4 +123,6 @@ void panthor_mmu_pt_cache_fini(void);
>  void panthor_mmu_debugfs_init(struct drm_minor *minor);
>  #endif
>  
> +struct panthor_vm_fault *panthor_vm_get_fault(struct panthor_vm *vm);
> +
>  #endif
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index a77399e95620..9ea0d2b27114 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -722,6 +722,11 @@ struct panthor_group {
>  	 * panthor_group::groups::waiting list.
>  	 */
>  	struct list_head wait_node;
> +
> +	/**
> +	 * @fatal: VM-level fault that caused a fatal error on the group.
> +	 */
> +	struct panthor_vm_fault fatal;
>  };
>  
>  struct panthor_job_profiling_data {
> @@ -1575,6 +1580,14 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  			 group->task_info.pid, group->task_info.comm);
>  
>  		group->fatal_queues |= BIT(cs_id);
> +
> +		if (panthor_vm_has_unhandled_faults(group->vm)) {
> +			struct panthor_vm_fault *fault;
> +
> +			fault = panthor_vm_get_fault(group->vm);
> +			if (fault)
> +				group->fatal = *fault;
> +		}
>  	}
>  
>  	if (CS_EXCEPTION_TYPE(fatal) == DRM_PANTHOR_EXCEPTION_CS_UNRECOVERABLE) {


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 4/5] drm/panthor: Propagate VM-level faults to groups
  2025-12-15 11:54 ` [PATCH v1 4/5] drm/panthor: Propagate VM-level faults to groups Lukas Zapolskas
  2025-12-15 12:41   ` Boris Brezillon
@ 2025-12-15 12:46   ` Boris Brezillon
  1 sibling, 0 replies; 16+ messages in thread
From: Boris Brezillon @ 2025-12-15 12:46 UTC (permalink / raw)
  To: Lukas Zapolskas
  Cc: Liviu Dudau, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	David Airlie, Simona Vetter, nd, dri-devel, linux-kernel

On Mon, 15 Dec 2025 11:54:56 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:

>  			 group->task_info.pid, group->task_info.comm);
>  
>  		group->fatal_queues |= BIT(cs_id);
> +
> +		if (panthor_vm_has_unhandled_faults(group->vm)) {
> +			struct panthor_vm_fault *fault;
> +
> +			fault = panthor_vm_get_fault(group->vm);
> +			if (fault)
> +				group->fatal = *fault;

group->vm can't be NULL, meaning fault can't be NULL either.

> +		}
>  	}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors
  2025-12-15 11:54 ` [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors Lukas Zapolskas
@ 2025-12-15 17:31   ` Boris Brezillon
  2025-12-16  5:45   ` kernel test robot
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Boris Brezillon @ 2025-12-15 17:31 UTC (permalink / raw)
  To: Lukas Zapolskas
  Cc: Liviu Dudau, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	David Airlie, Simona Vetter, nd, dri-devel, linux-kernel,
	Ketil Johnsen

On Mon, 15 Dec 2025 11:54:57 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:

> GROUP_GET_STATE now can be called two times: the first time to
> enumerate the faults that occurred on the associated queues, and
> the second time to copy out any fault information. The
> necessary infrastructure to copy out drm_panthor_obj_array's
> is pulled in from
> https://lore.kernel.org/dri-devel/20240828172605.19176-7-mihail.atanassov@arm.com/
> 
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> Co-developed-by: Ketil Johnsen <ketil.johnsen@arm.com>
> Signed-off-by: Ketil Johnsen <ketil.johnsen@arm.com>
> ---
>  drivers/gpu/drm/panthor/panthor_drv.c   | 85 ++++++++++++++++++++++-
>  drivers/gpu/drm/panthor/panthor_sched.c | 92 +++++++++++++++++++++++--
>  drivers/gpu/drm/panthor/panthor_sched.h |  4 +-
>  include/uapi/drm/panthor_drm.h          | 24 +++++++
>  4 files changed, 195 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
> index 98d4e8d867ed..bdcaf85b98cd 100644
> --- a/drivers/gpu/drm/panthor/panthor_drv.c
> +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> @@ -75,6 +75,55 @@ panthor_set_uobj(u64 usr_ptr, u32 usr_size, u32 min_size, u32 kern_size, const v
>  	return 0;
>  }
>  
> +/**
> + * panthor_set_uobj_array() - Copy a kernel object array into a user object array.
> + * @out: The object array to copy to.
> + * @min_stride: Minimum array stride.
> + * @obj_size: Kernel object size.
> + * @in: Kernel object array to copy from.
> + *
> + * Helper automating kernel -> user object copies.
> + *
> + * Don't use this function directly, use PANTHOR_UOBJ_SET_ARRAY() instead.
> + *
> + * Return: 0 on success, a negative error code otherwise.
> + */
> +static int
> +panthor_set_uobj_array(const struct drm_panthor_obj_array *out, u32 min_stride, u32 obj_size,
> +		       const void *in)
> +{
> +	if (out->stride < min_stride)
> +		return -EINVAL;
> +
> +	if (!out->count)
> +		return 0;
> +
> +	if (obj_size == out->stride) {
> +		if (copy_to_user(u64_to_user_ptr(out->array), in,
> +				 (unsigned long)obj_size * out->count))
> +			return -EFAULT;
> +	} else {
> +		u32 cpy_elem_size = min_t(u32, out->stride, obj_size);
> +		void __user *out_ptr = u64_to_user_ptr(out->array);
> +		const void *in_ptr = in;
> +
> +		for (u32 i = 0; i < out->count; i++) {
> +			if (copy_to_user(out_ptr, in_ptr, cpy_elem_size))
> +				return -EFAULT;
> +
> +			if (out->stride > obj_size &&
> +			    clear_user(out_ptr + cpy_elem_size, out->stride - obj_size)) {
> +				return -EFAULT;
> +			}
> +
> +			out_ptr += out->stride;
> +			in_ptr += obj_size;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  /**
>   * panthor_get_uobj_array() - Copy a user object array into a kernel accessible object array.
>   * @in: The object array to copy.
> @@ -178,7 +227,8 @@ panthor_get_uobj_array(const struct drm_panthor_obj_array *in, u32 min_stride,
>  		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_submit, syncs), \
>  		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_create, ringbuf_size), \
>  		 PANTHOR_UOBJ_DECL(struct drm_panthor_vm_bind_op, syncs), \
> -		 PANTHOR_UOBJ_DECL(struct drm_panthor_bo_sync_op, size))
> +		 PANTHOR_UOBJ_DECL(struct drm_panthor_bo_sync_op, size), \
> +		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_event, exception_data))
>  
>  /**
>   * PANTHOR_UOBJ_SET() - Copy a kernel object to a user object.
> @@ -193,6 +243,20 @@ panthor_get_uobj_array(const struct drm_panthor_obj_array *in, u32 min_stride,
>  			 PANTHOR_UOBJ_MIN_SIZE(_src_obj), \
>  			 sizeof(_src_obj), &(_src_obj))
>  
> +/**
> + * PANTHOR_UOBJ_SET_ARRAY() - Copies from _src_array to @_dest_drm_panthor_obj_array.array.
> + * @_dest_drm_panthor_obj_array: The &struct drm_panthor_obj_array containing a __u64 raw
> + * pointer to the destination C array in user space and the size of each array
> + * element in user space (the 'stride').
> + * @_src_array: The source C array object in kernel space.
> + *
> + * Return: Error code. See panthor_set_uobj_array().
> + */
> +#define PANTHOR_UOBJ_SET_ARRAY(_dest_drm_panthor_obj_array, _src_array) \
> +	panthor_set_uobj_array(_dest_drm_panthor_obj_array, \
> +			       PANTHOR_UOBJ_MIN_SIZE((_src_array)[0]), \
> +			       sizeof((_src_array)[0]), _src_array)
> +
>  /**
>   * PANTHOR_UOBJ_GET_ARRAY() - Copy a user object array to a kernel accessible
>   * object array.
> @@ -1128,9 +1192,23 @@ static int panthor_ioctl_group_get_state(struct drm_device *ddev, void *data,
>  					 struct drm_file *file)
>  {
>  	struct panthor_file *pfile = file->driver_priv;
> +	struct drm_panthor_queue_event *__free(kvfree) events = NULL;
>  	struct drm_panthor_group_get_state *args = data;
> +	int ret;
>  
> -	return panthor_group_get_state(pfile, args);
> +	/* First call enumerates the faults. */
> +	if (!args->faults.count && !args->faults.array)
> +		return panthor_group_get_state(pfile, args, NULL, 0);
> +
> +	ret = PANTHOR_UOBJ_GET_ARRAY(events, &args->faults);
> +	if (ret)
> +		return ret;
> +
> +	ret = panthor_group_get_state(pfile, args, events, args->faults.count);
> +	if (!ret)
> +		ret = PANTHOR_UOBJ_SET_ARRAY(&args->faults, events);
> +
> +	return ret;
>  }
>  
>  static int panthor_ioctl_tiler_heap_create(struct drm_device *ddev, void *data,
> @@ -1678,6 +1756,7 @@ static void panthor_debugfs_init(struct drm_minor *minor)
>   *       - adds DRM_IOCTL_PANTHOR_BO_SYNC ioctl
>   *       - adds DRM_IOCTL_PANTHOR_BO_QUERY_INFO ioctl
>   *       - adds drm_panthor_gpu_info::selected_coherency
> + * - 1.8 - extends GROUP_GET_STATE to propagate fault and fatal event metadata
>   */
>  static const struct drm_driver panthor_drm_driver = {
>  	.driver_features = DRIVER_RENDER | DRIVER_GEM | DRIVER_SYNCOBJ |
> @@ -1691,7 +1770,7 @@ static const struct drm_driver panthor_drm_driver = {
>  	.name = "panthor",
>  	.desc = "Panthor DRM driver",
>  	.major = 1,
> -	.minor = 7,
> +	.minor = 8,
>  
>  	.gem_create_object = panthor_gem_create_object,
>  	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 9ea0d2b27114..f58d9a21dec4 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -3872,21 +3872,86 @@ static struct panthor_group *group_from_handle(struct panthor_group_pool *pool,
>  	return group;
>  }
>  
> +static int panthor_group_count_faults(struct panthor_scheduler *sched, struct panthor_group *group)
> +{
> +	int count = 0;
> +	unsigned long queue_events = group->fault_queues | group->fatal_queues;
> +
> +	lockdep_assert(&sched->lock);
> +
> +	for (size_t i = 0; i < group->queue_count; i++) {
> +		struct panthor_queue *queue;
> +		struct panthor_queue_event *evnt;
> +
> +		if (!test_bit(i, &queue_events))
> +			continue;
> +
> +		queue = group->queues[i];
> +		if (!queue)
> +			continue;
> +
> +		list_for_each_entry(evnt, &queue->events, link)
> +			count++;
> +	}
> +
> +	return count;
> +}
> +
> +static void panthor_group_get_faults(struct panthor_scheduler *sched, struct panthor_group *group,
> +				     struct drm_panthor_queue_event *events, u32 count)
> +{
> +	int nr_events = 0;
> +	unsigned long queue_events = group->fault_queues | group->fatal_queues;
> +
> +	lockdep_assert(&sched->lock);
> +
> +	for (u32 i = 0; i < group->queue_count; i++) {
> +		struct panthor_queue *queue;
> +		struct panthor_queue_event *evnt, *tmp;
> +
> +		if (!test_bit(i, &queue_events))
> +			continue;
> +
> +		queue = group->queues[i];
> +
> +		if (!queue)
> +			continue;
> +
> +		list_for_each_entry_safe(evnt, tmp, &queue->events, link) {
> +			if (nr_events >= count)
> +				return;
> +
> +			events[nr_events++] = evnt->event;
> +			list_del(&evnt->link);
> +			kfree(evnt);
> +		}
> +
> +		/* In case of additional faults being generated between invocations
> +		 * of group_get state, there may not be enough space to drain
> +		 * events to the user provided buffer. In those cases, the queue
> +		 * should remain listed as faulted.
> +		 */
> +		if ((group->fault_queues & BIT(i)) && list_empty(&queue->events))
> +			group->fault_queues &= ~BIT(i);
> +	}
> +}
> +
>  int panthor_group_get_state(struct panthor_file *pfile,
> -			    struct drm_panthor_group_get_state *get_state)
> +			    struct drm_panthor_group_get_state *get_state,
> +			    struct drm_panthor_queue_event *events, u32 count)
>  {
>  	struct panthor_group_pool *gpool = pfile->groups;
>  	struct panthor_device *ptdev = pfile->ptdev;
>  	struct panthor_scheduler *sched = ptdev->scheduler;
> -	struct panthor_group *group;
> +	struct panthor_group *group = NULL;
> +	u32 fault_count;
>  
>  	group = group_from_handle(gpool, get_state->group_handle);
>  	if (!group)
>  		return -EINVAL;
>  
> -	memset(get_state, 0, sizeof(*get_state));
> +	guard(mutex)(&sched->lock);

If we go for a guard, can we do a scoped_guard() so the unlock happens
just before the put() on the group.

>  
> -	mutex_lock(&sched->lock);
>  	if (group->timedout)
>  		get_state->state |= DRM_PANTHOR_GROUP_STATE_TIMEDOUT;
>  	if (group->fatal_queues) {
> @@ -3898,10 +3963,25 @@ int panthor_group_get_state(struct panthor_file *pfile,
>  	if (group->fault_queues) {
>  		get_state->state |= DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT;
>  		get_state->fault_queues = group->fault_queues;
> -		group->fault_queues = 0;
>  	}
> -	mutex_unlock(&sched->lock);
>  
> +	get_state->exception_type = group->fault.exception_type;
> +	get_state->access_type = group->fault.access_type;
> +	get_state->source_id = group->fault.source_id;
> +	get_state->valid_address = group->fault.valid_address;
> +	get_state->address = group->fault.address;
> +
> +	fault_count = panthor_group_count_faults(sched, group);
> +
> +	if (!count && !events) {
> +		get_state->faults.count = fault_count;
> +		get_state->faults.stride = sizeof(struct drm_panthor_queue_event);

Is there any point reporting the kernel stride here? I'd expect the
kernel driver to adapt to the user stride when count != 0, and since
the stride is dictated by the kernel uAPI version, there's not much to
be gained by having userspace allocation an array with a stride that
doesn't match the object size the UMD knows about.

> +		goto exit;
> +	}
> +
> +	panthor_group_get_faults(sched, group, events, min(get_state->faults.count, count));

As I was mentioning early, we really need to reflect when the some
events were lost. Right now you're assuming infinite event list, but
capping it and reporting overflows is more future-proof, I think.

> +
> +exit:
>  	group_put(group);
>  	return 0;
>  }
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
> index f4a475aa34c0..d75870a6d836 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.h
> +++ b/drivers/gpu/drm/panthor/panthor_sched.h
> @@ -14,6 +14,7 @@ struct drm_panthor_group_create;
>  struct drm_panthor_queue_create;
>  struct drm_panthor_group_get_state;
>  struct drm_panthor_queue_submit;
> +struct drm_panthor_queue_event;
>  struct panthor_device;
>  struct panthor_file;
>  struct panthor_group_pool;
> @@ -25,7 +26,8 @@ int panthor_group_create(struct panthor_file *pfile,
>  			 u64 drm_client_id);
>  int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle);
>  int panthor_group_get_state(struct panthor_file *pfile,
> -			    struct drm_panthor_group_get_state *get_state);
> +			    struct drm_panthor_group_get_state *get_state,
> +			    struct drm_panthor_queue_event *events, u32 count);
>  
>  struct drm_sched_job *
>  panthor_job_create(struct panthor_file *pfile,
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index 083a02418d28..931c8371b1b6 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -1012,6 +1012,30 @@ struct drm_panthor_group_get_state {
>  
>  	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
>  	__u32 fault_queues;
> +
> +	/** @exception_type: The type of exception that caused the fault. */
> +	__u32 exception_type;
> +
> +	/** @access_type: The direction of the data transfer that caused the fault., if known. */
> +	__u32 access_type;
> +
> +	/** @source_id: ID supplying further data about the source of the fault. */
> +	__u32 source_id;
> +
> +	/**
> +	 * @valid_address: Whether the address is valid or not. Some faults may not come with
> +	 * a valid GPU VA.
> +	 */
> +	__u8 valid_address;

Could we actually have a drm_panthor_vm_fault object defined in the
uAPI header and re-used in the panthor_mmu.c code, so we don't have to
do this manual per-field copy dance and we can simply replace it with:

	get_state->vm_fault = group->vm_fault;

> +
> +	/** @pad0: MBZ. */
> +	__u8 pad0[3];
> +
> +	/** @address: GPU VA of the faulting access, if present. */
> +	__u64 address;
> +
> +	/** @faults: Array of faults passed back to the user. */
> +	struct drm_panthor_obj_array faults;

Can we name that one queue_faults?

>  };
>  
>  /**


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors
  2025-12-15 11:54 ` [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors Lukas Zapolskas
  2025-12-15 17:31   ` Boris Brezillon
@ 2025-12-16  5:45   ` kernel test robot
  2025-12-16  7:52   ` Marcin Ślusarz
  2025-12-16  9:29   ` kernel test robot
  3 siblings, 0 replies; 16+ messages in thread
From: kernel test robot @ 2025-12-16  5:45 UTC (permalink / raw)
  To: Lukas Zapolskas, Boris Brezillon, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter
  Cc: llvm, oe-kbuild-all, nd, dri-devel, linux-kernel, Lukas Zapolskas,
	Ketil Johnsen

Hi Lukas,

kernel test robot noticed the following build errors:

[auto build test ERROR on drm-misc/drm-misc-next]
[also build test ERROR on next-20251216]
[cannot apply to linus/master v6.19-rc1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Lukas-Zapolskas/drm-panthor-Implement-CS_FAULT-propagation-to-userspace/20251215-195920
base:   https://gitlab.freedesktop.org/drm/misc/kernel.git drm-misc-next
patch link:    https://lore.kernel.org/r/20251215115457.2137485-6-lukas.zapolskas%40arm.com
patch subject: [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors
config: x86_64-buildonly-randconfig-001-20251216 (https://download.01.org/0day-ci/archive/20251216/202512161359.YBFqhXuf-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251216/202512161359.YBFqhXuf-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512161359.YBFqhXuf-lkp@intel.com/

All errors (new ones prefixed by >>):

>> drivers/gpu/drm/panthor/panthor_sched.c:3968:37: error: no member named 'fault' in 'struct panthor_group'
    3968 |         get_state->exception_type = group->fault.exception_type;
         |                                     ~~~~~  ^
   drivers/gpu/drm/panthor/panthor_sched.c:3969:34: error: no member named 'fault' in 'struct panthor_group'
    3969 |         get_state->access_type = group->fault.access_type;
         |                                  ~~~~~  ^
   drivers/gpu/drm/panthor/panthor_sched.c:3970:32: error: no member named 'fault' in 'struct panthor_group'
    3970 |         get_state->source_id = group->fault.source_id;
         |                                ~~~~~  ^
   drivers/gpu/drm/panthor/panthor_sched.c:3971:36: error: no member named 'fault' in 'struct panthor_group'
    3971 |         get_state->valid_address = group->fault.valid_address;
         |                                    ~~~~~  ^
   drivers/gpu/drm/panthor/panthor_sched.c:3972:30: error: no member named 'fault' in 'struct panthor_group'
    3972 |         get_state->address = group->fault.address;
         |                              ~~~~~  ^
   5 errors generated.


vim +3968 drivers/gpu/drm/panthor/panthor_sched.c

  3938	
  3939	int panthor_group_get_state(struct panthor_file *pfile,
  3940				    struct drm_panthor_group_get_state *get_state,
  3941				    struct drm_panthor_queue_event *events, u32 count)
  3942	{
  3943		struct panthor_group_pool *gpool = pfile->groups;
  3944		struct panthor_device *ptdev = pfile->ptdev;
  3945		struct panthor_scheduler *sched = ptdev->scheduler;
  3946		struct panthor_group *group = NULL;
  3947		u32 fault_count;
  3948	
  3949		group = group_from_handle(gpool, get_state->group_handle);
  3950		if (!group)
  3951			return -EINVAL;
  3952	
  3953		guard(mutex)(&sched->lock);
  3954	
  3955		if (group->timedout)
  3956			get_state->state |= DRM_PANTHOR_GROUP_STATE_TIMEDOUT;
  3957		if (group->fatal_queues) {
  3958			get_state->state |= DRM_PANTHOR_GROUP_STATE_FATAL_FAULT;
  3959			get_state->fatal_queues = group->fatal_queues;
  3960		}
  3961		if (group->innocent)
  3962			get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
  3963		if (group->fault_queues) {
  3964			get_state->state |= DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT;
  3965			get_state->fault_queues = group->fault_queues;
  3966		}
  3967	
> 3968		get_state->exception_type = group->fault.exception_type;
  3969		get_state->access_type = group->fault.access_type;
  3970		get_state->source_id = group->fault.source_id;
  3971		get_state->valid_address = group->fault.valid_address;
  3972		get_state->address = group->fault.address;
  3973	
  3974		fault_count = panthor_group_count_faults(sched, group);
  3975	
  3976		if (!count && !events) {
  3977			get_state->faults.count = fault_count;
  3978			get_state->faults.stride = sizeof(struct drm_panthor_queue_event);
  3979			goto exit;
  3980		}
  3981	
  3982		panthor_group_get_faults(sched, group, events, min(get_state->faults.count, count));
  3983	
  3984	exit:
  3985		group_put(group);
  3986		return 0;
  3987	}
  3988	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors
  2025-12-15 11:54 ` [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors Lukas Zapolskas
  2025-12-15 17:31   ` Boris Brezillon
  2025-12-16  5:45   ` kernel test robot
@ 2025-12-16  7:52   ` Marcin Ślusarz
  2025-12-16  9:29   ` kernel test robot
  3 siblings, 0 replies; 16+ messages in thread
From: Marcin Ślusarz @ 2025-12-16  7:52 UTC (permalink / raw)
  To: Lukas Zapolskas
  Cc: Boris Brezillon, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, nd, dri-devel,
	linux-kernel, Ketil Johnsen

On Mon, Dec 15, 2025 at 11:54:57AM +0000, Lukas Zapolskas wrote:
>  int panthor_group_get_state(struct panthor_file *pfile,
> -			    struct drm_panthor_group_get_state *get_state)
> +			    struct drm_panthor_group_get_state *get_state,
> +			    struct drm_panthor_queue_event *events, u32 count)
>  {
>  	struct panthor_group_pool *gpool = pfile->groups;
>  	struct panthor_device *ptdev = pfile->ptdev;
>  	struct panthor_scheduler *sched = ptdev->scheduler;
> -	struct panthor_group *group;
> +	struct panthor_group *group = NULL;

Initialization is not needed. group is set as the very first thing.

> +	u32 fault_count;
>  
>  	group = group_from_handle(gpool, get_state->group_handle);
>  	if (!group)
>  		return -EINVAL;
>  

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors
  2025-12-15 11:54 ` [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors Lukas Zapolskas
                     ` (2 preceding siblings ...)
  2025-12-16  7:52   ` Marcin Ślusarz
@ 2025-12-16  9:29   ` kernel test robot
  3 siblings, 0 replies; 16+ messages in thread
From: kernel test robot @ 2025-12-16  9:29 UTC (permalink / raw)
  To: Lukas Zapolskas, Boris Brezillon, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter
  Cc: oe-kbuild-all, nd, dri-devel, linux-kernel, Lukas Zapolskas,
	Ketil Johnsen

Hi Lukas,

kernel test robot noticed the following build errors:

[auto build test ERROR on drm-misc/drm-misc-next]
[also build test ERROR on next-20251216]
[cannot apply to linus/master v6.19-rc1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Lukas-Zapolskas/drm-panthor-Implement-CS_FAULT-propagation-to-userspace/20251215-195920
base:   https://gitlab.freedesktop.org/drm/misc/kernel.git drm-misc-next
patch link:    https://lore.kernel.org/r/20251215115457.2137485-6-lukas.zapolskas%40arm.com
patch subject: [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors
config: alpha-allyesconfig (https://download.01.org/0day-ci/archive/20251216/202512161757.nqIBsB07-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251216/202512161757.nqIBsB07-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512161757.nqIBsB07-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/gpu/drm/panthor/panthor_sched.c: In function 'panthor_group_get_state':
>> drivers/gpu/drm/panthor/panthor_sched.c:3968:42: error: 'struct panthor_group' has no member named 'fault'
    3968 |         get_state->exception_type = group->fault.exception_type;
         |                                          ^~
   drivers/gpu/drm/panthor/panthor_sched.c:3969:39: error: 'struct panthor_group' has no member named 'fault'
    3969 |         get_state->access_type = group->fault.access_type;
         |                                       ^~
   drivers/gpu/drm/panthor/panthor_sched.c:3970:37: error: 'struct panthor_group' has no member named 'fault'
    3970 |         get_state->source_id = group->fault.source_id;
         |                                     ^~
   drivers/gpu/drm/panthor/panthor_sched.c:3971:41: error: 'struct panthor_group' has no member named 'fault'
    3971 |         get_state->valid_address = group->fault.valid_address;
         |                                         ^~
   drivers/gpu/drm/panthor/panthor_sched.c:3972:35: error: 'struct panthor_group' has no member named 'fault'
    3972 |         get_state->address = group->fault.address;
         |                                   ^~


vim +3968 drivers/gpu/drm/panthor/panthor_sched.c

  3938	
  3939	int panthor_group_get_state(struct panthor_file *pfile,
  3940				    struct drm_panthor_group_get_state *get_state,
  3941				    struct drm_panthor_queue_event *events, u32 count)
  3942	{
  3943		struct panthor_group_pool *gpool = pfile->groups;
  3944		struct panthor_device *ptdev = pfile->ptdev;
  3945		struct panthor_scheduler *sched = ptdev->scheduler;
  3946		struct panthor_group *group = NULL;
  3947		u32 fault_count;
  3948	
  3949		group = group_from_handle(gpool, get_state->group_handle);
  3950		if (!group)
  3951			return -EINVAL;
  3952	
  3953		guard(mutex)(&sched->lock);
  3954	
  3955		if (group->timedout)
  3956			get_state->state |= DRM_PANTHOR_GROUP_STATE_TIMEDOUT;
  3957		if (group->fatal_queues) {
  3958			get_state->state |= DRM_PANTHOR_GROUP_STATE_FATAL_FAULT;
  3959			get_state->fatal_queues = group->fatal_queues;
  3960		}
  3961		if (group->innocent)
  3962			get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
  3963		if (group->fault_queues) {
  3964			get_state->state |= DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT;
  3965			get_state->fault_queues = group->fault_queues;
  3966		}
  3967	
> 3968		get_state->exception_type = group->fault.exception_type;
  3969		get_state->access_type = group->fault.access_type;
  3970		get_state->source_id = group->fault.source_id;
  3971		get_state->valid_address = group->fault.valid_address;
  3972		get_state->address = group->fault.address;
  3973	
  3974		fault_count = panthor_group_count_faults(sched, group);
  3975	
  3976		if (!count && !events) {
  3977			get_state->faults.count = fault_count;
  3978			get_state->faults.stride = sizeof(struct drm_panthor_queue_event);
  3979			goto exit;
  3980		}
  3981	
  3982		panthor_group_get_faults(sched, group, events, min(get_state->faults.count, count));
  3983	
  3984	exit:
  3985		group_put(group);
  3986		return 0;
  3987	}
  3988	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information
  2025-12-15 11:54 ` [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information Lukas Zapolskas
  2025-12-15 12:11   ` Boris Brezillon
@ 2025-12-17 11:37   ` Steven Price
  1 sibling, 0 replies; 16+ messages in thread
From: Steven Price @ 2025-12-17 11:37 UTC (permalink / raw)
  To: Lukas Zapolskas, Boris Brezillon, Liviu Dudau, Maarten Lankhorst,
	Maxime Ripard, Thomas Zimmermann, David Airlie, Simona Vetter
  Cc: nd, dri-devel, linux-kernel

On 15/12/2025 11:54, Lukas Zapolskas wrote:
> A queue may encounter either one fatal fault or any number of
> recoverable faults during execution. The CSF FW provides the
> FAULT/FATAL registers, indicating the fault type, and another
> set of registers providing more metadata about why the fault
> was generated. Storing the information allows it to be
> reported to the user using the GROUP_GET_STATE ioctl.
> 
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 116 +++++++++++++++++-------
>  include/uapi/drm/panthor_drm.h          |  17 ++++
>  2 files changed, 100 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index eb8841beba39..a77399e95620 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -342,6 +342,14 @@ struct panthor_syncobj_64b {
>  	u32 pad;
>  };
>  
> +struct panthor_queue_event {
> +	/** @link: Link to a list of Panthor event errors. */
> +	struct list_head link;
> +
> +	/** @event: The event containing all of the fault/fatal metadata. */
> +	struct drm_panthor_queue_event event;
> +};
> +
>  /**
>   * struct panthor_queue - Execution queue
>   */
> @@ -485,6 +493,9 @@ struct panthor_queue {
>  		/** @seqno: Index of the next available profiling information slot. */
>  		u32 seqno;
>  	} profiling;
> +
> +	/** @events: List of fault or fatal events reported on this queue. */
> +	struct list_head events;
>  };
>  
>  /**
> @@ -918,6 +929,8 @@ panthor_queue_get_syncwait_obj(struct panthor_group *group, struct panthor_queue
>  
>  static void group_free_queue(struct panthor_group *group, struct panthor_queue *queue)
>  {
> +	struct panthor_queue_event *evt, *tmp;
> +
>  	if (IS_ERR_OR_NULL(queue))
>  		return;
>  
> @@ -934,6 +947,11 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>  
>  	panthor_queue_put_syncwait_obj(queue);
>  
> +	list_for_each_entry_safe(evt, tmp, &queue->events, link) {
> +		list_del(&evt->link);
> +		kfree(evt);
> +	}
> +
>  	panthor_kernel_bo_destroy(queue->ringbuf);
>  	panthor_kernel_bo_destroy(queue->iface.mem);
>  	panthor_kernel_bo_destroy(queue->profiling.slots);
> @@ -1476,6 +1494,69 @@ csg_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 priority)
>  	return 0;
>  }
>  
> +static struct panthor_queue_event *
> +panthor_queue_create_event(unsigned long event_type, u32 cs_id, u32 exception)
> +{
> +	struct panthor_queue_event *event;
> +
> +	event = kzalloc(sizeof(*event), GFP_KERNEL);
> +	if (!event)
> +		return ERR_PTR(-ENOMEM);
> +
> +	event->event = (struct drm_panthor_queue_event){
> +		.queue_id = cs_id,
> +		.event_type = event_type,
> +		.exception_type = CS_EXCEPTION_TYPE(exception),
> +		.exception_data = CS_EXCEPTION_DATA(exception),
> +	};
> +	INIT_LIST_HEAD(&event->link);
> +
> +	return event;
> +}
> +
> +#define PANTHOR_DEFINE_EVENT_INFO(__type, __msg, __event) \
> +static u32 panthor_queue_set_ ## __type ## _info(struct panthor_device *ptdev,			\
> +						 struct panthor_group *group,			\
> +						 u32 csg_id, u32 cs_id)				\
> +{												\
> +	struct panthor_scheduler *sched = ptdev->scheduler;					\
> +	struct panthor_fw_cs_iface *iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);	\
> +	struct panthor_queue *queue = group && cs_id < group->queue_count ?			\
> +				      group->queues[cs_id] : NULL;				\
> +	struct panthor_queue_event *event;							\
> +												\
> +	lockdep_assert_held(&sched->lock);							\
> +												\
> +	if (!iface || !queue)									\
> +		return 0;									\
> +												\
> +	const u32 exception = iface->output->__type;						\
> +	const u64 info = iface->output->__type ## _info;					\
> +												\
> +	event = panthor_queue_create_event((__event), cs_id, exception);			\
> +												\
> +	if (!IS_ERR(event))									\
> +		list_add_tail(&event->link, &queue->events);					\
> +	else											\
> +		drm_err(&ptdev->base, "Could not store fault notification, err = %ld",		\
> +			PTR_ERR(event));							\
> +												\
> +	drm_warn(&ptdev->base,									\
> +		 "CSG slot %d CS slot: %d\n"							\
> +		 "CS_" __msg  ".EXCEPTION_TYPE: 0x%x (%s)\n"					\
> +		 "CS_" __msg  ".EXCEPTION_DATA: 0x%x\n"						\
> +		 "CS_" __msg  "_INFO.EXCEPTION_DATA: 0x%llx\n",					\
> +		 csg_id, cs_id,									\
> +		 (unsigned int)CS_EXCEPTION_TYPE(exception),					\
> +		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(exception)),			\
> +		 (unsigned int)CS_EXCEPTION_DATA(exception), info);				\
> +												\
> +	return exception;									\
> +}
> +
> +PANTHOR_DEFINE_EVENT_INFO(fatal, "FATAL", DRM_PANTHOR_GROUP_STATE_FATAL_FAULT);
> +PANTHOR_DEFINE_EVENT_INFO(fault, "FAULT", DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT);

I can't say I'm particularly keen on a massive macro defining functions 
like this - it makes the code harder to read and breaks tools like 
ctags. In this case it's not much more code to just make a generic 
version (untested):

---8<---
static u32 panthor_queue_set_info(struct panthor_device *ptdev,
				  struct panthor_group *group,
				  u32 csg_id, u32 cs_id,
				  int event)
{
	struct panthor_scheduler *sched = ptdev->scheduler;
	struct panthor_fw_cs_iface *iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
	struct panthor_queue *queue = group && cs_id < group->queue_count ?
				      group->queues[cs_id] : NULL;
	struct panthor_queue_event *event;
	u32 exception, info;
	const char *type;

	lockdep_assert_held(&sched->lock);

	if (!iface || !queue)
		return 0;

	switch (event) {
	case DRM_PANTHOR_GROUP_STATE_FATAL_FAULT:
		exception = iface->output->fatal;
		info = iface->output->fatal_info;
		type = "FATAL";
		break;
	case DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT:
		exception = iface->output->fault;
		info = iface->output->fault_info;
		type = "FAULT";
		break;
	default:
		WARN_ON(1);
		return 0;
	}

	event = panthor_queue_create_event(event, cs_id, exception);

	if (!IS_ERR(event))
		list_add_tail(&event->link, &queue->events);
	else
		drm_err(&ptdev->base, "Could not store fault notification, err = %ld",
			PTR_ERR(event));

	drm_warn(&ptdev->base,
		 "CSG slot %d CS slot: %d\n"
		 "CS_%s.EXCEPTION_TYPE: 0x%x (%s)\n"
		 "CS_%s.EXCEPTION_DATA: 0x%x\n"
		 "CS_%s_INFO.EXCEPTION_DATA: 0x%llx\n",
		 csg_id, cs_id,
		 type, (unsigned int)CS_EXCEPTION_TYPE(exception),
		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(exception)),
		 type, (unsigned int)CS_EXCEPTION_DATA(exception),
		 type, info);

	return exception;
}

static u32 panthor_queue_set_fatal_info(struct panthor_device *ptdev,
					struct panthor_group *group,
					u32 csg_id, u32 cs_id)
{
	return panthor_queue_set_info(ptdev, group, csg_id, cs_id,
				      DRM_PANTHOR_GROUP_STATE_FATAL_FAULT);
}

static u32 panthor_queue_set_fault_info(struct panthor_device *ptdev,
					struct panthor_group *group,
					u32 csg_id, u32 cs_id)
{
	return panthor_queue_set_info(ptdev, group, csg_id, cs_id,
				      DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT);
}
---8<---

I'm also wondering if "set_fault_info" is the right wording. Both are 
'faults' (FATAL_FAULT vs QUEUE_FAULT). I also suspect that the helper
wrappers are unneeded considering we only have the single call site
for each.

The drm_warn() message could also be simplified - we don't actually
need the 'official' register names output, so we could just have a
"is fatal" flag.

Thanks,
Steve

> +
>  static void
>  cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  				   u32 csg_id, u32 cs_id)
> @@ -1483,15 +1564,11 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  	struct panthor_scheduler *sched = ptdev->scheduler;
>  	struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
>  	struct panthor_group *group = csg_slot->group;
> -	struct panthor_fw_cs_iface *cs_iface;
>  	u32 fatal;
> -	u64 info;
>  
>  	lockdep_assert_held(&sched->lock);
>  
> -	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
> -	fatal = cs_iface->output->fatal;
> -	info = cs_iface->output->fatal_info;
> +	fatal = panthor_queue_set_fatal_info(ptdev, group, csg_id, cs_id);
>  
>  	if (group) {
>  		drm_warn(&ptdev->base, "CS_FATAL: pid=%d, comm=%s\n",
> @@ -1509,17 +1586,6 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
>  	} else {
>  		sched_queue_delayed_work(sched, tick, 0);
>  	}
> -
> -	drm_warn(&ptdev->base,
> -		 "CSG slot %d CS slot: %d\n"
> -		 "CS_FATAL.EXCEPTION_TYPE: 0x%x (%s)\n"
> -		 "CS_FATAL.EXCEPTION_DATA: 0x%x\n"
> -		 "CS_FATAL_INFO.EXCEPTION_DATA: 0x%llx\n",
> -		 csg_id, cs_id,
> -		 (unsigned int)CS_EXCEPTION_TYPE(fatal),
> -		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
> -		 (unsigned int)CS_EXCEPTION_DATA(fatal),
> -		 info);
>  }
>  
>  static void
> @@ -1531,15 +1597,10 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  	struct panthor_group *group = csg_slot->group;
>  	struct panthor_queue *queue = group && cs_id < group->queue_count ?
>  				      group->queues[cs_id] : NULL;
> -	struct panthor_fw_cs_iface *cs_iface;
> -	u32 fault;
> -	u64 info;
>  
>  	lockdep_assert_held(&sched->lock);
>  
> -	cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
> -	fault = cs_iface->output->fault;
> -	info = cs_iface->output->fault_info;
> +	panthor_queue_set_fault_info(ptdev, group, csg_id, cs_id);
>  
>  	if (queue) {
>  		u64 cs_extract = queue->iface.output->extract;
> @@ -1564,17 +1625,6 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>  
>  		group->fault_queues |= BIT(cs_id);
>  	}
> -
> -	drm_warn(&ptdev->base,
> -		 "CSG slot %d CS slot: %d\n"
> -		 "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n"
> -		 "CS_FAULT.EXCEPTION_DATA: 0x%x\n"
> -		 "CS_FAULT_INFO.EXCEPTION_DATA: 0x%llx\n",
> -		 csg_id, cs_id,
> -		 (unsigned int)CS_EXCEPTION_TYPE(fault),
> -		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
> -		 (unsigned int)CS_EXCEPTION_DATA(fault),
> -		 info);
>  }
>  
>  static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index 77262d2b9672..083a02418d28 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -974,6 +974,23 @@ enum drm_panthor_group_state_flags {
>  	DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT = 1 << 3,
>  };
>  
> +/**
> + * struct drm_panthor_queue_event - Fault or fatal event occurring on a single queue.
> + */
> +struct drm_panthor_queue_event {
> +	/** @queue_id: The ID of the queue that faulted. */
> +	__u32 queue_id;
> +
> +	/** @event_type: What kind of event is being propagated. */
> +	__u32 event_type;
> +
> +	/** @exception_type: The type of exception that caused the fault. */
> +	__u32 exception_type;
> +
> +	/** @exception_data: Exception-specific data. */
> +	__u32 exception_data;
> +};
> +
>  /**
>   * struct drm_panthor_group_get_state - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_GET_STATE
>   *


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-12-17 11:38 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-15 11:54 [PATCH v1 0/5] drm/panthor: Implement fault information propagation Lukas Zapolskas
2025-12-15 11:54 ` [PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace Lukas Zapolskas
2025-12-15 12:03   ` Boris Brezillon
2025-12-15 11:54 ` [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information Lukas Zapolskas
2025-12-15 12:11   ` Boris Brezillon
2025-12-17 11:37   ` Steven Price
2025-12-15 11:54 ` [PATCH v1 3/5] drm/panthor: Track VM faults Lukas Zapolskas
2025-12-15 12:37   ` Boris Brezillon
2025-12-15 11:54 ` [PATCH v1 4/5] drm/panthor: Propagate VM-level faults to groups Lukas Zapolskas
2025-12-15 12:41   ` Boris Brezillon
2025-12-15 12:46   ` Boris Brezillon
2025-12-15 11:54 ` [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors Lukas Zapolskas
2025-12-15 17:31   ` Boris Brezillon
2025-12-16  5:45   ` kernel test robot
2025-12-16  7:52   ` Marcin Ślusarz
2025-12-16  9:29   ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).