From: Boris Brezillon <boris.brezillon@collabora.com>
To: Lukas Zapolskas <lukas.zapolskas@arm.com>
Cc: Liviu Dudau <liviu.dudau@arm.com>,
Maarten Lankhorst <maarten.lankhorst@linux.intel.com>,
Maxime Ripard <mripard@kernel.org>,
Thomas Zimmermann <tzimmermann@suse.de>,
David Airlie <airlied@gmail.com>, Simona Vetter <simona@ffwll.ch>,
nd@arm.com, dri-devel@lists.freedesktop.org,
linux-kernel@vger.kernel.org
Subject: Re: [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information
Date: Mon, 15 Dec 2025 13:11:42 +0100 [thread overview]
Message-ID: <20251215131142.34bf5d74@fedora> (raw)
In-Reply-To: <20251215115457.2137485-3-lukas.zapolskas@arm.com>
On Mon, 15 Dec 2025 11:54:54 +0000
Lukas Zapolskas <lukas.zapolskas@arm.com> wrote:
> A queue may encounter either one fatal fault or any number of
> recoverable faults during execution. The CSF FW provides the
> FAULT/FATAL registers, indicating the fault type, and another
> set of registers providing more metadata about why the fault
> was generated. Storing the information allows it to be
> reported to the user using the GROUP_GET_STATE ioctl.
>
> Signed-off-by: Lukas Zapolskas <lukas.zapolskas@arm.com>
> ---
> drivers/gpu/drm/panthor/panthor_sched.c | 116 +++++++++++++++++-------
> include/uapi/drm/panthor_drm.h | 17 ++++
> 2 files changed, 100 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index eb8841beba39..a77399e95620 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -342,6 +342,14 @@ struct panthor_syncobj_64b {
> u32 pad;
> };
>
> +struct panthor_queue_event {
> + /** @link: Link to a list of Panthor event errors. */
> + struct list_head link;
> +
> + /** @event: The event containing all of the fault/fatal metadata. */
> + struct drm_panthor_queue_event event;
> +};
> +
> /**
> * struct panthor_queue - Execution queue
> */
> @@ -485,6 +493,9 @@ struct panthor_queue {
> /** @seqno: Index of the next available profiling information slot. */
> u32 seqno;
> } profiling;
> +
> + /** @events: List of fault or fatal events reported on this queue. */
> + struct list_head events;
> };
>
> /**
> @@ -918,6 +929,8 @@ panthor_queue_get_syncwait_obj(struct panthor_group *group, struct panthor_queue
>
> static void group_free_queue(struct panthor_group *group, struct panthor_queue *queue)
> {
> + struct panthor_queue_event *evt, *tmp;
> +
> if (IS_ERR_OR_NULL(queue))
> return;
>
> @@ -934,6 +947,11 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>
> panthor_queue_put_syncwait_obj(queue);
>
> + list_for_each_entry_safe(evt, tmp, &queue->events, link) {
> + list_del(&evt->link);
> + kfree(evt);
> + }
> +
> panthor_kernel_bo_destroy(queue->ringbuf);
> panthor_kernel_bo_destroy(queue->iface.mem);
> panthor_kernel_bo_destroy(queue->profiling.slots);
> @@ -1476,6 +1494,69 @@ csg_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 priority)
> return 0;
> }
>
> +static struct panthor_queue_event *
> +panthor_queue_create_event(unsigned long event_type, u32 cs_id, u32 exception)
> +{
> + struct panthor_queue_event *event;
> +
> + event = kzalloc(sizeof(*event), GFP_KERNEL);
This is called from the dma-signalling path, from which we can't
allocate with GFP_KERNEL. I think it'd be preferable to pre-allocate a
fixed size event array at queue creation time (can be an extra param
passed to GROUP_CREATE), and report overflows if we're running out of
slots.
> + if (!event)
> + return ERR_PTR(-ENOMEM);
> +
> + event->event = (struct drm_panthor_queue_event){
> + .queue_id = cs_id,
> + .event_type = event_type,
> + .exception_type = CS_EXCEPTION_TYPE(exception),
> + .exception_data = CS_EXCEPTION_DATA(exception),
> + };
> + INIT_LIST_HEAD(&event->link);
> +
> + return event;
> +}
> +
> +#define PANTHOR_DEFINE_EVENT_INFO(__type, __msg, __event) \
> +static u32 panthor_queue_set_ ## __type ## _info(struct panthor_device *ptdev, \
> + struct panthor_group *group, \
> + u32 csg_id, u32 cs_id) \
> +{ \
> + struct panthor_scheduler *sched = ptdev->scheduler; \
> + struct panthor_fw_cs_iface *iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id); \
> + struct panthor_queue *queue = group && cs_id < group->queue_count ? \
> + group->queues[cs_id] : NULL; \
> + struct panthor_queue_event *event; \
> + \
> + lockdep_assert_held(&sched->lock); \
> + \
> + if (!iface || !queue) \
> + return 0; \
> + \
> + const u32 exception = iface->output->__type; \
> + const u64 info = iface->output->__type ## _info; \
> + \
> + event = panthor_queue_create_event((__event), cs_id, exception); \
> + \
> + if (!IS_ERR(event)) \
> + list_add_tail(&event->link, &queue->events); \
> + else \
> + drm_err(&ptdev->base, "Could not store fault notification, err = %ld", \
> + PTR_ERR(event)); \
> + \
> + drm_warn(&ptdev->base, \
> + "CSG slot %d CS slot: %d\n" \
> + "CS_" __msg ".EXCEPTION_TYPE: 0x%x (%s)\n" \
> + "CS_" __msg ".EXCEPTION_DATA: 0x%x\n" \
> + "CS_" __msg "_INFO.EXCEPTION_DATA: 0x%llx\n", \
> + csg_id, cs_id, \
> + (unsigned int)CS_EXCEPTION_TYPE(exception), \
> + panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(exception)), \
> + (unsigned int)CS_EXCEPTION_DATA(exception), info); \
> + \
> + return exception; \
> +}
> +
> +PANTHOR_DEFINE_EVENT_INFO(fatal, "FATAL", DRM_PANTHOR_GROUP_STATE_FATAL_FAULT);
> +PANTHOR_DEFINE_EVENT_INFO(fault, "FAULT", DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT);
> +
> static void
> cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
> u32 csg_id, u32 cs_id)
> @@ -1483,15 +1564,11 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
> struct panthor_scheduler *sched = ptdev->scheduler;
> struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
> struct panthor_group *group = csg_slot->group;
> - struct panthor_fw_cs_iface *cs_iface;
> u32 fatal;
> - u64 info;
>
> lockdep_assert_held(&sched->lock);
>
> - cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
> - fatal = cs_iface->output->fatal;
> - info = cs_iface->output->fatal_info;
> + fatal = panthor_queue_set_fatal_info(ptdev, group, csg_id, cs_id);
>
> if (group) {
> drm_warn(&ptdev->base, "CS_FATAL: pid=%d, comm=%s\n",
> @@ -1509,17 +1586,6 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
> } else {
> sched_queue_delayed_work(sched, tick, 0);
> }
> -
> - drm_warn(&ptdev->base,
> - "CSG slot %d CS slot: %d\n"
> - "CS_FATAL.EXCEPTION_TYPE: 0x%x (%s)\n"
> - "CS_FATAL.EXCEPTION_DATA: 0x%x\n"
> - "CS_FATAL_INFO.EXCEPTION_DATA: 0x%llx\n",
> - csg_id, cs_id,
> - (unsigned int)CS_EXCEPTION_TYPE(fatal),
> - panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
> - (unsigned int)CS_EXCEPTION_DATA(fatal),
> - info);
> }
>
> static void
> @@ -1531,15 +1597,10 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
> struct panthor_group *group = csg_slot->group;
> struct panthor_queue *queue = group && cs_id < group->queue_count ?
> group->queues[cs_id] : NULL;
> - struct panthor_fw_cs_iface *cs_iface;
> - u32 fault;
> - u64 info;
>
> lockdep_assert_held(&sched->lock);
>
> - cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
> - fault = cs_iface->output->fault;
> - info = cs_iface->output->fault_info;
> + panthor_queue_set_fault_info(ptdev, group, csg_id, cs_id);
>
> if (queue) {
> u64 cs_extract = queue->iface.output->extract;
> @@ -1564,17 +1625,6 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>
> group->fault_queues |= BIT(cs_id);
> }
> -
> - drm_warn(&ptdev->base,
> - "CSG slot %d CS slot: %d\n"
> - "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n"
> - "CS_FAULT.EXCEPTION_DATA: 0x%x\n"
> - "CS_FAULT_INFO.EXCEPTION_DATA: 0x%llx\n",
> - csg_id, cs_id,
> - (unsigned int)CS_EXCEPTION_TYPE(fault),
> - panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
> - (unsigned int)CS_EXCEPTION_DATA(fault),
> - info);
> }
>
> static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index 77262d2b9672..083a02418d28 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -974,6 +974,23 @@ enum drm_panthor_group_state_flags {
> DRM_PANTHOR_GROUP_STATE_QUEUE_FAULT = 1 << 3,
> };
>
> +/**
> + * struct drm_panthor_queue_event - Fault or fatal event occurring on a single queue.
> + */
> +struct drm_panthor_queue_event {
> + /** @queue_id: The ID of the queue that faulted. */
> + __u32 queue_id;
> +
> + /** @event_type: What kind of event is being propagated. */
> + __u32 event_type;
> +
> + /** @exception_type: The type of exception that caused the fault. */
> + __u32 exception_type;
> +
> + /** @exception_data: Exception-specific data. */
> + __u32 exception_data;
> +};
> +
> /**
> * struct drm_panthor_group_get_state - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_GET_STATE
> *
next prev parent reply other threads:[~2025-12-15 12:11 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-12-15 11:54 [PATCH v1 0/5] drm/panthor: Implement fault information propagation Lukas Zapolskas
2025-12-15 11:54 ` [PATCH v1 1/5] drm/panthor: Implement CS_FAULT propagation to userspace Lukas Zapolskas
2025-12-15 12:03 ` Boris Brezillon
2025-12-15 11:54 ` [PATCH v1 2/5] drm/panthor: Store queue fault and fatal information Lukas Zapolskas
2025-12-15 12:11 ` Boris Brezillon [this message]
2025-12-17 11:37 ` Steven Price
2025-12-15 11:54 ` [PATCH v1 3/5] drm/panthor: Track VM faults Lukas Zapolskas
2025-12-15 12:37 ` Boris Brezillon
2025-12-15 11:54 ` [PATCH v1 4/5] drm/panthor: Propagate VM-level faults to groups Lukas Zapolskas
2025-12-15 12:41 ` Boris Brezillon
2025-12-15 12:46 ` Boris Brezillon
2025-12-15 11:54 ` [PATCH v1 5/5] drm/panthor: Use GROUP_GET_STATE to provide group and queue errors Lukas Zapolskas
2025-12-15 17:31 ` Boris Brezillon
2025-12-16 5:45 ` kernel test robot
2025-12-16 7:52 ` Marcin Ślusarz
2025-12-16 9:29 ` kernel test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251215131142.34bf5d74@fedora \
--to=boris.brezillon@collabora.com \
--cc=airlied@gmail.com \
--cc=dri-devel@lists.freedesktop.org \
--cc=linux-kernel@vger.kernel.org \
--cc=liviu.dudau@arm.com \
--cc=lukas.zapolskas@arm.com \
--cc=maarten.lankhorst@linux.intel.com \
--cc=mripard@kernel.org \
--cc=nd@arm.com \
--cc=simona@ffwll.ch \
--cc=tzimmermann@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox