* [PATCH 01/11] drm/panthor: Always use the IRQ-safe variant when acquiring the fence lock
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-25 12:51 ` sashiko-bot
2026-06-26 9:11 ` Liviu Dudau
2026-06-25 12:40 ` [PATCH 02/11] drm/panthor: Keep the reset work disabled until everything is initialized Boris Brezillon
` (9 subsequent siblings)
10 siblings, 2 replies; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon, sashiko-bot
Since dma_fence objects can be shared with other subsystems, they may be
accessed from hardirq context in those drivers, and we have to take
that into account by also using the IRQ-safe variant when acquiring
the lock.
While at it, switch to the guard model.
Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
Reported-by: sashiko-bot@kernel.org
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=11
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_sched.c | 83 ++++++++++++++++-----------------
1 file changed, 39 insertions(+), 44 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 5b34032deff8..e97f29469d28 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -1151,15 +1151,14 @@ queue_suspend_timeout_locked(struct panthor_queue *queue)
static void
queue_suspend_timeout(struct panthor_queue *queue)
{
- spin_lock(&queue->fence_ctx.lock);
+ guard(spinlock_irqsave)(&queue->fence_ctx.lock);
queue_suspend_timeout_locked(queue);
- spin_unlock(&queue->fence_ctx.lock);
}
static void
queue_resume_timeout(struct panthor_queue *queue)
{
- spin_lock(&queue->fence_ctx.lock);
+ guard(spinlock_irqsave)(&queue->fence_ctx.lock);
if (queue_timeout_is_suspended(queue)) {
mod_delayed_work(queue->scheduler.timeout_wq,
@@ -1168,8 +1167,6 @@ queue_resume_timeout(struct panthor_queue *queue)
queue->timeout.remaining = MAX_SCHEDULE_TIMEOUT;
}
-
- spin_unlock(&queue->fence_ctx.lock);
}
/**
@@ -1542,7 +1539,7 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
u64 cs_extract = queue->iface.output->extract;
struct panthor_job *job;
- spin_lock(&queue->fence_ctx.lock);
+ guard(spinlock_irqsave)(&queue->fence_ctx.lock);
list_for_each_entry(job, &queue->fence_ctx.in_flight_jobs, node) {
if (cs_extract >= job->ringbuf.end)
continue;
@@ -1552,7 +1549,6 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
dma_fence_set_error(job->done_fence, -EINVAL);
}
- spin_unlock(&queue->fence_ctx.lock);
}
if (group) {
@@ -2183,13 +2179,13 @@ group_term_post_processing(struct panthor_group *group)
if (!queue)
continue;
- spin_lock(&queue->fence_ctx.lock);
- list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) {
- list_move_tail(&job->node, &faulty_jobs);
- dma_fence_set_error(job->done_fence, err);
- dma_fence_signal_locked(job->done_fence);
+ scoped_guard(spinlock_irqsave, &queue->fence_ctx.lock) {
+ list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) {
+ list_move_tail(&job->node, &faulty_jobs);
+ dma_fence_set_error(job->done_fence, err);
+ dma_fence_signal_locked(job->done_fence);
+ }
}
- spin_unlock(&queue->fence_ctx.lock);
/* Manually update the syncobj seqno to unblock waiters. */
syncobj = group->syncobjs->kmap + (i * sizeof(*syncobj));
@@ -3049,39 +3045,39 @@ static bool queue_check_job_completion(struct panthor_queue *queue)
LIST_HEAD(done_jobs);
cookie = dma_fence_begin_signalling();
- spin_lock(&queue->fence_ctx.lock);
- list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
- if (!syncobj) {
- struct panthor_group *group = job->group;
+ scoped_guard(spinlock_irqsave, &queue->fence_ctx.lock) {
+ list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
+ if (!syncobj) {
+ struct panthor_group *group = job->group;
- syncobj = group->syncobjs->kmap +
- (job->queue_idx * sizeof(*syncobj));
+ syncobj = group->syncobjs->kmap +
+ (job->queue_idx * sizeof(*syncobj));
+ }
+
+ if (syncobj->seqno < job->done_fence->seqno)
+ break;
+
+ list_move_tail(&job->node, &done_jobs);
+ dma_fence_signal_locked(job->done_fence);
}
- if (syncobj->seqno < job->done_fence->seqno)
- break;
+ if (list_empty(&queue->fence_ctx.in_flight_jobs)) {
+ /* If we have no job left, we cancel the timer, and reset remaining
+ * time to its default so it can be restarted next time
+ * queue_resume_timeout() is called.
+ */
+ queue_suspend_timeout_locked(queue);
- list_move_tail(&job->node, &done_jobs);
- dma_fence_signal_locked(job->done_fence);
+ /* If there's no job pending, we consider it progress to avoid a
+ * spurious timeout if the timeout handler and the sync update
+ * handler raced.
+ */
+ progress = true;
+ } else if (!list_empty(&done_jobs)) {
+ queue_reset_timeout_locked(queue);
+ progress = true;
+ }
}
-
- if (list_empty(&queue->fence_ctx.in_flight_jobs)) {
- /* If we have no job left, we cancel the timer, and reset remaining
- * time to its default so it can be restarted next time
- * queue_resume_timeout() is called.
- */
- queue_suspend_timeout_locked(queue);
-
- /* If there's no job pending, we consider it progress to avoid a
- * spurious timeout if the timeout handler and the sync update
- * handler raced.
- */
- progress = true;
- } else if (!list_empty(&done_jobs)) {
- queue_reset_timeout_locked(queue);
- progress = true;
- }
- spin_unlock(&queue->fence_ctx.lock);
dma_fence_end_signalling(cookie);
list_for_each_entry_safe(job, job_tmp, &done_jobs, node) {
@@ -3346,9 +3342,8 @@ queue_run_job(struct drm_sched_job *sched_job)
job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
panthor_job_get(&job->base);
- spin_lock(&queue->fence_ctx.lock);
- list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
- spin_unlock(&queue->fence_ctx.lock);
+ scoped_guard(spinlock_irqsave, &queue->fence_ctx.lock)
+ list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
/* Make sure the ring buffer is updated before the INSERT
* register.
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 01/11] drm/panthor: Always use the IRQ-safe variant when acquiring the fence lock
2026-06-25 12:40 ` [PATCH 01/11] drm/panthor: Always use the IRQ-safe variant when acquiring the fence lock Boris Brezillon
@ 2026-06-25 12:51 ` sashiko-bot
2026-06-26 9:11 ` Liviu Dudau
1 sibling, 0 replies; 31+ messages in thread
From: sashiko-bot @ 2026-06-25 12:51 UTC (permalink / raw)
To: Boris Brezillon; +Cc: dri-devel
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Low] The function queue_run_job mixes goto-based error handling with scope-based cleanup helpers.
--
commit a4621d48f775641c2c35dd7c0b40ff8a402f0f4e
Author: Boris Brezillon <boris.brezillon@collabora.com>
drm/panthor: Always use the IRQ-safe variant when acquiring the fence lock
This commit updates the panthor scheduler to use the IRQ-safe variant of
spinlock when acquiring the fence context lock, as dma_fence objects can be
shared and accessed from hardirq context. It also transitions the code to
use the scope-based guard model.
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=11
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 5b34032deff81..e97f29469d28a 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
[ ... ]
> @@ -3346,9 +3342,8 @@ queue_run_job(struct drm_sched_job *sched_job)
> job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
>
> panthor_job_get(&job->base);
> - spin_lock(&queue->fence_ctx.lock);
> - list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
> - spin_unlock(&queue->fence_ctx.lock);
> + scoped_guard(spinlock_irqsave, &queue->fence_ctx.lock)
> + list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
>
> /* Make sure the ring buffer is updated before the INSERT
> * register.
[Severity: Low]
Does this introduce a mix of goto-based error handling and scope-based
cleanup helpers in the same function?
Looking at queue_run_job(), it retains a goto out_unlock error handling path
earlier in the function while now introducing scoped_guard here. The kernel
cleanup guidelines generally state that for a given routine, we should convert
all resources that need a goto cleanup to scope-based cleanup, or convert none
of them, to avoid confusing ownership semantics.
Could this function be fully converted to use guards for sched->lock as well,
or should this specific lock retain the traditional spin_lock/spin_unlock
pattern to avoid mixing the two styles?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260625-panthor-misc-fixes-v1-0-b67ed973fea6@collabora.com?part=1
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH 01/11] drm/panthor: Always use the IRQ-safe variant when acquiring the fence lock
2026-06-25 12:40 ` [PATCH 01/11] drm/panthor: Always use the IRQ-safe variant when acquiring the fence lock Boris Brezillon
2026-06-25 12:51 ` sashiko-bot
@ 2026-06-26 9:11 ` Liviu Dudau
1 sibling, 0 replies; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 9:11 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Thu, Jun 25, 2026 at 02:40:27PM +0200, Boris Brezillon wrote:
> Since dma_fence objects can be shared with other subsystems, they may be
> accessed from hardirq context in those drivers, and we have to take
> that into account by also using the IRQ-safe variant when acquiring
> the lock.
>
> While at it, switch to the guard model.
>
> Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> Reported-by: sashiko-bot@kernel.org
> Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=11
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
> ---
> drivers/gpu/drm/panthor/panthor_sched.c | 83 ++++++++++++++++-----------------
> 1 file changed, 39 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 5b34032deff8..e97f29469d28 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -1151,15 +1151,14 @@ queue_suspend_timeout_locked(struct panthor_queue *queue)
> static void
> queue_suspend_timeout(struct panthor_queue *queue)
> {
> - spin_lock(&queue->fence_ctx.lock);
> + guard(spinlock_irqsave)(&queue->fence_ctx.lock);
> queue_suspend_timeout_locked(queue);
> - spin_unlock(&queue->fence_ctx.lock);
> }
>
> static void
> queue_resume_timeout(struct panthor_queue *queue)
> {
> - spin_lock(&queue->fence_ctx.lock);
> + guard(spinlock_irqsave)(&queue->fence_ctx.lock);
>
> if (queue_timeout_is_suspended(queue)) {
> mod_delayed_work(queue->scheduler.timeout_wq,
> @@ -1168,8 +1167,6 @@ queue_resume_timeout(struct panthor_queue *queue)
>
> queue->timeout.remaining = MAX_SCHEDULE_TIMEOUT;
> }
> -
> - spin_unlock(&queue->fence_ctx.lock);
> }
>
> /**
> @@ -1542,7 +1539,7 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
> u64 cs_extract = queue->iface.output->extract;
> struct panthor_job *job;
>
> - spin_lock(&queue->fence_ctx.lock);
> + guard(spinlock_irqsave)(&queue->fence_ctx.lock);
> list_for_each_entry(job, &queue->fence_ctx.in_flight_jobs, node) {
> if (cs_extract >= job->ringbuf.end)
> continue;
> @@ -1552,7 +1549,6 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
>
> dma_fence_set_error(job->done_fence, -EINVAL);
> }
> - spin_unlock(&queue->fence_ctx.lock);
> }
>
> if (group) {
> @@ -2183,13 +2179,13 @@ group_term_post_processing(struct panthor_group *group)
> if (!queue)
> continue;
>
> - spin_lock(&queue->fence_ctx.lock);
> - list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) {
> - list_move_tail(&job->node, &faulty_jobs);
> - dma_fence_set_error(job->done_fence, err);
> - dma_fence_signal_locked(job->done_fence);
> + scoped_guard(spinlock_irqsave, &queue->fence_ctx.lock) {
> + list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) {
> + list_move_tail(&job->node, &faulty_jobs);
> + dma_fence_set_error(job->done_fence, err);
> + dma_fence_signal_locked(job->done_fence);
> + }
> }
> - spin_unlock(&queue->fence_ctx.lock);
>
> /* Manually update the syncobj seqno to unblock waiters. */
> syncobj = group->syncobjs->kmap + (i * sizeof(*syncobj));
> @@ -3049,39 +3045,39 @@ static bool queue_check_job_completion(struct panthor_queue *queue)
> LIST_HEAD(done_jobs);
>
> cookie = dma_fence_begin_signalling();
> - spin_lock(&queue->fence_ctx.lock);
> - list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
> - if (!syncobj) {
> - struct panthor_group *group = job->group;
> + scoped_guard(spinlock_irqsave, &queue->fence_ctx.lock) {
> + list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
> + if (!syncobj) {
> + struct panthor_group *group = job->group;
>
> - syncobj = group->syncobjs->kmap +
> - (job->queue_idx * sizeof(*syncobj));
> + syncobj = group->syncobjs->kmap +
> + (job->queue_idx * sizeof(*syncobj));
> + }
> +
> + if (syncobj->seqno < job->done_fence->seqno)
> + break;
> +
> + list_move_tail(&job->node, &done_jobs);
> + dma_fence_signal_locked(job->done_fence);
> }
>
> - if (syncobj->seqno < job->done_fence->seqno)
> - break;
> + if (list_empty(&queue->fence_ctx.in_flight_jobs)) {
> + /* If we have no job left, we cancel the timer, and reset remaining
> + * time to its default so it can be restarted next time
> + * queue_resume_timeout() is called.
> + */
> + queue_suspend_timeout_locked(queue);
>
> - list_move_tail(&job->node, &done_jobs);
> - dma_fence_signal_locked(job->done_fence);
> + /* If there's no job pending, we consider it progress to avoid a
> + * spurious timeout if the timeout handler and the sync update
> + * handler raced.
> + */
> + progress = true;
> + } else if (!list_empty(&done_jobs)) {
> + queue_reset_timeout_locked(queue);
> + progress = true;
> + }
> }
> -
> - if (list_empty(&queue->fence_ctx.in_flight_jobs)) {
> - /* If we have no job left, we cancel the timer, and reset remaining
> - * time to its default so it can be restarted next time
> - * queue_resume_timeout() is called.
> - */
> - queue_suspend_timeout_locked(queue);
> -
> - /* If there's no job pending, we consider it progress to avoid a
> - * spurious timeout if the timeout handler and the sync update
> - * handler raced.
> - */
> - progress = true;
> - } else if (!list_empty(&done_jobs)) {
> - queue_reset_timeout_locked(queue);
> - progress = true;
> - }
> - spin_unlock(&queue->fence_ctx.lock);
> dma_fence_end_signalling(cookie);
>
> list_for_each_entry_safe(job, job_tmp, &done_jobs, node) {
> @@ -3346,9 +3342,8 @@ queue_run_job(struct drm_sched_job *sched_job)
> job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
>
> panthor_job_get(&job->base);
> - spin_lock(&queue->fence_ctx.lock);
> - list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
> - spin_unlock(&queue->fence_ctx.lock);
> + scoped_guard(spinlock_irqsave, &queue->fence_ctx.lock)
> + list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
>
> /* Make sure the ring buffer is updated before the INSERT
> * register.
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 02/11] drm/panthor: Keep the reset work disabled until everything is initialized
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
2026-06-25 12:40 ` [PATCH 01/11] drm/panthor: Always use the IRQ-safe variant when acquiring the fence lock Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-25 12:59 ` sashiko-bot
2026-06-26 9:13 ` Liviu Dudau
2026-06-25 12:40 ` [PATCH 03/11] drm/panthor: Fix UAF on works queued to panthor_cleanup_wq Boris Brezillon
` (8 subsequent siblings)
10 siblings, 2 replies; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon, sashiko-bot
The reset work will sub-component reset helpers, which might not be
ready if the reset happens during initialization, leading to NULL
pointer dereferences or worse.
Avoid that by keeping the reset work disabled while we're initializing
those sub-components.
Fixes: 5fe909cae118 ("drm/panthor: Add the device logical block")
Reported-by: sashiko-bot@kernel.org
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=4
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_device.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
index bd417d6ae8c0..0b25abebb803 100644
--- a/drivers/gpu/drm/panthor/panthor_device.c
+++ b/drivers/gpu/drm/panthor/panthor_device.c
@@ -207,6 +207,7 @@ int panthor_device_init(struct panthor_device *ptdev)
*dummy_page_virt = 1;
INIT_WORK(&ptdev->reset.work, panthor_device_reset_work);
+ disable_work(&ptdev->reset.work);
ptdev->reset.wq = alloc_ordered_workqueue("panthor-reset-wq", 0);
if (!ptdev->reset.wq)
return -ENOMEM;
@@ -285,6 +286,9 @@ int panthor_device_init(struct panthor_device *ptdev)
panthor_gem_init(ptdev);
+ /* Now that everything is initialized, we can enable the reset work. */
+ enable_work(&ptdev->reset.work);
+
/* ~3 frames */
pm_runtime_set_autosuspend_delay(ptdev->base.dev, 50);
pm_runtime_use_autosuspend(ptdev->base.dev);
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 02/11] drm/panthor: Keep the reset work disabled until everything is initialized
2026-06-25 12:40 ` [PATCH 02/11] drm/panthor: Keep the reset work disabled until everything is initialized Boris Brezillon
@ 2026-06-25 12:59 ` sashiko-bot
2026-06-26 9:13 ` Liviu Dudau
1 sibling, 0 replies; 31+ messages in thread
From: sashiko-bot @ 2026-06-25 12:59 UTC (permalink / raw)
To: Boris Brezillon; +Cc: dri-devel
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
New issues:
- [High] Disabling the reset work during initialization causes asynchronous reset requests to be silently dropped, permanently wedging the GPU by leaving the pending flag stuck at 1.
Pre-existing issues:
- [High] The reset work remains enabled during the error teardown path if `drm_dev_register` fails, allowing concurrent reset work to access sub-components as they are destroyed.
--
commit d88d89caab294e4f6ec4a893f2e535ca95fb0890
Author: Boris Brezillon <boris.brezillon@collabora.com>
drm/panthor: Keep the reset work disabled until everything is initialized
The reset work will sub-component reset helpers, which might not be
ready if the reset happens during initialization, leading to NULL
pointer dereferences or worse. Avoid that by keeping the reset work
disabled while we're initializing those sub-components.
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=4
> diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
> --- a/drivers/gpu/drm/panthor/panthor_device.c
> +++ b/drivers/gpu/drm/panthor/panthor_device.c
> @@ -207,6 +207,7 @@ int panthor_device_init(struct panthor_device *ptdev)
> *dummy_page_virt = 1;
>
> INIT_WORK(&ptdev->reset.work, panthor_device_reset_work);
> + disable_work(&ptdev->reset.work);
> ptdev->reset.wq = alloc_ordered_workqueue("panthor-reset-wq", 0);
> if (!ptdev->reset.wq)
> return -ENOMEM;
[ ... ]
> @@ -285,6 +286,9 @@ int panthor_device_init(struct panthor_device *ptdev)
>
> panthor_gem_init(ptdev);
>
> + /* Now that everything is initialized, we can enable the reset work. */
> + enable_work(&ptdev->reset.work);
> +
> /* ~3 frames */
> pm_runtime_set_autosuspend_delay(ptdev->base.dev, 50);
> pm_runtime_use_autosuspend(ptdev->base.dev);
[Severity: High]
If a reset is requested while the work is disabled (e.g., due to a
firmware timeout or early MMU fault during driver probe), does the device
become permanently wedged?
Looking at panthor_device_schedule_reset(), it relies on queue_work()
succeeding after atomic_cmpxchg() sets the pending flag:
panthor_device_schedule_reset() {
if (!atomic_cmpxchg(&ptdev->reset.pending, 0, 1) &&
atomic_read(&ptdev->pm.state) == PANTHOR_DEVICE_PM_STATE_ACTIVE) {
queue_work(ptdev->reset.wq, &ptdev->reset.work);
}
}
Because the work is disabled via disable_work() earlier, queue_work()
fails and the work is never queued.
When enable_work() is called here, it decrements the disable count but
does not requeue the dropped work or clear the pending flag. Because
ptdev->reset.pending remains 1, will all future reset requests fail the
atomic_cmpxchg() check, leaving the GPU unable to recover?
[Severity: High]
This isn't a bug introduced by this patch, but if drm_dev_register()
fails shortly after this, does the reset work remain enabled during the
teardown sequence?
If drm_dev_register() fails, the driver jumps to err_disable_autosuspend
to destroy sub-components sequentially. Because drm_dev_unplug() is not
called in this explicit error path, dev->unplugged remains false.
If a concurrent FW timeout or fault triggers during this teardown sequence,
any concurrently scheduled reset work will pass the drm_dev_enter() check
in panthor_device_reset_work():
panthor_device_reset_work() {
if (!drm_dev_enter(&ptdev->base, &cookie))
return;
panthor_sched_pre_reset(ptdev);
panthor_fw_pre_reset(ptdev, true);
...
}
Could this cause the reset work to interact with sub-components that are
actively being destroyed, leading to use-after-free or NULL pointer
dereferences?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260625-panthor-misc-fixes-v1-0-b67ed973fea6@collabora.com?part=2
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH 02/11] drm/panthor: Keep the reset work disabled until everything is initialized
2026-06-25 12:40 ` [PATCH 02/11] drm/panthor: Keep the reset work disabled until everything is initialized Boris Brezillon
2026-06-25 12:59 ` sashiko-bot
@ 2026-06-26 9:13 ` Liviu Dudau
1 sibling, 0 replies; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 9:13 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Thu, Jun 25, 2026 at 02:40:28PM +0200, Boris Brezillon wrote:
> The reset work will sub-component reset helpers, which might not be
> ready if the reset happens during initialization, leading to NULL
> pointer dereferences or worse.
>
> Avoid that by keeping the reset work disabled while we're initializing
> those sub-components.
>
> Fixes: 5fe909cae118 ("drm/panthor: Add the device logical block")
> Reported-by: sashiko-bot@kernel.org
> Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=4
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
> ---
> drivers/gpu/drm/panthor/panthor_device.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
> index bd417d6ae8c0..0b25abebb803 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.c
> +++ b/drivers/gpu/drm/panthor/panthor_device.c
> @@ -207,6 +207,7 @@ int panthor_device_init(struct panthor_device *ptdev)
> *dummy_page_virt = 1;
>
> INIT_WORK(&ptdev->reset.work, panthor_device_reset_work);
> + disable_work(&ptdev->reset.work);
> ptdev->reset.wq = alloc_ordered_workqueue("panthor-reset-wq", 0);
> if (!ptdev->reset.wq)
> return -ENOMEM;
> @@ -285,6 +286,9 @@ int panthor_device_init(struct panthor_device *ptdev)
>
> panthor_gem_init(ptdev);
>
> + /* Now that everything is initialized, we can enable the reset work. */
> + enable_work(&ptdev->reset.work);
> +
> /* ~3 frames */
> pm_runtime_set_autosuspend_delay(ptdev->base.dev, 50);
> pm_runtime_use_autosuspend(ptdev->base.dev);
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 03/11] drm/panthor: Fix UAF on works queued to panthor_cleanup_wq
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
2026-06-25 12:40 ` [PATCH 01/11] drm/panthor: Always use the IRQ-safe variant when acquiring the fence lock Boris Brezillon
2026-06-25 12:40 ` [PATCH 02/11] drm/panthor: Keep the reset work disabled until everything is initialized Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-25 12:56 ` sashiko-bot
2026-06-25 14:20 ` Boris Brezillon
2026-06-25 12:40 ` [PATCH 04/11] drm/panthor: Fix potential invalid pointer deref in group_process_tiler_oom() Boris Brezillon
` (7 subsequent siblings)
10 siblings, 2 replies; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon, sashiko-bot
If we don't drain the panthor_cleanup_wq before
unregistering/destroying the panthor_device, we might end up with
cleanup works that are executed after the device is gone.
Add a drain_workqueue() call in panthor_device_unplug() to prevent
that.
Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
Fixes: 647810ec2476 ("drm/panthor: Add the MMU/VM logical block")
Reported-by: sashiko-bot@kernel.org
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=2
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_device.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
index 0b25abebb803..a0774e28aa94 100644
--- a/drivers/gpu/drm/panthor/panthor_device.c
+++ b/drivers/gpu/drm/panthor/panthor_device.c
@@ -101,6 +101,11 @@ void panthor_device_unplug(struct panthor_device *ptdev)
panthor_gpu_unplug(ptdev);
panthor_pwr_unplug(ptdev);
+ /* Make sure works queued to panthor_cleanup_wq are executed
+ * before the device is destroyed.
+ */
+ drain_workqueue(panthor_cleanup_wq);
+
pm_runtime_dont_use_autosuspend(ptdev->base.dev);
pm_runtime_put_sync_suspend(ptdev->base.dev);
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 03/11] drm/panthor: Fix UAF on works queued to panthor_cleanup_wq
2026-06-25 12:40 ` [PATCH 03/11] drm/panthor: Fix UAF on works queued to panthor_cleanup_wq Boris Brezillon
@ 2026-06-25 12:56 ` sashiko-bot
2026-06-25 14:20 ` Boris Brezillon
1 sibling, 0 replies; 31+ messages in thread
From: sashiko-bot @ 2026-06-25 12:56 UTC (permalink / raw)
To: Boris Brezillon; +Cc: dri-devel
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
New issues:
- [High] Calling `drain_workqueue()` on a globally shared workqueue drops non-chained works from all other devices, leading to resource leaks.
Pre-existing issues:
- [High] The patch fails to prevent the Use-After-Free because `drain_workqueue()` does not permanently prevent new works from being queued after unplug.
--
commit a8d1c4b2d56a32fc02ddadc07ead22f7dc20b168
Author: Boris Brezillon <boris.brezillon@collabora.com>
drm/panthor: Fix UAF on works queued to panthor_cleanup_wq
This patch adds a drain_workqueue() call in panthor_device_unplug() to
ensure cleanup works are executed before the panthor device is destroyed.
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=2
> diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
> index 0b25abebb803c..a0774e28aa94d 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.c
> +++ b/drivers/gpu/drm/panthor/panthor_device.c
> @@ -101,6 +101,11 @@ void panthor_device_unplug(struct panthor_device *ptdev)
> panthor_gpu_unplug(ptdev);
> panthor_pwr_unplug(ptdev);
>
> + /* Make sure works queued to panthor_cleanup_wq are executed
> + * before the device is destroyed.
> + */
> + drain_workqueue(panthor_cleanup_wq);
[Severity: High]
Since panthor_cleanup_wq is a global workqueue shared by all panthor devices
(allocated once in panthor_init()), does calling drain_workqueue() here
cause issues for other active panthor devices?
drain_workqueue() temporarily sets the __WQ_DRAINING flag on the workqueue.
If any other active panthor device attempts to queue work while this flag
is set, __queue_work() will hit a WARN_ONCE and silently discard the
non-chained work.
Will this break cleanup for all other panthor devices system-wide, causing
resource leaks (e.g., VMs, BOs, groups) that can lead to system degradation?
[Severity: High]
This is a pre-existing issue, but does this actually prevent the
Use-After-Free?
drain_workqueue() only waits for currently queued works to finish and then
clears the __WQ_DRAINING flag before returning.
If a userspace process holds an open DRM file descriptor during a device
hot-unplug, the ptdev isn't freed immediately. When the process later closes
the FD, drm_release() will destroy the group pool and call group_release().
At this point:
drivers/gpu/drm/panthor/panthor_sched.c:group_release() {
...
queue_work(panthor_cleanup_wq, &group->release_work);
...
}
This successfully queues new asynchronous cleanup works because __WQ_DRAINING
has already been cleared.
When drm_release() then drops the final device reference, synchronously
freeing ptdev, the newly queued group_release_work will execute
asynchronously and access the freed ptdev, triggering the exact
Use-After-Free this was intended to fix.
> +
> pm_runtime_dont_use_autosuspend(ptdev->base.dev);
> pm_runtime_put_sync_suspend(ptdev->base.dev);
>
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260625-panthor-misc-fixes-v1-0-b67ed973fea6@collabora.com?part=3
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH 03/11] drm/panthor: Fix UAF on works queued to panthor_cleanup_wq
2026-06-25 12:40 ` [PATCH 03/11] drm/panthor: Fix UAF on works queued to panthor_cleanup_wq Boris Brezillon
2026-06-25 12:56 ` sashiko-bot
@ 2026-06-25 14:20 ` Boris Brezillon
1 sibling, 0 replies; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 14:20 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, sashiko-bot
On Thu, 25 Jun 2026 14:40:29 +0200
Boris Brezillon <boris.brezillon@collabora.com> wrote:
> If we don't drain the panthor_cleanup_wq before
> unregistering/destroying the panthor_device, we might end up with
> cleanup works that are executed after the device is gone.
>
> Add a drain_workqueue() call in panthor_device_unplug() to prevent
> that.
>
> Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> Fixes: 647810ec2476 ("drm/panthor: Add the MMU/VM logical block")
> Reported-by: sashiko-bot@kernel.org
> Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=2
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
> drivers/gpu/drm/panthor/panthor_device.c | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
> index 0b25abebb803..a0774e28aa94 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.c
> +++ b/drivers/gpu/drm/panthor/panthor_device.c
> @@ -101,6 +101,11 @@ void panthor_device_unplug(struct panthor_device *ptdev)
> panthor_gpu_unplug(ptdev);
> panthor_pwr_unplug(ptdev);
>
> + /* Make sure works queued to panthor_cleanup_wq are executed
> + * before the device is destroyed.
> + */
> + drain_workqueue(panthor_cleanup_wq);
As shashiko pointed out[1], this doesn't work. __flush_workqueue()
could work, but I'm actually considering moving those work to the
panthor_mmu/panthor_scheduler, so we can easily call
disable_work_sync() when the MMU/scheduler components are torn down.
> +
> pm_runtime_dont_use_autosuspend(ptdev->base.dev);
> pm_runtime_put_sync_suspend(ptdev->base.dev);
>
>
[1]https://sashiko.dev/#/patchset/20260625-panthor-misc-fixes-v1-0-b67ed973fea6@collabora.com?part=3
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 04/11] drm/panthor: Fix potential invalid pointer deref in group_process_tiler_oom()
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
` (2 preceding siblings ...)
2026-06-25 12:40 ` [PATCH 03/11] drm/panthor: Fix UAF on works queued to panthor_cleanup_wq Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-25 12:54 ` sashiko-bot
2026-06-26 9:14 ` Liviu Dudau
2026-06-25 12:40 ` [PATCH 05/11] drm/panthor: Fix theoretical IOMEM access in suspended state Boris Brezillon
` (6 subsequent siblings)
10 siblings, 2 replies; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon, sashiko-bot
If heaps is an ERR_PTR(), panthor_heap_pool_put() will deref an invalid
pointer. Make sure we set it to NULL in that case.
Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
Reported-by: sashiko-bot@kernel.org
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=2
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_sched.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index e97f29469d28..8fd4d97b062e 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -1600,7 +1600,10 @@ static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
if (unlikely(csg_id < 0))
return 0;
- if (IS_ERR(heaps) || frag_end > vt_end || vt_end >= vt_start) {
+ if (IS_ERR(heaps)) {
+ ret = -EINVAL;
+ heaps = NULL;
+ } else if (frag_end > vt_end || vt_end >= vt_start) {
ret = -EINVAL;
} else {
/* We do the allocation without holding the scheduler lock to avoid
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 04/11] drm/panthor: Fix potential invalid pointer deref in group_process_tiler_oom()
2026-06-25 12:40 ` [PATCH 04/11] drm/panthor: Fix potential invalid pointer deref in group_process_tiler_oom() Boris Brezillon
@ 2026-06-25 12:54 ` sashiko-bot
2026-06-26 9:14 ` Liviu Dudau
1 sibling, 0 replies; 31+ messages in thread
From: sashiko-bot @ 2026-06-25 12:54 UTC (permalink / raw)
To: Boris Brezillon; +Cc: dri-devel
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
Pre-existing issues:
- [High] `group->fatal_queues` is modified without any locking in `group_process_tiler_oom()`, leading to a data race with other contexts that update it, and `group->fatal_lock` is completely unused.
- [High] Firmware render pass counter validation check is vulnerable to 32-bit integer wrap-around, which can erroneously reject valid memory allocations.
--
commit 849043d96182c6a00588e13d1652d042b8e7a787
Author: Boris Brezillon <boris.brezillon@collabora.com>
drm/panthor: Fix potential invalid pointer deref in group_process_tiler_oom()
This commit prevents a potential invalid pointer dereference when
panthor_heap_pool_put() is called with an error pointer by ensuring heaps is
set to NULL on failure.
Reported-by: sashiko-bot@kernel.org
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=2
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index e97f29469d28a..8fd4d97b062e0 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -1600,7 +1600,10 @@ static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
> if (unlikely(csg_id < 0))
> return 0;
>
> - if (IS_ERR(heaps) || frag_end > vt_end || vt_end >= vt_start) {
> + if (IS_ERR(heaps)) {
> + ret = -EINVAL;
> + heaps = NULL;
> + } else if (frag_end > vt_end || vt_end >= vt_start) {
^^^^^^^^^^^^^^^^^^
[Severity: High]
This isn't a bug introduced by this patch, but does this condition incorrectly
handle 32-bit unsigned wrap-arounds for the FW pass counters?
If a long-running graphical application issues enough render passes, the 32-bit
counters can wrap around. For example, if vt_start = 0x00000001 and
vt_end = 0xFFFFFFFF, the direct comparison vt_end >= vt_start evaluates to
true even though vt_start is logically ahead.
This causes valid tiler OOM requests to fail with -EINVAL, marking the
affected queue as fatal.
Should sequence arithmetic like (s32)(vt_end - vt_start) >= 0 be used here
instead to safely compare wrapping sequence numbers?
> ret = -EINVAL;
> } else {
> /* We do the allocation without holding the scheduler lock to avoid
[Severity: High]
This is a pre-existing issue, but is there a data race when updating
group->fatal_queues later in this same function?
In group_process_tiler_oom(), which executes in the heap_alloc_wq workqueue,
the fatal_queues bitmask is updated without holding any locks:
drivers/gpu/drm/panthor/panthor_sched.c:group_process_tiler_oom() {
...
if (ret && ret != -ENOMEM) {
drm_warn(&ptdev->base, "Failed to extend the tiler heap\n");
group->fatal_queues |= BIT(cs_id);
...
}
Concurrently, cs_slot_process_fatal_event_locked() updates this same field
from process_fw_events_work in the sched->wq workqueue. Because |= is a
non-atomic read-modify-write operation, concurrent execution could result
in lost updates, potentially leaving a queue that experienced a fatal error
incorrectly marked as healthy.
Additionally, panthor_group defines spinlock_t fatal_lock to protect access
to fatal fields, but this lock appears to be completely unused.
Could fatal_lock be used here to prevent the race?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260625-panthor-misc-fixes-v1-0-b67ed973fea6@collabora.com?part=4
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH 04/11] drm/panthor: Fix potential invalid pointer deref in group_process_tiler_oom()
2026-06-25 12:40 ` [PATCH 04/11] drm/panthor: Fix potential invalid pointer deref in group_process_tiler_oom() Boris Brezillon
2026-06-25 12:54 ` sashiko-bot
@ 2026-06-26 9:14 ` Liviu Dudau
1 sibling, 0 replies; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 9:14 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Thu, Jun 25, 2026 at 02:40:30PM +0200, Boris Brezillon wrote:
> If heaps is an ERR_PTR(), panthor_heap_pool_put() will deref an invalid
> pointer. Make sure we set it to NULL in that case.
>
> Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> Reported-by: sashiko-bot@kernel.org
> Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v5-0-8836a74e0ef9@collabora.com?part=2
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
> ---
> drivers/gpu/drm/panthor/panthor_sched.c | 5 ++++-
> 1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index e97f29469d28..8fd4d97b062e 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -1600,7 +1600,10 @@ static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
> if (unlikely(csg_id < 0))
> return 0;
>
> - if (IS_ERR(heaps) || frag_end > vt_end || vt_end >= vt_start) {
> + if (IS_ERR(heaps)) {
> + ret = -EINVAL;
> + heaps = NULL;
> + } else if (frag_end > vt_end || vt_end >= vt_start) {
> ret = -EINVAL;
> } else {
> /* We do the allocation without holding the scheduler lock to avoid
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 05/11] drm/panthor: Fix theoretical IOMEM access in suspended state
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
` (3 preceding siblings ...)
2026-06-25 12:40 ` [PATCH 04/11] drm/panthor: Fix potential invalid pointer deref in group_process_tiler_oom() Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-26 9:29 ` Liviu Dudau
2026-06-25 12:40 ` [PATCH 06/11] drm/panthor: Don't overrule pending immediate ticks in sched_resume_tick() Boris Brezillon
` (5 subsequent siblings)
10 siblings, 1 reply; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon, sashiko-bot
In theory, our hardirq handler can be called while the device (and
thus the panthor_irq) is suspended, because the IRQ line is shared.
In practice though, in all the designs we've seen, the line is only
shared within the GPU, and because sub-component suspend state is
consistent (all-suspended or all-resumed), we shouldn't end up with
an interrupt triggered while we're suspended.
Fix the problem anyway, if nothing else, for our sanity.
Fixes: 0b2d86670a84 ("drm/panthor: Rework panthor_irq::suspended into panthor_irq::state")
Reported-by: sashiko-bot@kernel.org
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v4-0-3d2908912afa@collabora.com?part=1
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_device.h | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index 35679bfa1f3a..a39386bd6382 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -512,9 +512,6 @@ static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)
struct panthor_irq *pirq = data; \
enum panthor_irq_state old_state; \
\
- if (!gpu_read(pirq->iomem, INT_STAT)) \
- return IRQ_NONE; \
- \
guard(spinlock_irqsave)(&pirq->mask_lock); \
old_state = atomic_cmpxchg(&pirq->state, \
PANTHOR_IRQ_STATE_ACTIVE, \
@@ -522,6 +519,13 @@ static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)
if (old_state != PANTHOR_IRQ_STATE_ACTIVE) \
return IRQ_NONE; \
\
+ if (!gpu_read(pirq->iomem, INT_STAT)) { \
+ atomic_cmpxchg(&pirq->state, \
+ PANTHOR_IRQ_STATE_PROCESSING, \
+ PANTHOR_IRQ_STATE_ACTIVE); \
+ return IRQ_NONE; \
+ } \
+ \
gpu_write(pirq->iomem, INT_MASK, 0); \
return IRQ_WAKE_THREAD; \
} \
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 05/11] drm/panthor: Fix theoretical IOMEM access in suspended state
2026-06-25 12:40 ` [PATCH 05/11] drm/panthor: Fix theoretical IOMEM access in suspended state Boris Brezillon
@ 2026-06-26 9:29 ` Liviu Dudau
2026-06-26 11:40 ` Boris Brezillon
0 siblings, 1 reply; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 9:29 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Thu, Jun 25, 2026 at 02:40:31PM +0200, Boris Brezillon wrote:
> In theory, our hardirq handler can be called while the device (and
> thus the panthor_irq) is suspended, because the IRQ line is shared.
> In practice though, in all the designs we've seen, the line is only
> shared within the GPU, and because sub-component suspend state is
> consistent (all-suspended or all-resumed), we shouldn't end up with
> an interrupt triggered while we're suspended.
>
> Fix the problem anyway, if nothing else, for our sanity.
>
> Fixes: 0b2d86670a84 ("drm/panthor: Rework panthor_irq::suspended into panthor_irq::state")
> Reported-by: sashiko-bot@kernel.org
> Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v4-0-3d2908912afa@collabora.com?part=1
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
> drivers/gpu/drm/panthor/panthor_device.h | 10 +++++++---
> 1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> index 35679bfa1f3a..a39386bd6382 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.h
> +++ b/drivers/gpu/drm/panthor/panthor_device.h
> @@ -512,9 +512,6 @@ static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)
> struct panthor_irq *pirq = data; \
> enum panthor_irq_state old_state; \
> \
> - if (!gpu_read(pirq->iomem, INT_STAT)) \
> - return IRQ_NONE; \
> - \
> guard(spinlock_irqsave)(&pirq->mask_lock); \
> old_state = atomic_cmpxchg(&pirq->state, \
> PANTHOR_IRQ_STATE_ACTIVE, \
> @@ -522,6 +519,13 @@ static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)
> if (old_state != PANTHOR_IRQ_STATE_ACTIVE) \
> return IRQ_NONE; \
> \
> + if (!gpu_read(pirq->iomem, INT_STAT)) { \
> + atomic_cmpxchg(&pirq->state, \
> + PANTHOR_IRQ_STATE_PROCESSING, \
> + PANTHOR_IRQ_STATE_ACTIVE); \
> + return IRQ_NONE; \
> + } \
Hmm,
I get it that you're trying to revert the effect of the previous atomic_cmpxchg() here but it feels
like a better option would be to not do the swap at all if the state is not ACTIVE.
Best regards,
Liviu
> + \
> gpu_write(pirq->iomem, INT_MASK, 0); \
> return IRQ_WAKE_THREAD; \
> } \
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH 05/11] drm/panthor: Fix theoretical IOMEM access in suspended state
2026-06-26 9:29 ` Liviu Dudau
@ 2026-06-26 11:40 ` Boris Brezillon
2026-06-26 13:13 ` Liviu Dudau
0 siblings, 1 reply; 31+ messages in thread
From: Boris Brezillon @ 2026-06-26 11:40 UTC (permalink / raw)
To: Liviu Dudau
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Fri, 26 Jun 2026 10:29:54 +0100
Liviu Dudau <liviu.dudau@arm.com> wrote:
> On Thu, Jun 25, 2026 at 02:40:31PM +0200, Boris Brezillon wrote:
> > In theory, our hardirq handler can be called while the device (and
> > thus the panthor_irq) is suspended, because the IRQ line is shared.
> > In practice though, in all the designs we've seen, the line is only
> > shared within the GPU, and because sub-component suspend state is
> > consistent (all-suspended or all-resumed), we shouldn't end up with
> > an interrupt triggered while we're suspended.
> >
> > Fix the problem anyway, if nothing else, for our sanity.
> >
> > Fixes: 0b2d86670a84 ("drm/panthor: Rework panthor_irq::suspended into panthor_irq::state")
> > Reported-by: sashiko-bot@kernel.org
> > Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v4-0-3d2908912afa@collabora.com?part=1
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > ---
> > drivers/gpu/drm/panthor/panthor_device.h | 10 +++++++---
> > 1 file changed, 7 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> > index 35679bfa1f3a..a39386bd6382 100644
> > --- a/drivers/gpu/drm/panthor/panthor_device.h
> > +++ b/drivers/gpu/drm/panthor/panthor_device.h
> > @@ -512,9 +512,6 @@ static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)
> > struct panthor_irq *pirq = data; \
> > enum panthor_irq_state old_state; \
> > \
> > - if (!gpu_read(pirq->iomem, INT_STAT)) \
> > - return IRQ_NONE; \
> > - \
> > guard(spinlock_irqsave)(&pirq->mask_lock); \
> > old_state = atomic_cmpxchg(&pirq->state, \
> > PANTHOR_IRQ_STATE_ACTIVE, \
> > @@ -522,6 +519,13 @@ static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)
> > if (old_state != PANTHOR_IRQ_STATE_ACTIVE) \
> > return IRQ_NONE; \
> > \
> > + if (!gpu_read(pirq->iomem, INT_STAT)) { \
> > + atomic_cmpxchg(&pirq->state, \
> > + PANTHOR_IRQ_STATE_PROCESSING, \
> > + PANTHOR_IRQ_STATE_ACTIVE); \
> > + return IRQ_NONE; \
> > + } \
>
> Hmm,
>
> I get it that you're trying to revert the effect of the previous atomic_cmpxchg() here but it feels
> like a better option would be to not do the swap at all if the state is not ACTIVE.
That's what [1] does, but it's not possible until we've made this
atomic -> lock-based transition, and I want a fix that's not dependent
on non-Fixes patches so we can backport it.
[1]https://lore.kernel.org/dri-devel/20260625-panthor-signal-from-irq-v5-8-8836a74e0ef9@collabora.com/
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH 05/11] drm/panthor: Fix theoretical IOMEM access in suspended state
2026-06-26 11:40 ` Boris Brezillon
@ 2026-06-26 13:13 ` Liviu Dudau
0 siblings, 0 replies; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 13:13 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Fri, Jun 26, 2026 at 01:40:28PM +0200, Boris Brezillon wrote:
> On Fri, 26 Jun 2026 10:29:54 +0100
> Liviu Dudau <liviu.dudau@arm.com> wrote:
>
> > On Thu, Jun 25, 2026 at 02:40:31PM +0200, Boris Brezillon wrote:
> > > In theory, our hardirq handler can be called while the device (and
> > > thus the panthor_irq) is suspended, because the IRQ line is shared.
> > > In practice though, in all the designs we've seen, the line is only
> > > shared within the GPU, and because sub-component suspend state is
> > > consistent (all-suspended or all-resumed), we shouldn't end up with
> > > an interrupt triggered while we're suspended.
> > >
> > > Fix the problem anyway, if nothing else, for our sanity.
> > >
> > > Fixes: 0b2d86670a84 ("drm/panthor: Rework panthor_irq::suspended into panthor_irq::state")
> > > Reported-by: sashiko-bot@kernel.org
> > > Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v4-0-3d2908912afa@collabora.com?part=1
> > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > > ---
> > > drivers/gpu/drm/panthor/panthor_device.h | 10 +++++++---
> > > 1 file changed, 7 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> > > index 35679bfa1f3a..a39386bd6382 100644
> > > --- a/drivers/gpu/drm/panthor/panthor_device.h
> > > +++ b/drivers/gpu/drm/panthor/panthor_device.h
> > > @@ -512,9 +512,6 @@ static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)
> > > struct panthor_irq *pirq = data; \
> > > enum panthor_irq_state old_state; \
> > > \
> > > - if (!gpu_read(pirq->iomem, INT_STAT)) \
> > > - return IRQ_NONE; \
> > > - \
> > > guard(spinlock_irqsave)(&pirq->mask_lock); \
> > > old_state = atomic_cmpxchg(&pirq->state, \
> > > PANTHOR_IRQ_STATE_ACTIVE, \
> > > @@ -522,6 +519,13 @@ static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)
> > > if (old_state != PANTHOR_IRQ_STATE_ACTIVE) \
> > > return IRQ_NONE; \
> > > \
> > > + if (!gpu_read(pirq->iomem, INT_STAT)) { \
> > > + atomic_cmpxchg(&pirq->state, \
> > > + PANTHOR_IRQ_STATE_PROCESSING, \
> > > + PANTHOR_IRQ_STATE_ACTIVE); \
> > > + return IRQ_NONE; \
> > > + } \
> >
> > Hmm,
> >
> > I get it that you're trying to revert the effect of the previous atomic_cmpxchg() here but it feels
> > like a better option would be to not do the swap at all if the state is not ACTIVE.
>
> That's what [1] does, but it's not possible until we've made this
> atomic -> lock-based transition, and I want a fix that's not dependent
> on non-Fixes patches so we can backport it.
>
> [1]https://lore.kernel.org/dri-devel/20260625-panthor-signal-from-irq-v5-8-8836a74e0ef9@collabora.com/
I see.
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 06/11] drm/panthor: Don't overrule pending immediate ticks in sched_resume_tick()
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
` (4 preceding siblings ...)
2026-06-25 12:40 ` [PATCH 05/11] drm/panthor: Fix theoretical IOMEM access in suspended state Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-26 12:45 ` Liviu Dudau
2026-06-25 12:40 ` [PATCH 07/11] drm/panthor: Fix panthor_pwr_unplug() Boris Brezillon
` (4 subsequent siblings)
10 siblings, 1 reply; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon, sashiko-bot
We schedule immediate ticks when we need to process events on CSGs,
but those immediate ticks don't change the resched_target because we
want the other groups to stay scheduled for the remaining of the GPU
timeslot they were given. Make sure these immediate ticks don't get
overruled by a sched_queue_delayed_work() that would delay the tick
execution.
Fixes: 99820b4b7e50 ("drm/panthor: Make sure we resume the tick when new jobs are submitted")
Reported-by: sashiko-bot@kernel.org
Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v4-0-3d2908912afa@collabora.com?part=9
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_sched.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 8fd4d97b062e..ab3e13e44a26 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -2667,7 +2667,14 @@ static void sched_resume_tick(struct panthor_device *ptdev)
else
delay_jiffies = 0;
- sched_queue_delayed_work(sched, tick, delay_jiffies);
+ /* We schedule immediate ticks when we need to process events on CSGs,
+ * but those don't change the resched_target because we want the other
+ * groups to stay scheduled for the remaining of the GPU timeslot they
+ * were given. Make sure those immediate ticks don't get overruled by
+ * a sched_queue_delayed_work() that would delay the tick execution.
+ */
+ if (!delayed_work_pending(&sched->tick_work))
+ sched_queue_delayed_work(sched, tick, delay_jiffies);
}
static void group_schedule_locked(struct panthor_group *group, u32 queue_mask)
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 06/11] drm/panthor: Don't overrule pending immediate ticks in sched_resume_tick()
2026-06-25 12:40 ` [PATCH 06/11] drm/panthor: Don't overrule pending immediate ticks in sched_resume_tick() Boris Brezillon
@ 2026-06-26 12:45 ` Liviu Dudau
2026-06-26 13:19 ` Boris Brezillon
0 siblings, 1 reply; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 12:45 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Thu, Jun 25, 2026 at 02:40:32PM +0200, Boris Brezillon wrote:
> We schedule immediate ticks when we need to process events on CSGs,
> but those immediate ticks don't change the resched_target because we
> want the other groups to stay scheduled for the remaining of the GPU
> timeslot they were given. Make sure these immediate ticks don't get
> overruled by a sched_queue_delayed_work() that would delay the tick
> execution.
>
> Fixes: 99820b4b7e50 ("drm/panthor: Make sure we resume the tick when new jobs are submitted")
> Reported-by: sashiko-bot@kernel.org
> Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v4-0-3d2908912afa@collabora.com?part=9
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
> drivers/gpu/drm/panthor/panthor_sched.c | 9 ++++++++-
> 1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 8fd4d97b062e..ab3e13e44a26 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -2667,7 +2667,14 @@ static void sched_resume_tick(struct panthor_device *ptdev)
> else
> delay_jiffies = 0;
>
> - sched_queue_delayed_work(sched, tick, delay_jiffies);
> + /* We schedule immediate ticks when we need to process events on CSGs,
> + * but those don't change the resched_target because we want the other
> + * groups to stay scheduled for the remaining of the GPU timeslot they
> + * were given. Make sure those immediate ticks don't get overruled by
> + * a sched_queue_delayed_work() that would delay the tick execution.
> + */
> + if (!delayed_work_pending(&sched->tick_work))
> + sched_queue_delayed_work(sched, tick, delay_jiffies);
Maybe I'm having a Friday heat brain freeze, but it feels like the comment and the code
are going in a different direction. It doesn't help that the commit message copies the
comment so I can't tell if I'm misreading the comment or there was a different intent.
Best regards,
Liviu
> }
>
> static void group_schedule_locked(struct panthor_group *group, u32 queue_mask)
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH 06/11] drm/panthor: Don't overrule pending immediate ticks in sched_resume_tick()
2026-06-26 12:45 ` Liviu Dudau
@ 2026-06-26 13:19 ` Boris Brezillon
0 siblings, 0 replies; 31+ messages in thread
From: Boris Brezillon @ 2026-06-26 13:19 UTC (permalink / raw)
To: Liviu Dudau
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Fri, 26 Jun 2026 13:45:38 +0100
Liviu Dudau <liviu.dudau@arm.com> wrote:
> On Thu, Jun 25, 2026 at 02:40:32PM +0200, Boris Brezillon wrote:
> > We schedule immediate ticks when we need to process events on CSGs,
> > but those immediate ticks don't change the resched_target because we
> > want the other groups to stay scheduled for the remaining of the GPU
> > timeslot they were given. Make sure these immediate ticks don't get
> > overruled by a sched_queue_delayed_work() that would delay the tick
> > execution.
> >
> > Fixes: 99820b4b7e50 ("drm/panthor: Make sure we resume the tick when new jobs are submitted")
> > Reported-by: sashiko-bot@kernel.org
> > Closes: https://sashiko.dev/#/patchset/20260625-panthor-signal-from-irq-v4-0-3d2908912afa@collabora.com?part=9
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > ---
> > drivers/gpu/drm/panthor/panthor_sched.c | 9 ++++++++-
> > 1 file changed, 8 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > index 8fd4d97b062e..ab3e13e44a26 100644
> > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > @@ -2667,7 +2667,14 @@ static void sched_resume_tick(struct panthor_device *ptdev)
> > else
> > delay_jiffies = 0;
> >
> > - sched_queue_delayed_work(sched, tick, delay_jiffies);
> > + /* We schedule immediate ticks when we need to process events on CSGs,
> > + * but those don't change the resched_target because we want the other
> > + * groups to stay scheduled for the remaining of the GPU timeslot they
> > + * were given. Make sure those immediate ticks don't get overruled by
> > + * a sched_queue_delayed_work() that would delay the tick execution.
> > + */
> > + if (!delayed_work_pending(&sched->tick_work))
> > + sched_queue_delayed_work(sched, tick, delay_jiffies);
>
> Maybe I'm having a Friday heat brain freeze, but it feels like the comment and the code
> are going in a different direction. It doesn't help that the commit message copies the
> comment so I can't tell if I'm misreading the comment or there was a different intent.
There's basically two kind of ticks:
1. The periodic ones that serve as a way to rotate groups on the slots
and give everyone a chance to get a GPU slice
2. The immediate ones which are there to process events coming from an
interrupt, or to re-evaluate groups to schedule because a new group
became active
To detect the kind of tick, we use ::resched_target. If current time is
before this target, this is an event, and resident groups shouldn't be
evicted (I'm intentionally eluding RT groups to keep things simple).
The problem we have with sched_resume_tick() is that it's
unconditionally calling sched_queue_delayed_work()
(mod_delayed_work() internally). So, if we have an immediate tick
pending (one that didn't adjust ::resched_target), we end up
rescheduling the work to a later point thus delaying the processing of
this asynchronous event for no good reason. What this commit does is
skip the sched_queue_delayed_work() if the tick_work is pending.
I'd be happy to change the wording if you have something to propose.
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 07/11] drm/panthor: Fix panthor_pwr_unplug()
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
` (5 preceding siblings ...)
2026-06-25 12:40 ` [PATCH 06/11] drm/panthor: Don't overrule pending immediate ticks in sched_resume_tick() Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-26 12:42 ` Liviu Dudau
2026-06-25 12:40 ` [PATCH 08/11] drm/panthor: Drop a needless check in panthor_fw_unplug() Boris Brezillon
` (3 subsequent siblings)
10 siblings, 1 reply; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon
We can't call panthor_pwr_irq_suspend() if the device is suspended,
or this leads to a hang when the IOMEM region is accessed while the
clks are disabled. Do what other sub-components do and conditionally
call panthor_pwr_irq_suspend() if we know the PWR regbank block is
accessible.
Fixes: c27787f2b77f ("drm/panthor: Introduce panthor_pwr API and power control framework")
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_pwr.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/panthor/panthor_pwr.c b/drivers/gpu/drm/panthor/panthor_pwr.c
index 7c7f424a1436..090362bd700b 100644
--- a/drivers/gpu/drm/panthor/panthor_pwr.c
+++ b/drivers/gpu/drm/panthor/panthor_pwr.c
@@ -453,7 +453,8 @@ void panthor_pwr_unplug(struct panthor_device *ptdev)
return;
/* Make sure the IRQ handler is not running after that point. */
- panthor_pwr_irq_suspend(&ptdev->pwr->irq);
+ if (!IS_ENABLED(CONFIG_PM) || pm_runtime_active(ptdev->base.dev))
+ panthor_pwr_irq_suspend(&ptdev->pwr->irq);
/* Wake-up all waiters. */
spin_lock_irqsave(&ptdev->pwr->reqs_lock, flags);
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 07/11] drm/panthor: Fix panthor_pwr_unplug()
2026-06-25 12:40 ` [PATCH 07/11] drm/panthor: Fix panthor_pwr_unplug() Boris Brezillon
@ 2026-06-26 12:42 ` Liviu Dudau
0 siblings, 0 replies; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 12:42 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel
On Thu, Jun 25, 2026 at 02:40:33PM +0200, Boris Brezillon wrote:
> We can't call panthor_pwr_irq_suspend() if the device is suspended,
> or this leads to a hang when the IOMEM region is accessed while the
> clks are disabled. Do what other sub-components do and conditionally
> call panthor_pwr_irq_suspend() if we know the PWR regbank block is
> accessible.
>
> Fixes: c27787f2b77f ("drm/panthor: Introduce panthor_pwr API and power control framework")
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
> ---
> drivers/gpu/drm/panthor/panthor_pwr.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_pwr.c b/drivers/gpu/drm/panthor/panthor_pwr.c
> index 7c7f424a1436..090362bd700b 100644
> --- a/drivers/gpu/drm/panthor/panthor_pwr.c
> +++ b/drivers/gpu/drm/panthor/panthor_pwr.c
> @@ -453,7 +453,8 @@ void panthor_pwr_unplug(struct panthor_device *ptdev)
> return;
>
> /* Make sure the IRQ handler is not running after that point. */
> - panthor_pwr_irq_suspend(&ptdev->pwr->irq);
> + if (!IS_ENABLED(CONFIG_PM) || pm_runtime_active(ptdev->base.dev))
> + panthor_pwr_irq_suspend(&ptdev->pwr->irq);
>
> /* Wake-up all waiters. */
> spin_lock_irqsave(&ptdev->pwr->reqs_lock, flags);
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 08/11] drm/panthor: Drop a needless check in panthor_fw_unplug()
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
` (6 preceding siblings ...)
2026-06-25 12:40 ` [PATCH 07/11] drm/panthor: Fix panthor_pwr_unplug() Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-25 12:53 ` sashiko-bot
2026-06-26 13:11 ` Liviu Dudau
2026-06-25 12:40 ` [PATCH 09/11] drm/panthor: Fix a leak when a group is evicted before the tiler OOM is serviced Boris Brezillon
` (2 subsequent siblings)
10 siblings, 2 replies; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon
panthor_fw_unplug() is only called if we at least managed to initialize
the IRQ, so it's safe to drop the "is IRQ initialized" check.
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_fw.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
index 986151681b24..4fbddb9e18c8 100644
--- a/drivers/gpu/drm/panthor/panthor_fw.c
+++ b/drivers/gpu/drm/panthor/panthor_fw.c
@@ -1279,9 +1279,7 @@ void panthor_fw_unplug(struct panthor_device *ptdev)
if (!IS_ENABLED(CONFIG_PM) || pm_runtime_active(ptdev->base.dev)) {
/* Make sure the IRQ handler cannot be called after that point. */
- if (ptdev->fw->irq.irq)
- panthor_job_irq_suspend(&ptdev->fw->irq);
-
+ panthor_job_irq_suspend(&ptdev->fw->irq);
panthor_fw_stop(ptdev);
}
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 08/11] drm/panthor: Drop a needless check in panthor_fw_unplug()
2026-06-25 12:40 ` [PATCH 08/11] drm/panthor: Drop a needless check in panthor_fw_unplug() Boris Brezillon
@ 2026-06-25 12:53 ` sashiko-bot
2026-06-26 13:11 ` Liviu Dudau
1 sibling, 0 replies; 31+ messages in thread
From: sashiko-bot @ 2026-06-25 12:53 UTC (permalink / raw)
To: Boris Brezillon; +Cc: dri-devel
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
Pre-existing issues:
- [High] Integer overflow in `panthor_fw_read_build_info` leads to an out-of-bounds read when parsing untrusted firmware.
--
commit 36848cd8008e907cd2d324339568a4a41f683911
Author: Boris Brezillon <boris.brezillon@collabora.com>
drm/panthor: Drop a needless check in panthor_fw_unplug()
panthor_fw_unplug() is only called if we at least managed to initialize
the IRQ, so it's safe to drop the "is IRQ initialized" check.
> diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
> index 986151681b246..4fbddb9e18c8b 100644
> --- a/drivers/gpu/drm/panthor/panthor_fw.c
> +++ b/drivers/gpu/drm/panthor/panthor_fw.c
[ ... ]
drivers/gpu/drm/panthor/panthor_fw.c:panthor_fw_read_build_info() {
...
if (hdr.meta_start > fw->size ||
hdr.meta_start + hdr.meta_size > fw->size) {
drm_err(&ptdev->base, "Firmware build info corrupt\n");
/* We don't need the build info, so continue */
return 0;
}
if (memcmp(git_sha_header, fw->data + hdr.meta_start, header_len)) {
/* Not the expected header, this isn't metadata we understand */
return 0;
}
/* Check that the git SHA is NULL terminated as expected */
if (fw->data[hdr.meta_start + hdr.meta_size - 1] != '\0') {
...
}
[Severity: High]
This is a pre-existing issue, but does this bounds check use 32-bit
arithmetic that can wrap around before the promotion to size_t?
If an attacker supplies a large meta_size (e.g., 0xFFFFFFFF) and a carefully
chosen meta_start (e.g., 0) such that their sum wraps around, could this
bypass the check since 0 > fw->size evaluates to false?
Execution would then call memcmp() on the attacker-controlled meta_start
offset. If the string matches "git_sha: ", execution proceeds to:
fw->data[hdr.meta_start + hdr.meta_size - 1]
Could this evaluate to fw->data[0xFFFFFFFF] (a 4GB out-of-bounds offset)
and cause a page fault or oops when parsing untrusted firmware?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260625-panthor-misc-fixes-v1-0-b67ed973fea6@collabora.com?part=8
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH 08/11] drm/panthor: Drop a needless check in panthor_fw_unplug()
2026-06-25 12:40 ` [PATCH 08/11] drm/panthor: Drop a needless check in panthor_fw_unplug() Boris Brezillon
2026-06-25 12:53 ` sashiko-bot
@ 2026-06-26 13:11 ` Liviu Dudau
1 sibling, 0 replies; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 13:11 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel
On Thu, Jun 25, 2026 at 02:40:34PM +0200, Boris Brezillon wrote:
> panthor_fw_unplug() is only called if we at least managed to initialize
> the IRQ, so it's safe to drop the "is IRQ initialized" check.
>
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
> ---
> drivers/gpu/drm/panthor/panthor_fw.c | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
> index 986151681b24..4fbddb9e18c8 100644
> --- a/drivers/gpu/drm/panthor/panthor_fw.c
> +++ b/drivers/gpu/drm/panthor/panthor_fw.c
> @@ -1279,9 +1279,7 @@ void panthor_fw_unplug(struct panthor_device *ptdev)
>
> if (!IS_ENABLED(CONFIG_PM) || pm_runtime_active(ptdev->base.dev)) {
> /* Make sure the IRQ handler cannot be called after that point. */
> - if (ptdev->fw->irq.irq)
> - panthor_job_irq_suspend(&ptdev->fw->irq);
> -
> + panthor_job_irq_suspend(&ptdev->fw->irq);
> panthor_fw_stop(ptdev);
> }
>
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 09/11] drm/panthor: Fix a leak when a group is evicted before the tiler OOM is serviced
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
` (7 preceding siblings ...)
2026-06-25 12:40 ` [PATCH 08/11] drm/panthor: Drop a needless check in panthor_fw_unplug() Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-26 13:12 ` Liviu Dudau
2026-06-25 12:40 ` [PATCH 10/11] drm/panthor: Interrupt group start/resumption if group_bind_locked() fails Boris Brezillon
2026-06-25 12:40 ` [PATCH 11/11] drm/panthor: Keep interrupts masked until they are needed Boris Brezillon
10 siblings, 1 reply; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon, sashiko-bot
A group ref is tied to the pending tiler_oom_work, so we need to release
it if the cancel was effective.
Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
Reported-by: sashiko-bot@kernel.org
Closes: https://sashiko.dev/#/patchset/20260623-panthor-signal-from-irq-v3-0-2ece396f8ee0@collabora.com?part=7
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_sched.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index ab3e13e44a26..5fe95d03f23e 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -1057,7 +1057,8 @@ group_unbind_locked(struct panthor_group *group)
/* Tiler OOM events will be re-issued next time the group is scheduled. */
atomic_set(&group->tiler_oom, 0);
- cancel_work(&group->tiler_oom_work);
+ if (cancel_work(&group->tiler_oom_work))
+ group_put(group);
for (u32 i = 0; i < group->queue_count; i++)
group->queues[i]->doorbell_id = -1;
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 09/11] drm/panthor: Fix a leak when a group is evicted before the tiler OOM is serviced
2026-06-25 12:40 ` [PATCH 09/11] drm/panthor: Fix a leak when a group is evicted before the tiler OOM is serviced Boris Brezillon
@ 2026-06-26 13:12 ` Liviu Dudau
0 siblings, 0 replies; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 13:12 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Thu, Jun 25, 2026 at 02:40:35PM +0200, Boris Brezillon wrote:
> A group ref is tied to the pending tiler_oom_work, so we need to release
> it if the cancel was effective.
>
> Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> Reported-by: sashiko-bot@kernel.org
> Closes: https://sashiko.dev/#/patchset/20260623-panthor-signal-from-irq-v3-0-2ece396f8ee0@collabora.com?part=7
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
> ---
> drivers/gpu/drm/panthor/panthor_sched.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index ab3e13e44a26..5fe95d03f23e 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -1057,7 +1057,8 @@ group_unbind_locked(struct panthor_group *group)
>
> /* Tiler OOM events will be re-issued next time the group is scheduled. */
> atomic_set(&group->tiler_oom, 0);
> - cancel_work(&group->tiler_oom_work);
> + if (cancel_work(&group->tiler_oom_work))
> + group_put(group);
>
> for (u32 i = 0; i < group->queue_count; i++)
> group->queues[i]->doorbell_id = -1;
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 10/11] drm/panthor: Interrupt group start/resumption if group_bind_locked() fails
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
` (8 preceding siblings ...)
2026-06-25 12:40 ` [PATCH 09/11] drm/panthor: Fix a leak when a group is evicted before the tiler OOM is serviced Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-26 13:14 ` Liviu Dudau
2026-06-25 12:40 ` [PATCH 11/11] drm/panthor: Keep interrupts masked until they are needed Boris Brezillon
10 siblings, 1 reply; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon, sashiko-bot
group_bind_locked() can fail if the MMU block is stuck. This is normally
a reset situation, but by the time we reset the GPU, we might have
tried to resume a group that's not resident, which will probably trip
out the FW. So let's avoid that by bailing out when group_bind_locked()
returns an error. We don't even try to start more groups because the
GPU will be reset anyway.
Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
Reported-by: sashiko-bot@kernel.org
Closes: https://sashiko.dev/#/patchset/20260623-panthor-signal-from-irq-v3-0-2ece396f8ee0@collabora.com?part=7
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_sched.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 5fe95d03f23e..298b046c95ed 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -2368,7 +2368,13 @@ tick_ctx_apply(struct panthor_scheduler *sched, struct panthor_sched_tick_ctx *c
csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
csg_slot = &sched->csg_slots[csg_id];
- group_bind_locked(group, csg_id);
+ ret = group_bind_locked(group, csg_id);
+ if (ret) {
+ panthor_device_schedule_reset(ptdev);
+ ctx->csg_upd_failed_mask |= BIT(csg_id);
+ return;
+ }
+
csg_slot_prog_locked(ptdev, csg_id, new_csg_prio--);
csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
group->state == PANTHOR_CS_GROUP_SUSPENDED ?
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 10/11] drm/panthor: Interrupt group start/resumption if group_bind_locked() fails
2026-06-25 12:40 ` [PATCH 10/11] drm/panthor: Interrupt group start/resumption if group_bind_locked() fails Boris Brezillon
@ 2026-06-26 13:14 ` Liviu Dudau
0 siblings, 0 replies; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 13:14 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, sashiko-bot
On Thu, Jun 25, 2026 at 02:40:36PM +0200, Boris Brezillon wrote:
> group_bind_locked() can fail if the MMU block is stuck. This is normally
> a reset situation, but by the time we reset the GPU, we might have
> tried to resume a group that's not resident, which will probably trip
> out the FW. So let's avoid that by bailing out when group_bind_locked()
> returns an error. We don't even try to start more groups because the
> GPU will be reset anyway.
>
> Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block")
> Reported-by: sashiko-bot@kernel.org
> Closes: https://sashiko.dev/#/patchset/20260623-panthor-signal-from-irq-v3-0-2ece396f8ee0@collabora.com?part=7
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
> ---
> drivers/gpu/drm/panthor/panthor_sched.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 5fe95d03f23e..298b046c95ed 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -2368,7 +2368,13 @@ tick_ctx_apply(struct panthor_scheduler *sched, struct panthor_sched_tick_ctx *c
>
> csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id);
> csg_slot = &sched->csg_slots[csg_id];
> - group_bind_locked(group, csg_id);
> + ret = group_bind_locked(group, csg_id);
> + if (ret) {
> + panthor_device_schedule_reset(ptdev);
> + ctx->csg_upd_failed_mask |= BIT(csg_id);
> + return;
> + }
> +
> csg_slot_prog_locked(ptdev, csg_id, new_csg_prio--);
> csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id,
> group->state == PANTHOR_CS_GROUP_SUSPENDED ?
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH 11/11] drm/panthor: Keep interrupts masked until they are needed
2026-06-25 12:40 [PATCH 00/11] drm/panthor: Misc fixes for bugs found by shashiko Boris Brezillon
` (9 preceding siblings ...)
2026-06-25 12:40 ` [PATCH 10/11] drm/panthor: Interrupt group start/resumption if group_bind_locked() fails Boris Brezillon
@ 2026-06-25 12:40 ` Boris Brezillon
2026-06-26 13:18 ` Liviu Dudau
10 siblings, 1 reply; 31+ messages in thread
From: Boris Brezillon @ 2026-06-25 12:40 UTC (permalink / raw)
To: Steven Price, Liviu Dudau
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Nicolas Frattaroli, Chia-I Wu, Karunika Choo,
dri-devel, linux-kernel, Boris Brezillon, Shashiko
The autogenerated panthor_request_xx_irq() helpers unmask Mali
interrupts before we're sure we'll have a handler registered. For
non-shared IRQ lines, that's fine, but for shared ones, it might cause
an interrupt flood if the HW block raises an interrupt for any reason.
We could reworking the calls in panthor_request_xx_irq(), but it's just
simpler to let the caller decide when they are ready to handle interrupts
and call panthor_pwr_irq_resume() themselves. While at it, rework the
prototype to let users call panthor_pwr_irq_enable_events() explicitly
instead of passing an initial mask to panthor_request_pwr_irq().
Fixes: 5fe909cae118 ("drm/panthor: Add the device logical block")
Reported-by: Shashiko <sashiko-bot@kernel.org>
Closes: https://sashiko.dev/#/patchset/20260623-panthor-signal-from-irq-v3-0-2ece396f8ee0@collabora.com?part=3
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
drivers/gpu/drm/panthor/panthor_device.h | 7 ++++---
drivers/gpu/drm/panthor/panthor_fw.c | 2 +-
drivers/gpu/drm/panthor/panthor_gpu.c | 3 ++-
drivers/gpu/drm/panthor/panthor_mmu.c | 9 +++++++--
drivers/gpu/drm/panthor/panthor_pwr.c | 7 ++++---
5 files changed, 18 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index a39386bd6382..0fda64fbe5f2 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -588,14 +588,15 @@ static inline void panthor_ ## __name ## _irq_resume(struct panthor_irq *pirq)
\
static int panthor_request_ ## __name ## _irq(struct panthor_device *ptdev, \
struct panthor_irq *pirq, \
- int irq, u32 mask, void __iomem *iomem) \
+ int irq, void __iomem *iomem) \
{ \
pirq->ptdev = ptdev; \
pirq->irq = irq; \
- pirq->mask = mask; \
+ pirq->mask = 0; \
pirq->iomem = iomem; \
spin_lock_init(&pirq->mask_lock); \
- panthor_ ## __name ## _irq_resume(pirq); \
+ atomic_set(&pirq->state, PANTHOR_IRQ_STATE_SUSPENDED); \
+ gpu_write(pirq->iomem, INT_MASK, 0); \
\
return devm_request_threaded_irq(ptdev->base.dev, irq, \
panthor_ ## __name ## _irq_raw_handler, \
diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
index 4fbddb9e18c8..de8e6689a869 100644
--- a/drivers/gpu/drm/panthor/panthor_fw.c
+++ b/drivers/gpu/drm/panthor/panthor_fw.c
@@ -1474,7 +1474,7 @@ int panthor_fw_init(struct panthor_device *ptdev)
if (irq <= 0)
return -ENODEV;
- ret = panthor_request_job_irq(ptdev, &fw->irq, irq, 0,
+ ret = panthor_request_job_irq(ptdev, &fw->irq, irq,
ptdev->iomem + JOB_INT_BASE);
if (ret) {
drm_err(&ptdev->base, "failed to request job irq");
diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
index e52c5675981f..c013d6bf9a59 100644
--- a/drivers/gpu/drm/panthor/panthor_gpu.c
+++ b/drivers/gpu/drm/panthor/panthor_gpu.c
@@ -170,11 +170,12 @@ int panthor_gpu_init(struct panthor_device *ptdev)
return irq;
ret = panthor_request_gpu_irq(ptdev, &ptdev->gpu->irq, irq,
- GPU_INTERRUPTS_MASK,
ptdev->iomem + GPU_INT_BASE);
if (ret)
return ret;
+ panthor_gpu_irq_enable_events(&ptdev->gpu->irq, GPU_INTERRUPTS_MASK);
+ panthor_gpu_irq_resume(&ptdev->gpu->irq);
return 0;
}
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index 31cc57029c12..1fef3c5c1b50 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -3406,7 +3406,6 @@ int panthor_mmu_init(struct panthor_device *ptdev)
return -ENODEV;
ret = panthor_request_mmu_irq(ptdev, &mmu->irq, irq,
- panthor_mmu_fault_mask(ptdev, ~0),
ptdev->iomem + MMU_INT_BASE);
if (ret)
return ret;
@@ -3424,7 +3423,13 @@ int panthor_mmu_init(struct panthor_device *ptdev)
ptdev->gpu_info.mmu_features |= BITS_PER_LONG;
}
- return drmm_add_action_or_reset(&ptdev->base, panthor_mmu_release_wq, mmu->vm.wq);
+ ret = drmm_add_action_or_reset(&ptdev->base, panthor_mmu_release_wq, mmu->vm.wq);
+ if (ret)
+ return ret;
+
+ panthor_mmu_irq_enable_events(&mmu->irq, panthor_mmu_fault_mask(ptdev, ~0));
+ panthor_mmu_irq_resume(&mmu->irq);
+ return 0;
}
#ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/drm/panthor/panthor_pwr.c b/drivers/gpu/drm/panthor/panthor_pwr.c
index 090362bd700b..f2c2c3000590 100644
--- a/drivers/gpu/drm/panthor/panthor_pwr.c
+++ b/drivers/gpu/drm/panthor/panthor_pwr.c
@@ -484,12 +484,13 @@ int panthor_pwr_init(struct panthor_device *ptdev)
if (irq < 0)
return irq;
- err = panthor_request_pwr_irq(
- ptdev, &pwr->irq, irq, PWR_INTERRUPTS_MASK,
- pwr->iomem + PWR_INT_BASE);
+ err = panthor_request_pwr_irq(ptdev, &pwr->irq, irq,
+ pwr->iomem + PWR_INT_BASE);
if (err)
return err;
+ panthor_pwr_irq_enable_events(&pwr->irq, PWR_INTERRUPTS_MASK);
+ panthor_pwr_irq_resume(&pwr->irq);
return 0;
}
--
2.54.0
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH 11/11] drm/panthor: Keep interrupts masked until they are needed
2026-06-25 12:40 ` [PATCH 11/11] drm/panthor: Keep interrupts masked until they are needed Boris Brezillon
@ 2026-06-26 13:18 ` Liviu Dudau
0 siblings, 0 replies; 31+ messages in thread
From: Liviu Dudau @ 2026-06-26 13:18 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
David Airlie, Simona Vetter, Nicolas Frattaroli, Chia-I Wu,
Karunika Choo, dri-devel, linux-kernel, Shashiko
On Thu, Jun 25, 2026 at 02:40:37PM +0200, Boris Brezillon wrote:
> The autogenerated panthor_request_xx_irq() helpers unmask Mali
> interrupts before we're sure we'll have a handler registered. For
> non-shared IRQ lines, that's fine, but for shared ones, it might cause
> an interrupt flood if the HW block raises an interrupt for any reason.
>
> We could reworking the calls in panthor_request_xx_irq(), but it's just
^ rework
> simpler to let the caller decide when they are ready to handle interrupts
> and call panthor_pwr_irq_resume() themselves. While at it, rework the
> prototype to let users call panthor_pwr_irq_enable_events() explicitly
> instead of passing an initial mask to panthor_request_pwr_irq().
>
> Fixes: 5fe909cae118 ("drm/panthor: Add the device logical block")
> Reported-by: Shashiko <sashiko-bot@kernel.org>
> Closes: https://sashiko.dev/#/patchset/20260623-panthor-signal-from-irq-v3-0-2ece396f8ee0@collabora.com?part=3
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Looks good to me!
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
> ---
> drivers/gpu/drm/panthor/panthor_device.h | 7 ++++---
> drivers/gpu/drm/panthor/panthor_fw.c | 2 +-
> drivers/gpu/drm/panthor/panthor_gpu.c | 3 ++-
> drivers/gpu/drm/panthor/panthor_mmu.c | 9 +++++++--
> drivers/gpu/drm/panthor/panthor_pwr.c | 7 ++++---
> 5 files changed, 18 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> index a39386bd6382..0fda64fbe5f2 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.h
> +++ b/drivers/gpu/drm/panthor/panthor_device.h
> @@ -588,14 +588,15 @@ static inline void panthor_ ## __name ## _irq_resume(struct panthor_irq *pirq)
> \
> static int panthor_request_ ## __name ## _irq(struct panthor_device *ptdev, \
> struct panthor_irq *pirq, \
> - int irq, u32 mask, void __iomem *iomem) \
> + int irq, void __iomem *iomem) \
> { \
> pirq->ptdev = ptdev; \
> pirq->irq = irq; \
> - pirq->mask = mask; \
> + pirq->mask = 0; \
> pirq->iomem = iomem; \
> spin_lock_init(&pirq->mask_lock); \
> - panthor_ ## __name ## _irq_resume(pirq); \
> + atomic_set(&pirq->state, PANTHOR_IRQ_STATE_SUSPENDED); \
> + gpu_write(pirq->iomem, INT_MASK, 0); \
> \
> return devm_request_threaded_irq(ptdev->base.dev, irq, \
> panthor_ ## __name ## _irq_raw_handler, \
> diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
> index 4fbddb9e18c8..de8e6689a869 100644
> --- a/drivers/gpu/drm/panthor/panthor_fw.c
> +++ b/drivers/gpu/drm/panthor/panthor_fw.c
> @@ -1474,7 +1474,7 @@ int panthor_fw_init(struct panthor_device *ptdev)
> if (irq <= 0)
> return -ENODEV;
>
> - ret = panthor_request_job_irq(ptdev, &fw->irq, irq, 0,
> + ret = panthor_request_job_irq(ptdev, &fw->irq, irq,
> ptdev->iomem + JOB_INT_BASE);
> if (ret) {
> drm_err(&ptdev->base, "failed to request job irq");
> diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
> index e52c5675981f..c013d6bf9a59 100644
> --- a/drivers/gpu/drm/panthor/panthor_gpu.c
> +++ b/drivers/gpu/drm/panthor/panthor_gpu.c
> @@ -170,11 +170,12 @@ int panthor_gpu_init(struct panthor_device *ptdev)
> return irq;
>
> ret = panthor_request_gpu_irq(ptdev, &ptdev->gpu->irq, irq,
> - GPU_INTERRUPTS_MASK,
> ptdev->iomem + GPU_INT_BASE);
> if (ret)
> return ret;
>
> + panthor_gpu_irq_enable_events(&ptdev->gpu->irq, GPU_INTERRUPTS_MASK);
> + panthor_gpu_irq_resume(&ptdev->gpu->irq);
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index 31cc57029c12..1fef3c5c1b50 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -3406,7 +3406,6 @@ int panthor_mmu_init(struct panthor_device *ptdev)
> return -ENODEV;
>
> ret = panthor_request_mmu_irq(ptdev, &mmu->irq, irq,
> - panthor_mmu_fault_mask(ptdev, ~0),
> ptdev->iomem + MMU_INT_BASE);
> if (ret)
> return ret;
> @@ -3424,7 +3423,13 @@ int panthor_mmu_init(struct panthor_device *ptdev)
> ptdev->gpu_info.mmu_features |= BITS_PER_LONG;
> }
>
> - return drmm_add_action_or_reset(&ptdev->base, panthor_mmu_release_wq, mmu->vm.wq);
> + ret = drmm_add_action_or_reset(&ptdev->base, panthor_mmu_release_wq, mmu->vm.wq);
> + if (ret)
> + return ret;
> +
> + panthor_mmu_irq_enable_events(&mmu->irq, panthor_mmu_fault_mask(ptdev, ~0));
> + panthor_mmu_irq_resume(&mmu->irq);
> + return 0;
> }
>
> #ifdef CONFIG_DEBUG_FS
> diff --git a/drivers/gpu/drm/panthor/panthor_pwr.c b/drivers/gpu/drm/panthor/panthor_pwr.c
> index 090362bd700b..f2c2c3000590 100644
> --- a/drivers/gpu/drm/panthor/panthor_pwr.c
> +++ b/drivers/gpu/drm/panthor/panthor_pwr.c
> @@ -484,12 +484,13 @@ int panthor_pwr_init(struct panthor_device *ptdev)
> if (irq < 0)
> return irq;
>
> - err = panthor_request_pwr_irq(
> - ptdev, &pwr->irq, irq, PWR_INTERRUPTS_MASK,
> - pwr->iomem + PWR_INT_BASE);
> + err = panthor_request_pwr_irq(ptdev, &pwr->irq, irq,
> + pwr->iomem + PWR_INT_BASE);
> if (err)
> return err;
>
> + panthor_pwr_irq_enable_events(&pwr->irq, PWR_INTERRUPTS_MASK);
> + panthor_pwr_irq_resume(&pwr->irq);
> return 0;
> }
>
>
> --
> 2.54.0
>
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 31+ messages in thread