All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/sched: Protect entity->last_scheduled with spinlock
@ 2026-06-24 12:46 Philipp Stanner
  2026-06-24 13:03 ` sashiko-bot
  0 siblings, 1 reply; 2+ messages in thread
From: Philipp Stanner @ 2026-06-24 12:46 UTC (permalink / raw)
  To: Matthew Brost, Danilo Krummrich, Philipp Stanner,
	Christian König, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, Sumit Semwal
  Cc: dri-devel, linux-kernel, linux-media, linaro-mm-sig

The entity->last_scheduled field has always been set and read with
special RCU functions in addition to memory barriers. There is no
obvious reason for that, since the entity lock is available and taken at all
places that evaluate the last_scheduled field. The only exception is
drm_sched_entity_error(), which is not performance critical in any way.

Improve robustness, readability and maintainability by replacing RCU and
barriers with the lock.

As a preparational step, while at it, also guard spsc_queue_pop() with
the lock, since spsc_queue is deprecated and supposed to be replaced
with a locked list.

Signed-off-by: Philipp Stanner <phasta@kernel.org>
---
Tested with drm_sched unit tests, which all ran fine.
---
 drivers/gpu/drm/scheduler/sched_entity.c | 49 +++++++++++-------------
 include/drm/gpu_scheduler.h              |  9 ++---
 2 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index c51101ec70c1..95b2c48a604a 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -135,7 +135,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
 	entity->num_sched_list = num_sched_list;
 	entity->sched_list = num_sched_list > 1 ? sched_list : NULL;
 	entity->rq = &sched_list[0]->rq;
-	RCU_INIT_POINTER(entity->last_scheduled, NULL);
 	RB_CLEAR_NODE(&entity->rb_tree_node);
 	init_completion(&entity->entity_idle);
 
@@ -201,10 +200,10 @@ int drm_sched_entity_error(struct drm_sched_entity *entity)
 	struct dma_fence *fence;
 	int r;
 
-	rcu_read_lock();
-	fence = rcu_dereference(entity->last_scheduled);
+	spin_lock(&entity->lock);
+	fence = entity->last_scheduled;
 	r = fence ? fence->error : 0;
-	rcu_read_unlock();
+	spin_unlock(&entity->lock);
 
 	return r;
 }
@@ -288,8 +287,10 @@ void drm_sched_entity_kill(struct drm_sched_entity *entity)
 	wait_for_completion(&entity->entity_idle);
 
 	/* The entity is guaranteed to not be used by the scheduler */
-	prev = rcu_dereference_check(entity->last_scheduled, true);
+	spin_lock(&entity->lock);
+	prev = entity->last_scheduled;
 	dma_fence_get(prev);
+	spin_unlock(&entity->lock);
 	while ((job = drm_sched_entity_queue_pop(entity))) {
 		struct drm_sched_fence *s_fence = job->s_fence;
 
@@ -381,8 +382,8 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity)
 		entity->dependency = NULL;
 	}
 
-	dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
-	RCU_INIT_POINTER(entity->last_scheduled, NULL);
+	dma_fence_put(entity->last_scheduled);
+	WRITE_ONCE(entity->last_scheduled, NULL);
 	drm_sched_entity_stats_put(entity->stats);
 }
 EXPORT_SYMBOL(drm_sched_entity_fini);
@@ -523,18 +524,18 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
 	if (entity->guilty && atomic_read(entity->guilty))
 		dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
 
-	dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
-	rcu_assign_pointer(entity->last_scheduled,
-			   dma_fence_get(&sched_job->s_fence->finished));
+	spin_lock(&entity->lock);
+	dma_fence_put(entity->last_scheduled);
+	entity->last_scheduled = dma_fence_get(&sched_job->s_fence->finished);
 
-	/*
-	 * If the queue is empty we allow drm_sched_entity_select_rq() to
-	 * locklessly access ->last_scheduled. This only works if we set the
-	 * pointer before we dequeue and if we a write barrier here.
+	/* A recent rework required taking the spinlock above. Since spsc_queue
+	 * is scheduled for removal as per the DRM-TODO-list, we access it here
+	 * locked already to prepare for that cleanup.
+	 *
+	 * TODO: Fully replace spsc_queue with a locked (h)list.
 	 */
-	smp_wmb();
-
 	spsc_queue_pop(&entity->job_queue);
+	spin_unlock(&entity->lock);
 
 	drm_sched_rq_pop_entity(entity);
 
@@ -561,21 +562,15 @@ void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
 	if (spsc_queue_count(&entity->job_queue))
 		return;
 
-	/*
-	 * Only when the queue is empty are we guaranteed that
-	 * drm_sched_run_job_work() cannot change entity->last_scheduled. To
-	 * enforce ordering we need a read barrier here. See
-	 * drm_sched_entity_pop_job() for the other side.
-	 */
-	smp_rmb();
-
-	fence = rcu_dereference_check(entity->last_scheduled, true);
+	spin_lock(&entity->lock);
+	fence = entity->last_scheduled;
 
 	/* stay on the same engine if the previous job hasn't finished */
-	if (fence && !dma_fence_is_signaled(fence))
+	if (fence && !dma_fence_is_signaled(fence)) {
+		spin_unlock(&entity->lock);
 		return;
+	}
 
-	spin_lock(&entity->lock);
 	sched = drm_sched_pick_best(entity->sched_list, entity->num_sched_list);
 	rq = sched ? &sched->rq : NULL;
 	if (rq != entity->rq) {
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index d61c19e78182..176ff1f936cd 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -100,7 +100,8 @@ struct drm_sched_entity {
 	 * @lock:
 	 *
 	 * Lock protecting the run-queue (@rq) to which this entity belongs,
-	 * @priority and the list of schedulers (@sched_list, @num_sched_list).
+	 * @priority, @last_scheduled and the list of schedulers (@sched_list,
+	 * @num_sched_list).
 	 */
 	spinlock_t			lock;
 
@@ -202,11 +203,9 @@ struct drm_sched_entity {
 	/**
 	 * @last_scheduled:
 	 *
-	 * Points to the finished fence of the last scheduled job. Only written
-	 * by drm_sched_entity_pop_job(). Can be accessed locklessly from
-	 * drm_sched_job_arm() if the queue is empty.
+	 * Points to the finished fence of the last scheduled job.
 	 */
-	struct dma_fence __rcu		*last_scheduled;
+	struct dma_fence		*last_scheduled;
 
 	/**
 	 * @last_user: last group leader pushing a job into the entity.

base-commit: 60b5fa6edfef867322fce7c8306e5c4b46211be7
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] drm/sched: Protect entity->last_scheduled with spinlock
  2026-06-24 12:46 [PATCH] drm/sched: Protect entity->last_scheduled with spinlock Philipp Stanner
@ 2026-06-24 13:03 ` sashiko-bot
  0 siblings, 0 replies; 2+ messages in thread
From: sashiko-bot @ 2026-06-24 13:03 UTC (permalink / raw)
  To: Philipp Stanner; +Cc: dri-devel

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] Calling dma_fence_put() while holding the entity->lock spinlock introduces a risk of AB-BA lock inversion and deadlock with driver locks.
--

commit 21ecb5d3d71a1baa4ff6c29fb3ddc0deff8299e3
Author: Philipp Stanner <phasta@kernel.org>

drm/sched: Protect entity->last_scheduled with spinlock

This commit replaces the RCU protection and memory barriers used for
entity->last_scheduled with the existing entity->lock spinlock. It also
guards the spsc_queue_pop() call with the same lock as a preparatory step
for future cleanups.

> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index c51101ec70c16..95b2c48a604a6 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
[ ... ]
> @@ -523,18 +524,18 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>  	if (entity->guilty && atomic_read(entity->guilty))
>  		dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
>  
> -	dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
> -	rcu_assign_pointer(entity->last_scheduled,
> -			   dma_fence_get(&sched_job->s_fence->finished));
> -
> -	/*
> -	 * If the queue is empty we allow drm_sched_entity_select_rq() to
> -	 * locklessly access ->last_scheduled. This only works if we set the
> -	 * pointer before we dequeue and if we a write barrier here.
> +	spin_lock(&entity->lock);
> +	dma_fence_put(entity->last_scheduled);

[Severity: High]
Could calling dma_fence_put() while holding entity->lock introduce an AB-BA
lock inversion risk here?

If dropping the final reference to the scheduled fence invokes its release
callback, this can synchronously drop the reference to the underlying driver
hardware fence (fence->parent). When the hardware fence's reference count
drops to zero, the driver-specific release callback is invoked synchronously
while entity->lock is held.

Drivers commonly acquire their own internal locks (e.g., submission queue
locks) in their fence release callbacks. However, during job submission,
drivers typically hold these same internal locks before calling
drm_sched_entity_push_job(), which subsequently acquires entity->lock.
This reverse lock ordering might create a circular lock dependency.

Would it be safer to grab a local reference to entity->last_scheduled,
update the pointer, drop the spinlock, and then call dma_fence_put() on
the local reference outside the critical section?

> +	entity->last_scheduled = dma_fence_get(&sched_job->s_fence->finished);
> +
> +	/* A recent rework required taking the spinlock above. Since spsc_queue
> +	 * is scheduled for removal as per the DRM-TODO-list, we access it here
> +	 * locked already to prepare for that cleanup.
> +	 *
> +	 * TODO: Fully replace spsc_queue with a locked (h)list.
>  	 */
> -	smp_wmb();
> -
>  	spsc_queue_pop(&entity->job_queue);
> +	spin_unlock(&entity->lock);
>  
>  	drm_sched_rq_pop_entity(entity);
>

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260624124651.1414814-2-phasta@kernel.org?part=1

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-06-24 13:03 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-24 12:46 [PATCH] drm/sched: Protect entity->last_scheduled with spinlock Philipp Stanner
2026-06-24 13:03 ` sashiko-bot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.