* [PATCH v2 1/6] drm/sched: Fix teardown leaks with waitqueue
2025-04-24 9:55 [PATCH v2 0/6] drm/sched: Fix memory leaks in drm_sched_fini() Philipp Stanner
@ 2025-04-24 9:55 ` Philipp Stanner
2025-05-16 9:19 ` Tvrtko Ursulin
2025-04-24 9:55 ` [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long Philipp Stanner
` (4 subsequent siblings)
5 siblings, 1 reply; 24+ messages in thread
From: Philipp Stanner @ 2025-04-24 9:55 UTC (permalink / raw)
To: Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Philipp Stanner, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
Tvrtko Ursulin
Cc: dri-devel, nouveau, linux-kernel, Philipp Stanner
From: Philipp Stanner <pstanner@redhat.com>
The GPU scheduler currently does not ensure that its pending_list is
empty before performing various other teardown tasks in
drm_sched_fini().
If there are still jobs in the pending_list, this is problematic because
after scheduler teardown, no one will call backend_ops.free_job()
anymore. This would, consequently, result in memory leaks.
One way to solve this is to implement a waitqueue that drm_sched_fini()
blocks on until the pending_list has become empty.
Add a waitqueue to struct drm_gpu_scheduler. Wake up waiters once the
pending_list becomes empty. Wait in drm_sched_fini() for that to happen.
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
---
drivers/gpu/drm/scheduler/sched_main.c | 72 ++++++++++++++++++++------
include/drm/gpu_scheduler.h | 6 +++
2 files changed, 63 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 829579c41c6b..ea82e69a72a8 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -367,7 +367,7 @@ static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
*/
static void __drm_sched_run_free_queue(struct drm_gpu_scheduler *sched)
{
- if (!READ_ONCE(sched->pause_submit))
+ if (!READ_ONCE(sched->pause_free))
queue_work(sched->submit_wq, &sched->work_free_job);
}
@@ -556,6 +556,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
* is parked at which point it's safe.
*/
list_del_init(&job->list);
+
spin_unlock(&sched->job_list_lock);
status = job->sched->ops->timedout_job(job);
@@ -1119,6 +1120,12 @@ drm_sched_get_finished_job(struct drm_gpu_scheduler *sched)
/* remove job from pending_list */
list_del_init(&job->list);
+ /*
+ * Inform tasks blocking in drm_sched_fini() that it's now safe to proceed.
+ */
+ if (list_empty(&sched->pending_list))
+ wake_up(&sched->pending_list_waitque);
+
/* cancel this job's TO timer */
cancel_delayed_work(&sched->work_tdr);
/* make the scheduled timestamp more accurate */
@@ -1324,6 +1331,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_
init_waitqueue_head(&sched->job_scheduled);
INIT_LIST_HEAD(&sched->pending_list);
spin_lock_init(&sched->job_list_lock);
+ init_waitqueue_head(&sched->pending_list_waitque);
atomic_set(&sched->credit_count, 0);
INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
@@ -1331,6 +1339,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_
atomic_set(&sched->_score, 0);
atomic64_set(&sched->job_id_count, 0);
sched->pause_submit = false;
+ sched->pause_free = false;
sched->ready = true;
return 0;
@@ -1348,6 +1357,39 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_
}
EXPORT_SYMBOL(drm_sched_init);
+/**
+ * drm_sched_submission_and_timeout_stop - stop everything except for free_job()
+ * @sched: scheduler instance
+ *
+ * Only needed to cleanly tear down the scheduler in drm_sched_fini().
+ */
+static inline void
+drm_sched_submission_and_timeout_stop(struct drm_gpu_scheduler *sched)
+{
+ WRITE_ONCE(sched->pause_submit, true);
+ cancel_work_sync(&sched->work_run_job);
+ cancel_delayed_work_sync(&sched->work_tdr);
+}
+
+static inline void
+drm_sched_free_stop(struct drm_gpu_scheduler *sched)
+{
+ WRITE_ONCE(sched->pause_free, true);
+ cancel_work_sync(&sched->work_free_job);
+}
+
+static inline bool
+drm_sched_no_jobs_pending(struct drm_gpu_scheduler *sched)
+{
+ bool empty;
+
+ spin_lock(&sched->job_list_lock);
+ empty = list_empty(&sched->pending_list);
+ spin_unlock(&sched->job_list_lock);
+
+ return empty;
+}
+
/**
* drm_sched_fini - Destroy a gpu scheduler
*
@@ -1355,26 +1397,24 @@ EXPORT_SYMBOL(drm_sched_init);
*
* Tears down and cleans up the scheduler.
*
- * This stops submission of new jobs to the hardware through
- * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job()
- * will not be called for all jobs still in drm_gpu_scheduler.pending_list.
- * There is no solution for this currently. Thus, it is up to the driver to make
- * sure that:
- *
- * a) drm_sched_fini() is only called after for all submitted jobs
- * drm_sched_backend_ops.free_job() has been called or that
- * b) the jobs for which drm_sched_backend_ops.free_job() has not been called
- * after drm_sched_fini() ran are freed manually.
- *
- * FIXME: Take care of the above problem and prevent this function from leaking
- * the jobs in drm_gpu_scheduler.pending_list under any circumstances.
+ * Note that this function blocks until all the fences returned by
+ * &struct drm_sched_backend_ops.run_job have been signalled.
*/
void drm_sched_fini(struct drm_gpu_scheduler *sched)
{
struct drm_sched_entity *s_entity;
int i;
- drm_sched_wqueue_stop(sched);
+ /*
+ * Jobs that have neither been scheduled or which have timed out are
+ * gone by now, but jobs that have been submitted through
+ * backend_ops.run_job() and have not yet terminated are still pending.
+ *
+ * Wait for the pending_list to become empty to avoid leaking those jobs.
+ */
+ drm_sched_submission_and_timeout_stop(sched);
+ wait_event(sched->pending_list_waitque, drm_sched_no_jobs_pending(sched));
+ drm_sched_free_stop(sched);
for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
struct drm_sched_rq *rq = sched->sched_rq[i];
@@ -1471,6 +1511,7 @@ EXPORT_SYMBOL(drm_sched_wqueue_ready);
void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
{
WRITE_ONCE(sched->pause_submit, true);
+ WRITE_ONCE(sched->pause_free, true);
cancel_work_sync(&sched->work_run_job);
cancel_work_sync(&sched->work_free_job);
}
@@ -1488,6 +1529,7 @@ EXPORT_SYMBOL(drm_sched_wqueue_stop);
void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched)
{
WRITE_ONCE(sched->pause_submit, false);
+ WRITE_ONCE(sched->pause_free, false);
queue_work(sched->submit_wq, &sched->work_run_job);
queue_work(sched->submit_wq, &sched->work_free_job);
}
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 1a7e377d4cbb..d0b1f416b4d9 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -29,6 +29,7 @@
#include <linux/completion.h>
#include <linux/xarray.h>
#include <linux/workqueue.h>
+#include <linux/wait.h>
#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000)
@@ -533,6 +534,8 @@ struct drm_sched_backend_ops {
* timeout interval is over.
* @pending_list: the list of jobs which are currently in the job queue.
* @job_list_lock: lock to protect the pending_list.
+ * @pending_list_waitque: a waitqueue for drm_sched_fini() to block on until all
+ * pending jobs have been finished.
* @hang_limit: once the hangs by a job crosses this limit then it is marked
* guilty and it will no longer be considered for scheduling.
* @score: score to help loadbalancer pick a idle sched
@@ -540,6 +543,7 @@ struct drm_sched_backend_ops {
* @ready: marks if the underlying HW is ready to work
* @free_guilty: A hit to time out handler to free the guilty job.
* @pause_submit: pause queuing of @work_run_job on @submit_wq
+ * @pause_free: pause queuing of @work_free_job on @submit_wq
* @own_submit_wq: scheduler owns allocation of @submit_wq
* @dev: system &struct device
*
@@ -562,12 +566,14 @@ struct drm_gpu_scheduler {
struct delayed_work work_tdr;
struct list_head pending_list;
spinlock_t job_list_lock;
+ wait_queue_head_t pending_list_waitque;
int hang_limit;
atomic_t *score;
atomic_t _score;
bool ready;
bool free_guilty;
bool pause_submit;
+ bool pause_free;
bool own_submit_wq;
struct device *dev;
};
--
2.48.1
^ permalink raw reply related [flat|nested] 24+ messages in thread* Re: [PATCH v2 1/6] drm/sched: Fix teardown leaks with waitqueue
2025-04-24 9:55 ` [PATCH v2 1/6] drm/sched: Fix teardown leaks with waitqueue Philipp Stanner
@ 2025-05-16 9:19 ` Tvrtko Ursulin
0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-16 9:19 UTC (permalink / raw)
To: Philipp Stanner, Lyude Paul, Danilo Krummrich, David Airlie,
Simona Vetter, Matthew Brost, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel, Philipp Stanner
Hi,
Some review comments below, with a caveat that does not imply support
for the approach just yet.
On 24/04/2025 10:55, Philipp Stanner wrote:
> From: Philipp Stanner <pstanner@redhat.com>
>
> The GPU scheduler currently does not ensure that its pending_list is
> empty before performing various other teardown tasks in
> drm_sched_fini().
>
> If there are still jobs in the pending_list, this is problematic because
> after scheduler teardown, no one will call backend_ops.free_job()
> anymore. This would, consequently, result in memory leaks.
>
> One way to solve this is to implement a waitqueue that drm_sched_fini()
> blocks on until the pending_list has become empty.
>
> Add a waitqueue to struct drm_gpu_scheduler. Wake up waiters once the
> pending_list becomes empty. Wait in drm_sched_fini() for that to happen.
>
> Signed-off-by: Philipp Stanner <pstanner@redhat.com>
> ---
> drivers/gpu/drm/scheduler/sched_main.c | 72 ++++++++++++++++++++------
> include/drm/gpu_scheduler.h | 6 +++
> 2 files changed, 63 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 829579c41c6b..ea82e69a72a8 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -367,7 +367,7 @@ static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched)
> */
> static void __drm_sched_run_free_queue(struct drm_gpu_scheduler *sched)
> {
> - if (!READ_ONCE(sched->pause_submit))
> + if (!READ_ONCE(sched->pause_free))
> queue_work(sched->submit_wq, &sched->work_free_job);
> }
>
> @@ -556,6 +556,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
> * is parked at which point it's safe.
> */
> list_del_init(&job->list);
> +
> spin_unlock(&sched->job_list_lock);
>
> status = job->sched->ops->timedout_job(job);
> @@ -1119,6 +1120,12 @@ drm_sched_get_finished_job(struct drm_gpu_scheduler *sched)
> /* remove job from pending_list */
> list_del_init(&job->list);
>
> + /*
> + * Inform tasks blocking in drm_sched_fini() that it's now safe to proceed.
> + */
> + if (list_empty(&sched->pending_list))
> + wake_up(&sched->pending_list_waitque);
Typo in queue.
wake_up could go on the else branch of the if (next) block below.
Also, another micro-advantage to the ->cancel_job() approach is that it
doesn't add executing stuff at runtime which is only relevant for the
teardown.
> +
> /* cancel this job's TO timer */
> cancel_delayed_work(&sched->work_tdr);
> /* make the scheduled timestamp more accurate */
> @@ -1324,6 +1331,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_
> init_waitqueue_head(&sched->job_scheduled);
> INIT_LIST_HEAD(&sched->pending_list);
> spin_lock_init(&sched->job_list_lock);
> + init_waitqueue_head(&sched->pending_list_waitque);
> atomic_set(&sched->credit_count, 0);
> INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
> @@ -1331,6 +1339,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_
> atomic_set(&sched->_score, 0);
> atomic64_set(&sched->job_id_count, 0);
> sched->pause_submit = false;
> + sched->pause_free = false;
>
> sched->ready = true;
> return 0;
> @@ -1348,6 +1357,39 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_
> }
> EXPORT_SYMBOL(drm_sched_init);
>
> +/**
> + * drm_sched_submission_and_timeout_stop - stop everything except for free_job()
> + * @sched: scheduler instance
> + *
> + * Only needed to cleanly tear down the scheduler in drm_sched_fini().
> + */
> +static inline void
> +drm_sched_submission_and_timeout_stop(struct drm_gpu_scheduler *sched)
> +{
> + WRITE_ONCE(sched->pause_submit, true);
> + cancel_work_sync(&sched->work_run_job);
> + cancel_delayed_work_sync(&sched->work_tdr);
> +}
This one and the two below it: I think best known practice is to leave
out the inline keyword and let the compiler decide.
> +
> +static inline void
> +drm_sched_free_stop(struct drm_gpu_scheduler *sched)
> +{
> + WRITE_ONCE(sched->pause_free, true);
> + cancel_work_sync(&sched->work_free_job);
> +}
> +
> +static inline bool
> +drm_sched_no_jobs_pending(struct drm_gpu_scheduler *sched)
> +{
> + bool empty;
> +
> + spin_lock(&sched->job_list_lock);
> + empty = list_empty(&sched->pending_list);
> + spin_unlock(&sched->job_list_lock);
> +
> + return empty;
> +}
> +
> /**
> * drm_sched_fini - Destroy a gpu scheduler
> *
> @@ -1355,26 +1397,24 @@ EXPORT_SYMBOL(drm_sched_init);
> *
> * Tears down and cleans up the scheduler.
> *
> - * This stops submission of new jobs to the hardware through
> - * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job()
> - * will not be called for all jobs still in drm_gpu_scheduler.pending_list.
> - * There is no solution for this currently. Thus, it is up to the driver to make
> - * sure that:
> - *
> - * a) drm_sched_fini() is only called after for all submitted jobs
> - * drm_sched_backend_ops.free_job() has been called or that
> - * b) the jobs for which drm_sched_backend_ops.free_job() has not been called
> - * after drm_sched_fini() ran are freed manually.
> - *
> - * FIXME: Take care of the above problem and prevent this function from leaking
> - * the jobs in drm_gpu_scheduler.pending_list under any circumstances.
> + * Note that this function blocks until all the fences returned by
> + * &struct drm_sched_backend_ops.run_job have been signalled.
> */
> void drm_sched_fini(struct drm_gpu_scheduler *sched)
> {
> struct drm_sched_entity *s_entity;
> int i;
>
> - drm_sched_wqueue_stop(sched);
> + /*
> + * Jobs that have neither been scheduled or which have timed out are
> + * gone by now, but jobs that have been submitted through
> + * backend_ops.run_job() and have not yet terminated are still pending.
> + *
> + * Wait for the pending_list to become empty to avoid leaking those jobs.
> + */
> + drm_sched_submission_and_timeout_stop(sched);
> + wait_event(sched->pending_list_waitque, drm_sched_no_jobs_pending(sched));
I think this is a patch ordering issue. To avoid blocking indefinitely
after patch 1, you probably should swap patches 1 and 2, or maybe just
squash them.
Regards,
Tvrtko
> + drm_sched_free_stop(sched);
>
> for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
> struct drm_sched_rq *rq = sched->sched_rq[i];
> @@ -1471,6 +1511,7 @@ EXPORT_SYMBOL(drm_sched_wqueue_ready);
> void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
> {
> WRITE_ONCE(sched->pause_submit, true);
> + WRITE_ONCE(sched->pause_free, true);
> cancel_work_sync(&sched->work_run_job);
> cancel_work_sync(&sched->work_free_job);
> }
> @@ -1488,6 +1529,7 @@ EXPORT_SYMBOL(drm_sched_wqueue_stop);
> void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched)
> {
> WRITE_ONCE(sched->pause_submit, false);
> + WRITE_ONCE(sched->pause_free, false);
> queue_work(sched->submit_wq, &sched->work_run_job);
> queue_work(sched->submit_wq, &sched->work_free_job);
> }
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 1a7e377d4cbb..d0b1f416b4d9 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -29,6 +29,7 @@
> #include <linux/completion.h>
> #include <linux/xarray.h>
> #include <linux/workqueue.h>
> +#include <linux/wait.h>
>
> #define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000)
>
> @@ -533,6 +534,8 @@ struct drm_sched_backend_ops {
> * timeout interval is over.
> * @pending_list: the list of jobs which are currently in the job queue.
> * @job_list_lock: lock to protect the pending_list.
> + * @pending_list_waitque: a waitqueue for drm_sched_fini() to block on until all
> + * pending jobs have been finished.
> * @hang_limit: once the hangs by a job crosses this limit then it is marked
> * guilty and it will no longer be considered for scheduling.
> * @score: score to help loadbalancer pick a idle sched
> @@ -540,6 +543,7 @@ struct drm_sched_backend_ops {
> * @ready: marks if the underlying HW is ready to work
> * @free_guilty: A hit to time out handler to free the guilty job.
> * @pause_submit: pause queuing of @work_run_job on @submit_wq
> + * @pause_free: pause queuing of @work_free_job on @submit_wq
> * @own_submit_wq: scheduler owns allocation of @submit_wq
> * @dev: system &struct device
> *
> @@ -562,12 +566,14 @@ struct drm_gpu_scheduler {
> struct delayed_work work_tdr;
> struct list_head pending_list;
> spinlock_t job_list_lock;
> + wait_queue_head_t pending_list_waitque;
> int hang_limit;
> atomic_t *score;
> atomic_t _score;
> bool ready;
> bool free_guilty;
> bool pause_submit;
> + bool pause_free;
> bool own_submit_wq;
> struct device *dev;
> };
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-04-24 9:55 [PATCH v2 0/6] drm/sched: Fix memory leaks in drm_sched_fini() Philipp Stanner
2025-04-24 9:55 ` [PATCH v2 1/6] drm/sched: Fix teardown leaks with waitqueue Philipp Stanner
@ 2025-04-24 9:55 ` Philipp Stanner
2025-05-16 9:33 ` Tvrtko Ursulin
2025-04-24 9:55 ` [PATCH v2 3/6] drm/sched: Warn if pending list is not empty Philipp Stanner
` (3 subsequent siblings)
5 siblings, 1 reply; 24+ messages in thread
From: Philipp Stanner @ 2025-04-24 9:55 UTC (permalink / raw)
To: Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Philipp Stanner, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
Tvrtko Ursulin
Cc: dri-devel, nouveau, linux-kernel, Danilo Krummrich
The waitqueue that ensures that drm_sched_fini() blocks until the
pending_list has become empty could theoretically cause that function to
block for a very long time. That, ultimately, could block userspace
procesess and prevent them from being killable through SIGKILL.
When a driver calls drm_sched_fini(), it is safe to assume that all
still pending jobs are not needed anymore anyways. Thus, they can be
cancelled and thereby it can be ensured that drm_sched_fini() will
return relatively quickly.
Implement a new helper to stop all work items / submission except for
the drm_sched_backend_ops.run_job().
Implement a driver callback, kill_fence_context(), that instructs the
driver to kill the fence context associated with this scheduler, thereby
causing all pending hardware fences to be signalled.
Call those new routines in drm_sched_fini() and ensure backwards
compatibility if the new callback is not implemented.
Suggested-by: Danilo Krummrich <dakr@redhat.com>
Signed-off-by: Philipp Stanner <phasta@kernel.org>
---
drivers/gpu/drm/scheduler/sched_main.c | 47 +++++++++++++++++---------
include/drm/gpu_scheduler.h | 11 ++++++
2 files changed, 42 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index ea82e69a72a8..c2ad6c70bfb6 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -1390,31 +1390,46 @@ drm_sched_no_jobs_pending(struct drm_gpu_scheduler *sched)
return empty;
}
+/**
+ * drm_sched_cancel_jobs_and_wait - trigger freeing of all pending jobs
+ * @sched: scheduler instance
+ *
+ * Must only be called if &struct drm_sched_backend_ops.kill_fence_context is
+ * implemented.
+ *
+ * Instructs the driver to kill the fence context associated with this scheduler,
+ * thereby signalling all pending fences. This, in turn, will trigger
+ * &struct drm_sched_backend_ops.free_job to be called for all pending jobs.
+ * The function then blocks until all pending jobs have been freed.
+ */
+static inline void
+drm_sched_cancel_jobs_and_wait(struct drm_gpu_scheduler *sched)
+{
+ sched->ops->kill_fence_context(sched);
+ wait_event(sched->pending_list_waitque, drm_sched_no_jobs_pending(sched));
+}
+
/**
* drm_sched_fini - Destroy a gpu scheduler
*
* @sched: scheduler instance
*
- * Tears down and cleans up the scheduler.
- *
- * Note that this function blocks until all the fences returned by
- * &struct drm_sched_backend_ops.run_job have been signalled.
+ * Tears down and cleans up the scheduler. Might leak memory if
+ * &struct drm_sched_backend_ops.kill_fence_context is not implemented.
*/
void drm_sched_fini(struct drm_gpu_scheduler *sched)
{
struct drm_sched_entity *s_entity;
int i;
- /*
- * Jobs that have neither been scheduled or which have timed out are
- * gone by now, but jobs that have been submitted through
- * backend_ops.run_job() and have not yet terminated are still pending.
- *
- * Wait for the pending_list to become empty to avoid leaking those jobs.
- */
- drm_sched_submission_and_timeout_stop(sched);
- wait_event(sched->pending_list_waitque, drm_sched_no_jobs_pending(sched));
- drm_sched_free_stop(sched);
+ if (sched->ops->kill_fence_context) {
+ drm_sched_submission_and_timeout_stop(sched);
+ drm_sched_cancel_jobs_and_wait(sched);
+ drm_sched_free_stop(sched);
+ } else {
+ /* We're in "legacy free-mode" and ignore potential mem leaks */
+ drm_sched_wqueue_stop(sched);
+ }
for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
struct drm_sched_rq *rq = sched->sched_rq[i];
@@ -1502,7 +1517,7 @@ bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched)
EXPORT_SYMBOL(drm_sched_wqueue_ready);
/**
- * drm_sched_wqueue_stop - stop scheduler submission
+ * drm_sched_wqueue_stop - stop scheduler submission and freeing
* @sched: scheduler instance
*
* Stops the scheduler from pulling new jobs from entities. It also stops
@@ -1518,7 +1533,7 @@ void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
EXPORT_SYMBOL(drm_sched_wqueue_stop);
/**
- * drm_sched_wqueue_start - start scheduler submission
+ * drm_sched_wqueue_start - start scheduler submission and freeing
* @sched: scheduler instance
*
* Restarts the scheduler after drm_sched_wqueue_stop() has stopped it.
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index d0b1f416b4d9..8630b4a26f10 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -509,6 +509,17 @@ struct drm_sched_backend_ops {
* and it's time to clean it up.
*/
void (*free_job)(struct drm_sched_job *sched_job);
+
+ /**
+ * @kill_fence_context: kill the fence context belonging to this scheduler
+ *
+ * Needed to cleanly tear the scheduler down in drm_sched_fini(). This
+ * callback will cause all hardware fences to be signalled by the driver,
+ * which, ultimately, ensures that all jobs get freed before teardown.
+ *
+ * This callback is optional, but it is highly recommended to implement it.
+ */
+ void (*kill_fence_context)(struct drm_gpu_scheduler *sched);
};
/**
--
2.48.1
^ permalink raw reply related [flat|nested] 24+ messages in thread* Re: [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-04-24 9:55 ` [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long Philipp Stanner
@ 2025-05-16 9:33 ` Tvrtko Ursulin
2025-05-16 9:53 ` Danilo Krummrich
2025-05-16 9:54 ` Philipp Stanner
0 siblings, 2 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-16 9:33 UTC (permalink / raw)
To: Philipp Stanner, Lyude Paul, Danilo Krummrich, David Airlie,
Simona Vetter, Matthew Brost, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel, Danilo Krummrich
On 24/04/2025 10:55, Philipp Stanner wrote:
> The waitqueue that ensures that drm_sched_fini() blocks until the
> pending_list has become empty could theoretically cause that function to
> block for a very long time. That, ultimately, could block userspace
> procesess and prevent them from being killable through SIGKILL.
>
> When a driver calls drm_sched_fini(), it is safe to assume that all
> still pending jobs are not needed anymore anyways. Thus, they can be
> cancelled and thereby it can be ensured that drm_sched_fini() will
> return relatively quickly.
>
> Implement a new helper to stop all work items / submission except for
> the drm_sched_backend_ops.run_job().
>
> Implement a driver callback, kill_fence_context(), that instructs the
> driver to kill the fence context associated with this scheduler, thereby
> causing all pending hardware fences to be signalled.
>
> Call those new routines in drm_sched_fini() and ensure backwards
> compatibility if the new callback is not implemented.
>
> Suggested-by: Danilo Krummrich <dakr@redhat.com>
> Signed-off-by: Philipp Stanner <phasta@kernel.org>
> ---
> drivers/gpu/drm/scheduler/sched_main.c | 47 +++++++++++++++++---------
> include/drm/gpu_scheduler.h | 11 ++++++
> 2 files changed, 42 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index ea82e69a72a8..c2ad6c70bfb6 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -1390,31 +1390,46 @@ drm_sched_no_jobs_pending(struct drm_gpu_scheduler *sched)
> return empty;
> }
>
> +/**
> + * drm_sched_cancel_jobs_and_wait - trigger freeing of all pending jobs
> + * @sched: scheduler instance
> + *
> + * Must only be called if &struct drm_sched_backend_ops.kill_fence_context is
> + * implemented.
> + *
> + * Instructs the driver to kill the fence context associated with this scheduler,
> + * thereby signalling all pending fences. This, in turn, will trigger
> + * &struct drm_sched_backend_ops.free_job to be called for all pending jobs.
> + * The function then blocks until all pending jobs have been freed.
> + */
> +static inline void
> +drm_sched_cancel_jobs_and_wait(struct drm_gpu_scheduler *sched)
> +{
> + sched->ops->kill_fence_context(sched);
> + wait_event(sched->pending_list_waitque, drm_sched_no_jobs_pending(sched));
> +}
> +
> /**
> * drm_sched_fini - Destroy a gpu scheduler
> *
> * @sched: scheduler instance
> *
> - * Tears down and cleans up the scheduler.
> - *
> - * Note that this function blocks until all the fences returned by
> - * &struct drm_sched_backend_ops.run_job have been signalled.
> + * Tears down and cleans up the scheduler. Might leak memory if
> + * &struct drm_sched_backend_ops.kill_fence_context is not implemented.
> */
> void drm_sched_fini(struct drm_gpu_scheduler *sched)
> {
> struct drm_sched_entity *s_entity;
> int i;
>
> - /*
> - * Jobs that have neither been scheduled or which have timed out are
> - * gone by now, but jobs that have been submitted through
> - * backend_ops.run_job() and have not yet terminated are still pending.
> - *
> - * Wait for the pending_list to become empty to avoid leaking those jobs.
> - */
> - drm_sched_submission_and_timeout_stop(sched);
> - wait_event(sched->pending_list_waitque, drm_sched_no_jobs_pending(sched));
> - drm_sched_free_stop(sched);
> + if (sched->ops->kill_fence_context) {
> + drm_sched_submission_and_timeout_stop(sched);
> + drm_sched_cancel_jobs_and_wait(sched);
> + drm_sched_free_stop(sched);
> + } else {
> + /* We're in "legacy free-mode" and ignore potential mem leaks */
> + drm_sched_wqueue_stop(sched);
> + }
>
> for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
> struct drm_sched_rq *rq = sched->sched_rq[i];
> @@ -1502,7 +1517,7 @@ bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched)
> EXPORT_SYMBOL(drm_sched_wqueue_ready);
>
> /**
> - * drm_sched_wqueue_stop - stop scheduler submission
> + * drm_sched_wqueue_stop - stop scheduler submission and freeing
Looks like the kerneldoc corrections (below too) belong to the previous
patch. Irrelevant if you decide to squash them though.
> * @sched: scheduler instance
> *
> * Stops the scheduler from pulling new jobs from entities. It also stops
> @@ -1518,7 +1533,7 @@ void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
> EXPORT_SYMBOL(drm_sched_wqueue_stop);
>
> /**
> - * drm_sched_wqueue_start - start scheduler submission
> + * drm_sched_wqueue_start - start scheduler submission and freeing
> * @sched: scheduler instance
> *
> * Restarts the scheduler after drm_sched_wqueue_stop() has stopped it.
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index d0b1f416b4d9..8630b4a26f10 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -509,6 +509,17 @@ struct drm_sched_backend_ops {
> * and it's time to clean it up.
> */
> void (*free_job)(struct drm_sched_job *sched_job);
> +
> + /**
> + * @kill_fence_context: kill the fence context belonging to this scheduler
Which fence context would that be? ;)
Also, "fence context" would be a new terminology in gpu_scheduler.h API
level. You could call it ->sched_fini() or similar to signify at which
point in the API it gets called and then the fact it takes sched as
parameter would be natural.
We also probably want some commentary on the topic of indefinite (or
very long at least) blocking a thread exit / SIGINT/TERM/KILL time.
Cover letter touches upon that problem but I don't see you address it.
Is the idea to let drivers shoot themselves in the foot or what?
Regards,
Tvrtko
> + *
> + * Needed to cleanly tear the scheduler down in drm_sched_fini(). This
> + * callback will cause all hardware fences to be signalled by the driver,
> + * which, ultimately, ensures that all jobs get freed before teardown.
> + *
> + * This callback is optional, but it is highly recommended to implement it.
> + */
> + void (*kill_fence_context)(struct drm_gpu_scheduler *sched);
> };
>
> /**
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-05-16 9:33 ` Tvrtko Ursulin
@ 2025-05-16 9:53 ` Danilo Krummrich
2025-05-16 10:19 ` Tvrtko Ursulin
2025-05-16 9:54 ` Philipp Stanner
1 sibling, 1 reply; 24+ messages in thread
From: Danilo Krummrich @ 2025-05-16 9:53 UTC (permalink / raw)
To: Tvrtko Ursulin
Cc: Philipp Stanner, Lyude Paul, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann, dri-devel, nouveau,
linux-kernel, Danilo Krummrich
On Fri, May 16, 2025 at 10:33:30AM +0100, Tvrtko Ursulin wrote:
> On 24/04/2025 10:55, Philipp Stanner wrote:
> > + * @kill_fence_context: kill the fence context belonging to this scheduler
>
> Which fence context would that be? ;)
There's one one per ring and a scheduler instance represents a single ring. So,
what should be specified here?
> Also, "fence context" would be a new terminology in gpu_scheduler.h API
> level. You could call it ->sched_fini() or similar to signify at which point
> in the API it gets called and then the fact it takes sched as parameter
> would be natural.
The driver should tear down the fence context in this callback, not the while
scheduler. ->sched_fini() would hence be misleading.
> We also probably want some commentary on the topic of indefinite (or very
> long at least) blocking a thread exit / SIGINT/TERM/KILL time.
You mean in case the driver does implement the callback, but does *not* properly
tear down the fence context? So, you ask for describing potential consequences
of drivers having bugs in the implementation of the callback? Or something else?
> Is the idea to let drivers shoot themselves in the foot or what?
Please abstain from such rhetorical questions, that's not a good way of having
technical discussions.
- Danilo
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-05-16 9:53 ` Danilo Krummrich
@ 2025-05-16 10:19 ` Tvrtko Ursulin
2025-05-16 10:54 ` Danilo Krummrich
0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-16 10:19 UTC (permalink / raw)
To: Danilo Krummrich
Cc: Philipp Stanner, Lyude Paul, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann, dri-devel, nouveau,
linux-kernel, Danilo Krummrich
On 16/05/2025 10:53, Danilo Krummrich wrote:
> On Fri, May 16, 2025 at 10:33:30AM +0100, Tvrtko Ursulin wrote:
>> On 24/04/2025 10:55, Philipp Stanner wrote:
>>> + * @kill_fence_context: kill the fence context belonging to this scheduler
>>
>> Which fence context would that be? ;)
>
> There's one one per ring and a scheduler instance represents a single ring. So,
> what should be specified here?
I was pointing out the fact not all drivers are 1:1 sched:entity. So
plural at least. Thought it would be obvious from the ";)".
>> Also, "fence context" would be a new terminology in gpu_scheduler.h API
>> level. You could call it ->sched_fini() or similar to signify at which point
>> in the API it gets called and then the fact it takes sched as parameter
>> would be natural.
>
> The driver should tear down the fence context in this callback, not the while
> scheduler. ->sched_fini() would hence be misleading.
Not the while what? Not while drm_sched_fini()? Could call it
sched_kill() or anything. My point is that we dont' have "fence context"
in the API but entities so adding a new term sounds sub-optimal.
>> We also probably want some commentary on the topic of indefinite (or very
>> long at least) blocking a thread exit / SIGINT/TERM/KILL time.
>
> You mean in case the driver does implement the callback, but does *not* properly
> tear down the fence context? So, you ask for describing potential consequences
> of drivers having bugs in the implementation of the callback? Or something else?
I was proposing the kerneldoc for the vfunc should document the callback
must not block, or if blocking is unavoidable, either document a
guideline on how long is acceptable. Maybe even enforce a limit in the
scheduler core itself.
Regards,
Tvrtko
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-05-16 10:19 ` Tvrtko Ursulin
@ 2025-05-16 10:54 ` Danilo Krummrich
2025-05-16 11:35 ` Tvrtko Ursulin
0 siblings, 1 reply; 24+ messages in thread
From: Danilo Krummrich @ 2025-05-16 10:54 UTC (permalink / raw)
To: Tvrtko Ursulin
Cc: Philipp Stanner, Lyude Paul, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann, dri-devel, nouveau,
linux-kernel, Danilo Krummrich
On Fri, May 16, 2025 at 11:19:50AM +0100, Tvrtko Ursulin wrote:
>
> On 16/05/2025 10:53, Danilo Krummrich wrote:
> > On Fri, May 16, 2025 at 10:33:30AM +0100, Tvrtko Ursulin wrote:
> > > On 24/04/2025 10:55, Philipp Stanner wrote:
> > > > + * @kill_fence_context: kill the fence context belonging to this scheduler
> > >
> > > Which fence context would that be? ;)
> >
> > There's one one per ring and a scheduler instance represents a single ring. So,
> > what should be specified here?
>
> I was pointing out the fact not all drivers are 1:1 sched:entity.
I'm well aware, but how is that relevant? Entities don't have an associated
fence context, but a GPU Ring (either hardware or software) has, which a
scheduler instance represents.
> Thought it would be obvious from the ";)".
I should read from ";)" that you refer to a 1:N-sched:entity relationship (which
doesn't seem to be related)?
> > > Also, "fence context" would be a new terminology in gpu_scheduler.h API
> > > level. You could call it ->sched_fini() or similar to signify at which point
> > > in the API it gets called and then the fact it takes sched as parameter
> > > would be natural.
> >
> > The driver should tear down the fence context in this callback, not the while
> > scheduler. ->sched_fini() would hence be misleading.
>
> Not the while what? Not while drm_sched_fini()?
*whole
> Could call it sched_kill()
> or anything. My point is that we dont' have "fence context" in the API but
> entities so adding a new term sounds sub-optimal.
In the callback the driver should neither tear down an entity, nor the whole
scheduler, hence we shouldn't call it like that. sched_kill() is therefore
misleading as well.
It should be named after what it actually does (or should do). Feel free to
propose a different name that conforms with that.
> > > We also probably want some commentary on the topic of indefinite (or very
> > > long at least) blocking a thread exit / SIGINT/TERM/KILL time.
> >
> > You mean in case the driver does implement the callback, but does *not* properly
> > tear down the fence context? So, you ask for describing potential consequences
> > of drivers having bugs in the implementation of the callback? Or something else?
>
> I was proposing the kerneldoc for the vfunc should document the callback
> must not block, or if blocking is unavoidable, either document a guideline
> on how long is acceptable. Maybe even enforce a limit in the scheduler core
> itself.
Killing the fence context shouldn't block.
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-05-16 10:54 ` Danilo Krummrich
@ 2025-05-16 11:35 ` Tvrtko Ursulin
2025-05-16 12:00 ` Danilo Krummrich
0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-16 11:35 UTC (permalink / raw)
To: Danilo Krummrich
Cc: Philipp Stanner, Lyude Paul, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann, dri-devel, nouveau,
linux-kernel, Danilo Krummrich
On 16/05/2025 11:54, Danilo Krummrich wrote:
> On Fri, May 16, 2025 at 11:19:50AM +0100, Tvrtko Ursulin wrote:
>>
>> On 16/05/2025 10:53, Danilo Krummrich wrote:
>>> On Fri, May 16, 2025 at 10:33:30AM +0100, Tvrtko Ursulin wrote:
>>>> On 24/04/2025 10:55, Philipp Stanner wrote:
>>>>> + * @kill_fence_context: kill the fence context belonging to this scheduler
>>>>
>>>> Which fence context would that be? ;)
>>>
>>> There's one one per ring and a scheduler instance represents a single ring. So,
>>> what should be specified here?
>>
>> I was pointing out the fact not all drivers are 1:1 sched:entity.
>
> I'm well aware, but how is that relevant? Entities don't have an associated
> fence context, but a GPU Ring (either hardware or software) has, which a
> scheduler instance represents.
Aha! Well.. how it is relevant and do entities not have an associated
fence context? Well, entity->fence_context.. that was my first
association this whole time. Never it crossed my mind this is talking
about the hardware fence context. Proof in the pudding naming should be
improved.
But I also don't think there is a requirement for fences returned from
->run_job() to have a single context. Which again makes it not the best
naming.
>> Thought it would be obvious from the ";)".
>
> I should read from ";)" that you refer to a 1:N-sched:entity relationship (which
> doesn't seem to be related)?
>
>>>> Also, "fence context" would be a new terminology in gpu_scheduler.h API
>>>> level. You could call it ->sched_fini() or similar to signify at which point
>>>> in the API it gets called and then the fact it takes sched as parameter
>>>> would be natural.
>>>
>>> The driver should tear down the fence context in this callback, not the while
>>> scheduler. ->sched_fini() would hence be misleading.
>>
>> Not the while what? Not while drm_sched_fini()?
>
> *whole
>
>> Could call it sched_kill()
>> or anything. My point is that we dont' have "fence context" in the API but
>> entities so adding a new term sounds sub-optimal.
>
> In the callback the driver should neither tear down an entity, nor the whole
> scheduler, hence we shouldn't call it like that. sched_kill() is therefore
> misleading as well.
->sched_exit()? ->sched_stop()? ->sched_cleanup()?
> It should be named after what it actually does (or should do). Feel free to
> propose a different name that conforms with that.
>
>>>> We also probably want some commentary on the topic of indefinite (or very
>>>> long at least) blocking a thread exit / SIGINT/TERM/KILL time.
>>>
>>> You mean in case the driver does implement the callback, but does *not* properly
>>> tear down the fence context? So, you ask for describing potential consequences
>>> of drivers having bugs in the implementation of the callback? Or something else?
>>
>> I was proposing the kerneldoc for the vfunc should document the callback
>> must not block, or if blocking is unavoidable, either document a guideline
>> on how long is acceptable. Maybe even enforce a limit in the scheduler core
>> itself.
>
> Killing the fence context shouldn't block.
Cool. And maybe convert the wait_event to wait_event_timeout with a
warning to be robust.
Mind you, I still think a solution which does not add new state
machinery should be preferred.
Regards,
Tvrtko
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-05-16 11:35 ` Tvrtko Ursulin
@ 2025-05-16 12:00 ` Danilo Krummrich
2025-05-16 15:48 ` Tvrtko Ursulin
0 siblings, 1 reply; 24+ messages in thread
From: Danilo Krummrich @ 2025-05-16 12:00 UTC (permalink / raw)
To: Tvrtko Ursulin
Cc: Philipp Stanner, Lyude Paul, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann, dri-devel, nouveau,
linux-kernel, Danilo Krummrich
On Fri, May 16, 2025 at 12:35:52PM +0100, Tvrtko Ursulin wrote:
>
> On 16/05/2025 11:54, Danilo Krummrich wrote:
> > On Fri, May 16, 2025 at 11:19:50AM +0100, Tvrtko Ursulin wrote:
> > >
> > > On 16/05/2025 10:53, Danilo Krummrich wrote:
> > > > On Fri, May 16, 2025 at 10:33:30AM +0100, Tvrtko Ursulin wrote:
> > > > > On 24/04/2025 10:55, Philipp Stanner wrote:
> > > > > > + * @kill_fence_context: kill the fence context belonging to this scheduler
> > > > >
> > > > > Which fence context would that be? ;)
> > > >
> > > > There's one one per ring and a scheduler instance represents a single ring. So,
> > > > what should be specified here?
> > >
> > > I was pointing out the fact not all drivers are 1:1 sched:entity.
> >
> > I'm well aware, but how is that relevant? Entities don't have an associated
> > fence context, but a GPU Ring (either hardware or software) has, which a
> > scheduler instance represents.
>
> Aha! Well.. how it is relevant and do entities not have an associated fence
> context? Well, entity->fence_context.. that was my first association this
> whole time. Never it crossed my mind this is talking about the hardware
> fence context. Proof in the pudding naming should be improved.
It says "fence context belonging to this scheduler", which should be
unambiguous, however I agree that we could mark out the difference even more.
> But I also don't think there is a requirement for fences returned from
> ->run_job() to have a single context. Which again makes it not the best
> naming.
It's implied by the fact that a scheduler instance represents a ring. Having
multiple fence contexts per ring doesn't make any sense.
But it's indeed not written down -- we should do that then.
> > In the callback the driver should neither tear down an entity, nor the whole
> > scheduler, hence we shouldn't call it like that. sched_kill() is therefore
> > misleading as well.
>
> ->sched_exit()? ->sched_stop()? ->sched_cleanup()?
I think this all would throw up questions like "What does {exit,stop,cleanup}
mean in this context?". And the answer would be "kill the fence context of the
ring represented by the scheduler".
I think we want a name that represents that without an indirection that we have
to define.
> > It should be named after what it actually does (or should do). Feel free to
> > propose a different name that conforms with that.
> >
> > > > > We also probably want some commentary on the topic of indefinite (or very
> > > > > long at least) blocking a thread exit / SIGINT/TERM/KILL time.
> > > >
> > > > You mean in case the driver does implement the callback, but does *not* properly
> > > > tear down the fence context? So, you ask for describing potential consequences
> > > > of drivers having bugs in the implementation of the callback? Or something else?
> > >
> > > I was proposing the kerneldoc for the vfunc should document the callback
> > > must not block, or if blocking is unavoidable, either document a guideline
> > > on how long is acceptable. Maybe even enforce a limit in the scheduler core
> > > itself.
> >
> > Killing the fence context shouldn't block.
>
> Cool. And maybe convert the wait_event to wait_event_timeout with a warning
> to be robust.
That would make sense if it could deadlock, but even if the driver does nothing
it should terminate eventually. The rule that we always rely on is that we
guarantee throughout the kernel that fences are signalled eventually.
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-05-16 12:00 ` Danilo Krummrich
@ 2025-05-16 15:48 ` Tvrtko Ursulin
0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-16 15:48 UTC (permalink / raw)
To: Danilo Krummrich
Cc: Philipp Stanner, Lyude Paul, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann, dri-devel, nouveau,
linux-kernel, Danilo Krummrich
On 16/05/2025 13:00, Danilo Krummrich wrote:
> On Fri, May 16, 2025 at 12:35:52PM +0100, Tvrtko Ursulin wrote:
>>
>> On 16/05/2025 11:54, Danilo Krummrich wrote:
>>> On Fri, May 16, 2025 at 11:19:50AM +0100, Tvrtko Ursulin wrote:
>>>>
>>>> On 16/05/2025 10:53, Danilo Krummrich wrote:
>>>>> On Fri, May 16, 2025 at 10:33:30AM +0100, Tvrtko Ursulin wrote:
>>>>>> On 24/04/2025 10:55, Philipp Stanner wrote:
>>>>>>> + * @kill_fence_context: kill the fence context belonging to this scheduler
>>>>>>
>>>>>> Which fence context would that be? ;)
>>>>>
>>>>> There's one one per ring and a scheduler instance represents a single ring. So,
>>>>> what should be specified here?
>>>>
>>>> I was pointing out the fact not all drivers are 1:1 sched:entity.
>>>
>>> I'm well aware, but how is that relevant? Entities don't have an associated
>>> fence context, but a GPU Ring (either hardware or software) has, which a
>>> scheduler instance represents.
>>
>> Aha! Well.. how it is relevant and do entities not have an associated fence
>> context? Well, entity->fence_context.. that was my first association this
>> whole time. Never it crossed my mind this is talking about the hardware
>> fence context. Proof in the pudding naming should be improved.
>
> It says "fence context belonging to this scheduler", which should be
> unambiguous, however I agree that we could mark out the difference even more.
Cool, I had tunnel vision due working with entity->fence_context a lot
and this just had misfortune to re-use the same name.
>> But I also don't think there is a requirement for fences returned from
>> ->run_job() to have a single context. Which again makes it not the best
>> naming.
>
> It's implied by the fact that a scheduler instance represents a ring. Having
> multiple fence contexts per ring doesn't make any sense.
>
> But it's indeed not written down -- we should do that then.
Would you do it in code or just in docs? I don't see a real benefit to
it to be honest, since nothing depends on anything apart that it is a
fence which will signal when job is done. But I am not aware of anything
where it would be a problem either. One to run past driver maintainers I
guess.
>>> In the callback the driver should neither tear down an entity, nor the whole
>>> scheduler, hence we shouldn't call it like that. sched_kill() is therefore
>>> misleading as well.
>>
>> ->sched_exit()? ->sched_stop()? ->sched_cleanup()?
>
> I think this all would throw up questions like "What does {exit,stop,cleanup}
> mean in this context?". And the answer would be "kill the fence context of the
> ring represented by the scheduler".
>
> I think we want a name that represents that without an indirection that we have
> to define.
Well fence_kill_context wasn't self-descriptive to me neither so there
is that too. In other words some kerneldoc will be needed anyway and a
callback to call while tearing something down is pretty standard stuff.
So I don't think it is a big deal to explain what that callback should do.
>>> It should be named after what it actually does (or should do). Feel free to
>>> propose a different name that conforms with that.
>>>
>>>>>> We also probably want some commentary on the topic of indefinite (or very
>>>>>> long at least) blocking a thread exit / SIGINT/TERM/KILL time.
>>>>>
>>>>> You mean in case the driver does implement the callback, but does *not* properly
>>>>> tear down the fence context? So, you ask for describing potential consequences
>>>>> of drivers having bugs in the implementation of the callback? Or something else?
>>>>
>>>> I was proposing the kerneldoc for the vfunc should document the callback
>>>> must not block, or if blocking is unavoidable, either document a guideline
>>>> on how long is acceptable. Maybe even enforce a limit in the scheduler core
>>>> itself.
>>>
>>> Killing the fence context shouldn't block.
>>
>> Cool. And maybe convert the wait_event to wait_event_timeout with a warning
>> to be robust.
>
> That would make sense if it could deadlock, but even if the driver does nothing
> it should terminate eventually. The rule that we always rely on is that we
> guarantee throughout the kernel that fences are signalled eventually.
Given it is an opt-in fair enough. (Some drivers don't have automatic
fence expiration, but then again they don't have this callback either.
And once they start adding it, there will be kerneldoc to say callback
must not block.)
Regards,
Tvrtko
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-05-16 9:33 ` Tvrtko Ursulin
2025-05-16 9:53 ` Danilo Krummrich
@ 2025-05-16 9:54 ` Philipp Stanner
2025-05-16 10:34 ` Tvrtko Ursulin
1 sibling, 1 reply; 24+ messages in thread
From: Philipp Stanner @ 2025-05-16 9:54 UTC (permalink / raw)
To: Tvrtko Ursulin, Philipp Stanner, Lyude Paul, Danilo Krummrich,
David Airlie, Simona Vetter, Matthew Brost, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel, Danilo Krummrich
On Fri, 2025-05-16 at 10:33 +0100, Tvrtko Ursulin wrote:
>
> On 24/04/2025 10:55, Philipp Stanner wrote:
> > The waitqueue that ensures that drm_sched_fini() blocks until the
> > pending_list has become empty could theoretically cause that
> > function to
> > block for a very long time. That, ultimately, could block userspace
> > procesess and prevent them from being killable through SIGKILL.
> >
> > When a driver calls drm_sched_fini(), it is safe to assume that all
> > still pending jobs are not needed anymore anyways. Thus, they can
> > be
> > cancelled and thereby it can be ensured that drm_sched_fini() will
> > return relatively quickly.
> >
> > Implement a new helper to stop all work items / submission except
> > for
> > the drm_sched_backend_ops.run_job().
> >
> > Implement a driver callback, kill_fence_context(), that instructs
> > the
> > driver to kill the fence context associated with this scheduler,
> > thereby
> > causing all pending hardware fences to be signalled.
> >
> > Call those new routines in drm_sched_fini() and ensure backwards
> > compatibility if the new callback is not implemented.
> >
> > Suggested-by: Danilo Krummrich <dakr@redhat.com>
> > Signed-off-by: Philipp Stanner <phasta@kernel.org>
> > ---
> > drivers/gpu/drm/scheduler/sched_main.c | 47 +++++++++++++++++----
> > -----
> > include/drm/gpu_scheduler.h | 11 ++++++
> > 2 files changed, 42 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> > b/drivers/gpu/drm/scheduler/sched_main.c
> > index ea82e69a72a8..c2ad6c70bfb6 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -1390,31 +1390,46 @@ drm_sched_no_jobs_pending(struct
> > drm_gpu_scheduler *sched)
> > return empty;
> > }
> >
> > +/**
> > + * drm_sched_cancel_jobs_and_wait - trigger freeing of all pending
> > jobs
> > + * @sched: scheduler instance
> > + *
> > + * Must only be called if &struct
> > drm_sched_backend_ops.kill_fence_context is
> > + * implemented.
> > + *
> > + * Instructs the driver to kill the fence context associated with
> > this scheduler,
> > + * thereby signalling all pending fences. This, in turn, will
> > trigger
> > + * &struct drm_sched_backend_ops.free_job to be called for all
> > pending jobs.
> > + * The function then blocks until all pending jobs have been
> > freed.
> > + */
> > +static inline void
> > +drm_sched_cancel_jobs_and_wait(struct drm_gpu_scheduler *sched)
> > +{
> > + sched->ops->kill_fence_context(sched);
> > + wait_event(sched->pending_list_waitque,
> > drm_sched_no_jobs_pending(sched));
> > +}
> > +
> > /**
> > * drm_sched_fini - Destroy a gpu scheduler
> > *
> > * @sched: scheduler instance
> > *
> > - * Tears down and cleans up the scheduler.
> > - *
> > - * Note that this function blocks until all the fences returned by
> > - * &struct drm_sched_backend_ops.run_job have been signalled.
> > + * Tears down and cleans up the scheduler. Might leak memory if
> > + * &struct drm_sched_backend_ops.kill_fence_context is not
> > implemented.
> > */
> > void drm_sched_fini(struct drm_gpu_scheduler *sched)
> > {
> > struct drm_sched_entity *s_entity;
> > int i;
> >
> > - /*
> > - * Jobs that have neither been scheduled or which have
> > timed out are
> > - * gone by now, but jobs that have been submitted through
> > - * backend_ops.run_job() and have not yet terminated are
> > still pending.
> > - *
> > - * Wait for the pending_list to become empty to avoid
> > leaking those jobs.
> > - */
> > - drm_sched_submission_and_timeout_stop(sched);
> > - wait_event(sched->pending_list_waitque,
> > drm_sched_no_jobs_pending(sched));
> > - drm_sched_free_stop(sched);
> > + if (sched->ops->kill_fence_context) {
> > + drm_sched_submission_and_timeout_stop(sched);
> > + drm_sched_cancel_jobs_and_wait(sched);
> > + drm_sched_free_stop(sched);
> > + } else {
> > + /* We're in "legacy free-mode" and ignore
> > potential mem leaks */
> > + drm_sched_wqueue_stop(sched);
> > + }
> >
> > for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs;
> > i++) {
> > struct drm_sched_rq *rq = sched->sched_rq[i];
> > @@ -1502,7 +1517,7 @@ bool drm_sched_wqueue_ready(struct
> > drm_gpu_scheduler *sched)
> > EXPORT_SYMBOL(drm_sched_wqueue_ready);
> >
> > /**
> > - * drm_sched_wqueue_stop - stop scheduler submission
> > + * drm_sched_wqueue_stop - stop scheduler submission and freeing
>
> Looks like the kerneldoc corrections (below too) belong to the
> previous
> patch. Irrelevant if you decide to squash them though.
>
> > * @sched: scheduler instance
> > *
> > * Stops the scheduler from pulling new jobs from entities. It
> > also stops
> > @@ -1518,7 +1533,7 @@ void drm_sched_wqueue_stop(struct
> > drm_gpu_scheduler *sched)
> > EXPORT_SYMBOL(drm_sched_wqueue_stop);
> >
> > /**
> > - * drm_sched_wqueue_start - start scheduler submission
> > + * drm_sched_wqueue_start - start scheduler submission and freeing
> > * @sched: scheduler instance
> > *
> > * Restarts the scheduler after drm_sched_wqueue_stop() has
> > stopped it.
> > diff --git a/include/drm/gpu_scheduler.h
> > b/include/drm/gpu_scheduler.h
> > index d0b1f416b4d9..8630b4a26f10 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -509,6 +509,17 @@ struct drm_sched_backend_ops {
> > * and it's time to clean it up.
> > */
> > void (*free_job)(struct drm_sched_job *sched_job);
> > +
> > + /**
> > + * @kill_fence_context: kill the fence context belonging
> > to this scheduler
>
> Which fence context would that be? ;)
>
> Also, "fence context" would be a new terminology in gpu_scheduler.h
> API
> level. You could call it ->sched_fini() or similar to signify at
> which
> point in the API it gets called and then the fact it takes sched as
> parameter would be natural.
>
> We also probably want some commentary on the topic of indefinite (or
> very long at least) blocking a thread exit / SIGINT/TERM/KILL time.
> Cover letter touches upon that problem but I don't see you address
> it.
> Is the idea to let drivers shoot themselves in the foot or what?
You didn't seriously just write that, did you?
Yes, my intention clearly has been since September to make things
worse, more ambiguous and destroy drivers. That's why I'm probably the
only individual after Sima (who added some of the FIXMEs) who ever
bothered documenting all this mess here, and warning people about the
five hundred pitfalls, like three different start and stop functions,
implicit refcount rules and God knows which other insane hacks that
simply move driver problems into common infrastructure.
Reconsider your attitude.
P.
>
> Regards,
>
> Tvrtko
>
> > + *
> > + * Needed to cleanly tear the scheduler down in
> > drm_sched_fini(). This
> > + * callback will cause all hardware fences to be signalled
> > by the driver,
> > + * which, ultimately, ensures that all jobs get freed
> > before teardown.
> > + *
> > + * This callback is optional, but it is highly recommended
> > to implement it.
> > + */
> > + void (*kill_fence_context)(struct drm_gpu_scheduler
> > *sched);
> > };
> >
> > /**
>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long
2025-05-16 9:54 ` Philipp Stanner
@ 2025-05-16 10:34 ` Tvrtko Ursulin
0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-16 10:34 UTC (permalink / raw)
To: phasta, Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel, Danilo Krummrich
On 16/05/2025 10:54, Philipp Stanner wrote:
> On Fri, 2025-05-16 at 10:33 +0100, Tvrtko Ursulin wrote:
>>
>> On 24/04/2025 10:55, Philipp Stanner wrote:
>>> The waitqueue that ensures that drm_sched_fini() blocks until the
>>> pending_list has become empty could theoretically cause that
>>> function to
>>> block for a very long time. That, ultimately, could block userspace
>>> procesess and prevent them from being killable through SIGKILL.
>>>
>>> When a driver calls drm_sched_fini(), it is safe to assume that all
>>> still pending jobs are not needed anymore anyways. Thus, they can
>>> be
>>> cancelled and thereby it can be ensured that drm_sched_fini() will
>>> return relatively quickly.
>>>
>>> Implement a new helper to stop all work items / submission except
>>> for
>>> the drm_sched_backend_ops.run_job().
>>>
>>> Implement a driver callback, kill_fence_context(), that instructs
>>> the
>>> driver to kill the fence context associated with this scheduler,
>>> thereby
>>> causing all pending hardware fences to be signalled.
>>>
>>> Call those new routines in drm_sched_fini() and ensure backwards
>>> compatibility if the new callback is not implemented.
>>>
>>> Suggested-by: Danilo Krummrich <dakr@redhat.com>
>>> Signed-off-by: Philipp Stanner <phasta@kernel.org>
>>> ---
>>> drivers/gpu/drm/scheduler/sched_main.c | 47 +++++++++++++++++----
>>> -----
>>> include/drm/gpu_scheduler.h | 11 ++++++
>>> 2 files changed, 42 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>> index ea82e69a72a8..c2ad6c70bfb6 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -1390,31 +1390,46 @@ drm_sched_no_jobs_pending(struct
>>> drm_gpu_scheduler *sched)
>>> return empty;
>>> }
>>>
>>> +/**
>>> + * drm_sched_cancel_jobs_and_wait - trigger freeing of all pending
>>> jobs
>>> + * @sched: scheduler instance
>>> + *
>>> + * Must only be called if &struct
>>> drm_sched_backend_ops.kill_fence_context is
>>> + * implemented.
>>> + *
>>> + * Instructs the driver to kill the fence context associated with
>>> this scheduler,
>>> + * thereby signalling all pending fences. This, in turn, will
>>> trigger
>>> + * &struct drm_sched_backend_ops.free_job to be called for all
>>> pending jobs.
>>> + * The function then blocks until all pending jobs have been
>>> freed.
>>> + */
>>> +static inline void
>>> +drm_sched_cancel_jobs_and_wait(struct drm_gpu_scheduler *sched)
>>> +{
>>> + sched->ops->kill_fence_context(sched);
>>> + wait_event(sched->pending_list_waitque,
>>> drm_sched_no_jobs_pending(sched));
>>> +}
>>> +
>>> /**
>>> * drm_sched_fini - Destroy a gpu scheduler
>>> *
>>> * @sched: scheduler instance
>>> *
>>> - * Tears down and cleans up the scheduler.
>>> - *
>>> - * Note that this function blocks until all the fences returned by
>>> - * &struct drm_sched_backend_ops.run_job have been signalled.
>>> + * Tears down and cleans up the scheduler. Might leak memory if
>>> + * &struct drm_sched_backend_ops.kill_fence_context is not
>>> implemented.
>>> */
>>> void drm_sched_fini(struct drm_gpu_scheduler *sched)
>>> {
>>> struct drm_sched_entity *s_entity;
>>> int i;
>>>
>>> - /*
>>> - * Jobs that have neither been scheduled or which have
>>> timed out are
>>> - * gone by now, but jobs that have been submitted through
>>> - * backend_ops.run_job() and have not yet terminated are
>>> still pending.
>>> - *
>>> - * Wait for the pending_list to become empty to avoid
>>> leaking those jobs.
>>> - */
>>> - drm_sched_submission_and_timeout_stop(sched);
>>> - wait_event(sched->pending_list_waitque,
>>> drm_sched_no_jobs_pending(sched));
>>> - drm_sched_free_stop(sched);
>>> + if (sched->ops->kill_fence_context) {
>>> + drm_sched_submission_and_timeout_stop(sched);
>>> + drm_sched_cancel_jobs_and_wait(sched);
>>> + drm_sched_free_stop(sched);
>>> + } else {
>>> + /* We're in "legacy free-mode" and ignore
>>> potential mem leaks */
>>> + drm_sched_wqueue_stop(sched);
>>> + }
>>>
>>> for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs;
>>> i++) {
>>> struct drm_sched_rq *rq = sched->sched_rq[i];
>>> @@ -1502,7 +1517,7 @@ bool drm_sched_wqueue_ready(struct
>>> drm_gpu_scheduler *sched)
>>> EXPORT_SYMBOL(drm_sched_wqueue_ready);
>>>
>>> /**
>>> - * drm_sched_wqueue_stop - stop scheduler submission
>>> + * drm_sched_wqueue_stop - stop scheduler submission and freeing
>>
>> Looks like the kerneldoc corrections (below too) belong to the
>> previous
>> patch. Irrelevant if you decide to squash them though.
>>
>>> * @sched: scheduler instance
>>> *
>>> * Stops the scheduler from pulling new jobs from entities. It
>>> also stops
>>> @@ -1518,7 +1533,7 @@ void drm_sched_wqueue_stop(struct
>>> drm_gpu_scheduler *sched)
>>> EXPORT_SYMBOL(drm_sched_wqueue_stop);
>>>
>>> /**
>>> - * drm_sched_wqueue_start - start scheduler submission
>>> + * drm_sched_wqueue_start - start scheduler submission and freeing
>>> * @sched: scheduler instance
>>> *
>>> * Restarts the scheduler after drm_sched_wqueue_stop() has
>>> stopped it.
>>> diff --git a/include/drm/gpu_scheduler.h
>>> b/include/drm/gpu_scheduler.h
>>> index d0b1f416b4d9..8630b4a26f10 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -509,6 +509,17 @@ struct drm_sched_backend_ops {
>>> * and it's time to clean it up.
>>> */
>>> void (*free_job)(struct drm_sched_job *sched_job);
>>> +
>>> + /**
>>> + * @kill_fence_context: kill the fence context belonging
>>> to this scheduler
>>
>> Which fence context would that be? ;)
>>
>> Also, "fence context" would be a new terminology in gpu_scheduler.h
>> API
>> level. You could call it ->sched_fini() or similar to signify at
>> which
>> point in the API it gets called and then the fact it takes sched as
>> parameter would be natural.
>>
>> We also probably want some commentary on the topic of indefinite (or
>> very long at least) blocking a thread exit / SIGINT/TERM/KILL time.
>> Cover letter touches upon that problem but I don't see you address
>> it.
>> Is the idea to let drivers shoot themselves in the foot or what?
>
> You didn't seriously just write that, did you?
>
> Yes, my intention clearly has been since September to make things
> worse, more ambiguous and destroy drivers. That's why I'm probably the
> only individual after Sima (who added some of the FIXMEs) who ever
> bothered documenting all this mess here, and warning people about the
> five hundred pitfalls, like three different start and stop functions,
> implicit refcount rules and God knows which other insane hacks that
> simply move driver problems into common infrastructure.
>
> Reconsider your attitude.
I don't know what kind of attitude you think I was expressing. If the
last sentence was a hyperbole too much for you then sorry. Point was in
the paragraph ending with that - to document callback must not block, or
if it has to to discuss for how long. And to perhaps discuss if
scheduler code should impose a limit. Otherwise the indefinite block on
thread exit from the cover letter can still happen. I don't see raising
those point is a criticism of your overall work. (Fact that we don't see
eye to eye regarding more state machine vs the ->cancel_job()
alternative aside.)
Regards,
Tvrtko
>
> P.
>
>>
>> Regards,
>>
>> Tvrtko
>>
>>> + *
>>> + * Needed to cleanly tear the scheduler down in
>>> drm_sched_fini(). This
>>> + * callback will cause all hardware fences to be signalled
>>> by the driver,
>>> + * which, ultimately, ensures that all jobs get freed
>>> before teardown.
>>> + *
>>> + * This callback is optional, but it is highly recommended
>>> to implement it.
>>> + */
>>> + void (*kill_fence_context)(struct drm_gpu_scheduler
>>> *sched);
>>> };
>>>
>>> /**
>>
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v2 3/6] drm/sched: Warn if pending list is not empty
2025-04-24 9:55 [PATCH v2 0/6] drm/sched: Fix memory leaks in drm_sched_fini() Philipp Stanner
2025-04-24 9:55 ` [PATCH v2 1/6] drm/sched: Fix teardown leaks with waitqueue Philipp Stanner
2025-04-24 9:55 ` [PATCH v2 2/6] drm/sched: Prevent teardown waitque from blocking too long Philipp Stanner
@ 2025-04-24 9:55 ` Philipp Stanner
2025-05-16 9:40 ` Tvrtko Ursulin
2025-04-24 9:55 ` [PATCH v2 4/6] drm/nouveau: Add new callback for scheduler teardown Philipp Stanner
` (2 subsequent siblings)
5 siblings, 1 reply; 24+ messages in thread
From: Philipp Stanner @ 2025-04-24 9:55 UTC (permalink / raw)
To: Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Philipp Stanner, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
Tvrtko Ursulin
Cc: dri-devel, nouveau, linux-kernel
drm_sched_fini() can leak jobs under certain circumstances.
Warn if that happens.
Signed-off-by: Philipp Stanner <phasta@kernel.org>
---
drivers/gpu/drm/scheduler/sched_main.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index c2ad6c70bfb6..0c56b85c574f 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -1457,6 +1457,9 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
sched->ready = false;
kfree(sched->sched_rq);
sched->sched_rq = NULL;
+
+ if (!list_empty(&sched->pending_list))
+ dev_err(sched->dev, "Tearing down scheduler while jobs are pending!\n");
}
EXPORT_SYMBOL(drm_sched_fini);
--
2.48.1
^ permalink raw reply related [flat|nested] 24+ messages in thread* Re: [PATCH v2 3/6] drm/sched: Warn if pending list is not empty
2025-04-24 9:55 ` [PATCH v2 3/6] drm/sched: Warn if pending list is not empty Philipp Stanner
@ 2025-05-16 9:40 ` Tvrtko Ursulin
0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-16 9:40 UTC (permalink / raw)
To: Philipp Stanner, Lyude Paul, Danilo Krummrich, David Airlie,
Simona Vetter, Matthew Brost, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel
On 24/04/2025 10:55, Philipp Stanner wrote:
> drm_sched_fini() can leak jobs under certain circumstances.
>
> Warn if that happens.
>
> Signed-off-by: Philipp Stanner <phasta@kernel.org>
> ---
> drivers/gpu/drm/scheduler/sched_main.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index c2ad6c70bfb6..0c56b85c574f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -1457,6 +1457,9 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
> sched->ready = false;
> kfree(sched->sched_rq);
> sched->sched_rq = NULL;
> +
> + if (!list_empty(&sched->pending_list))
> + dev_err(sched->dev, "Tearing down scheduler while jobs are pending!\n");
Is this expected to trigger for many drivers? In that case I am not sure
if it helps anyone, apart from generating bug tracker entries across the
world, for the issue we know about and could work
"quietly" on addressing it? Only if you think we don't really know who
leaks and who doesn't and this will help figure out.
Hm wait, at this point in the series it would fire incorrectly for the
mock scheduler. It should go last in the series at minimum.
Regards,
Tvrtko
> }
> EXPORT_SYMBOL(drm_sched_fini);
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v2 4/6] drm/nouveau: Add new callback for scheduler teardown
2025-04-24 9:55 [PATCH v2 0/6] drm/sched: Fix memory leaks in drm_sched_fini() Philipp Stanner
` (2 preceding siblings ...)
2025-04-24 9:55 ` [PATCH v2 3/6] drm/sched: Warn if pending list is not empty Philipp Stanner
@ 2025-04-24 9:55 ` Philipp Stanner
2025-04-24 9:55 ` [PATCH v2 5/6] drm/nouveau: Remove waitque for sched teardown Philipp Stanner
2025-04-24 9:55 ` [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design Philipp Stanner
5 siblings, 0 replies; 24+ messages in thread
From: Philipp Stanner @ 2025-04-24 9:55 UTC (permalink / raw)
To: Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Philipp Stanner, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
Tvrtko Ursulin
Cc: dri-devel, nouveau, linux-kernel
There is a new callback for always tearing the scheduler down in a
leak-free, deadlock-free manner.
Port Nouveau as its first user by providing the scheduler with a
callback that ensures the fence context gets killed in drm_sched_fini().
Signed-off-by: Philipp Stanner <phasta@kernel.org>
---
drivers/gpu/drm/nouveau/nouveau_abi16.c | 4 ++--
drivers/gpu/drm/nouveau/nouveau_drm.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_sched.c | 19 +++++++++++++++++--
drivers/gpu/drm/nouveau/nouveau_sched.h | 3 +++
4 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c
index 2a0617e5fe2a..53b8a85a8adb 100644
--- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
+++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
@@ -415,8 +415,8 @@ nouveau_abi16_ioctl_channel_alloc(ABI16_IOCTL_ARGS)
* The client lock is already acquired by nouveau_abi16_get().
*/
if (nouveau_cli_uvmm(cli)) {
- ret = nouveau_sched_create(&chan->sched, drm, drm->sched_wq,
- chan->chan->dma.ib_max);
+ ret = nouveau_sched_create(&chan->sched, drm, chan->chan,
+ drm->sched_wq, chan->chan->dma.ib_max);
if (ret)
goto done;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index c69139701056..2a2b319dca5f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -313,7 +313,7 @@ nouveau_cli_init(struct nouveau_drm *drm, const char *sname,
* locks which indirectly or directly are held for allocations
* elsewhere.
*/
- ret = nouveau_sched_create(&cli->sched, drm, NULL, 1);
+ ret = nouveau_sched_create(&cli->sched, drm, NULL, NULL, 1);
if (ret)
goto done;
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c
index d326e55d2d24..3659ac78bb3e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -11,6 +11,7 @@
#include "nouveau_exec.h"
#include "nouveau_abi16.h"
#include "nouveau_sched.h"
+#include "nouveau_chan.h"
#define NOUVEAU_SCHED_JOB_TIMEOUT_MS 10000
@@ -392,10 +393,22 @@ nouveau_sched_free_job(struct drm_sched_job *sched_job)
nouveau_job_fini(job);
}
+static void
+nouveau_sched_fence_context_kill(struct drm_gpu_scheduler *sched)
+{
+ struct nouveau_sched *nsched;
+
+ nsched = container_of(sched, struct nouveau_sched, base);
+
+ if (nsched->chan)
+ nouveau_channel_kill(nsched->chan);
+}
+
static const struct drm_sched_backend_ops nouveau_sched_ops = {
.run_job = nouveau_sched_run_job,
.timedout_job = nouveau_sched_timedout_job,
.free_job = nouveau_sched_free_job,
+ .kill_fence_context = nouveau_sched_fence_context_kill,
};
static int
@@ -461,7 +474,8 @@ nouveau_sched_init(struct nouveau_sched *sched, struct nouveau_drm *drm,
int
nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm,
- struct workqueue_struct *wq, u32 credit_limit)
+ struct nouveau_channel *chan, struct workqueue_struct *wq,
+ u32 credit_limit)
{
struct nouveau_sched *sched;
int ret;
@@ -470,6 +484,8 @@ nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm,
if (!sched)
return -ENOMEM;
+ sched->chan = chan;
+
ret = nouveau_sched_init(sched, drm, wq, credit_limit);
if (ret) {
kfree(sched);
@@ -481,7 +497,6 @@ nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm,
return 0;
}
-
static void
nouveau_sched_fini(struct nouveau_sched *sched)
{
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h b/drivers/gpu/drm/nouveau/nouveau_sched.h
index 20cd1da8db73..e6e2016a3569 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.h
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
@@ -9,6 +9,7 @@
#include <drm/gpu_scheduler.h>
#include "nouveau_drv.h"
+#include "nouveau_chan.h"
#define to_nouveau_job(sched_job) \
container_of((sched_job), struct nouveau_job, base)
@@ -101,6 +102,7 @@ struct nouveau_sched {
struct drm_sched_entity entity;
struct workqueue_struct *wq;
struct mutex mutex;
+ struct nouveau_channel *chan;
struct {
struct {
@@ -112,6 +114,7 @@ struct nouveau_sched {
};
int nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm,
+ struct nouveau_channel *chan,
struct workqueue_struct *wq, u32 credit_limit);
void nouveau_sched_destroy(struct nouveau_sched **psched);
--
2.48.1
^ permalink raw reply related [flat|nested] 24+ messages in thread* [PATCH v2 5/6] drm/nouveau: Remove waitque for sched teardown
2025-04-24 9:55 [PATCH v2 0/6] drm/sched: Fix memory leaks in drm_sched_fini() Philipp Stanner
` (3 preceding siblings ...)
2025-04-24 9:55 ` [PATCH v2 4/6] drm/nouveau: Add new callback for scheduler teardown Philipp Stanner
@ 2025-04-24 9:55 ` Philipp Stanner
2025-04-24 9:55 ` [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design Philipp Stanner
5 siblings, 0 replies; 24+ messages in thread
From: Philipp Stanner @ 2025-04-24 9:55 UTC (permalink / raw)
To: Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Philipp Stanner, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
Tvrtko Ursulin
Cc: dri-devel, nouveau, linux-kernel
struct nouveau_sched contains a waitque needed to prevent
drm_sched_fini() from being called while there are still jobs pending.
Doing so so far would have caused memory leaks.
With the new memleak-free mode of operation switched on in
drm_sched_fini() by providing the callback
nouveau_sched_fence_context_kill() the waitque is not necessary anymore.
Remove the waitque.
Signed-off-by: Philipp Stanner <phasta@kernel.org>
---
drivers/gpu/drm/nouveau/nouveau_sched.c | 20 +++++++-------------
drivers/gpu/drm/nouveau/nouveau_sched.h | 9 +++------
drivers/gpu/drm/nouveau/nouveau_uvmm.c | 8 ++++----
3 files changed, 14 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c
index 3659ac78bb3e..d9ac76198616 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -121,11 +121,9 @@ nouveau_job_done(struct nouveau_job *job)
{
struct nouveau_sched *sched = job->sched;
- spin_lock(&sched->job.list.lock);
+ spin_lock(&sched->job_list.lock);
list_del(&job->entry);
- spin_unlock(&sched->job.list.lock);
-
- wake_up(&sched->job.wq);
+ spin_unlock(&sched->job_list.lock);
}
void
@@ -306,9 +304,9 @@ nouveau_job_submit(struct nouveau_job *job)
}
/* Submit was successful; add the job to the schedulers job list. */
- spin_lock(&sched->job.list.lock);
- list_add(&job->entry, &sched->job.list.head);
- spin_unlock(&sched->job.list.lock);
+ spin_lock(&sched->job_list.lock);
+ list_add(&job->entry, &sched->job_list.head);
+ spin_unlock(&sched->job_list.lock);
drm_sched_job_arm(&job->base);
job->done_fence = dma_fence_get(&job->base.s_fence->finished);
@@ -458,9 +456,8 @@ nouveau_sched_init(struct nouveau_sched *sched, struct nouveau_drm *drm,
goto fail_sched;
mutex_init(&sched->mutex);
- spin_lock_init(&sched->job.list.lock);
- INIT_LIST_HEAD(&sched->job.list.head);
- init_waitqueue_head(&sched->job.wq);
+ spin_lock_init(&sched->job_list.lock);
+ INIT_LIST_HEAD(&sched->job_list.head);
return 0;
@@ -503,9 +500,6 @@ nouveau_sched_fini(struct nouveau_sched *sched)
struct drm_gpu_scheduler *drm_sched = &sched->base;
struct drm_sched_entity *entity = &sched->entity;
- rmb(); /* for list_empty to work without lock */
- wait_event(sched->job.wq, list_empty(&sched->job.list.head));
-
drm_sched_entity_fini(entity);
drm_sched_fini(drm_sched);
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h b/drivers/gpu/drm/nouveau/nouveau_sched.h
index e6e2016a3569..339a14563fbb 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.h
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
@@ -105,12 +105,9 @@ struct nouveau_sched {
struct nouveau_channel *chan;
struct {
- struct {
- struct list_head head;
- spinlock_t lock;
- } list;
- struct wait_queue_head wq;
- } job;
+ struct list_head head;
+ spinlock_t lock;
+ } job_list;
};
int nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm,
diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
index 48f105239f42..ddfc46bc1b3e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
@@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range)
u64 end = addr + range;
again:
- spin_lock(&sched->job.list.lock);
- list_for_each_entry(__job, &sched->job.list.head, entry) {
+ spin_lock(&sched->job_list.lock);
+ list_for_each_entry(__job, &sched->job_list.head, entry) {
struct nouveau_uvmm_bind_job *bind_job = to_uvmm_bind_job(__job);
list_for_each_op(op, &bind_job->ops) {
@@ -1030,7 +1030,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range)
if (!(end <= op_addr || addr >= op_end)) {
nouveau_uvmm_bind_job_get(bind_job);
- spin_unlock(&sched->job.list.lock);
+ spin_unlock(&sched->job_list.lock);
wait_for_completion(&bind_job->complete);
nouveau_uvmm_bind_job_put(bind_job);
goto again;
@@ -1038,7 +1038,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range)
}
}
}
- spin_unlock(&sched->job.list.lock);
+ spin_unlock(&sched->job_list.lock);
}
static int
--
2.48.1
^ permalink raw reply related [flat|nested] 24+ messages in thread* [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design
2025-04-24 9:55 [PATCH v2 0/6] drm/sched: Fix memory leaks in drm_sched_fini() Philipp Stanner
` (4 preceding siblings ...)
2025-04-24 9:55 ` [PATCH v2 5/6] drm/nouveau: Remove waitque for sched teardown Philipp Stanner
@ 2025-04-24 9:55 ` Philipp Stanner
2025-05-08 11:03 ` Philipp Stanner
5 siblings, 1 reply; 24+ messages in thread
From: Philipp Stanner @ 2025-04-24 9:55 UTC (permalink / raw)
To: Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Philipp Stanner, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
Tvrtko Ursulin
Cc: dri-devel, nouveau, linux-kernel
The unit tests so far took care manually of avoiding memory leaks that
might have occurred when calling drm_sched_fini().
The scheduler now takes care by itself of avoiding memory leaks if the
driver provides the callback drm_sched_backend_ops.kill_fence_context().
Implement that callback for the unit tests. Remove the manual cleanup
code.
Signed-off-by: Philipp Stanner <phasta@kernel.org>
---
.../gpu/drm/scheduler/tests/mock_scheduler.c | 34 ++++++++++++-------
1 file changed, 21 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
index f999c8859cf7..a72d26ca8262 100644
--- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
+++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
@@ -228,10 +228,30 @@ static void mock_sched_free_job(struct drm_sched_job *sched_job)
/* Mock job itself is freed by the kunit framework. */
}
+static void mock_sched_fence_context_kill(struct drm_gpu_scheduler *gpu_sched)
+{
+ struct drm_mock_scheduler *sched = drm_sched_to_mock_sched(gpu_sched);
+ struct drm_mock_sched_job *job;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sched->lock, flags);
+ list_for_each_entry(job, &sched->job_list, link) {
+ spin_lock(&job->lock);
+ if (!dma_fence_is_signaled_locked(&job->hw_fence)) {
+ dma_fence_set_error(&job->hw_fence, -ECANCELED);
+ dma_fence_signal_locked(&job->hw_fence);
+ }
+ complete(&job->done);
+ spin_unlock(&job->lock);
+ }
+ spin_unlock_irqrestore(&sched->lock, flags);
+}
+
static const struct drm_sched_backend_ops drm_mock_scheduler_ops = {
.run_job = mock_sched_run_job,
.timedout_job = mock_sched_timedout_job,
- .free_job = mock_sched_free_job
+ .free_job = mock_sched_free_job,
+ .kill_fence_context = mock_sched_fence_context_kill,
};
/**
@@ -300,18 +320,6 @@ void drm_mock_sched_fini(struct drm_mock_scheduler *sched)
drm_mock_sched_job_complete(job);
spin_unlock_irqrestore(&sched->lock, flags);
- /*
- * Free completed jobs and jobs not yet processed by the DRM scheduler
- * free worker.
- */
- spin_lock_irqsave(&sched->lock, flags);
- list_for_each_entry_safe(job, next, &sched->done_list, link)
- list_move_tail(&job->link, &list);
- spin_unlock_irqrestore(&sched->lock, flags);
-
- list_for_each_entry_safe(job, next, &list, link)
- mock_sched_free_job(&job->base);
-
drm_sched_fini(&sched->base);
}
--
2.48.1
^ permalink raw reply related [flat|nested] 24+ messages in thread* Re: [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design
2025-04-24 9:55 ` [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design Philipp Stanner
@ 2025-05-08 11:03 ` Philipp Stanner
2025-05-08 12:51 ` Tvrtko Ursulin
0 siblings, 1 reply; 24+ messages in thread
From: Philipp Stanner @ 2025-05-08 11:03 UTC (permalink / raw)
To: Philipp Stanner, Lyude Paul, Danilo Krummrich, David Airlie,
Simona Vetter, Matthew Brost, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
Tvrtko Ursulin
Cc: dri-devel, nouveau, linux-kernel
On Thu, 2025-04-24 at 11:55 +0200, Philipp Stanner wrote:
> The unit tests so far took care manually of avoiding memory leaks
> that
> might have occurred when calling drm_sched_fini().
>
> The scheduler now takes care by itself of avoiding memory leaks if
> the
> driver provides the callback
> drm_sched_backend_ops.kill_fence_context().
>
> Implement that callback for the unit tests. Remove the manual cleanup
> code.
@Tvrtko: On a scale from 1-10, how much do you love this patch? :)
P.
>
> Signed-off-by: Philipp Stanner <phasta@kernel.org>
> ---
> .../gpu/drm/scheduler/tests/mock_scheduler.c | 34 ++++++++++++-----
> --
> 1 file changed, 21 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> index f999c8859cf7..a72d26ca8262 100644
> --- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> +++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> @@ -228,10 +228,30 @@ static void mock_sched_free_job(struct
> drm_sched_job *sched_job)
> /* Mock job itself is freed by the kunit framework. */
> }
>
> +static void mock_sched_fence_context_kill(struct drm_gpu_scheduler
> *gpu_sched)
> +{
> + struct drm_mock_scheduler *sched =
> drm_sched_to_mock_sched(gpu_sched);
> + struct drm_mock_sched_job *job;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&sched->lock, flags);
> + list_for_each_entry(job, &sched->job_list, link) {
> + spin_lock(&job->lock);
> + if (!dma_fence_is_signaled_locked(&job->hw_fence)) {
> + dma_fence_set_error(&job->hw_fence, -
> ECANCELED);
> + dma_fence_signal_locked(&job->hw_fence);
> + }
> + complete(&job->done);
> + spin_unlock(&job->lock);
> + }
> + spin_unlock_irqrestore(&sched->lock, flags);
> +}
> +
> static const struct drm_sched_backend_ops drm_mock_scheduler_ops = {
> .run_job = mock_sched_run_job,
> .timedout_job = mock_sched_timedout_job,
> - .free_job = mock_sched_free_job
> + .free_job = mock_sched_free_job,
> + .kill_fence_context = mock_sched_fence_context_kill,
> };
>
> /**
> @@ -300,18 +320,6 @@ void drm_mock_sched_fini(struct
> drm_mock_scheduler *sched)
> drm_mock_sched_job_complete(job);
> spin_unlock_irqrestore(&sched->lock, flags);
>
> - /*
> - * Free completed jobs and jobs not yet processed by the DRM
> scheduler
> - * free worker.
> - */
> - spin_lock_irqsave(&sched->lock, flags);
> - list_for_each_entry_safe(job, next, &sched->done_list, link)
> - list_move_tail(&job->link, &list);
> - spin_unlock_irqrestore(&sched->lock, flags);
> -
> - list_for_each_entry_safe(job, next, &list, link)
> - mock_sched_free_job(&job->base);
> -
> drm_sched_fini(&sched->base);
> }
>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design
2025-05-08 11:03 ` Philipp Stanner
@ 2025-05-08 12:51 ` Tvrtko Ursulin
2025-05-12 8:00 ` Philipp Stanner
0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-08 12:51 UTC (permalink / raw)
To: phasta, Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel
Hi Philipp,
On 08/05/2025 12:03, Philipp Stanner wrote:
> On Thu, 2025-04-24 at 11:55 +0200, Philipp Stanner wrote:
>> The unit tests so far took care manually of avoiding memory leaks
>> that
>> might have occurred when calling drm_sched_fini().
>>
>> The scheduler now takes care by itself of avoiding memory leaks if
>> the
>> driver provides the callback
>> drm_sched_backend_ops.kill_fence_context().
>>
>> Implement that callback for the unit tests. Remove the manual cleanup
>> code.
>
> @Tvrtko: On a scale from 1-10, how much do you love this patch? :)
Specific patch aside, it is the series as a whole I would like to be
sure there isn't a more elegant way to achieve the same end result.
Like that sketch of a counter proposal I sent for the reasons listed
with it. Which were, AFAIR, to avoid needing to add more state machine,
to avoid mandating drivers have to keep an internal list, and to align
better with the existing prototypes in the sched ops table (where
everything operates on jobs).
Regards,
Tvrtko
>> Signed-off-by: Philipp Stanner <phasta@kernel.org>
>> ---
>> .../gpu/drm/scheduler/tests/mock_scheduler.c | 34 ++++++++++++-----
>> --
>> 1 file changed, 21 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>> b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>> index f999c8859cf7..a72d26ca8262 100644
>> --- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>> +++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>> @@ -228,10 +228,30 @@ static void mock_sched_free_job(struct
>> drm_sched_job *sched_job)
>> /* Mock job itself is freed by the kunit framework. */
>> }
>>
>> +static void mock_sched_fence_context_kill(struct drm_gpu_scheduler
>> *gpu_sched)
>> +{
>> + struct drm_mock_scheduler *sched =
>> drm_sched_to_mock_sched(gpu_sched);
>> + struct drm_mock_sched_job *job;
>> + unsigned long flags;
>> +
>> + spin_lock_irqsave(&sched->lock, flags);
>> + list_for_each_entry(job, &sched->job_list, link) {
>> + spin_lock(&job->lock);
>> + if (!dma_fence_is_signaled_locked(&job->hw_fence)) {
>> + dma_fence_set_error(&job->hw_fence, -
>> ECANCELED);
>> + dma_fence_signal_locked(&job->hw_fence);
>> + }
>> + complete(&job->done);
>> + spin_unlock(&job->lock);
>> + }
>> + spin_unlock_irqrestore(&sched->lock, flags);
>> +}
>> +
>> static const struct drm_sched_backend_ops drm_mock_scheduler_ops = {
>> .run_job = mock_sched_run_job,
>> .timedout_job = mock_sched_timedout_job,
>> - .free_job = mock_sched_free_job
>> + .free_job = mock_sched_free_job,
>> + .kill_fence_context = mock_sched_fence_context_kill,
>> };
>>
>> /**
>> @@ -300,18 +320,6 @@ void drm_mock_sched_fini(struct
>> drm_mock_scheduler *sched)
>> drm_mock_sched_job_complete(job);
>> spin_unlock_irqrestore(&sched->lock, flags);
>>
>> - /*
>> - * Free completed jobs and jobs not yet processed by the DRM
>> scheduler
>> - * free worker.
>> - */
>> - spin_lock_irqsave(&sched->lock, flags);
>> - list_for_each_entry_safe(job, next, &sched->done_list, link)
>> - list_move_tail(&job->link, &list);
>> - spin_unlock_irqrestore(&sched->lock, flags);
>> -
>> - list_for_each_entry_safe(job, next, &list, link)
>> - mock_sched_free_job(&job->base);
>> -
>> drm_sched_fini(&sched->base);
>> }
>>
>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design
2025-05-08 12:51 ` Tvrtko Ursulin
@ 2025-05-12 8:00 ` Philipp Stanner
2025-05-14 8:30 ` Tvrtko Ursulin
0 siblings, 1 reply; 24+ messages in thread
From: Philipp Stanner @ 2025-05-12 8:00 UTC (permalink / raw)
To: Tvrtko Ursulin, phasta, Lyude Paul, Danilo Krummrich,
David Airlie, Simona Vetter, Matthew Brost, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel
On Thu, 2025-05-08 at 13:51 +0100, Tvrtko Ursulin wrote:
>
> Hi Philipp,
>
> On 08/05/2025 12:03, Philipp Stanner wrote:
> > On Thu, 2025-04-24 at 11:55 +0200, Philipp Stanner wrote:
> > > The unit tests so far took care manually of avoiding memory leaks
> > > that
> > > might have occurred when calling drm_sched_fini().
> > >
> > > The scheduler now takes care by itself of avoiding memory leaks
> > > if
> > > the
> > > driver provides the callback
> > > drm_sched_backend_ops.kill_fence_context().
> > >
> > > Implement that callback for the unit tests. Remove the manual
> > > cleanup
> > > code.
> >
> > @Tvrtko: On a scale from 1-10, how much do you love this patch? :)
>
> Specific patch aside, it is the series as a whole I would like to be
> sure there isn't a more elegant way to achieve the same end result.
I count this as a 9/10 \o/
But jokes aside:
>
> Like that sketch of a counter proposal I sent for the reasons listed
> with it. Which were, AFAIR, to avoid needing to add more state
> machine,
Well the state machine added is basically just the waitqueue. The
WRITE_ONCE booleans are currently just for correctness and clarity.
I've looked at them and want to remove them all in an other patch,
because I think they're not needed (workqueue handles that)
But yes, the added state is > 0
> to avoid mandating drivers have to keep an internal list,
That's not mandated by the scheduler, but by logic itself. All drivers
need to have a list of on-flight fences. Otherwise the drivers would
have no chance of signaling those fences once their GPU tells them to
do so.
I have now provided two users of the new API, nouveau and the unit
tests. Can you think of a party for which the suggested approach
wouldn't work?
Don't get me wrong, your approach does work and it definitely has its
charm. However, I think what I propose here is syntactically a bit
cleaner because the classical order of a fence first being signaled in
the driver and then the associated job being freed as usual by the
scheduler is guaranteed. IOW, we primarily rely on the signaling path.
Either way, neither your nor my approach would have worked out of the
box in Nouveau without that driver exploding.
> and to align
> better with the existing prototypes in the sched ops table (where
> everything operates on jobs).
That's not a hard criteria IMO. Those are sched_backend_ops, not
sched_job_backend_ops, and prepare_job() already takes a parameter
other than a job.
Cheers,
P.
>
> Regards,
>
> Tvrtko
>
> > > Signed-off-by: Philipp Stanner <phasta@kernel.org>
> > > ---
> > > .../gpu/drm/scheduler/tests/mock_scheduler.c | 34
> > > ++++++++++++-----
> > > --
> > > 1 file changed, 21 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> > > b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> > > index f999c8859cf7..a72d26ca8262 100644
> > > --- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> > > +++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> > > @@ -228,10 +228,30 @@ static void mock_sched_free_job(struct
> > > drm_sched_job *sched_job)
> > > /* Mock job itself is freed by the kunit framework. */
> > > }
> > >
> > > +static void mock_sched_fence_context_kill(struct
> > > drm_gpu_scheduler
> > > *gpu_sched)
> > > +{
> > > + struct drm_mock_scheduler *sched =
> > > drm_sched_to_mock_sched(gpu_sched);
> > > + struct drm_mock_sched_job *job;
> > > + unsigned long flags;
> > > +
> > > + spin_lock_irqsave(&sched->lock, flags);
> > > + list_for_each_entry(job, &sched->job_list, link) {
> > > + spin_lock(&job->lock);
> > > + if (!dma_fence_is_signaled_locked(&job-
> > > >hw_fence)) {
> > > + dma_fence_set_error(&job->hw_fence, -
> > > ECANCELED);
> > > + dma_fence_signal_locked(&job->hw_fence);
> > > + }
> > > + complete(&job->done);
> > > + spin_unlock(&job->lock);
> > > + }
> > > + spin_unlock_irqrestore(&sched->lock, flags);
> > > +}
> > > +
> > > static const struct drm_sched_backend_ops
> > > drm_mock_scheduler_ops = {
> > > .run_job = mock_sched_run_job,
> > > .timedout_job = mock_sched_timedout_job,
> > > - .free_job = mock_sched_free_job
> > > + .free_job = mock_sched_free_job,
> > > + .kill_fence_context = mock_sched_fence_context_kill,
> > > };
> > >
> > > /**
> > > @@ -300,18 +320,6 @@ void drm_mock_sched_fini(struct
> > > drm_mock_scheduler *sched)
> > > drm_mock_sched_job_complete(job);
> > > spin_unlock_irqrestore(&sched->lock, flags);
> > >
> > > - /*
> > > - * Free completed jobs and jobs not yet processed by the
> > > DRM
> > > scheduler
> > > - * free worker.
> > > - */
> > > - spin_lock_irqsave(&sched->lock, flags);
> > > - list_for_each_entry_safe(job, next, &sched->done_list,
> > > link)
> > > - list_move_tail(&job->link, &list);
> > > - spin_unlock_irqrestore(&sched->lock, flags);
> > > -
> > > - list_for_each_entry_safe(job, next, &list, link)
> > > - mock_sched_free_job(&job->base);
> > > -
> > > drm_sched_fini(&sched->base);
> > > }
> > >
> >
>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design
2025-05-12 8:00 ` Philipp Stanner
@ 2025-05-14 8:30 ` Tvrtko Ursulin
2025-05-14 9:19 ` Philipp Stanner
0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-14 8:30 UTC (permalink / raw)
To: phasta, Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel
On 12/05/2025 09:00, Philipp Stanner wrote:
> On Thu, 2025-05-08 at 13:51 +0100, Tvrtko Ursulin wrote:
>>
>> Hi Philipp,
>>
>> On 08/05/2025 12:03, Philipp Stanner wrote:
>>> On Thu, 2025-04-24 at 11:55 +0200, Philipp Stanner wrote:
>>>> The unit tests so far took care manually of avoiding memory leaks
>>>> that
>>>> might have occurred when calling drm_sched_fini().
>>>>
>>>> The scheduler now takes care by itself of avoiding memory leaks
>>>> if
>>>> the
>>>> driver provides the callback
>>>> drm_sched_backend_ops.kill_fence_context().
>>>>
>>>> Implement that callback for the unit tests. Remove the manual
>>>> cleanup
>>>> code.
>>>
>>> @Tvrtko: On a scale from 1-10, how much do you love this patch? :)
>>
>> Specific patch aside, it is the series as a whole I would like to be
>> sure there isn't a more elegant way to achieve the same end result.
>
> I count this as a 9/10 \o/
:) Yes, sorry, it would a bit lower than that, at least until someone
can point out a fatal flaw in my alternative. :)
> But jokes aside:
>
>>
>> Like that sketch of a counter proposal I sent for the reasons listed
>> with it. Which were, AFAIR, to avoid needing to add more state
>> machine,
>
> Well the state machine added is basically just the waitqueue. The
> WRITE_ONCE booleans are currently just for correctness and clarity.
> I've looked at them and want to remove them all in an other patch,
> because I think they're not needed (workqueue handles that)
>
> But yes, the added state is > 0
>
>> to avoid mandating drivers have to keep an internal list,
>
> That's not mandated by the scheduler, but by logic itself. All drivers
> need to have a list of on-flight fences. Otherwise the drivers would
> have no chance of signaling those fences once their GPU tells them to
> do so.
Probably it would be hard to signal without tracking of some sort yes,
although it wouldn't have to be indexed by fence context, or looked up
by it so maybe still simpler.
More importantly I think with this comment I was thinking about the fact
that with ops->cancel_job() approach I was able to remove the _done_
list tracking from the mock scheduler.
> I have now provided two users of the new API, nouveau and the unit
> tests. Can you think of a party for which the suggested approach
> wouldn't work?
I did not think along those lines yet so don't know. I just thought it
was too much code to implement a relatively simple thing and that also a
few things in the design bothered me.
If you look at the diffstat from my proposal and ignore kerneldoc and
unit test stats, it literally adds 8 lines to drm_sched_fini() and a
single line to gpu_scheduler.h:
+ void (*cancel_job)(struct drm_sched_job *sched_job);
And in the former after it stops the workers:
+ if (sched->ops->cancel_job) {
+ struct drm_sched_job *job;
+
+ list_for_each_entry_reverse(job, &sched->pending_list,
list) {
+ sched->ops->cancel_job(job);
+ sched->ops->free_job(job);
+ }
+ }
To me this looks quite clean. Unless, I say this again, I am missing
some fatal flaw why it doesn't work.
> Don't get me wrong, your approach does work and it definitely has its
> charm. However, I think what I propose here is syntactically a bit
> cleaner because the classical order of a fence first being signaled in
> the driver and then the associated job being freed as usual by the
> scheduler is guaranteed. IOW, we primarily rely on the signaling path.
>
> Either way, neither your nor my approach would have worked out of the
> box in Nouveau without that driver exploding.
What do you mean by this - the latest version of your series does or
does not work for nouveau?
Regards,
Tvrtko
>
>> and to align
>> better with the existing prototypes in the sched ops table (where
>> everything operates on jobs).
>
> That's not a hard criteria IMO. Those are sched_backend_ops, not
> sched_job_backend_ops, and prepare_job() already takes a parameter
> other than a job.
>
>
> Cheers,
> P.
>
>>
>> Regards,
>>
>> Tvrtko
>>
>>>> Signed-off-by: Philipp Stanner <phasta@kernel.org>
>>>> ---
>>>> .../gpu/drm/scheduler/tests/mock_scheduler.c | 34
>>>> ++++++++++++-----
>>>> --
>>>> 1 file changed, 21 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>>>> b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>>>> index f999c8859cf7..a72d26ca8262 100644
>>>> --- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>>>> +++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>>>> @@ -228,10 +228,30 @@ static void mock_sched_free_job(struct
>>>> drm_sched_job *sched_job)
>>>> /* Mock job itself is freed by the kunit framework. */
>>>> }
>>>>
>>>> +static void mock_sched_fence_context_kill(struct
>>>> drm_gpu_scheduler
>>>> *gpu_sched)
>>>> +{
>>>> + struct drm_mock_scheduler *sched =
>>>> drm_sched_to_mock_sched(gpu_sched);
>>>> + struct drm_mock_sched_job *job;
>>>> + unsigned long flags;
>>>> +
>>>> + spin_lock_irqsave(&sched->lock, flags);
>>>> + list_for_each_entry(job, &sched->job_list, link) {
>>>> + spin_lock(&job->lock);
>>>> + if (!dma_fence_is_signaled_locked(&job-
>>>>> hw_fence)) {
>>>> + dma_fence_set_error(&job->hw_fence, -
>>>> ECANCELED);
>>>> + dma_fence_signal_locked(&job->hw_fence);
>>>> + }
>>>> + complete(&job->done);
>>>> + spin_unlock(&job->lock);
>>>> + }
>>>> + spin_unlock_irqrestore(&sched->lock, flags);
>>>> +}
>>>> +
>>>> static const struct drm_sched_backend_ops
>>>> drm_mock_scheduler_ops = {
>>>> .run_job = mock_sched_run_job,
>>>> .timedout_job = mock_sched_timedout_job,
>>>> - .free_job = mock_sched_free_job
>>>> + .free_job = mock_sched_free_job,
>>>> + .kill_fence_context = mock_sched_fence_context_kill,
>>>> };
>>>>
>>>> /**
>>>> @@ -300,18 +320,6 @@ void drm_mock_sched_fini(struct
>>>> drm_mock_scheduler *sched)
>>>> drm_mock_sched_job_complete(job);
>>>> spin_unlock_irqrestore(&sched->lock, flags);
>>>>
>>>> - /*
>>>> - * Free completed jobs and jobs not yet processed by the
>>>> DRM
>>>> scheduler
>>>> - * free worker.
>>>> - */
>>>> - spin_lock_irqsave(&sched->lock, flags);
>>>> - list_for_each_entry_safe(job, next, &sched->done_list,
>>>> link)
>>>> - list_move_tail(&job->link, &list);
>>>> - spin_unlock_irqrestore(&sched->lock, flags);
>>>> -
>>>> - list_for_each_entry_safe(job, next, &list, link)
>>>> - mock_sched_free_job(&job->base);
>>>> -
>>>> drm_sched_fini(&sched->base);
>>>> }
>>>>
>>>
>>
>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design
2025-05-14 8:30 ` Tvrtko Ursulin
@ 2025-05-14 9:19 ` Philipp Stanner
2025-05-16 9:00 ` Tvrtko Ursulin
0 siblings, 1 reply; 24+ messages in thread
From: Philipp Stanner @ 2025-05-14 9:19 UTC (permalink / raw)
To: Tvrtko Ursulin, phasta, Lyude Paul, Danilo Krummrich,
David Airlie, Simona Vetter, Matthew Brost, Christian König,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel
On Wed, 2025-05-14 at 09:30 +0100, Tvrtko Ursulin wrote:
>
> On 12/05/2025 09:00, Philipp Stanner wrote:
> > On Thu, 2025-05-08 at 13:51 +0100, Tvrtko Ursulin wrote:
> > >
> > > Hi Philipp,
> > >
> > > On 08/05/2025 12:03, Philipp Stanner wrote:
> > > > On Thu, 2025-04-24 at 11:55 +0200, Philipp Stanner wrote:
> > > > > The unit tests so far took care manually of avoiding memory
> > > > > leaks
> > > > > that
> > > > > might have occurred when calling drm_sched_fini().
> > > > >
> > > > > The scheduler now takes care by itself of avoiding memory
> > > > > leaks
> > > > > if
> > > > > the
> > > > > driver provides the callback
> > > > > drm_sched_backend_ops.kill_fence_context().
> > > > >
> > > > > Implement that callback for the unit tests. Remove the manual
> > > > > cleanup
> > > > > code.
> > > >
> > > > @Tvrtko: On a scale from 1-10, how much do you love this patch?
> > > > :)
> > >
> > > Specific patch aside, it is the series as a whole I would like to
> > > be
> > > sure there isn't a more elegant way to achieve the same end
> > > result.
> >
> > I count this as a 9/10 \o/
>
> :) Yes, sorry, it would a bit lower than that, at least until someone
> can point out a fatal flaw in my alternative. :)
There's no fatal flaw in my approach either :)
>
> > But jokes aside:
> >
> > >
> > > Like that sketch of a counter proposal I sent for the reasons
> > > listed
> > > with it. Which were, AFAIR, to avoid needing to add more state
> > > machine,
> >
> > Well the state machine added is basically just the waitqueue. The
> > WRITE_ONCE booleans are currently just for correctness and clarity.
> > I've looked at them and want to remove them all in an other patch,
> > because I think they're not needed (workqueue handles that)
> >
> > But yes, the added state is > 0
> >
> > > to avoid mandating drivers have to keep an internal list,
> >
> > That's not mandated by the scheduler, but by logic itself. All
> > drivers
> > need to have a list of on-flight fences. Otherwise the drivers
> > would
> > have no chance of signaling those fences once their GPU tells them
> > to
> > do so.
>
> Probably it would be hard to signal without tracking of some sort
> yes,
> although it wouldn't have to be indexed by fence context, or looked
> up
> by it so maybe still simpler.
Well, the decisive point remains that all drivers must know all on-air
fences, so they all are able to signal those fences.
>
> More importantly I think with this comment I was thinking about the
> fact
> that with ops->cancel_job() approach I was able to remove the _done_
> list tracking from the mock scheduler.
If the done_list is just about avoiding memory leaks, then I should
also be able to remove it completely, shouldn't I? In this patch here I
already remove the leak-related code in drm_mock_sched_fini(), but
hadn't looked too deeply into what else the done_list does. Is it just
about the leaks?
>
> > I have now provided two users of the new API, nouveau and the unit
> > tests. Can you think of a party for which the suggested approach
> > wouldn't work?
>
> I did not think along those lines yet so don't know. I just thought
> it
> was too much code to implement a relatively simple thing and that
> also a
> few things in the design bothered me.
>
> If you look at the diffstat from my proposal and ignore kerneldoc and
> unit test stats, it literally adds 8 lines to drm_sched_fini() and a
> single line to gpu_scheduler.h:
>
> + void (*cancel_job)(struct drm_sched_job *sched_job);
>
> And in the former after it stops the workers:
>
> + if (sched->ops->cancel_job) {
> + struct drm_sched_job *job;
> +
> + list_for_each_entry_reverse(job, &sched-
> >pending_list,
> list) {
> + sched->ops->cancel_job(job);
> + sched->ops->free_job(job);
> + }
> + }
>
> To me this looks quite clean. Unless, I say this again, I am missing
> some fatal flaw why it doesn't work.
It does work. I've stated that before. But mine works, too. And mine
doesn't have a hard blocker either, as far as I can see.
So let's focus on the main differences:
Your version adds fewer lines of code, that's correct.
I think cleaning up through just having the driver signal the fences
all at once is better because
1. The very same code path both for "legacy-fini" and "new-fini" is
responsible for cleaning up the jobs. Notably, the free_job()
callback is only invoked by the same parties as before, primarily
the work items. We don't add a new, only sometimes running, code
path *that free's jobs*.
2. The scheduler's already fractured design doesn't fracture
further: the free_job work item remains responsible for calling
free_job(). Having just one party being responsible for one thing
is a desirable design goal.
3. Considering that many drivers generously (ab)use API internals of
the scheduler, and considering that there is already ambiguity of
who's responsible for handling job lifetimes, I believe it is
safer not to add an additional code path that can free jobs, but
keep that one path.
4. It reads more cleanly: "We're not canceling a single job here,
we're killing all associated jobs now all at once", raising
awareness for driver programmers that this is a significant
event: we're tearing down while not all of your jobs have
finished on your GPU.
>
> > Don't get me wrong, your approach does work and it definitely has
> > its
> > charm. However, I think what I propose here is syntactically a bit
> > cleaner because the classical order of a fence first being signaled
> > in
> > the driver and then the associated job being freed as usual by the
> > scheduler is guaranteed. IOW, we primarily rely on the signaling
> > path.
> >
> > Either way, neither your nor my approach would have worked out of
> > the
> > box in Nouveau without that driver exploding.
>
> What do you mean by this - the latest version of your series does or
> does not work for nouveau?
Mine works with Nouveau, but revealed a bug in Nouveau [1]. Yours would
have ran into that bug, too.
My point is just that your job-by-job approach wouldn't have been
superior to my approach in practice, i.e., when implementing it in the
first "beta tester", Nouveau. Would have been the same problem in
different color.
So just because a cancel_job() callback would result in fewer lines of
code wouldn't mean it's superior in practice. I expect the same to be
the case for other drivers, especially those who use scheduler
internals.
So, summarizing:
* My approach works. Your approach works.
* It works for all drivers, because they all have a list of fences.
* It communicates more clearly to the driver what this is all about.
* It keeps the scheduler's design more consistent regarding
responsibility / code paths for job life times.
P.
[1] https://lore.kernel.org/dri-devel/20250415121900.55719-2-phasta@kernel.org/
>
> Regards,
>
> Tvrtko
>
> >
> > > and to align
> > > better with the existing prototypes in the sched ops table (where
> > > everything operates on jobs).
> >
> > That's not a hard criteria IMO. Those are sched_backend_ops, not
> > sched_job_backend_ops, and prepare_job() already takes a parameter
> > other than a job.
> >
> >
> > Cheers,
> > P.
> >
> > >
> > > Regards,
> > >
> > > Tvrtko
> > >
> > > > > Signed-off-by: Philipp Stanner <phasta@kernel.org>
> > > > > ---
> > > > > .../gpu/drm/scheduler/tests/mock_scheduler.c | 34
> > > > > ++++++++++++-----
> > > > > --
> > > > > 1 file changed, 21 insertions(+), 13 deletions(-)
> > > > >
> > > > > diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> > > > > b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> > > > > index f999c8859cf7..a72d26ca8262 100644
> > > > > --- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> > > > > +++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
> > > > > @@ -228,10 +228,30 @@ static void mock_sched_free_job(struct
> > > > > drm_sched_job *sched_job)
> > > > > /* Mock job itself is freed by the kunit framework. */
> > > > > }
> > > > >
> > > > > +static void mock_sched_fence_context_kill(struct
> > > > > drm_gpu_scheduler
> > > > > *gpu_sched)
> > > > > +{
> > > > > + struct drm_mock_scheduler *sched =
> > > > > drm_sched_to_mock_sched(gpu_sched);
> > > > > + struct drm_mock_sched_job *job;
> > > > > + unsigned long flags;
> > > > > +
> > > > > + spin_lock_irqsave(&sched->lock, flags);
> > > > > + list_for_each_entry(job, &sched->job_list, link) {
> > > > > + spin_lock(&job->lock);
> > > > > + if (!dma_fence_is_signaled_locked(&job-
> > > > > > hw_fence)) {
> > > > > + dma_fence_set_error(&job->hw_fence, -
> > > > > ECANCELED);
> > > > > + dma_fence_signal_locked(&job->hw_fence);
> > > > > + }
> > > > > + complete(&job->done);
> > > > > + spin_unlock(&job->lock);
> > > > > + }
> > > > > + spin_unlock_irqrestore(&sched->lock, flags);
> > > > > +}
> > > > > +
> > > > > static const struct drm_sched_backend_ops
> > > > > drm_mock_scheduler_ops = {
> > > > > .run_job = mock_sched_run_job,
> > > > > .timedout_job = mock_sched_timedout_job,
> > > > > - .free_job = mock_sched_free_job
> > > > > + .free_job = mock_sched_free_job,
> > > > > + .kill_fence_context = mock_sched_fence_context_kill,
> > > > > };
> > > > >
> > > > > /**
> > > > > @@ -300,18 +320,6 @@ void drm_mock_sched_fini(struct
> > > > > drm_mock_scheduler *sched)
> > > > > drm_mock_sched_job_complete(job);
> > > > > spin_unlock_irqrestore(&sched->lock, flags);
> > > > >
> > > > > - /*
> > > > > - * Free completed jobs and jobs not yet processed by the
> > > > > DRM
> > > > > scheduler
> > > > > - * free worker.
> > > > > - */
> > > > > - spin_lock_irqsave(&sched->lock, flags);
> > > > > - list_for_each_entry_safe(job, next, &sched->done_list,
> > > > > link)
> > > > > - list_move_tail(&job->link, &list);
> > > > > - spin_unlock_irqrestore(&sched->lock, flags);
> > > > > -
> > > > > - list_for_each_entry_safe(job, next, &list, link)
> > > > > - mock_sched_free_job(&job->base);
> > > > > -
> > > > > drm_sched_fini(&sched->base);
> > > > > }
> > > > >
> > > >
> > >
> >
>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v2 6/6] drm/sched: Port unit tests to new cleanup design
2025-05-14 9:19 ` Philipp Stanner
@ 2025-05-16 9:00 ` Tvrtko Ursulin
0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-05-16 9:00 UTC (permalink / raw)
To: phasta, Lyude Paul, Danilo Krummrich, David Airlie, Simona Vetter,
Matthew Brost, Christian König, Maarten Lankhorst,
Maxime Ripard, Thomas Zimmermann
Cc: dri-devel, nouveau, linux-kernel
On 14/05/2025 10:19, Philipp Stanner wrote:
> On Wed, 2025-05-14 at 09:30 +0100, Tvrtko Ursulin wrote:
>>
>> On 12/05/2025 09:00, Philipp Stanner wrote:
>>> On Thu, 2025-05-08 at 13:51 +0100, Tvrtko Ursulin wrote:
>>>>
>>>> Hi Philipp,
>>>>
>>>> On 08/05/2025 12:03, Philipp Stanner wrote:
>>>>> On Thu, 2025-04-24 at 11:55 +0200, Philipp Stanner wrote:
>>>>>> The unit tests so far took care manually of avoiding memory
>>>>>> leaks
>>>>>> that
>>>>>> might have occurred when calling drm_sched_fini().
>>>>>>
>>>>>> The scheduler now takes care by itself of avoiding memory
>>>>>> leaks
>>>>>> if
>>>>>> the
>>>>>> driver provides the callback
>>>>>> drm_sched_backend_ops.kill_fence_context().
>>>>>>
>>>>>> Implement that callback for the unit tests. Remove the manual
>>>>>> cleanup
>>>>>> code.
>>>>>
>>>>> @Tvrtko: On a scale from 1-10, how much do you love this patch?
>>>>> :)
>>>>
>>>> Specific patch aside, it is the series as a whole I would like to
>>>> be
>>>> sure there isn't a more elegant way to achieve the same end
>>>> result.
>>>
>>> I count this as a 9/10 \o/
>>
>> :) Yes, sorry, it would a bit lower than that, at least until someone
>> can point out a fatal flaw in my alternative. :)
>
> There's no fatal flaw in my approach either :)
>
>>
>>> But jokes aside:
>>>
>>>>
>>>> Like that sketch of a counter proposal I sent for the reasons
>>>> listed
>>>> with it. Which were, AFAIR, to avoid needing to add more state
>>>> machine,
>>>
>>> Well the state machine added is basically just the waitqueue. The
>>> WRITE_ONCE booleans are currently just for correctness and clarity.
>>> I've looked at them and want to remove them all in an other patch,
>>> because I think they're not needed (workqueue handles that)
>>>
>>> But yes, the added state is > 0
>>>
>>>> to avoid mandating drivers have to keep an internal list,
>>>
>>> That's not mandated by the scheduler, but by logic itself. All
>>> drivers
>>> need to have a list of on-flight fences. Otherwise the drivers
>>> would
>>> have no chance of signaling those fences once their GPU tells them
>>> to
>>> do so.
>>
>> Probably it would be hard to signal without tracking of some sort
>> yes,
>> although it wouldn't have to be indexed by fence context, or looked
>> up
>> by it so maybe still simpler.
>
> Well, the decisive point remains that all drivers must know all on-air
> fences, so they all are able to signal those fences.
>
>>
>> More importantly I think with this comment I was thinking about the
>> fact
>> that with ops->cancel_job() approach I was able to remove the _done_
>> list tracking from the mock scheduler.
>
> If the done_list is just about avoiding memory leaks, then I should
> also be able to remove it completely, shouldn't I? In this patch here I
> already remove the leak-related code in drm_mock_sched_fini(), but
> hadn't looked too deeply into what else the done_list does. Is it just
> about the leaks?
Yes, I added it specifically to enable freeing completed jobs before
calling drm_sched_fini (so mock scheduler does not have to look at the
scheduler internal sched->job_list). On a glance you should be able to
remove it too and it looks it should just work. You can peek at my patch
for how to remove it completely if it helps.
>>> I have now provided two users of the new API, nouveau and the unit
>>> tests. Can you think of a party for which the suggested approach
>>> wouldn't work?
>>
>> I did not think along those lines yet so don't know. I just thought
>> it
>> was too much code to implement a relatively simple thing and that
>> also a
>> few things in the design bothered me.
>>
>> If you look at the diffstat from my proposal and ignore kerneldoc and
>> unit test stats, it literally adds 8 lines to drm_sched_fini() and a
>> single line to gpu_scheduler.h:
>>
>> + void (*cancel_job)(struct drm_sched_job *sched_job);
>>
>> And in the former after it stops the workers:
>>
>> + if (sched->ops->cancel_job) {
>> + struct drm_sched_job *job;
>> +
>> + list_for_each_entry_reverse(job, &sched-
>>> pending_list,
>> list) {
>> + sched->ops->cancel_job(job);
>> + sched->ops->free_job(job);
>> + }
>> + }
>>
>> To me this looks quite clean. Unless, I say this again, I am missing
>> some fatal flaw why it doesn't work.
>
> It does work. I've stated that before. But mine works, too. And mine
> doesn't have a hard blocker either, as far as I can see.
>
> So let's focus on the main differences:
>
> Your version adds fewer lines of code, that's correct.
>
> I think cleaning up through just having the driver signal the fences
> all at once is better because
> 1. The very same code path both for "legacy-fini" and "new-fini" is
> responsible for cleaning up the jobs. Notably, the free_job()
> callback is only invoked by the same parties as before, primarily
> the work items. We don't add a new, only sometimes running, code
> path *that free's jobs*.
> 2. The scheduler's already fractured design doesn't fracture
> further: the free_job work item remains responsible for calling
> free_job(). Having just one party being responsible for one thing
> is a desirable design goal.
> 3. Considering that many drivers generously (ab)use API internals of
> the scheduler, and considering that there is already ambiguity of
> who's responsible for handling job lifetimes, I believe it is
> safer not to add an additional code path that can free jobs, but
> keep that one path.
All three look the same root argument. I would not be too concerned
since it is still "one party" - the scheduler internals.
> 4. It reads more cleanly: "We're not canceling a single job here,
> we're killing all associated jobs now all at once", raising
> awareness for driver programmers that this is a significant
> event: we're tearing down while not all of your jobs have
> finished on your GPU.
It looks we simply may end up not agreeing.
I don't see most of the points above as significant, or even valid as a
differentiator between the two approaches, while what I object to the
most is adding of more state machine stuff. Because that is what I think
makes the scheduler harder to maintain. More flags, more stopping some
workers while leaving some run, etc.
A new callback to cancel a single job is IMO completely clean.
Equivalent to how we already have ->timedout_job(), which, in the
context of the recent discussions, really has the semantics of "cancel
if it hasn't completed in the meantime". So I even wonder if there could
be scope to share something with ->cancel_job(). The design or the
implementation.
Regards,
Tvrtko
>>
>>> Don't get me wrong, your approach does work and it definitely has
>>> its
>>> charm. However, I think what I propose here is syntactically a bit
>>> cleaner because the classical order of a fence first being signaled
>>> in
>>> the driver and then the associated job being freed as usual by the
>>> scheduler is guaranteed. IOW, we primarily rely on the signaling
>>> path.
>>>
>>> Either way, neither your nor my approach would have worked out of
>>> the
>>> box in Nouveau without that driver exploding.
>>
>> What do you mean by this - the latest version of your series does or
>> does not work for nouveau?
>
> Mine works with Nouveau, but revealed a bug in Nouveau [1]. Yours would
> have ran into that bug, too.
>
> My point is just that your job-by-job approach wouldn't have been
> superior to my approach in practice, i.e., when implementing it in the
> first "beta tester", Nouveau. Would have been the same problem in
> different color.
>
> So just because a cancel_job() callback would result in fewer lines of
> code wouldn't mean it's superior in practice. I expect the same to be
> the case for other drivers, especially those who use scheduler
> internals.
>
> So, summarizing:
> * My approach works. Your approach works.
> * It works for all drivers, because they all have a list of fences.
> * It communicates more clearly to the driver what this is all about.
> * It keeps the scheduler's design more consistent regarding
> responsibility / code paths for job life times.
>
> P.
>
> [1] https://lore.kernel.org/dri-devel/20250415121900.55719-2-phasta@kernel.org/
>
>>
>> Regards,
>>
>> Tvrtko
>>
>>>
>>>> and to align
>>>> better with the existing prototypes in the sched ops table (where
>>>> everything operates on jobs).
>>>
>>> That's not a hard criteria IMO. Those are sched_backend_ops, not
>>> sched_job_backend_ops, and prepare_job() already takes a parameter
>>> other than a job.
>>>
>>>
>>> Cheers,
>>> P.
>>>
>>>>
>>>> Regards,
>>>>
>>>> Tvrtko
>>>>
>>>>>> Signed-off-by: Philipp Stanner <phasta@kernel.org>
>>>>>> ---
>>>>>> .../gpu/drm/scheduler/tests/mock_scheduler.c | 34
>>>>>> ++++++++++++-----
>>>>>> --
>>>>>> 1 file changed, 21 insertions(+), 13 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>>>>>> b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>>>>>> index f999c8859cf7..a72d26ca8262 100644
>>>>>> --- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>>>>>> +++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
>>>>>> @@ -228,10 +228,30 @@ static void mock_sched_free_job(struct
>>>>>> drm_sched_job *sched_job)
>>>>>> /* Mock job itself is freed by the kunit framework. */
>>>>>> }
>>>>>>
>>>>>> +static void mock_sched_fence_context_kill(struct
>>>>>> drm_gpu_scheduler
>>>>>> *gpu_sched)
>>>>>> +{
>>>>>> + struct drm_mock_scheduler *sched =
>>>>>> drm_sched_to_mock_sched(gpu_sched);
>>>>>> + struct drm_mock_sched_job *job;
>>>>>> + unsigned long flags;
>>>>>> +
>>>>>> + spin_lock_irqsave(&sched->lock, flags);
>>>>>> + list_for_each_entry(job, &sched->job_list, link) {
>>>>>> + spin_lock(&job->lock);
>>>>>> + if (!dma_fence_is_signaled_locked(&job-
>>>>>>> hw_fence)) {
>>>>>> + dma_fence_set_error(&job->hw_fence, -
>>>>>> ECANCELED);
>>>>>> + dma_fence_signal_locked(&job->hw_fence);
>>>>>> + }
>>>>>> + complete(&job->done);
>>>>>> + spin_unlock(&job->lock);
>>>>>> + }
>>>>>> + spin_unlock_irqrestore(&sched->lock, flags);
>>>>>> +}
>>>>>> +
>>>>>> static const struct drm_sched_backend_ops
>>>>>> drm_mock_scheduler_ops = {
>>>>>> .run_job = mock_sched_run_job,
>>>>>> .timedout_job = mock_sched_timedout_job,
>>>>>> - .free_job = mock_sched_free_job
>>>>>> + .free_job = mock_sched_free_job,
>>>>>> + .kill_fence_context = mock_sched_fence_context_kill,
>>>>>> };
>>>>>>
>>>>>> /**
>>>>>> @@ -300,18 +320,6 @@ void drm_mock_sched_fini(struct
>>>>>> drm_mock_scheduler *sched)
>>>>>> drm_mock_sched_job_complete(job);
>>>>>> spin_unlock_irqrestore(&sched->lock, flags);
>>>>>>
>>>>>> - /*
>>>>>> - * Free completed jobs and jobs not yet processed by the
>>>>>> DRM
>>>>>> scheduler
>>>>>> - * free worker.
>>>>>> - */
>>>>>> - spin_lock_irqsave(&sched->lock, flags);
>>>>>> - list_for_each_entry_safe(job, next, &sched->done_list,
>>>>>> link)
>>>>>> - list_move_tail(&job->link, &list);
>>>>>> - spin_unlock_irqrestore(&sched->lock, flags);
>>>>>> -
>>>>>> - list_for_each_entry_safe(job, next, &list, link)
>>>>>> - mock_sched_free_job(&job->base);
>>>>>> -
>>>>>> drm_sched_fini(&sched->base);
>>>>>> }
>>>>>>
>>>>>
>>>>
>>>
>>
>
^ permalink raw reply [flat|nested] 24+ messages in thread