[RFC PATCH] drm/sched: Fix a UAF on drm_sched

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
@ 2024-08-29 17:12 Boris Brezillon
  2024-08-30  8:14 ` Christian König
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Boris Brezillon @ 2024-08-29 17:12 UTC (permalink / raw)
  To: dri-devel
  Cc: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe,
	kernel, Luben Tuikov, Matthew Brost, Christian König,
	Danilo Krummrich

dma_fence objects created by an entity might outlive the
drm_gpu_scheduler this entity was bound to if those fences are retained
by other other objects, like a dma_buf resv. This means that
drm_sched_fence::sched might be invalid when the resv is walked, which
in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.

This probably went unnoticed so far, because the drm_gpu_scheduler had
the lifetime of the drm_device, so, unless you were removing the device,
there were no reasons for the scheduler to be gone before its fences.

With the introduction of a new model where each entity has its own
drm_gpu_scheduler instance, this situation is likely to happen every time
a GPU context is destroyed and some of its fences remain attached to
dma_buf objects still owned by other drivers/processes.

In order to make drm_sched_fence_get_timeline_name() safe, we need to
copy the scheduler name into our own refcounted object that's only
destroyed when both the scheduler and all its fences are gone.

The fact drm_sched_fence might have a reference to the drm_gpu_scheduler
even after it's been released is worrisome though, but I'd rather
discuss that with everyone than come up with a solution that's likely
to end up being rejected.

Note that the bug was found while repeatedly reading dma_buf's debugfs
file, which, at some point, calls dma_resv_describe() on a resv that
contains signalled fences coming from a destroyed GPU context.
AFAIK, there's nothing invalid there.

Cc: Luben Tuikov <ltuikov89@gmail.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: Danilo Krummrich <dakr@redhat.com>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/scheduler/sched_fence.c |  8 +++-
 drivers/gpu/drm/scheduler/sched_main.c  | 18 ++++++++-
 include/drm/gpu_scheduler.h             | 52 +++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
index 0f35f009b9d3..efa2a71d98eb 100644
--- a/drivers/gpu/drm/scheduler/sched_fence.c
+++ b/drivers/gpu/drm/scheduler/sched_fence.c
@@ -90,7 +90,7 @@ static const char *drm_sched_fence_get_driver_name(struct dma_fence *fence)
 static const char *drm_sched_fence_get_timeline_name(struct dma_fence *f)
 {
 	struct drm_sched_fence *fence = to_drm_sched_fence(f);
-	return (const char *)fence->sched->name;
+	return (const char *)fence->timeline->name;
 }
 
 static void drm_sched_fence_free_rcu(struct rcu_head *rcu)
@@ -112,8 +112,10 @@ static void drm_sched_fence_free_rcu(struct rcu_head *rcu)
  */
 void drm_sched_fence_free(struct drm_sched_fence *fence)
 {
+	drm_sched_fence_timeline_put(fence->timeline);
+
 	/* This function should not be called if the fence has been initialized. */
-	if (!WARN_ON_ONCE(fence->sched))
+	if (!WARN_ON_ONCE(fence->timeline))
 		kmem_cache_free(sched_fence_slab, fence);
 }
 
@@ -224,6 +226,8 @@ void drm_sched_fence_init(struct drm_sched_fence *fence,
 	unsigned seq;
 
 	fence->sched = entity->rq->sched;
+	fence->timeline = fence->sched->fence_timeline;
+	drm_sched_fence_timeline_get(fence->timeline);
 	seq = atomic_inc_return(&entity->fence_seq);
 	dma_fence_init(&fence->scheduled, &drm_sched_fence_ops_scheduled,
 		       &fence->lock, entity->fence_context, seq);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 7e90c9f95611..1e31a9c8ce15 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -1288,10 +1288,21 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
 		sched->own_submit_wq = true;
 	}
 
+	sched->fence_timeline = kzalloc(sizeof(*sched->fence_timeline), GFP_KERNEL);
+	if (!sched->fence_timeline)
+		goto Out_check_own;
+
+	sched->fence_timeline->name = kasprintf(GFP_KERNEL, "%s", sched->name);
+	if (!sched->fence_timeline->name)
+		goto Out_free_fence_timeline;
+
+	kref_init(&sched->fence_timeline->kref);
+
 	sched->sched_rq = kmalloc_array(num_rqs, sizeof(*sched->sched_rq),
 					GFP_KERNEL | __GFP_ZERO);
 	if (!sched->sched_rq)
-		goto Out_check_own;
+		goto Out_free_fence_timeline;
+
 	sched->num_rqs = num_rqs;
 	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
 		sched->sched_rq[i] = kzalloc(sizeof(*sched->sched_rq[i]), GFP_KERNEL);
@@ -1319,6 +1330,10 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
 
 	kfree(sched->sched_rq);
 	sched->sched_rq = NULL;
+Out_free_fence_timeline:
+	if (sched->fence_timeline)
+		kfree(sched->fence_timeline->name);
+	kfree(sched->fence_timeline);
 Out_check_own:
 	if (sched->own_submit_wq)
 		destroy_workqueue(sched->submit_wq);
@@ -1367,6 +1382,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 	sched->ready = false;
 	kfree(sched->sched_rq);
 	sched->sched_rq = NULL;
+	drm_sched_fence_timeline_put(sched->fence_timeline);
 }
 EXPORT_SYMBOL(drm_sched_fini);
 
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 5acc64954a88..615ca89f77dc 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -261,6 +261,52 @@ struct drm_sched_rq {
 	struct rb_root_cached		rb_tree_root;
 };
 
+/**
+ * struct drm_sched_fence_timeline - Wrapped around the timeline name
+ *
+ * This is needed to cope with the fact dma_fence objects created by
+ * an entity might outlive the drm_gpu_scheduler this entity was bound
+ * to, making drm_sched_fence::sched invalid and leading to a UAF when
+ * dma_fence_ops::get_timeline_name() is called.
+ */
+struct drm_sched_fence_timeline {
+	/** @kref: Reference count of this timeline object. */
+	struct kref			kref;
+
+	/**
+	 * @name: Name of the timeline.
+	 *
+	 * This is currently a copy of drm_gpu_scheduler::name.
+	 */
+	const char			*name;
+};
+
+static inline void
+drm_sched_fence_timeline_release(struct kref *kref)
+{
+	struct drm_sched_fence_timeline *tl =
+		container_of(kref, struct drm_sched_fence_timeline, kref);
+
+	kfree(tl->name);
+	kfree(tl);
+}
+
+static inline void
+drm_sched_fence_timeline_put(struct drm_sched_fence_timeline *tl)
+{
+	if (tl)
+		kref_put(&tl->kref, drm_sched_fence_timeline_release);
+}
+
+static inline struct drm_sched_fence_timeline *
+drm_sched_fence_timeline_get(struct drm_sched_fence_timeline *tl)
+{
+	if (tl)
+		kref_get(&tl->kref);
+
+	return tl;
+}
+
 /**
  * struct drm_sched_fence - fences corresponding to the scheduling of a job.
  */
@@ -289,6 +335,9 @@ struct drm_sched_fence {
 	 */
 	ktime_t				deadline;
 
+        /** @timeline: the timeline this fence belongs to. */
+	struct drm_sched_fence_timeline	*timeline;
+
         /**
          * @parent: the fence returned by &drm_sched_backend_ops.run_job
          * when scheduling the job on hardware. We signal the
@@ -488,6 +537,8 @@ struct drm_sched_backend_ops {
  * @credit_count: the current credit count of this scheduler
  * @timeout: the time after which a job is removed from the scheduler.
  * @name: name of the ring for which this scheduler is being used.
+ * @fence_timeline: fence timeline that will be passed to fences created by
+ *                  and entity that's bound to this scheduler.
  * @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT,
  *           as there's usually one run-queue per priority, but could be less.
  * @sched_rq: An allocated array of run-queues of size @num_rqs;
@@ -521,6 +572,7 @@ struct drm_gpu_scheduler {
 	atomic_t			credit_count;
 	long				timeout;
 	const char			*name;
+	struct drm_sched_fence_timeline	*fence_timeline;
 	u32                             num_rqs;
 	struct drm_sched_rq             **sched_rq;
 	wait_queue_head_t		job_scheduled;
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-08-29 17:12 [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched Boris Brezillon
@ 2024-08-30  8:14 ` Christian König
  2024-08-30  9:37   ` Boris Brezillon
  2024-08-30 21:43   ` Matthew Brost
  2024-09-01 22:39 ` kernel test robot
  2024-09-02  3:14 ` kernel test robot
  2 siblings, 2 replies; 18+ messages in thread
From: Christian König @ 2024-08-30  8:14 UTC (permalink / raw)
  To: Boris Brezillon, dri-devel
  Cc: Steven Price, Liviu Dudau, Adrián Larumbe, kernel,
	Luben Tuikov, Matthew Brost, Danilo Krummrich

Am 29.08.24 um 19:12 schrieb Boris Brezillon:
> dma_fence objects created by an entity might outlive the
> drm_gpu_scheduler this entity was bound to if those fences are retained
> by other other objects, like a dma_buf resv. This means that
> drm_sched_fence::sched might be invalid when the resv is walked, which
> in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.
>
> This probably went unnoticed so far, because the drm_gpu_scheduler had
> the lifetime of the drm_device, so, unless you were removing the device,
> there were no reasons for the scheduler to be gone before its fences.

Nope, that is intentional design. get_timeline_name() is not safe to be 
called after the fence signaled because that would causes circular 
dependency problems.

E.g. when you have hardware fences it can happen that fences reference a 
driver module (for the function printing the name) and the module in 
turn keeps fences around.

So you easily end up with a module you can never unload.


> With the introduction of a new model where each entity has its own
> drm_gpu_scheduler instance, this situation is likely to happen every time
> a GPU context is destroyed and some of its fences remain attached to
> dma_buf objects still owned by other drivers/processes.
>
> In order to make drm_sched_fence_get_timeline_name() safe, we need to
> copy the scheduler name into our own refcounted object that's only
> destroyed when both the scheduler and all its fences are gone.
>
> The fact drm_sched_fence might have a reference to the drm_gpu_scheduler
> even after it's been released is worrisome though, but I'd rather
> discuss that with everyone than come up with a solution that's likely
> to end up being rejected.
>
> Note that the bug was found while repeatedly reading dma_buf's debugfs
> file, which, at some point, calls dma_resv_describe() on a resv that
> contains signalled fences coming from a destroyed GPU context.
> AFAIK, there's nothing invalid there.

Yeah but reading debugfs is not guaranteed to crash the kernel.

On the other hand the approach with a kref'ed string looks rather sane 
to me. One comment on this below.

>
> Cc: Luben Tuikov <ltuikov89@gmail.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: "Christian König" <christian.koenig@amd.com>
> Cc: Danilo Krummrich <dakr@redhat.com>
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>   drivers/gpu/drm/scheduler/sched_fence.c |  8 +++-
>   drivers/gpu/drm/scheduler/sched_main.c  | 18 ++++++++-
>   include/drm/gpu_scheduler.h             | 52 +++++++++++++++++++++++++
>   3 files changed, 75 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
> index 0f35f009b9d3..efa2a71d98eb 100644
> --- a/drivers/gpu/drm/scheduler/sched_fence.c
> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
> @@ -90,7 +90,7 @@ static const char *drm_sched_fence_get_driver_name(struct dma_fence *fence)
>   static const char *drm_sched_fence_get_timeline_name(struct dma_fence *f)
>   {
>   	struct drm_sched_fence *fence = to_drm_sched_fence(f);
> -	return (const char *)fence->sched->name;
> +	return (const char *)fence->timeline->name;
>   }
>   
>   static void drm_sched_fence_free_rcu(struct rcu_head *rcu)
> @@ -112,8 +112,10 @@ static void drm_sched_fence_free_rcu(struct rcu_head *rcu)
>    */
>   void drm_sched_fence_free(struct drm_sched_fence *fence)
>   {
> +	drm_sched_fence_timeline_put(fence->timeline);
> +
>   	/* This function should not be called if the fence has been initialized. */
> -	if (!WARN_ON_ONCE(fence->sched))
> +	if (!WARN_ON_ONCE(fence->timeline))
>   		kmem_cache_free(sched_fence_slab, fence);
>   }
>   
> @@ -224,6 +226,8 @@ void drm_sched_fence_init(struct drm_sched_fence *fence,
>   	unsigned seq;
>   
>   	fence->sched = entity->rq->sched;
> +	fence->timeline = fence->sched->fence_timeline;
> +	drm_sched_fence_timeline_get(fence->timeline);
>   	seq = atomic_inc_return(&entity->fence_seq);
>   	dma_fence_init(&fence->scheduled, &drm_sched_fence_ops_scheduled,
>   		       &fence->lock, entity->fence_context, seq);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 7e90c9f95611..1e31a9c8ce15 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -1288,10 +1288,21 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		sched->own_submit_wq = true;
>   	}
>   
> +	sched->fence_timeline = kzalloc(sizeof(*sched->fence_timeline), GFP_KERNEL);
> +	if (!sched->fence_timeline)
> +		goto Out_check_own;
> +
> +	sched->fence_timeline->name = kasprintf(GFP_KERNEL, "%s", sched->name);
> +	if (!sched->fence_timeline->name)
> +		goto Out_free_fence_timeline;
> +
> +	kref_init(&sched->fence_timeline->kref);
> +
>   	sched->sched_rq = kmalloc_array(num_rqs, sizeof(*sched->sched_rq),
>   					GFP_KERNEL | __GFP_ZERO);
>   	if (!sched->sched_rq)
> -		goto Out_check_own;
> +		goto Out_free_fence_timeline;
> +
>   	sched->num_rqs = num_rqs;
>   	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
>   		sched->sched_rq[i] = kzalloc(sizeof(*sched->sched_rq[i]), GFP_KERNEL);
> @@ -1319,6 +1330,10 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   
>   	kfree(sched->sched_rq);
>   	sched->sched_rq = NULL;
> +Out_free_fence_timeline:
> +	if (sched->fence_timeline)
> +		kfree(sched->fence_timeline->name);
> +	kfree(sched->fence_timeline);
>   Out_check_own:
>   	if (sched->own_submit_wq)
>   		destroy_workqueue(sched->submit_wq);
> @@ -1367,6 +1382,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
>   	sched->ready = false;
>   	kfree(sched->sched_rq);
>   	sched->sched_rq = NULL;
> +	drm_sched_fence_timeline_put(sched->fence_timeline);
>   }
>   EXPORT_SYMBOL(drm_sched_fini);
>   
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 5acc64954a88..615ca89f77dc 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -261,6 +261,52 @@ struct drm_sched_rq {
>   	struct rb_root_cached		rb_tree_root;
>   };
>   
> +/**
> + * struct drm_sched_fence_timeline - Wrapped around the timeline name
> + *
> + * This is needed to cope with the fact dma_fence objects created by
> + * an entity might outlive the drm_gpu_scheduler this entity was bound
> + * to, making drm_sched_fence::sched invalid and leading to a UAF when
> + * dma_fence_ops::get_timeline_name() is called.
> + */
> +struct drm_sched_fence_timeline {
> +	/** @kref: Reference count of this timeline object. */
> +	struct kref			kref;
> +
> +	/**
> +	 * @name: Name of the timeline.
> +	 *
> +	 * This is currently a copy of drm_gpu_scheduler::name.
> +	 */
> +	const char			*name;

Make that a char name[] and embed the name into the structure. The macro 
struct_size() can be used to calculate the size.

> +};
> +
> +static inline void
> +drm_sched_fence_timeline_release(struct kref *kref)
> +{
> +	struct drm_sched_fence_timeline *tl =
> +		container_of(kref, struct drm_sched_fence_timeline, kref);
> +
> +	kfree(tl->name);
> +	kfree(tl);

This avoids having two allocations for the timeline name.

Regards,
Christian.

> +}
> +
> +static inline void
> +drm_sched_fence_timeline_put(struct drm_sched_fence_timeline *tl)
> +{
> +	if (tl)
> +		kref_put(&tl->kref, drm_sched_fence_timeline_release);
> +}
> +
> +static inline struct drm_sched_fence_timeline *
> +drm_sched_fence_timeline_get(struct drm_sched_fence_timeline *tl)
> +{
> +	if (tl)
> +		kref_get(&tl->kref);
> +
> +	return tl;
> +}
> +
>   /**
>    * struct drm_sched_fence - fences corresponding to the scheduling of a job.
>    */
> @@ -289,6 +335,9 @@ struct drm_sched_fence {
>   	 */
>   	ktime_t				deadline;
>   
> +        /** @timeline: the timeline this fence belongs to. */
> +	struct drm_sched_fence_timeline	*timeline;
> +
>           /**
>            * @parent: the fence returned by &drm_sched_backend_ops.run_job
>            * when scheduling the job on hardware. We signal the
> @@ -488,6 +537,8 @@ struct drm_sched_backend_ops {
>    * @credit_count: the current credit count of this scheduler
>    * @timeout: the time after which a job is removed from the scheduler.
>    * @name: name of the ring for which this scheduler is being used.
> + * @fence_timeline: fence timeline that will be passed to fences created by
> + *                  and entity that's bound to this scheduler.
>    * @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT,
>    *           as there's usually one run-queue per priority, but could be less.
>    * @sched_rq: An allocated array of run-queues of size @num_rqs;
> @@ -521,6 +572,7 @@ struct drm_gpu_scheduler {
>   	atomic_t			credit_count;
>   	long				timeout;
>   	const char			*name;
> +	struct drm_sched_fence_timeline	*fence_timeline;
>   	u32                             num_rqs;
>   	struct drm_sched_rq             **sched_rq;
>   	wait_queue_head_t		job_scheduled;


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-08-30  8:14 ` Christian König
@ 2024-08-30  9:37   ` Boris Brezillon
  2024-08-30 10:44     ` Boris Brezillon
  2024-08-30 12:57     ` Christian König
  2024-08-30 21:43   ` Matthew Brost
  1 sibling, 2 replies; 18+ messages in thread
From: Boris Brezillon @ 2024-08-30  9:37 UTC (permalink / raw)
  To: Christian König
  Cc: dri-devel, Steven Price, Liviu Dudau, Adrián Larumbe, kernel,
	Luben Tuikov, Matthew Brost, Danilo Krummrich

Hi Christian,

On Fri, 30 Aug 2024 10:14:18 +0200
Christian König <christian.koenig@amd.com> wrote:

> Am 29.08.24 um 19:12 schrieb Boris Brezillon:
> > dma_fence objects created by an entity might outlive the
> > drm_gpu_scheduler this entity was bound to if those fences are retained
> > by other other objects, like a dma_buf resv. This means that
> > drm_sched_fence::sched might be invalid when the resv is walked, which
> > in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.
> >
> > This probably went unnoticed so far, because the drm_gpu_scheduler had
> > the lifetime of the drm_device, so, unless you were removing the device,
> > there were no reasons for the scheduler to be gone before its fences.  
> 
> Nope, that is intentional design. get_timeline_name() is not safe to be 
> called after the fence signaled because that would causes circular 
> dependency problems.

Do you mean the dma_fence layer should not call get_timeline_name()
after it's been signalled (looking at the code/doc, it doesn't seem to
be the case), or do you mean the drm_sched implementation of the fence
interface is wrong and should assume the fence can live longer than its
creator?

> 
> E.g. when you have hardware fences it can happen that fences reference a 
> driver module (for the function printing the name) and the module in 
> turn keeps fences around.
> 
> So you easily end up with a module you can never unload.

On the other hand, I think preventing the module from being unloaded is
the right thing to do, because otherwise the dma_fence_ops might be
gone when they get dereferenced in the release path. That's also a
problem I noticed when I started working on the initial panthor driver
without drm_sched. To solve that I ended up retaining a module ref for
each fence created, and releasing this ref in the
dma_fence_ops::release() function.

drm_sched adds an indirection that allows drivers to not care, but
that's still a problem if you end up unloading drm_sched while some of
its drm_sched_fence fences are owned by external components.

> 
> 
> > With the introduction of a new model where each entity has its own
> > drm_gpu_scheduler instance, this situation is likely to happen every time
> > a GPU context is destroyed and some of its fences remain attached to
> > dma_buf objects still owned by other drivers/processes.
> >
> > In order to make drm_sched_fence_get_timeline_name() safe, we need to
> > copy the scheduler name into our own refcounted object that's only
> > destroyed when both the scheduler and all its fences are gone.
> >
> > The fact drm_sched_fence might have a reference to the drm_gpu_scheduler
> > even after it's been released is worrisome though, but I'd rather
> > discuss that with everyone than come up with a solution that's likely
> > to end up being rejected.
> >
> > Note that the bug was found while repeatedly reading dma_buf's debugfs
> > file, which, at some point, calls dma_resv_describe() on a resv that
> > contains signalled fences coming from a destroyed GPU context.
> > AFAIK, there's nothing invalid there.  
> 
> Yeah but reading debugfs is not guaranteed to crash the kernel.
> 
> On the other hand the approach with a kref'ed string looks rather sane 
> to me. One comment on this below.

There's still the problem I mentioned above (unloading drm_sched can
make things crash). Are there any plans to fix that? The simple option
would be to prevent compiling drm_sched as a module, but that's not an
option because it depends on DRM which is a tristate too. Maybe we
could have drm_sched_fence.o linked statically, just like dma-fence.c
is linked statically to prevent the stub ops from disappearing.
Not sure if drm_sched_fence.c depends on symbols defined in
sched_{main,entity}.c or other parts of the DRM subsystem though.

> > +/**
> > + * struct drm_sched_fence_timeline - Wrapped around the timeline name
> > + *
> > + * This is needed to cope with the fact dma_fence objects created by
> > + * an entity might outlive the drm_gpu_scheduler this entity was bound
> > + * to, making drm_sched_fence::sched invalid and leading to a UAF when
> > + * dma_fence_ops::get_timeline_name() is called.
> > + */
> > +struct drm_sched_fence_timeline {
> > +	/** @kref: Reference count of this timeline object. */
> > +	struct kref			kref;
> > +
> > +	/**
> > +	 * @name: Name of the timeline.
> > +	 *
> > +	 * This is currently a copy of drm_gpu_scheduler::name.
> > +	 */
> > +	const char			*name;  
> 
> Make that a char name[] and embed the name into the structure. The macro 
> struct_size() can be used to calculate the size.

Sure I can do that.

Regards,

Boris

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-08-30  9:37   ` Boris Brezillon
@ 2024-08-30 10:44     ` Boris Brezillon
  2024-08-30 12:57     ` Christian König
  1 sibling, 0 replies; 18+ messages in thread
From: Boris Brezillon @ 2024-08-30 10:44 UTC (permalink / raw)
  To: Christian König
  Cc: dri-devel, Steven Price, Liviu Dudau, Adrián Larumbe, kernel,
	Luben Tuikov, Matthew Brost, Danilo Krummrich

On Fri, 30 Aug 2024 11:37:21 +0200
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> > > With the introduction of a new model where each entity has its own
> > > drm_gpu_scheduler instance, this situation is likely to happen every time
> > > a GPU context is destroyed and some of its fences remain attached to
> > > dma_buf objects still owned by other drivers/processes.
> > >
> > > In order to make drm_sched_fence_get_timeline_name() safe, we need to
> > > copy the scheduler name into our own refcounted object that's only
> > > destroyed when both the scheduler and all its fences are gone.
> > >
> > > The fact drm_sched_fence might have a reference to the drm_gpu_scheduler
> > > even after it's been released is worrisome though, but I'd rather
> > > discuss that with everyone than come up with a solution that's likely
> > > to end up being rejected.
> > >
> > > Note that the bug was found while repeatedly reading dma_buf's debugfs
> > > file, which, at some point, calls dma_resv_describe() on a resv that
> > > contains signalled fences coming from a destroyed GPU context.
> > > AFAIK, there's nothing invalid there.    
> > 
> > Yeah but reading debugfs is not guaranteed to crash the kernel.
> > 
> > On the other hand the approach with a kref'ed string looks rather sane 
> > to me. One comment on this below.  
> 
> There's still the problem I mentioned above (unloading drm_sched can
> make things crash). Are there any plans to fix that? The simple option
> would be to prevent compiling drm_sched as a module, but that's not an
> option because it depends on DRM which is a tristate too. Maybe we
> could have drm_sched_fence.o linked statically, just like dma-fence.c
> is linked statically to prevent the stub ops from disappearing.
> Not sure if drm_sched_fence.c depends on symbols defined in
> sched_{main,entity}.c or other parts of the DRM subsystem though.

For the record, I gave it a try, and linking drm_sched_fence.o
statically while leaving the rest of drm_sched as a module seems to
work. I just sent an RFC for that [1].

[1]https://lore.kernel.org/dri-devel/20240830104057.1479321-1-boris.brezillon@collabora.com/T/#u

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-08-30  9:37   ` Boris Brezillon
  2024-08-30 10:44     ` Boris Brezillon
@ 2024-08-30 12:57     ` Christian König
  1 sibling, 0 replies; 18+ messages in thread
From: Christian König @ 2024-08-30 12:57 UTC (permalink / raw)
  To: Boris Brezillon
  Cc: dri-devel, Steven Price, Liviu Dudau, Adrián Larumbe, kernel,
	Luben Tuikov, Matthew Brost, Danilo Krummrich

[-- Attachment #1: Type: text/plain, Size: 6020 bytes --]

Am 30.08.24 um 11:37 schrieb Boris Brezillon:
> Hi Christian,
>
> On Fri, 30 Aug 2024 10:14:18 +0200
> Christian König<christian.koenig@amd.com>  wrote:
>
>> Am 29.08.24 um 19:12 schrieb Boris Brezillon:
>>> dma_fence objects created by an entity might outlive the
>>> drm_gpu_scheduler this entity was bound to if those fences are retained
>>> by other other objects, like a dma_buf resv. This means that
>>> drm_sched_fence::sched might be invalid when the resv is walked, which
>>> in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.
>>>
>>> This probably went unnoticed so far, because the drm_gpu_scheduler had
>>> the lifetime of the drm_device, so, unless you were removing the device,
>>> there were no reasons for the scheduler to be gone before its fences.
>> Nope, that is intentional design. get_timeline_name() is not safe to be
>> called after the fence signaled because that would causes circular
>> dependency problems.
> Do you mean the dma_fence layer should not call get_timeline_name()
> after it's been signalled (looking at the code/doc, it doesn't seem to
> be the case), or do you mean the drm_sched implementation of the fence
> interface is wrong and should assume the fence can live longer than its
> creator?

Neither, the crashing in an debugfs corner use case is simply acceptable 
behavior.

The problem is rather that when you start to create shedulers on demand 
this isn't a rare corner use case any more, but rather much easier to 
trigger problem.

On the other hand the kernel has tons (and I would guess thousands) of 
debugfs files which can crash the kernel trivially. Quite a bunch of 
them don't take all the necessary locks and look into internal data 
structures without any guarantee that those won't go away in the middle 
of a sprintf()...

>> E.g. when you have hardware fences it can happen that fences reference a
>> driver module (for the function printing the name) and the module in
>> turn keeps fences around.
>>
>> So you easily end up with a module you can never unload.
> On the other hand, I think preventing the module from being unloaded is
> the right thing to do, because otherwise the dma_fence_ops might be
> gone when they get dereferenced in the release path. That's also a
> problem I noticed when I started working on the initial panthor driver
> without drm_sched. To solve that I ended up retaining a module ref for
> each fence created, and releasing this ref in the
> dma_fence_ops::release() function.

Yeah that was what other drivers initially did as well, but that was 
reverted at some point and nobody really looked much into it.

The takeaway was that it's better to potentially crash on unload than to 
not allow unloading at all.

> drm_sched adds an indirection that allows drivers to not care, but
> that's still a problem if you end up unloading drm_sched while some of
> its drm_sched_fence fences are owned by external components.

And you're not the first one to report this. It's just that your 
solution looks better than what I've seen before.

>>> With the introduction of a new model where each entity has its own
>>> drm_gpu_scheduler instance, this situation is likely to happen every time
>>> a GPU context is destroyed and some of its fences remain attached to
>>> dma_buf objects still owned by other drivers/processes.
>>>
>>> In order to make drm_sched_fence_get_timeline_name() safe, we need to
>>> copy the scheduler name into our own refcounted object that's only
>>> destroyed when both the scheduler and all its fences are gone.
>>>
>>> The fact drm_sched_fence might have a reference to the drm_gpu_scheduler
>>> even after it's been released is worrisome though, but I'd rather
>>> discuss that with everyone than come up with a solution that's likely
>>> to end up being rejected.
>>>
>>> Note that the bug was found while repeatedly reading dma_buf's debugfs
>>> file, which, at some point, calls dma_resv_describe() on a resv that
>>> contains signalled fences coming from a destroyed GPU context.
>>> AFAIK, there's nothing invalid there.
>> Yeah but reading debugfs is not guaranteed to crash the kernel.
>>
>> On the other hand the approach with a kref'ed string looks rather sane
>> to me. One comment on this below.
> There's still the problem I mentioned above (unloading drm_sched can
> make things crash). Are there any plans to fix that?

At least not at the moment, but your patch here looks like it makes this 
a possibility.

Depending on the uapi taking a module reference for each created 
sheduler fence might even result in overflowing the reference count, but 
if you grab a module reference for each drm_sched_fence_timeline object 
than that will probably work quite fine.

Regards,
Christian.

> The simple option
> would be to prevent compiling drm_sched as a module, but that's not an
> option because it depends on DRM which is a tristate too. Maybe we
> could have drm_sched_fence.o linked statically, just like dma-fence.c
> is linked statically to prevent the stub ops from disappearing.
> Not sure if drm_sched_fence.c depends on symbols defined in
> sched_{main,entity}.c or other parts of the DRM subsystem though.
>
>>> +/**
>>> + * struct drm_sched_fence_timeline - Wrapped around the timeline name
>>> + *
>>> + * This is needed to cope with the fact dma_fence objects created by
>>> + * an entity might outlive the drm_gpu_scheduler this entity was bound
>>> + * to, making drm_sched_fence::sched invalid and leading to a UAF when
>>> + * dma_fence_ops::get_timeline_name() is called.
>>> + */
>>> +struct drm_sched_fence_timeline {
>>> +	/** @kref: Reference count of this timeline object. */
>>> +	struct kref			kref;
>>> +
>>> +	/**
>>> +	 * @name: Name of the timeline.
>>> +	 *
>>> +	 * This is currently a copy of drm_gpu_scheduler::name.
>>> +	 */
>>> +	const char			*name;
>> Make that a char name[] and embed the name into the structure. The macro
>> struct_size() can be used to calculate the size.
> Sure I can do that.
>
> Regards,
>
> Boris

[-- Attachment #2: Type: text/html, Size: 7943 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-08-30  8:14 ` Christian König
  2024-08-30  9:37   ` Boris Brezillon
@ 2024-08-30 21:43   ` Matthew Brost
  2024-08-31  7:25     ` Boris Brezillon
  2024-09-02 10:43     ` Christian König
  1 sibling, 2 replies; 18+ messages in thread
From: Matthew Brost @ 2024-08-30 21:43 UTC (permalink / raw)
  To: Christian König
  Cc: Boris Brezillon, dri-devel, Steven Price, Liviu Dudau,
	Adrián Larumbe, kernel, Luben Tuikov, Danilo Krummrich

On Fri, Aug 30, 2024 at 10:14:18AM +0200, Christian König wrote:
> Am 29.08.24 um 19:12 schrieb Boris Brezillon:
> > dma_fence objects created by an entity might outlive the
> > drm_gpu_scheduler this entity was bound to if those fences are retained
> > by other other objects, like a dma_buf resv. This means that
> > drm_sched_fence::sched might be invalid when the resv is walked, which
> > in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.
> > 
> > This probably went unnoticed so far, because the drm_gpu_scheduler had
> > the lifetime of the drm_device, so, unless you were removing the device,
> > there were no reasons for the scheduler to be gone before its fences.
> 
> Nope, that is intentional design. get_timeline_name() is not safe to be
> called after the fence signaled because that would causes circular
> dependency problems.
>

I'm quite sure happens, ftrace for example can and will call
get_timeline_name in trace_dma_fence_destroy which is certainly after
the fence is signaled. There are likely other cases too - this just
quickly came to mind.
 
> E.g. when you have hardware fences it can happen that fences reference a
> driver module (for the function printing the name) and the module in turn
> keeps fences around.
> 

I am almost positive without this patch this problematic in Xe or any
driver in which schedulers are tied to IOCTLs rather than kernel module.

In Xe 'fence->sched' maps to an xe_exec_queue which can be freed once
the destroy exec queue IOCTL is called and all jobs are free'd (i.e.
'fence' signals). The fence could be live on after in dma-resv objects,
drm syncobjs, etc...

I know this issue has been raised before and basically NACK'd but I have
a strong opinion this is valid and in fact required.

Matt

> So you easily end up with a module you can never unload.
> 
> 
> > With the introduction of a new model where each entity has its own
> > drm_gpu_scheduler instance, this situation is likely to happen every time
> > a GPU context is destroyed and some of its fences remain attached to
> > dma_buf objects still owned by other drivers/processes.
> > 
> > In order to make drm_sched_fence_get_timeline_name() safe, we need to
> > copy the scheduler name into our own refcounted object that's only
> > destroyed when both the scheduler and all its fences are gone.
> > 
> > The fact drm_sched_fence might have a reference to the drm_gpu_scheduler
> > even after it's been released is worrisome though, but I'd rather
> > discuss that with everyone than come up with a solution that's likely
> > to end up being rejected.
> > 
> > Note that the bug was found while repeatedly reading dma_buf's debugfs
> > file, which, at some point, calls dma_resv_describe() on a resv that
> > contains signalled fences coming from a destroyed GPU context.
> > AFAIK, there's nothing invalid there.
> 
> Yeah but reading debugfs is not guaranteed to crash the kernel.
> 
> On the other hand the approach with a kref'ed string looks rather sane to
> me. One comment on this below.
> 
> > 
> > Cc: Luben Tuikov <ltuikov89@gmail.com>
> > Cc: Matthew Brost <matthew.brost@intel.com>
> > Cc: "Christian König" <christian.koenig@amd.com>
> > Cc: Danilo Krummrich <dakr@redhat.com>
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > ---
> >   drivers/gpu/drm/scheduler/sched_fence.c |  8 +++-
> >   drivers/gpu/drm/scheduler/sched_main.c  | 18 ++++++++-
> >   include/drm/gpu_scheduler.h             | 52 +++++++++++++++++++++++++
> >   3 files changed, 75 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
> > index 0f35f009b9d3..efa2a71d98eb 100644
> > --- a/drivers/gpu/drm/scheduler/sched_fence.c
> > +++ b/drivers/gpu/drm/scheduler/sched_fence.c
> > @@ -90,7 +90,7 @@ static const char *drm_sched_fence_get_driver_name(struct dma_fence *fence)
> >   static const char *drm_sched_fence_get_timeline_name(struct dma_fence *f)
> >   {
> >   	struct drm_sched_fence *fence = to_drm_sched_fence(f);
> > -	return (const char *)fence->sched->name;
> > +	return (const char *)fence->timeline->name;
> >   }
> >   static void drm_sched_fence_free_rcu(struct rcu_head *rcu)
> > @@ -112,8 +112,10 @@ static void drm_sched_fence_free_rcu(struct rcu_head *rcu)
> >    */
> >   void drm_sched_fence_free(struct drm_sched_fence *fence)
> >   {
> > +	drm_sched_fence_timeline_put(fence->timeline);
> > +
> >   	/* This function should not be called if the fence has been initialized. */
> > -	if (!WARN_ON_ONCE(fence->sched))
> > +	if (!WARN_ON_ONCE(fence->timeline))
> >   		kmem_cache_free(sched_fence_slab, fence);
> >   }
> > @@ -224,6 +226,8 @@ void drm_sched_fence_init(struct drm_sched_fence *fence,
> >   	unsigned seq;
> >   	fence->sched = entity->rq->sched;
> > +	fence->timeline = fence->sched->fence_timeline;
> > +	drm_sched_fence_timeline_get(fence->timeline);
> >   	seq = atomic_inc_return(&entity->fence_seq);
> >   	dma_fence_init(&fence->scheduled, &drm_sched_fence_ops_scheduled,
> >   		       &fence->lock, entity->fence_context, seq);
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> > index 7e90c9f95611..1e31a9c8ce15 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -1288,10 +1288,21 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >   		sched->own_submit_wq = true;
> >   	}
> > +	sched->fence_timeline = kzalloc(sizeof(*sched->fence_timeline), GFP_KERNEL);
> > +	if (!sched->fence_timeline)
> > +		goto Out_check_own;
> > +
> > +	sched->fence_timeline->name = kasprintf(GFP_KERNEL, "%s", sched->name);
> > +	if (!sched->fence_timeline->name)
> > +		goto Out_free_fence_timeline;
> > +
> > +	kref_init(&sched->fence_timeline->kref);
> > +
> >   	sched->sched_rq = kmalloc_array(num_rqs, sizeof(*sched->sched_rq),
> >   					GFP_KERNEL | __GFP_ZERO);
> >   	if (!sched->sched_rq)
> > -		goto Out_check_own;
> > +		goto Out_free_fence_timeline;
> > +
> >   	sched->num_rqs = num_rqs;
> >   	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
> >   		sched->sched_rq[i] = kzalloc(sizeof(*sched->sched_rq[i]), GFP_KERNEL);
> > @@ -1319,6 +1330,10 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >   	kfree(sched->sched_rq);
> >   	sched->sched_rq = NULL;
> > +Out_free_fence_timeline:
> > +	if (sched->fence_timeline)
> > +		kfree(sched->fence_timeline->name);
> > +	kfree(sched->fence_timeline);
> >   Out_check_own:
> >   	if (sched->own_submit_wq)
> >   		destroy_workqueue(sched->submit_wq);
> > @@ -1367,6 +1382,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
> >   	sched->ready = false;
> >   	kfree(sched->sched_rq);
> >   	sched->sched_rq = NULL;
> > +	drm_sched_fence_timeline_put(sched->fence_timeline);
> >   }
> >   EXPORT_SYMBOL(drm_sched_fini);
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index 5acc64954a88..615ca89f77dc 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -261,6 +261,52 @@ struct drm_sched_rq {
> >   	struct rb_root_cached		rb_tree_root;
> >   };
> > +/**
> > + * struct drm_sched_fence_timeline - Wrapped around the timeline name
> > + *
> > + * This is needed to cope with the fact dma_fence objects created by
> > + * an entity might outlive the drm_gpu_scheduler this entity was bound
> > + * to, making drm_sched_fence::sched invalid and leading to a UAF when
> > + * dma_fence_ops::get_timeline_name() is called.
> > + */
> > +struct drm_sched_fence_timeline {
> > +	/** @kref: Reference count of this timeline object. */
> > +	struct kref			kref;
> > +
> > +	/**
> > +	 * @name: Name of the timeline.
> > +	 *
> > +	 * This is currently a copy of drm_gpu_scheduler::name.
> > +	 */
> > +	const char			*name;
> 
> Make that a char name[] and embed the name into the structure. The macro
> struct_size() can be used to calculate the size.
> 
> > +};
> > +
> > +static inline void
> > +drm_sched_fence_timeline_release(struct kref *kref)
> > +{
> > +	struct drm_sched_fence_timeline *tl =
> > +		container_of(kref, struct drm_sched_fence_timeline, kref);
> > +
> > +	kfree(tl->name);
> > +	kfree(tl);
> 
> This avoids having two allocations for the timeline name.
> 
> Regards,
> Christian.
> 
> > +}
> > +
> > +static inline void
> > +drm_sched_fence_timeline_put(struct drm_sched_fence_timeline *tl)
> > +{
> > +	if (tl)
> > +		kref_put(&tl->kref, drm_sched_fence_timeline_release);
> > +}
> > +
> > +static inline struct drm_sched_fence_timeline *
> > +drm_sched_fence_timeline_get(struct drm_sched_fence_timeline *tl)
> > +{
> > +	if (tl)
> > +		kref_get(&tl->kref);
> > +
> > +	return tl;
> > +}
> > +
> >   /**
> >    * struct drm_sched_fence - fences corresponding to the scheduling of a job.
> >    */
> > @@ -289,6 +335,9 @@ struct drm_sched_fence {
> >   	 */
> >   	ktime_t				deadline;
> > +        /** @timeline: the timeline this fence belongs to. */
> > +	struct drm_sched_fence_timeline	*timeline;
> > +
> >           /**
> >            * @parent: the fence returned by &drm_sched_backend_ops.run_job
> >            * when scheduling the job on hardware. We signal the
> > @@ -488,6 +537,8 @@ struct drm_sched_backend_ops {
> >    * @credit_count: the current credit count of this scheduler
> >    * @timeout: the time after which a job is removed from the scheduler.
> >    * @name: name of the ring for which this scheduler is being used.
> > + * @fence_timeline: fence timeline that will be passed to fences created by
> > + *                  and entity that's bound to this scheduler.
> >    * @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT,
> >    *           as there's usually one run-queue per priority, but could be less.
> >    * @sched_rq: An allocated array of run-queues of size @num_rqs;
> > @@ -521,6 +572,7 @@ struct drm_gpu_scheduler {
> >   	atomic_t			credit_count;
> >   	long				timeout;
> >   	const char			*name;
> > +	struct drm_sched_fence_timeline	*fence_timeline;
> >   	u32                             num_rqs;
> >   	struct drm_sched_rq             **sched_rq;
> >   	wait_queue_head_t		job_scheduled;
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-08-30 21:43   ` Matthew Brost
@ 2024-08-31  7:25     ` Boris Brezillon
  2024-09-02 10:43     ` Christian König
  1 sibling, 0 replies; 18+ messages in thread
From: Boris Brezillon @ 2024-08-31  7:25 UTC (permalink / raw)
  To: Matthew Brost
  Cc: Christian König, dri-devel, Steven Price, Liviu Dudau,
	Adrián Larumbe, kernel, Luben Tuikov, Danilo Krummrich

On Fri, 30 Aug 2024 21:43:44 +0000
Matthew Brost <matthew.brost@intel.com> wrote:

> On Fri, Aug 30, 2024 at 10:14:18AM +0200, Christian König wrote:
> > Am 29.08.24 um 19:12 schrieb Boris Brezillon:  
> > > dma_fence objects created by an entity might outlive the
> > > drm_gpu_scheduler this entity was bound to if those fences are retained
> > > by other other objects, like a dma_buf resv. This means that
> > > drm_sched_fence::sched might be invalid when the resv is walked, which
> > > in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.
> > > 
> > > This probably went unnoticed so far, because the drm_gpu_scheduler had
> > > the lifetime of the drm_device, so, unless you were removing the device,
> > > there were no reasons for the scheduler to be gone before its fences.  
> > 
> > Nope, that is intentional design. get_timeline_name() is not safe to be
> > called after the fence signaled because that would causes circular
> > dependency problems.
> >  
> 
> I'm quite sure happens, ftrace for example can and will call
> get_timeline_name in trace_dma_fence_destroy which is certainly after
> the fence is signaled. There are likely other cases too - this just
> quickly came to mind.
>  
> > E.g. when you have hardware fences it can happen that fences reference a
> > driver module (for the function printing the name) and the module in turn
> > keeps fences around.
> >   
> 
> I am almost positive without this patch this problematic in Xe or any
> driver in which schedulers are tied to IOCTLs rather than kernel module.
> 
> In Xe 'fence->sched' maps to an xe_exec_queue which can be freed once
> the destroy exec queue IOCTL is called and all jobs are free'd (i.e.
> 'fence' signals). The fence could be live on after in dma-resv objects,
> drm syncobjs, etc...
> 
> I know this issue has been raised before and basically NACK'd but I have
> a strong opinion this is valid and in fact required.

IIUC, Christian recognized that it's more problematic now that
schedulers lifetime is no longer bound to the device lifetime but
instead the GPU context lifetime. So I think we all agree that this
needs fixing :-).

How about I send a v2 of this patch, as it seems Christian was more or
less happy with the approach, baring the allocation scheme. And then
we can discuss how we want to fix the module-unload issue.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-08-30 21:43   ` Matthew Brost
  2024-08-31  7:25     ` Boris Brezillon
@ 2024-09-02 10:43     ` Christian König
  2024-09-02 13:23       ` Daniel Vetter
  1 sibling, 1 reply; 18+ messages in thread
From: Christian König @ 2024-09-02 10:43 UTC (permalink / raw)
  To: Matthew Brost
  Cc: Boris Brezillon, dri-devel, Steven Price, Liviu Dudau,
	Adrián Larumbe, kernel, Luben Tuikov, Danilo Krummrich

[-- Attachment #1: Type: text/plain, Size: 10850 bytes --]

Am 30.08.24 um 23:43 schrieb Matthew Brost:
> On Fri, Aug 30, 2024 at 10:14:18AM +0200, Christian König wrote:
>> Am 29.08.24 um 19:12 schrieb Boris Brezillon:
>>> dma_fence objects created by an entity might outlive the
>>> drm_gpu_scheduler this entity was bound to if those fences are retained
>>> by other other objects, like a dma_buf resv. This means that
>>> drm_sched_fence::sched might be invalid when the resv is walked, which
>>> in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.
>>>
>>> This probably went unnoticed so far, because the drm_gpu_scheduler had
>>> the lifetime of the drm_device, so, unless you were removing the device,
>>> there were no reasons for the scheduler to be gone before its fences.
>> Nope, that is intentional design. get_timeline_name() is not safe to be
>> called after the fence signaled because that would causes circular
>> dependency problems.
>>
> I'm quite sure happens, ftrace for example can and will call
> get_timeline_name in trace_dma_fence_destroy which is certainly after
> the fence is signaled. There are likely other cases too - this just
> quickly came to mind.

Good point, completely forgotten about ftrace.

>> E.g. when you have hardware fences it can happen that fences reference a
>> driver module (for the function printing the name) and the module in turn
>> keeps fences around.
>>
> I am almost positive without this patch this problematic in Xe or any
> driver in which schedulers are tied to IOCTLs rather than kernel module.
>
> In Xe 'fence->sched' maps to an xe_exec_queue which can be freed once
> the destroy exec queue IOCTL is called and all jobs are free'd (i.e.
> 'fence' signals). The fence could be live on after in dma-resv objects,
> drm syncobjs, etc...
>
> I know this issue has been raised before and basically NACK'd but I have
> a strong opinion this is valid and in fact required.

I've NACK'd automatically signaling pending fences on destruction of the 
scheduler (that reminds me that I wanted to add a warning for that) and 
copying the name into every scheduler fence.

As long as we don't do any of that I'm perfectly fine fixing this issue. 
The approach of creating a reference counted object for the name looks 
rather valid to me.

Especially since we then pretty much get the module references correct 
for free as well.

Christian.

>
> Matt
>
>> So you easily end up with a module you can never unload.
>>
>>
>>> With the introduction of a new model where each entity has its own
>>> drm_gpu_scheduler instance, this situation is likely to happen every time
>>> a GPU context is destroyed and some of its fences remain attached to
>>> dma_buf objects still owned by other drivers/processes.
>>>
>>> In order to make drm_sched_fence_get_timeline_name() safe, we need to
>>> copy the scheduler name into our own refcounted object that's only
>>> destroyed when both the scheduler and all its fences are gone.
>>>
>>> The fact drm_sched_fence might have a reference to the drm_gpu_scheduler
>>> even after it's been released is worrisome though, but I'd rather
>>> discuss that with everyone than come up with a solution that's likely
>>> to end up being rejected.
>>>
>>> Note that the bug was found while repeatedly reading dma_buf's debugfs
>>> file, which, at some point, calls dma_resv_describe() on a resv that
>>> contains signalled fences coming from a destroyed GPU context.
>>> AFAIK, there's nothing invalid there.
>> Yeah but reading debugfs is not guaranteed to crash the kernel.
>>
>> On the other hand the approach with a kref'ed string looks rather sane to
>> me. One comment on this below.
>>
>>> Cc: Luben Tuikov<ltuikov89@gmail.com>
>>> Cc: Matthew Brost<matthew.brost@intel.com>
>>> Cc: "Christian König"<christian.koenig@amd.com>
>>> Cc: Danilo Krummrich<dakr@redhat.com>
>>> Signed-off-by: Boris Brezillon<boris.brezillon@collabora.com>
>>> ---
>>>    drivers/gpu/drm/scheduler/sched_fence.c |  8 +++-
>>>    drivers/gpu/drm/scheduler/sched_main.c  | 18 ++++++++-
>>>    include/drm/gpu_scheduler.h             | 52 +++++++++++++++++++++++++
>>>    3 files changed, 75 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
>>> index 0f35f009b9d3..efa2a71d98eb 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_fence.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
>>> @@ -90,7 +90,7 @@ static const char *drm_sched_fence_get_driver_name(struct dma_fence *fence)
>>>    static const char *drm_sched_fence_get_timeline_name(struct dma_fence *f)
>>>    {
>>>    	struct drm_sched_fence *fence = to_drm_sched_fence(f);
>>> -	return (const char *)fence->sched->name;
>>> +	return (const char *)fence->timeline->name;
>>>    }
>>>    static void drm_sched_fence_free_rcu(struct rcu_head *rcu)
>>> @@ -112,8 +112,10 @@ static void drm_sched_fence_free_rcu(struct rcu_head *rcu)
>>>     */
>>>    void drm_sched_fence_free(struct drm_sched_fence *fence)
>>>    {
>>> +	drm_sched_fence_timeline_put(fence->timeline);
>>> +
>>>    	/* This function should not be called if the fence has been initialized. */
>>> -	if (!WARN_ON_ONCE(fence->sched))
>>> +	if (!WARN_ON_ONCE(fence->timeline))
>>>    		kmem_cache_free(sched_fence_slab, fence);
>>>    }
>>> @@ -224,6 +226,8 @@ void drm_sched_fence_init(struct drm_sched_fence *fence,
>>>    	unsigned seq;
>>>    	fence->sched = entity->rq->sched;
>>> +	fence->timeline = fence->sched->fence_timeline;
>>> +	drm_sched_fence_timeline_get(fence->timeline);
>>>    	seq = atomic_inc_return(&entity->fence_seq);
>>>    	dma_fence_init(&fence->scheduled, &drm_sched_fence_ops_scheduled,
>>>    		       &fence->lock, entity->fence_context, seq);
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index 7e90c9f95611..1e31a9c8ce15 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -1288,10 +1288,21 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>    		sched->own_submit_wq = true;
>>>    	}
>>> +	sched->fence_timeline = kzalloc(sizeof(*sched->fence_timeline), GFP_KERNEL);
>>> +	if (!sched->fence_timeline)
>>> +		goto Out_check_own;
>>> +
>>> +	sched->fence_timeline->name = kasprintf(GFP_KERNEL, "%s", sched->name);
>>> +	if (!sched->fence_timeline->name)
>>> +		goto Out_free_fence_timeline;
>>> +
>>> +	kref_init(&sched->fence_timeline->kref);
>>> +
>>>    	sched->sched_rq = kmalloc_array(num_rqs, sizeof(*sched->sched_rq),
>>>    					GFP_KERNEL | __GFP_ZERO);
>>>    	if (!sched->sched_rq)
>>> -		goto Out_check_own;
>>> +		goto Out_free_fence_timeline;
>>> +
>>>    	sched->num_rqs = num_rqs;
>>>    	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
>>>    		sched->sched_rq[i] = kzalloc(sizeof(*sched->sched_rq[i]), GFP_KERNEL);
>>> @@ -1319,6 +1330,10 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>    	kfree(sched->sched_rq);
>>>    	sched->sched_rq = NULL;
>>> +Out_free_fence_timeline:
>>> +	if (sched->fence_timeline)
>>> +		kfree(sched->fence_timeline->name);
>>> +	kfree(sched->fence_timeline);
>>>    Out_check_own:
>>>    	if (sched->own_submit_wq)
>>>    		destroy_workqueue(sched->submit_wq);
>>> @@ -1367,6 +1382,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
>>>    	sched->ready = false;
>>>    	kfree(sched->sched_rq);
>>>    	sched->sched_rq = NULL;
>>> +	drm_sched_fence_timeline_put(sched->fence_timeline);
>>>    }
>>>    EXPORT_SYMBOL(drm_sched_fini);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index 5acc64954a88..615ca89f77dc 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -261,6 +261,52 @@ struct drm_sched_rq {
>>>    	struct rb_root_cached		rb_tree_root;
>>>    };
>>> +/**
>>> + * struct drm_sched_fence_timeline - Wrapped around the timeline name
>>> + *
>>> + * This is needed to cope with the fact dma_fence objects created by
>>> + * an entity might outlive the drm_gpu_scheduler this entity was bound
>>> + * to, making drm_sched_fence::sched invalid and leading to a UAF when
>>> + * dma_fence_ops::get_timeline_name() is called.
>>> + */
>>> +struct drm_sched_fence_timeline {
>>> +	/** @kref: Reference count of this timeline object. */
>>> +	struct kref			kref;
>>> +
>>> +	/**
>>> +	 * @name: Name of the timeline.
>>> +	 *
>>> +	 * This is currently a copy of drm_gpu_scheduler::name.
>>> +	 */
>>> +	const char			*name;
>> Make that a char name[] and embed the name into the structure. The macro
>> struct_size() can be used to calculate the size.
>>
>>> +};
>>> +
>>> +static inline void
>>> +drm_sched_fence_timeline_release(struct kref *kref)
>>> +{
>>> +	struct drm_sched_fence_timeline *tl =
>>> +		container_of(kref, struct drm_sched_fence_timeline, kref);
>>> +
>>> +	kfree(tl->name);
>>> +	kfree(tl);
>> This avoids having two allocations for the timeline name.
>>
>> Regards,
>> Christian.
>>
>>> +}
>>> +
>>> +static inline void
>>> +drm_sched_fence_timeline_put(struct drm_sched_fence_timeline *tl)
>>> +{
>>> +	if (tl)
>>> +		kref_put(&tl->kref, drm_sched_fence_timeline_release);
>>> +}
>>> +
>>> +static inline struct drm_sched_fence_timeline *
>>> +drm_sched_fence_timeline_get(struct drm_sched_fence_timeline *tl)
>>> +{
>>> +	if (tl)
>>> +		kref_get(&tl->kref);
>>> +
>>> +	return tl;
>>> +}
>>> +
>>>    /**
>>>     * struct drm_sched_fence - fences corresponding to the scheduling of a job.
>>>     */
>>> @@ -289,6 +335,9 @@ struct drm_sched_fence {
>>>    	 */
>>>    	ktime_t				deadline;
>>> +        /** @timeline: the timeline this fence belongs to. */
>>> +	struct drm_sched_fence_timeline	*timeline;
>>> +
>>>            /**
>>>             * @parent: the fence returned by &drm_sched_backend_ops.run_job
>>>             * when scheduling the job on hardware. We signal the
>>> @@ -488,6 +537,8 @@ struct drm_sched_backend_ops {
>>>     * @credit_count: the current credit count of this scheduler
>>>     * @timeout: the time after which a job is removed from the scheduler.
>>>     * @name: name of the ring for which this scheduler is being used.
>>> + * @fence_timeline: fence timeline that will be passed to fences created by
>>> + *                  and entity that's bound to this scheduler.
>>>     * @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT,
>>>     *           as there's usually one run-queue per priority, but could be less.
>>>     * @sched_rq: An allocated array of run-queues of size @num_rqs;
>>> @@ -521,6 +572,7 @@ struct drm_gpu_scheduler {
>>>    	atomic_t			credit_count;
>>>    	long				timeout;
>>>    	const char			*name;
>>> +	struct drm_sched_fence_timeline	*fence_timeline;
>>>    	u32                             num_rqs;
>>>    	struct drm_sched_rq             **sched_rq;
>>>    	wait_queue_head_t		job_scheduled;

[-- Attachment #2: Type: text/html, Size: 12440 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-09-02 10:43     ` Christian König
@ 2024-09-02 13:23       ` Daniel Vetter
  2024-09-02 14:18         ` Christian König
  0 siblings, 1 reply; 18+ messages in thread
From: Daniel Vetter @ 2024-09-02 13:23 UTC (permalink / raw)
  To: Christian König
  Cc: Matthew Brost, Boris Brezillon, dri-devel, Steven Price,
	Liviu Dudau, Adrián Larumbe, kernel, Luben Tuikov,
	Danilo Krummrich

On Mon, Sep 02, 2024 at 12:43:45PM +0200, Christian König wrote:
> Am 30.08.24 um 23:43 schrieb Matthew Brost:
> > On Fri, Aug 30, 2024 at 10:14:18AM +0200, Christian König wrote:
> > > Am 29.08.24 um 19:12 schrieb Boris Brezillon:
> > > > dma_fence objects created by an entity might outlive the
> > > > drm_gpu_scheduler this entity was bound to if those fences are retained
> > > > by other other objects, like a dma_buf resv. This means that
> > > > drm_sched_fence::sched might be invalid when the resv is walked, which
> > > > in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.
> > > > 
> > > > This probably went unnoticed so far, because the drm_gpu_scheduler had
> > > > the lifetime of the drm_device, so, unless you were removing the device,
> > > > there were no reasons for the scheduler to be gone before its fences.
> > > Nope, that is intentional design. get_timeline_name() is not safe to be
> > > called after the fence signaled because that would causes circular
> > > dependency problems.

So I don't think knowlingly crashing in debugfs is ok. debugfs can break
stuff like secure boot, and if you go about things very wrongly it can
upset the kernel (like touching pci mappings from userspace can). But just
going boom due to a race essentially means debugfs is unusable. Because
there's no way to avoid the boom with dma_fence:

- they're guaranteed to signal in finite time (unless driver bugs)

- the moment they've signalled looking too closely at them is undefined
  behaviour.

> > I'm quite sure happens, ftrace for example can and will call
> > get_timeline_name in trace_dma_fence_destroy which is certainly after
> > the fence is signaled. There are likely other cases too - this just
> > quickly came to mind.
> 
> Good point, completely forgotten about ftrace.
> 
> > > E.g. when you have hardware fences it can happen that fences reference a
> > > driver module (for the function printing the name) and the module in turn
> > > keeps fences around.
> > > 
> > I am almost positive without this patch this problematic in Xe or any
> > driver in which schedulers are tied to IOCTLs rather than kernel module.
> > 
> > In Xe 'fence->sched' maps to an xe_exec_queue which can be freed once
> > the destroy exec queue IOCTL is called and all jobs are free'd (i.e.
> > 'fence' signals). The fence could be live on after in dma-resv objects,
> > drm syncobjs, etc...
> > 
> > I know this issue has been raised before and basically NACK'd but I have
> > a strong opinion this is valid and in fact required.
> 
> I've NACK'd automatically signaling pending fences on destruction of the
> scheduler (that reminds me that I wanted to add a warning for that) and
> copying the name into every scheduler fence.
> 
> As long as we don't do any of that I'm perfectly fine fixing this issue. The
> approach of creating a reference counted object for the name looks rather
> valid to me.
> 
> Especially since we then pretty much get the module references correct for
> free as well.

So I think the issue is much, much bigger, and there's more. And the
issue is I think a fundamental design issue of dma_fence itself, not
individual users. I think at the core it's two constraints:

- dma_fence can stick around practically forever in varios container
  objects. We only garbage collect when someone looks, and not even then
  consistently.

- fences are meant to be cheap, so they do not have the big refcount going
  on like other shared objects like dma_buf

Specifically there's also no refcounting on the module itself with the
->owner and try_module_get stuff. So even if we fix all these issues on
the data structure lifetime side of things, you might still oops calling
into dma_fence->ops->release.

Oops.

I think the complete solution is if we change this code all so that core
dma-fence.c code guarantees to never ever again call into any driver code
after dma_fence_signal has been called, and takes over the final kfree_rcu
itself. But that's a giantic change. But I think it's the only way to
really fix this mess:

- drivers will clean up any of their own references in a timely fashion,
  so no more accidentally lingering gpu context or vms and the bo they
  have mapped lying around.

- there's no lifetime or other use-after-free issues anywhere for fences
  anymore

Downside is that some of the debugging stuff becomes a bit less useful.
But e.g. tracepoints could just dump the timeline once at creation or when
signalling, and so you don't need to dump it anymore when freeing. And a
signalled fence is generally not a problem anymore, so in a compositor
that's also all fine (iirc you can get at some of this stuff through the
sync_file interfaces too).

The other downside is that it's a huge pile of work, but I don't think we
can get to an actually solid design with less headaches and pain ...

Thoughts?

Cheers, Sima
-- 
Simona Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-09-02 13:23       ` Daniel Vetter
@ 2024-09-02 14:18         ` Christian König
  2024-09-03  8:13           ` Simona Vetter
  0 siblings, 1 reply; 18+ messages in thread
From: Christian König @ 2024-09-02 14:18 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Matthew Brost, Boris Brezillon, dri-devel, Steven Price,
	Liviu Dudau, Adrián Larumbe, kernel, Luben Tuikov,
	Danilo Krummrich

Am 02.09.24 um 15:23 schrieb Daniel Vetter:
> On Mon, Sep 02, 2024 at 12:43:45PM +0200, Christian König wrote:
>> Am 30.08.24 um 23:43 schrieb Matthew Brost:
>>> On Fri, Aug 30, 2024 at 10:14:18AM +0200, Christian König wrote:
>>>> Am 29.08.24 um 19:12 schrieb Boris Brezillon:
>>>>> dma_fence objects created by an entity might outlive the
>>>>> drm_gpu_scheduler this entity was bound to if those fences are retained
>>>>> by other other objects, like a dma_buf resv. This means that
>>>>> drm_sched_fence::sched might be invalid when the resv is walked, which
>>>>> in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.
>>>>>
>>>>> This probably went unnoticed so far, because the drm_gpu_scheduler had
>>>>> the lifetime of the drm_device, so, unless you were removing the device,
>>>>> there were no reasons for the scheduler to be gone before its fences.
>>>> Nope, that is intentional design. get_timeline_name() is not safe to be
>>>> called after the fence signaled because that would causes circular
>>>> dependency problems.
> So I don't think knowlingly crashing in debugfs is ok. debugfs can break
> stuff like secure boot, and if you go about things very wrongly it can
> upset the kernel (like touching pci mappings from userspace can). But just
> going boom due to a race essentially means debugfs is unusable. Because
> there's no way to avoid the boom with dma_fence:
>
> - they're guaranteed to signal in finite time (unless driver bugs)
>
> - the moment they've signalled looking too closely at them is undefined
>    behaviour.
>
>>> I'm quite sure happens, ftrace for example can and will call
>>> get_timeline_name in trace_dma_fence_destroy which is certainly after
>>> the fence is signaled. There are likely other cases too - this just
>>> quickly came to mind.
>> Good point, completely forgotten about ftrace.
>>
>>>> E.g. when you have hardware fences it can happen that fences reference a
>>>> driver module (for the function printing the name) and the module in turn
>>>> keeps fences around.
>>>>
>>> I am almost positive without this patch this problematic in Xe or any
>>> driver in which schedulers are tied to IOCTLs rather than kernel module.
>>>
>>> In Xe 'fence->sched' maps to an xe_exec_queue which can be freed once
>>> the destroy exec queue IOCTL is called and all jobs are free'd (i.e.
>>> 'fence' signals). The fence could be live on after in dma-resv objects,
>>> drm syncobjs, etc...
>>>
>>> I know this issue has been raised before and basically NACK'd but I have
>>> a strong opinion this is valid and in fact required.
>> I've NACK'd automatically signaling pending fences on destruction of the
>> scheduler (that reminds me that I wanted to add a warning for that) and
>> copying the name into every scheduler fence.
>>
>> As long as we don't do any of that I'm perfectly fine fixing this issue. The
>> approach of creating a reference counted object for the name looks rather
>> valid to me.
>>
>> Especially since we then pretty much get the module references correct for
>> free as well.
> So I think the issue is much, much bigger, and there's more. And the
> issue is I think a fundamental design issue of dma_fence itself, not
> individual users.

IIRC both Alex and me pointed out this issue on the very first dma_fence 
code and nobody really cared.

>   I think at the core it's two constraints:
>
> - dma_fence can stick around practically forever in varios container
>    objects. We only garbage collect when someone looks, and not even then
>    consistently.
>
> - fences are meant to be cheap, so they do not have the big refcount going
>    on like other shared objects like dma_buf
>
> Specifically there's also no refcounting on the module itself with the
> ->owner and try_module_get stuff. So even if we fix all these issues on
> the data structure lifetime side of things, you might still oops calling
> into dma_fence->ops->release.
>
> Oops.

Yes, exactly that. I'm a bit surprised that you realize that only now :)

We have the issue for at least 10 years or so and it pops up every now 
and then on my desk because people complain that unloading amdgpu crashes.

> I think the complete solution is if we change this code all so that core
> dma-fence.c code guarantees to never ever again call into any driver code
> after dma_fence_signal has been called, and takes over the final kfree_rcu
> itself. But that's a giantic change. But I think it's the only way to
> really fix this mess:
>
> - drivers will clean up any of their own references in a timely fashion,
>    so no more accidentally lingering gpu context or vms and the bo they
>    have mapped lying around.
>
> - there's no lifetime or other use-after-free issues anywhere for fences
>    anymore
>
> Downside is that some of the debugging stuff becomes a bit less useful.
> But e.g. tracepoints could just dump the timeline once at creation or when
> signalling, and so you don't need to dump it anymore when freeing. And a
> signalled fence is generally not a problem anymore, so in a compositor
> that's also all fine (iirc you can get at some of this stuff through the
> sync_file interfaces too).
>
> The other downside is that it's a huge pile of work, but I don't think we
> can get to an actually solid design with less headaches and pain ...
>
> Thoughts?

The alternative is to use the scheduler fence(s) to decouple hardware 
fences from the containers. That would be rather cheap to implement.

The only downside would be that the scheduler module probably keeps 
loaded forever once used. But at least I can live with that.

Regards,
Christian.

>
> Cheers, Sima


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-09-02 14:18         ` Christian König
@ 2024-09-03  8:13           ` Simona Vetter
  2024-09-04  7:40             ` Christian König
  0 siblings, 1 reply; 18+ messages in thread
From: Simona Vetter @ 2024-09-03  8:13 UTC (permalink / raw)
  To: Christian König
  Cc: Daniel Vetter, Matthew Brost, Boris Brezillon, dri-devel,
	Steven Price, Liviu Dudau, Adrián Larumbe, kernel,
	Luben Tuikov, Danilo Krummrich

On Mon, Sep 02, 2024 at 04:18:33PM +0200, Christian König wrote:
> Am 02.09.24 um 15:23 schrieb Daniel Vetter:
> > On Mon, Sep 02, 2024 at 12:43:45PM +0200, Christian König wrote:
> > > Am 30.08.24 um 23:43 schrieb Matthew Brost:
> > > > On Fri, Aug 30, 2024 at 10:14:18AM +0200, Christian König wrote:
> > > > > Am 29.08.24 um 19:12 schrieb Boris Brezillon:
> > > > > > dma_fence objects created by an entity might outlive the
> > > > > > drm_gpu_scheduler this entity was bound to if those fences are retained
> > > > > > by other other objects, like a dma_buf resv. This means that
> > > > > > drm_sched_fence::sched might be invalid when the resv is walked, which
> > > > > > in turn leads to a UAF when dma_fence_ops::get_timeline_name() is called.
> > > > > > 
> > > > > > This probably went unnoticed so far, because the drm_gpu_scheduler had
> > > > > > the lifetime of the drm_device, so, unless you were removing the device,
> > > > > > there were no reasons for the scheduler to be gone before its fences.
> > > > > Nope, that is intentional design. get_timeline_name() is not safe to be
> > > > > called after the fence signaled because that would causes circular
> > > > > dependency problems.
> > So I don't think knowlingly crashing in debugfs is ok. debugfs can break
> > stuff like secure boot, and if you go about things very wrongly it can
> > upset the kernel (like touching pci mappings from userspace can). But just
> > going boom due to a race essentially means debugfs is unusable. Because
> > there's no way to avoid the boom with dma_fence:
> > 
> > - they're guaranteed to signal in finite time (unless driver bugs)
> > 
> > - the moment they've signalled looking too closely at them is undefined
> >    behaviour.
> > 
> > > > I'm quite sure happens, ftrace for example can and will call
> > > > get_timeline_name in trace_dma_fence_destroy which is certainly after
> > > > the fence is signaled. There are likely other cases too - this just
> > > > quickly came to mind.
> > > Good point, completely forgotten about ftrace.
> > > 
> > > > > E.g. when you have hardware fences it can happen that fences reference a
> > > > > driver module (for the function printing the name) and the module in turn
> > > > > keeps fences around.
> > > > > 
> > > > I am almost positive without this patch this problematic in Xe or any
> > > > driver in which schedulers are tied to IOCTLs rather than kernel module.
> > > > 
> > > > In Xe 'fence->sched' maps to an xe_exec_queue which can be freed once
> > > > the destroy exec queue IOCTL is called and all jobs are free'd (i.e.
> > > > 'fence' signals). The fence could be live on after in dma-resv objects,
> > > > drm syncobjs, etc...
> > > > 
> > > > I know this issue has been raised before and basically NACK'd but I have
> > > > a strong opinion this is valid and in fact required.
> > > I've NACK'd automatically signaling pending fences on destruction of the
> > > scheduler (that reminds me that I wanted to add a warning for that) and
> > > copying the name into every scheduler fence.
> > > 
> > > As long as we don't do any of that I'm perfectly fine fixing this issue. The
> > > approach of creating a reference counted object for the name looks rather
> > > valid to me.
> > > 
> > > Especially since we then pretty much get the module references correct for
> > > free as well.
> > So I think the issue is much, much bigger, and there's more. And the
> > issue is I think a fundamental design issue of dma_fence itself, not
> > individual users.
> 
> IIRC both Alex and me pointed out this issue on the very first dma_fence
> code and nobody really cared.

I guess way back then we didn't really sort out any of the hotunplug
issues, and there wasn't any fw ctx schedulers at least on our horizons
yet. Thin excuse, I know ...

> >   I think at the core it's two constraints:
> > 
> > - dma_fence can stick around practically forever in varios container
> >    objects. We only garbage collect when someone looks, and not even then
> >    consistently.
> > 
> > - fences are meant to be cheap, so they do not have the big refcount going
> >    on like other shared objects like dma_buf
> > 
> > Specifically there's also no refcounting on the module itself with the
> > ->owner and try_module_get stuff. So even if we fix all these issues on
> > the data structure lifetime side of things, you might still oops calling
> > into dma_fence->ops->release.
> > 
> > Oops.
> 
> Yes, exactly that. I'm a bit surprised that you realize that only now :)
> 
> We have the issue for at least 10 years or so and it pops up every now and
> then on my desk because people complain that unloading amdgpu crashes.

Yeah I knew about the issue. The new idea that popped into my mind is that
I think we cannot plug this properly unless we do it in dma_fence.c for
everyone, and essentially reshape the lifetime rules for that from yolo
to something actually well-defined.

Kinda similar work to how dma_resv locking rules and fence book-keeping
were unified to something that actually works across drivers ...
 
> > I think the complete solution is if we change this code all so that core
> > dma-fence.c code guarantees to never ever again call into any driver code
> > after dma_fence_signal has been called, and takes over the final kfree_rcu
> > itself. But that's a giantic change. But I think it's the only way to
> > really fix this mess:
> > 
> > - drivers will clean up any of their own references in a timely fashion,
> >    so no more accidentally lingering gpu context or vms and the bo they
> >    have mapped lying around.
> > 
> > - there's no lifetime or other use-after-free issues anywhere for fences
> >    anymore
> > 
> > Downside is that some of the debugging stuff becomes a bit less useful.
> > But e.g. tracepoints could just dump the timeline once at creation or when
> > signalling, and so you don't need to dump it anymore when freeing. And a
> > signalled fence is generally not a problem anymore, so in a compositor
> > that's also all fine (iirc you can get at some of this stuff through the
> > sync_file interfaces too).
> > 
> > The other downside is that it's a huge pile of work, but I don't think we
> > can get to an actually solid design with less headaches and pain ...
> > 
> > Thoughts?
> 
> The alternative is to use the scheduler fence(s) to decouple hardware fences
> from the containers. That would be rather cheap to implement.
> 
> The only downside would be that the scheduler module probably keeps loaded
> forever once used. But at least I can live with that.

Yeah I think interim it's an ok stop-gap. But aside from keeping the
scheduler code pinned forever I think there's some more things:

- I'm not sure we can do it, without digging into dma_fence.c locking
  internals too much.

- It defacto means you can use dma_fence that are fence containers and
  drm_sched_job_fence, and nothing else. And drivers will get this wrong
  and do dma_fence ad-hoc for stuff like tlb flushing, or pte writing, and
  whatever else, that won't necessairly go through a drm_sched.

So not great imo, and hence why I've shifted towards that we should fix
this in dma_fence.c code for everyone.
-Sima
-- 
Simona Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-09-03  8:13           ` Simona Vetter
@ 2024-09-04  7:40             ` Christian König
  2024-09-04  9:46               ` Simona Vetter
  0 siblings, 1 reply; 18+ messages in thread
From: Christian König @ 2024-09-04  7:40 UTC (permalink / raw)
  To: Simona Vetter
  Cc: Daniel Vetter, Matthew Brost, Boris Brezillon, dri-devel,
	Steven Price, Liviu Dudau, Adrián Larumbe, kernel,
	Luben Tuikov, Danilo Krummrich

[-- Attachment #1: Type: text/plain, Size: 4277 bytes --]

Am 03.09.24 um 10:13 schrieb Simona Vetter:
> [SNIP]
>>> So I think the issue is much, much bigger, and there's more. And the
>>> issue is I think a fundamental design issue of dma_fence itself, not
>>> individual users.
>> IIRC both Alex and me pointed out this issue on the very first dma_fence
>> code and nobody really cared.
> I guess way back then we didn't really sort out any of the hotunplug
> issues, and there wasn't any fw ctx schedulers at least on our horizons
> yet. Thin excuse, I know ...

Well it's just when you have a bee string and a broken leg, what do you 
attend first? :)

>>>    I think at the core it's two constraints:
>>>
>>> - dma_fence can stick around practically forever in varios container
>>>     objects. We only garbage collect when someone looks, and not even then
>>>     consistently.
>>>
>>> - fences are meant to be cheap, so they do not have the big refcount going
>>>     on like other shared objects like dma_buf
>>>
>>> Specifically there's also no refcounting on the module itself with the
>>> ->owner and try_module_get stuff. So even if we fix all these issues on
>>> the data structure lifetime side of things, you might still oops calling
>>> into dma_fence->ops->release.
>>>
>>> Oops.
>> Yes, exactly that. I'm a bit surprised that you realize that only now :)
>>
>> We have the issue for at least 10 years or so and it pops up every now and
>> then on my desk because people complain that unloading amdgpu crashes.
> Yeah I knew about the issue. The new idea that popped into my mind is that
> I think we cannot plug this properly unless we do it in dma_fence.c for
> everyone, and essentially reshape the lifetime rules for that from yolo
> to something actually well-defined.
>
> Kinda similar work to how dma_resv locking rules and fence book-keeping
> were unified to something that actually works across drivers ...

Well sounds like I've just got more items on my TODO list.

I have patches waiting to be send out going into this direction anyway, 
will try to get them out by the end of the week and then we can discuss 
what's still missing.

Christian.

>   
>>> I think the complete solution is if we change this code all so that core
>>> dma-fence.c code guarantees to never ever again call into any driver code
>>> after dma_fence_signal has been called, and takes over the final kfree_rcu
>>> itself. But that's a giantic change. But I think it's the only way to
>>> really fix this mess:
>>>
>>> - drivers will clean up any of their own references in a timely fashion,
>>>     so no more accidentally lingering gpu context or vms and the bo they
>>>     have mapped lying around.
>>>
>>> - there's no lifetime or other use-after-free issues anywhere for fences
>>>     anymore
>>>
>>> Downside is that some of the debugging stuff becomes a bit less useful.
>>> But e.g. tracepoints could just dump the timeline once at creation or when
>>> signalling, and so you don't need to dump it anymore when freeing. And a
>>> signalled fence is generally not a problem anymore, so in a compositor
>>> that's also all fine (iirc you can get at some of this stuff through the
>>> sync_file interfaces too).
>>>
>>> The other downside is that it's a huge pile of work, but I don't think we
>>> can get to an actually solid design with less headaches and pain ...
>>>
>>> Thoughts?
>> The alternative is to use the scheduler fence(s) to decouple hardware fences
>> from the containers. That would be rather cheap to implement.
>>
>> The only downside would be that the scheduler module probably keeps loaded
>> forever once used. But at least I can live with that.
> Yeah I think interim it's an ok stop-gap. But aside from keeping the
> scheduler code pinned forever I think there's some more things:
>
> - I'm not sure we can do it, without digging into dma_fence.c locking
>    internals too much.
>
> - It defacto means you can use dma_fence that are fence containers and
>    drm_sched_job_fence, and nothing else. And drivers will get this wrong
>    and do dma_fence ad-hoc for stuff like tlb flushing, or pte writing, and
>    whatever else, that won't necessairly go through a drm_sched.
>
> So not great imo, and hence why I've shifted towards that we should fix
> this in dma_fence.c code for everyone.
> -Sima

[-- Attachment #2: Type: text/html, Size: 5470 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-09-04  7:40             ` Christian König
@ 2024-09-04  9:46               ` Simona Vetter
  2024-09-04 10:03                 ` Simona Vetter
  2024-09-04 10:23                 ` Boris Brezillon
  0 siblings, 2 replies; 18+ messages in thread
From: Simona Vetter @ 2024-09-04  9:46 UTC (permalink / raw)
  To: Christian König
  Cc: Simona Vetter, Daniel Vetter, Matthew Brost, Boris Brezillon,
	dri-devel, Steven Price, Liviu Dudau, Adrián Larumbe, kernel,
	Luben Tuikov, Danilo Krummrich

On Wed, Sep 04, 2024 at 09:40:36AM +0200, Christian König wrote:
> Am 03.09.24 um 10:13 schrieb Simona Vetter:
> > [SNIP]
> > > > So I think the issue is much, much bigger, and there's more. And the
> > > > issue is I think a fundamental design issue of dma_fence itself, not
> > > > individual users.
> > > IIRC both Alex and me pointed out this issue on the very first dma_fence
> > > code and nobody really cared.
> > I guess way back then we didn't really sort out any of the hotunplug
> > issues, and there wasn't any fw ctx schedulers at least on our horizons
> > yet. Thin excuse, I know ...
> 
> Well it's just when you have a bee string and a broken leg, what do you
> attend first? :)

Yeah ...

> > > >    I think at the core it's two constraints:
> > > > 
> > > > - dma_fence can stick around practically forever in varios container
> > > >     objects. We only garbage collect when someone looks, and not even then
> > > >     consistently.
> > > > 
> > > > - fences are meant to be cheap, so they do not have the big refcount going
> > > >     on like other shared objects like dma_buf
> > > > 
> > > > Specifically there's also no refcounting on the module itself with the
> > > > ->owner and try_module_get stuff. So even if we fix all these issues on
> > > > the data structure lifetime side of things, you might still oops calling
> > > > into dma_fence->ops->release.
> > > > 
> > > > Oops.
> > > Yes, exactly that. I'm a bit surprised that you realize that only now :)
> > > 
> > > We have the issue for at least 10 years or so and it pops up every now and
> > > then on my desk because people complain that unloading amdgpu crashes.
> > Yeah I knew about the issue. The new idea that popped into my mind is that
> > I think we cannot plug this properly unless we do it in dma_fence.c for
> > everyone, and essentially reshape the lifetime rules for that from yolo
> > to something actually well-defined.
> > 
> > Kinda similar work to how dma_resv locking rules and fence book-keeping
> > were unified to something that actually works across drivers ...
> 
> Well sounds like I've just got more items on my TODO list.
> 
> I have patches waiting to be send out going into this direction anyway, will
> try to get them out by the end of the week and then we can discuss what's
> still missing.

Quick addition, another motivator from the panthor userspace submit
discussion: If the preempt ctx fence concept spreads, that's another
non-drm_sched fence that drivers will need and are pretty much guaranteed
to get wrong.

Also maybe Boris volunteers to help out with some of the work here? Or
perhaps some of the nova folks, it seems to be even more a pain for rust
drivers ...

Cheers, Sima

> 
> Christian.
> 
> > > > I think the complete solution is if we change this code all so that core
> > > > dma-fence.c code guarantees to never ever again call into any driver code
> > > > after dma_fence_signal has been called, and takes over the final kfree_rcu
> > > > itself. But that's a giantic change. But I think it's the only way to
> > > > really fix this mess:
> > > > 
> > > > - drivers will clean up any of their own references in a timely fashion,
> > > >     so no more accidentally lingering gpu context or vms and the bo they
> > > >     have mapped lying around.
> > > > 
> > > > - there's no lifetime or other use-after-free issues anywhere for fences
> > > >     anymore
> > > > 
> > > > Downside is that some of the debugging stuff becomes a bit less useful.
> > > > But e.g. tracepoints could just dump the timeline once at creation or when
> > > > signalling, and so you don't need to dump it anymore when freeing. And a
> > > > signalled fence is generally not a problem anymore, so in a compositor
> > > > that's also all fine (iirc you can get at some of this stuff through the
> > > > sync_file interfaces too).
> > > > 
> > > > The other downside is that it's a huge pile of work, but I don't think we
> > > > can get to an actually solid design with less headaches and pain ...
> > > > 
> > > > Thoughts?
> > > The alternative is to use the scheduler fence(s) to decouple hardware fences
> > > from the containers. That would be rather cheap to implement.
> > > 
> > > The only downside would be that the scheduler module probably keeps loaded
> > > forever once used. But at least I can live with that.
> > Yeah I think interim it's an ok stop-gap. But aside from keeping the
> > scheduler code pinned forever I think there's some more things:
> > 
> > - I'm not sure we can do it, without digging into dma_fence.c locking
> >    internals too much.
> > 
> > - It defacto means you can use dma_fence that are fence containers and
> >    drm_sched_job_fence, and nothing else. And drivers will get this wrong
> >    and do dma_fence ad-hoc for stuff like tlb flushing, or pte writing, and
> >    whatever else, that won't necessairly go through a drm_sched.
> > 
> > So not great imo, and hence why I've shifted towards that we should fix
> > this in dma_fence.c code for everyone.
> > -Sima

-- 
Simona Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-09-04  9:46               ` Simona Vetter
@ 2024-09-04 10:03                 ` Simona Vetter
  2024-09-04 10:26                   ` Boris Brezillon
  2024-09-04 10:23                 ` Boris Brezillon
  1 sibling, 1 reply; 18+ messages in thread
From: Simona Vetter @ 2024-09-04 10:03 UTC (permalink / raw)
  To: Simona Vetter
  Cc: Christian König, Daniel Vetter, Matthew Brost,
	Boris Brezillon, dri-devel, Steven Price, Liviu Dudau,
	Adrián Larumbe, kernel, Luben Tuikov, Danilo Krummrich

On Wed, Sep 04, 2024 at 11:46:54AM +0200, Simona Vetter wrote:
> On Wed, Sep 04, 2024 at 09:40:36AM +0200, Christian König wrote:
> > Am 03.09.24 um 10:13 schrieb Simona Vetter:
> > > [SNIP]
> > > > > So I think the issue is much, much bigger, and there's more. And the
> > > > > issue is I think a fundamental design issue of dma_fence itself, not
> > > > > individual users.
> > > > IIRC both Alex and me pointed out this issue on the very first dma_fence
> > > > code and nobody really cared.
> > > I guess way back then we didn't really sort out any of the hotunplug
> > > issues, and there wasn't any fw ctx schedulers at least on our horizons
> > > yet. Thin excuse, I know ...
> > 
> > Well it's just when you have a bee string and a broken leg, what do you
> > attend first? :)
> 
> Yeah ...
> 
> > > > >    I think at the core it's two constraints:
> > > > > 
> > > > > - dma_fence can stick around practically forever in varios container
> > > > >     objects. We only garbage collect when someone looks, and not even then
> > > > >     consistently.
> > > > > 
> > > > > - fences are meant to be cheap, so they do not have the big refcount going
> > > > >     on like other shared objects like dma_buf
> > > > > 
> > > > > Specifically there's also no refcounting on the module itself with the
> > > > > ->owner and try_module_get stuff. So even if we fix all these issues on
> > > > > the data structure lifetime side of things, you might still oops calling
> > > > > into dma_fence->ops->release.
> > > > > 
> > > > > Oops.
> > > > Yes, exactly that. I'm a bit surprised that you realize that only now :)
> > > > 
> > > > We have the issue for at least 10 years or so and it pops up every now and
> > > > then on my desk because people complain that unloading amdgpu crashes.
> > > Yeah I knew about the issue. The new idea that popped into my mind is that
> > > I think we cannot plug this properly unless we do it in dma_fence.c for
> > > everyone, and essentially reshape the lifetime rules for that from yolo
> > > to something actually well-defined.
> > > 
> > > Kinda similar work to how dma_resv locking rules and fence book-keeping
> > > were unified to something that actually works across drivers ...
> > 
> > Well sounds like I've just got more items on my TODO list.
> > 
> > I have patches waiting to be send out going into this direction anyway, will
> > try to get them out by the end of the week and then we can discuss what's
> > still missing.
> 
> Quick addition, another motivator from the panthor userspace submit
> discussion: If the preempt ctx fence concept spreads, that's another
> non-drm_sched fence that drivers will need and are pretty much guaranteed
> to get wrong.
> 
> Also maybe Boris volunteers to help out with some of the work here? Or
> perhaps some of the nova folks, it seems to be even more a pain for rust
> drivers ...

I forgot to add: I think it'd be really good to record the rough consensus
on the problem and the long term solution we're aiming for an a kerneldoc
or TODO patch. I think recording those design goals helped us a _lot_ in
making the dma_resv_usage/lock and dma_buf api cleanups and cross-driver
consistent semantics happen. Maybe as a WARNING/TODO block in the
dma_fence_ops kerneldoc?

Boris, can you volunteer perhaps? I'm happy to review, but I feel like I'm
too close and deep in this mess that I'll likely miss some aspect if I
type it myself.

Thanks, Sima
-- 
Simona Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-09-04 10:03                 ` Simona Vetter
@ 2024-09-04 10:26                   ` Boris Brezillon
  0 siblings, 0 replies; 18+ messages in thread
From: Boris Brezillon @ 2024-09-04 10:26 UTC (permalink / raw)
  To: Simona Vetter
  Cc: Christian König, Daniel Vetter, Matthew Brost, dri-devel,
	Steven Price, Liviu Dudau, Adrián Larumbe, kernel,
	Luben Tuikov, Danilo Krummrich

On Wed, 4 Sep 2024 12:03:24 +0200
Simona Vetter <simona.vetter@ffwll.ch> wrote:

> On Wed, Sep 04, 2024 at 11:46:54AM +0200, Simona Vetter wrote:
> > On Wed, Sep 04, 2024 at 09:40:36AM +0200, Christian König wrote:  
> > > Am 03.09.24 um 10:13 schrieb Simona Vetter:  
> > > > [SNIP]  
> > > > > > So I think the issue is much, much bigger, and there's more. And the
> > > > > > issue is I think a fundamental design issue of dma_fence itself, not
> > > > > > individual users.  
> > > > > IIRC both Alex and me pointed out this issue on the very first dma_fence
> > > > > code and nobody really cared.  
> > > > I guess way back then we didn't really sort out any of the hotunplug
> > > > issues, and there wasn't any fw ctx schedulers at least on our horizons
> > > > yet. Thin excuse, I know ...  
> > > 
> > > Well it's just when you have a bee string and a broken leg, what do you
> > > attend first? :)  
> > 
> > Yeah ...
> >   
> > > > > >    I think at the core it's two constraints:
> > > > > > 
> > > > > > - dma_fence can stick around practically forever in varios container
> > > > > >     objects. We only garbage collect when someone looks, and not even then
> > > > > >     consistently.
> > > > > > 
> > > > > > - fences are meant to be cheap, so they do not have the big refcount going
> > > > > >     on like other shared objects like dma_buf
> > > > > > 
> > > > > > Specifically there's also no refcounting on the module itself with the  
> > > > > > ->owner and try_module_get stuff. So even if we fix all these issues on  
> > > > > > the data structure lifetime side of things, you might still oops calling
> > > > > > into dma_fence->ops->release.
> > > > > > 
> > > > > > Oops.  
> > > > > Yes, exactly that. I'm a bit surprised that you realize that only now :)
> > > > > 
> > > > > We have the issue for at least 10 years or so and it pops up every now and
> > > > > then on my desk because people complain that unloading amdgpu crashes.  
> > > > Yeah I knew about the issue. The new idea that popped into my mind is that
> > > > I think we cannot plug this properly unless we do it in dma_fence.c for
> > > > everyone, and essentially reshape the lifetime rules for that from yolo
> > > > to something actually well-defined.
> > > > 
> > > > Kinda similar work to how dma_resv locking rules and fence book-keeping
> > > > were unified to something that actually works across drivers ...  
> > > 
> > > Well sounds like I've just got more items on my TODO list.
> > > 
> > > I have patches waiting to be send out going into this direction anyway, will
> > > try to get them out by the end of the week and then we can discuss what's
> > > still missing.  
> > 
> > Quick addition, another motivator from the panthor userspace submit
> > discussion: If the preempt ctx fence concept spreads, that's another
> > non-drm_sched fence that drivers will need and are pretty much guaranteed
> > to get wrong.
> > 
> > Also maybe Boris volunteers to help out with some of the work here? Or
> > perhaps some of the nova folks, it seems to be even more a pain for rust
> > drivers ...  
> 
> I forgot to add: I think it'd be really good to record the rough consensus
> on the problem and the long term solution we're aiming for an a kerneldoc
> or TODO patch. I think recording those design goals helped us a _lot_ in
> making the dma_resv_usage/lock and dma_buf api cleanups and cross-driver
> consistent semantics happen. Maybe as a WARNING/TODO block in the
> dma_fence_ops kerneldoc?
> 
> Boris, can you volunteer perhaps?

Sure, I won't be able to do that this week though.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-09-04  9:46               ` Simona Vetter
  2024-09-04 10:03                 ` Simona Vetter
@ 2024-09-04 10:23                 ` Boris Brezillon
  1 sibling, 0 replies; 18+ messages in thread
From: Boris Brezillon @ 2024-09-04 10:23 UTC (permalink / raw)
  To: Simona Vetter
  Cc: Christian König, Daniel Vetter, Matthew Brost, dri-devel,
	Steven Price, Liviu Dudau, Adrián Larumbe, kernel,
	Luben Tuikov, Danilo Krummrich

On Wed, 4 Sep 2024 11:46:54 +0200
Simona Vetter <simona.vetter@ffwll.ch> wrote:

> On Wed, Sep 04, 2024 at 09:40:36AM +0200, Christian König wrote:
> > Am 03.09.24 um 10:13 schrieb Simona Vetter:  
> > > [SNIP]  
> > > > > So I think the issue is much, much bigger, and there's more. And the
> > > > > issue is I think a fundamental design issue of dma_fence itself, not
> > > > > individual users.  
> > > > IIRC both Alex and me pointed out this issue on the very first dma_fence
> > > > code and nobody really cared.  
> > > I guess way back then we didn't really sort out any of the hotunplug
> > > issues, and there wasn't any fw ctx schedulers at least on our horizons
> > > yet. Thin excuse, I know ...  
> > 
> > Well it's just when you have a bee string and a broken leg, what do you
> > attend first? :)  
> 
> Yeah ...
> 
> > > > >    I think at the core it's two constraints:
> > > > > 
> > > > > - dma_fence can stick around practically forever in varios container
> > > > >     objects. We only garbage collect when someone looks, and not even then
> > > > >     consistently.
> > > > > 
> > > > > - fences are meant to be cheap, so they do not have the big refcount going
> > > > >     on like other shared objects like dma_buf
> > > > > 
> > > > > Specifically there's also no refcounting on the module itself with the  
> > > > > ->owner and try_module_get stuff. So even if we fix all these issues on  
> > > > > the data structure lifetime side of things, you might still oops calling
> > > > > into dma_fence->ops->release.
> > > > > 
> > > > > Oops.  
> > > > Yes, exactly that. I'm a bit surprised that you realize that only now :)
> > > > 
> > > > We have the issue for at least 10 years or so and it pops up every now and
> > > > then on my desk because people complain that unloading amdgpu crashes.  
> > > Yeah I knew about the issue. The new idea that popped into my mind is that
> > > I think we cannot plug this properly unless we do it in dma_fence.c for
> > > everyone, and essentially reshape the lifetime rules for that from yolo
> > > to something actually well-defined.
> > > 
> > > Kinda similar work to how dma_resv locking rules and fence book-keeping
> > > were unified to something that actually works across drivers ...  
> > 
> > Well sounds like I've just got more items on my TODO list.
> > 
> > I have patches waiting to be send out going into this direction anyway, will
> > try to get them out by the end of the week and then we can discuss what's
> > still missing.  
> 
> Quick addition, another motivator from the panthor userspace submit
> discussion: If the preempt ctx fence concept spreads, that's another
> non-drm_sched fence that drivers will need and are pretty much guaranteed
> to get wrong.
> 
> Also maybe Boris volunteers to help out with some of the work here?

Sure, I can review/test what Christian comes up with, since he already
seems to have a draft for the new implementation.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-08-29 17:12 [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched Boris Brezillon
  2024-08-30  8:14 ` Christian König
@ 2024-09-01 22:39 ` kernel test robot
  2024-09-02  3:14 ` kernel test robot
  2 siblings, 0 replies; 18+ messages in thread
From: kernel test robot @ 2024-09-01 22:39 UTC (permalink / raw)
  To: Boris Brezillon; +Cc: llvm, oe-kbuild-all

Hi Boris,

[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:

[auto build test ERROR on drm/drm-next]
[also build test ERROR on drm-exynos/exynos-drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-misc/drm-misc-next drm-tip/drm-tip linus/master v6.11-rc6 next-20240830]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Boris-Brezillon/drm-sched-Fix-a-UAF-on-drm_sched_fence-sched/20240830-012038
base:   git://anongit.freedesktop.org/drm/drm drm-next
patch link:    https://lore.kernel.org/r/20240829171238.609481-1-boris.brezillon%40collabora.com
patch subject: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
config: i386-buildonly-randconfig-003-20240902 (https://download.01.org/0day-ci/archive/20240902/202409020603.tc9e1ATi-lkp@intel.com/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240902/202409020603.tc9e1ATi-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409020603.tc9e1ATi-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from drivers/gpu/drm/lima/lima_dlbu.c:7:
   In file included from drivers/gpu/drm/lima/lima_device.h:12:
   In file included from drivers/gpu/drm/lima/lima_sched.h:7:
>> include/drm/gpu_scheduler.h:290:2: error: call to undeclared function 'kfree'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     290 |         kfree(tl->name);
         |         ^
   1 error generated.
--
   In file included from drivers/gpu/drm/lima/lima_trace.c:4:
   In file included from drivers/gpu/drm/lima/lima_sched.h:7:
>> include/drm/gpu_scheduler.h:290:2: error: call to undeclared function 'kfree'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     290 |         kfree(tl->name);
         |         ^
   In file included from drivers/gpu/drm/lima/lima_trace.c:7:
   In file included from drivers/gpu/drm/lima/lima_trace.h:50:
   In file included from include/trace/define_trace.h:102:
   In file included from include/trace/trace_events.h:21:
   In file included from include/linux/trace_events.h:6:
   In file included from include/linux/ring_buffer.h:5:
   In file included from include/linux/mm.h:33:
   include/linux/slab.h:278:6: error: conflicting types for 'kfree'
     278 | void kfree(const void *objp);
         |      ^
   include/drm/gpu_scheduler.h:290:2: note: previous implicit declaration is here
     290 |         kfree(tl->name);
         |         ^
   In file included from drivers/gpu/drm/lima/lima_trace.c:7:
   In file included from drivers/gpu/drm/lima/lima_trace.h:50:
   In file included from include/trace/define_trace.h:102:
   In file included from include/trace/trace_events.h:21:
   In file included from include/linux/trace_events.h:6:
   In file included from include/linux/ring_buffer.h:5:
   In file included from include/linux/mm.h:1127:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:98:11: warning: array index 3 is past the end of the array (that has type 'unsigned long[2]') [-Warray-bounds]
      98 |                 return (set->sig[3] | set->sig[2] |
         |                         ^        ~
   arch/x86/include/asm/signal.h:24:2: note: array 'sig' declared here
      24 |         unsigned long sig[_NSIG_WORDS];
         |         ^
   In file included from drivers/gpu/drm/lima/lima_trace.c:7:
   In file included from drivers/gpu/drm/lima/lima_trace.h:50:
   In file included from include/trace/define_trace.h:102:
   In file included from include/trace/trace_events.h:21:
   In file included from include/linux/trace_events.h:6:
   In file included from include/linux/ring_buffer.h:5:
   In file included from include/linux/mm.h:1127:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:98:25: warning: array index 2 is past the end of the array (that has type 'unsigned long[2]') [-Warray-bounds]
      98 |                 return (set->sig[3] | set->sig[2] |
         |                                       ^        ~
   arch/x86/include/asm/signal.h:24:2: note: array 'sig' declared here
      24 |         unsigned long sig[_NSIG_WORDS];
         |         ^
   In file included from drivers/gpu/drm/lima/lima_trace.c:7:
   In file included from drivers/gpu/drm/lima/lima_trace.h:50:
   In file included from include/trace/define_trace.h:102:
   In file included from include/trace/trace_events.h:21:
   In file included from include/linux/trace_events.h:6:
   In file included from include/linux/ring_buffer.h:5:
   In file included from include/linux/mm.h:1127:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:114:11: warning: array index 3 is past the end of the array (that has type 'const unsigned long[2]') [-Warray-bounds]
     114 |                 return  (set1->sig[3] == set2->sig[3]) &&
         |                          ^         ~
   arch/x86/include/asm/signal.h:24:2: note: array 'sig' declared here
      24 |         unsigned long sig[_NSIG_WORDS];
         |         ^
   In file included from drivers/gpu/drm/lima/lima_trace.c:7:
   In file included from drivers/gpu/drm/lima/lima_trace.h:50:
   In file included from include/trace/define_trace.h:102:
   In file included from include/trace/trace_events.h:21:
   In file included from include/linux/trace_events.h:6:
   In file included from include/linux/ring_buffer.h:5:
   In file included from include/linux/mm.h:1127:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:114:27: warning: array index 3 is past the end of the array (that has type 'const unsigned long[2]') [-Warray-bounds]
     114 |                 return  (set1->sig[3] == set2->sig[3]) &&
         |                                          ^         ~
   arch/x86/include/asm/signal.h:24:2: note: array 'sig' declared here
      24 |         unsigned long sig[_NSIG_WORDS];
         |         ^
   In file included from drivers/gpu/drm/lima/lima_trace.c:7:
   In file included from drivers/gpu/drm/lima/lima_trace.h:50:
   In file included from include/trace/define_trace.h:102:
   In file included from include/trace/trace_events.h:21:
   In file included from include/linux/trace_events.h:6:
   In file included from include/linux/ring_buffer.h:5:
   In file included from include/linux/mm.h:1127:
   In file included from include/linux/huge_mm.h:8:
   In file included from include/linux/fs.h:33:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:115:5: warning: array index 2 is past the end of the array (that has type 'const unsigned long[2]') [-Warray-bounds]


vim +/kfree +290 include/drm/gpu_scheduler.h

   283	
   284	static inline void
   285	drm_sched_fence_timeline_release(struct kref *kref)
   286	{
   287		struct drm_sched_fence_timeline *tl =
   288			container_of(kref, struct drm_sched_fence_timeline, kref);
   289	
 > 290		kfree(tl->name);
   291		kfree(tl);
   292	}
   293	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
  2024-08-29 17:12 [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched Boris Brezillon
  2024-08-30  8:14 ` Christian König
  2024-09-01 22:39 ` kernel test robot
@ 2024-09-02  3:14 ` kernel test robot
  2 siblings, 0 replies; 18+ messages in thread
From: kernel test robot @ 2024-09-02  3:14 UTC (permalink / raw)
  To: Boris Brezillon; +Cc: oe-kbuild-all

Hi Boris,

[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:

[auto build test ERROR on drm/drm-next]
[also build test ERROR on drm-misc/drm-misc-next linus/master v6.11-rc6 next-20240830]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Boris-Brezillon/drm-sched-Fix-a-UAF-on-drm_sched_fence-sched/20240830-012038
base:   git://anongit.freedesktop.org/drm/drm drm-next
patch link:    https://lore.kernel.org/r/20240829171238.609481-1-boris.brezillon%40collabora.com
patch subject: [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched
config: i386-allmodconfig (https://download.01.org/0day-ci/archive/20240902/202409021023.EWUGtkoV-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240902/202409021023.EWUGtkoV-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409021023.EWUGtkoV-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h:28,
                    from drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h:29,
                    from drivers/gpu/drm/amd/amdgpu/amdgpu.h:43,
                    from drivers/gpu/drm/amd/amdgpu/amdgpu_doorbell_mgr.c:25:
   include/drm/gpu_scheduler.h: In function 'drm_sched_fence_timeline_release':
>> include/drm/gpu_scheduler.h:290:9: error: implicit declaration of function 'kfree' [-Werror=implicit-function-declaration]
     290 |         kfree(tl->name);
         |         ^~~~~
   In file included from include/linux/resource_ext.h:11,
                    from include/linux/pci.h:40,
                    from drivers/gpu/drm/amd/amdgpu/amdgpu.h:52:
   include/linux/slab.h: At top level:
   include/linux/slab.h:278:6: warning: conflicting types for 'kfree'; have 'void(const void *)'
     278 | void kfree(const void *objp);
         |      ^~~~~
   include/drm/gpu_scheduler.h:290:9: note: previous implicit declaration of 'kfree' with type 'void(const void *)'
     290 |         kfree(tl->name);
         |         ^~~~~
   cc1: some warnings being treated as errors


vim +/kfree +290 include/drm/gpu_scheduler.h

   283	
   284	static inline void
   285	drm_sched_fence_timeline_release(struct kref *kref)
   286	{
   287		struct drm_sched_fence_timeline *tl =
   288			container_of(kref, struct drm_sched_fence_timeline, kref);
   289	
 > 290		kfree(tl->name);
   291		kfree(tl);
   292	}
   293	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2024-09-04 10:26 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-29 17:12 [RFC PATCH] drm/sched: Fix a UAF on drm_sched_fence::sched Boris Brezillon
2024-08-30  8:14 ` Christian König
2024-08-30  9:37   ` Boris Brezillon
2024-08-30 10:44     ` Boris Brezillon
2024-08-30 12:57     ` Christian König
2024-08-30 21:43   ` Matthew Brost
2024-08-31  7:25     ` Boris Brezillon
2024-09-02 10:43     ` Christian König
2024-09-02 13:23       ` Daniel Vetter
2024-09-02 14:18         ` Christian König
2024-09-03  8:13           ` Simona Vetter
2024-09-04  7:40             ` Christian König
2024-09-04  9:46               ` Simona Vetter
2024-09-04 10:03                 ` Simona Vetter
2024-09-04 10:26                   ` Boris Brezillon
2024-09-04 10:23                 ` Boris Brezillon
2024-09-01 22:39 ` kernel test robot
2024-09-02  3:14 ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.