[PATCH] drm/amdgpu/userq: fix access to stale wptr mapping

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] drm/amdgpu/userq: fix access to stale wptr mapping
@ 2026-05-04 12:56 Sunil Khatri
  2026-05-04 13:05 ` Christian König
  0 siblings, 1 reply; 4+ messages in thread
From: Sunil Khatri @ 2026-05-04 12:56 UTC (permalink / raw)
  To: Alex Deucher, Christian König; +Cc: amd-gfx, Sunil Khatri

Use drm_exec to take both locks i.e vm root bo and
wptr_obj bo to access the mapping data properly.

This fixes the security issue of unmap the wptr_obj while
a queue creation is in progress and passing other
bo at same address.

Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 122 ++++++++++-----------
 1 file changed, 57 insertions(+), 65 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 501e2e10b4a6..3d4f83015488 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -30,34 +30,6 @@
 #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE
 #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE
 
-static int
-mes_userq_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
-{
-	int ret;
-
-	ret = amdgpu_bo_reserve(bo, true);
-	if (ret) {
-		DRM_ERROR("Failed to reserve bo. ret %d\n", ret);
-		goto err_reserve_bo_failed;
-	}
-
-	ret = amdgpu_ttm_alloc_gart(&bo->tbo);
-	if (ret) {
-		DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);
-		goto err_map_bo_gart_failed;
-	}
-
-	amdgpu_bo_unreserve(bo);
-	bo = amdgpu_bo_ref(bo);
-
-	return 0;
-
-err_map_bo_gart_failed:
-	amdgpu_bo_unreserve(bo);
-err_reserve_bo_failed:
-	return ret;
-}
-
 static int
 mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
 			      struct amdgpu_userq_mgr *uq_mgr,
@@ -65,55 +37,75 @@ mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
 			      uint64_t wptr)
 {
 	struct amdgpu_bo_va_mapping *wptr_mapping;
-	struct amdgpu_vm *wptr_vm;
 	struct amdgpu_userq_obj *wptr_obj = &queue->wptr_obj;
+	struct amdgpu_bo *obj;
+	struct amdgpu_vm *vm = queue->vm;
+	struct drm_exec exec;
 	int ret;
 
-	wptr_vm = queue->vm;
-	ret = amdgpu_bo_reserve(wptr_vm->root.bo, false);
-	if (ret)
-		return ret;
-
 	wptr &= AMDGPU_GMC_HOLE_MASK;
-	wptr_mapping = amdgpu_vm_bo_lookup_mapping(wptr_vm, wptr >> PAGE_SHIFT);
-	amdgpu_bo_unreserve(wptr_vm->root.bo);
-	if (!wptr_mapping) {
-		DRM_ERROR("Failed to lookup wptr bo\n");
-		return -EINVAL;
-	}
 
-	wptr_obj->obj = wptr_mapping->bo_va->base.bo;
-	if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
-		DRM_ERROR("Requested GART mapping for wptr bo larger than one page\n");
-		return -EINVAL;
-	}
+	drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
+	drm_exec_until_all_locked(&exec) {
+		ret = amdgpu_vm_lock_pd(vm, &exec, 1);
+		drm_exec_retry_on_contention(&exec);
+		if (unlikely(ret))
+			goto fail_lock;
+
+		wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> PAGE_SHIFT);
+		if (!wptr_mapping) {
+			DRM_ERROR("Failed to lock up wptr bo\n");
+			ret = -EINVAL;
+			goto fail_lock;
+		}
 
-	ret = mes_userq_map_gtt_bo_to_gart(wptr_obj->obj);
-	if (ret) {
-		DRM_ERROR("Failed to map wptr bo to GART\n");
-		return ret;
-	}
+		obj = wptr_mapping->bo_va->base.bo;
+		ret = drm_exec_prepare_obj(&exec, &obj->tbo.base, 1);
+		drm_exec_retry_on_contention(&exec);
+		if (unlikely(ret)) {
+			DRM_ERROR("Failed to prepare wptr bo\n");
+			goto fail_lock;
+		}
 
-	ret = amdgpu_bo_reserve(wptr_obj->obj, true);
-	if (ret) {
-		DRM_ERROR("Failed to reserve wptr bo\n");
-		return ret;
-	}
+		/* mapping now should be stable since both the locks are held */
+		wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> PAGE_SHIFT);
+		if (!wptr_mapping) {
+			DRM_ERROR("Failed to lock up wptr bo\n");
+			ret = -EINVAL;
+			goto fail_lock;
+		}
 
-	/* TODO use eviction fence instead of pinning. */
-	ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);
-	if (ret) {
-		drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin wptr bo\n");
-		goto unresv_bo;
-	}
+		wptr_obj->obj = amdgpu_bo_ref(wptr_mapping->bo_va->base.bo);
+
+		if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
+			DRM_ERROR("Requested wptr bo size is larger than one page\n");
+			ret = -EINVAL;
+			goto fail_map;
+		}
+
+		ret = amdgpu_ttm_alloc_gart(&wptr_obj->obj->tbo);
+		if (ret) {
+			DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);
+			goto fail_map;
+		}
+
+		/* TODO use eviction fence instead of pinning. */
+		ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);
+		if (ret) {
+			DRM_ERROR("Failed to pin wptr bo. ret %d\n", ret);
+			goto fail_map;
+		}
 
-	queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);
-	amdgpu_bo_unreserve(wptr_obj->obj);
+		queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);
+	}
 
+	drm_exec_fini(&exec);
 	return 0;
 
-unresv_bo:
-	amdgpu_bo_unreserve(wptr_obj->obj);
+fail_map:
+	amdgpu_bo_unref(&wptr_obj->obj);
+fail_lock:
+	drm_exec_fini(&exec);
 	return ret;
 
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/amdgpu/userq: fix access to stale wptr mapping
  2026-05-04 12:56 [PATCH] drm/amdgpu/userq: fix access to stale wptr mapping Sunil Khatri
@ 2026-05-04 13:05 ` Christian König
  2026-05-05  7:29   ` Khatri, Sunil
  0 siblings, 1 reply; 4+ messages in thread
From: Christian König @ 2026-05-04 13:05 UTC (permalink / raw)
  To: Sunil Khatri, Alex Deucher; +Cc: amd-gfx

On 5/4/26 14:56, Sunil Khatri wrote:
> Use drm_exec to take both locks i.e vm root bo and
> wptr_obj bo to access the mapping data properly.
> 
> This fixes the security issue of unmap the wptr_obj while
> a queue creation is in progress and passing other
> bo at same address.
> 
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 122 ++++++++++-----------
>  1 file changed, 57 insertions(+), 65 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index 501e2e10b4a6..3d4f83015488 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -30,34 +30,6 @@
>  #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE
>  #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE
>  
> -static int
> -mes_userq_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
> -{
> -	int ret;
> -
> -	ret = amdgpu_bo_reserve(bo, true);
> -	if (ret) {
> -		DRM_ERROR("Failed to reserve bo. ret %d\n", ret);
> -		goto err_reserve_bo_failed;
> -	}
> -
> -	ret = amdgpu_ttm_alloc_gart(&bo->tbo);
> -	if (ret) {
> -		DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);
> -		goto err_map_bo_gart_failed;
> -	}
> -
> -	amdgpu_bo_unreserve(bo);
> -	bo = amdgpu_bo_ref(bo);
> -
> -	return 0;
> -
> -err_map_bo_gart_failed:
> -	amdgpu_bo_unreserve(bo);
> -err_reserve_bo_failed:
> -	return ret;
> -}
> -
>  static int
>  mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
>  			      struct amdgpu_userq_mgr *uq_mgr,
> @@ -65,55 +37,75 @@ mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
>  			      uint64_t wptr)
>  {
>  	struct amdgpu_bo_va_mapping *wptr_mapping;
> -	struct amdgpu_vm *wptr_vm;
>  	struct amdgpu_userq_obj *wptr_obj = &queue->wptr_obj;
> +	struct amdgpu_bo *obj;
> +	struct amdgpu_vm *vm = queue->vm;
> +	struct drm_exec exec;
>  	int ret;
>  
> -	wptr_vm = queue->vm;
> -	ret = amdgpu_bo_reserve(wptr_vm->root.bo, false);
> -	if (ret)
> -		return ret;
> -
>  	wptr &= AMDGPU_GMC_HOLE_MASK;
> -	wptr_mapping = amdgpu_vm_bo_lookup_mapping(wptr_vm, wptr >> PAGE_SHIFT);
> -	amdgpu_bo_unreserve(wptr_vm->root.bo);
> -	if (!wptr_mapping) {
> -		DRM_ERROR("Failed to lookup wptr bo\n");
> -		return -EINVAL;
> -	}
>  
> -	wptr_obj->obj = wptr_mapping->bo_va->base.bo;
> -	if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
> -		DRM_ERROR("Requested GART mapping for wptr bo larger than one page\n");
> -		return -EINVAL;
> -	}
> +	drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);

This should probably be 2 instead of 0.

> +	drm_exec_until_all_locked(&exec) {
> +		ret = amdgpu_vm_lock_pd(vm, &exec, 1);
> +		drm_exec_retry_on_contention(&exec);
> +		if (unlikely(ret))
> +			goto fail_lock;
> +
> +		wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> PAGE_SHIFT);
> +		if (!wptr_mapping) {
> +			DRM_ERROR("Failed to lock up wptr bo\n");

Please drop that error message. It can spam the logs when userspace intentionally gives incorrect values.

> +			ret = -EINVAL;
> +			goto fail_lock;
> +		}
>  
> -	ret = mes_userq_map_gtt_bo_to_gart(wptr_obj->obj);
> -	if (ret) {
> -		DRM_ERROR("Failed to map wptr bo to GART\n");
> -		return ret;
> -	}
> +		obj = wptr_mapping->bo_va->base.bo;
> +		ret = drm_exec_prepare_obj(&exec, &obj->tbo.base, 1);

Using drm_exec_lock_obj() should be sufficient.

We don't need a fence slot for this use case.

> +		drm_exec_retry_on_contention(&exec);
> +		if (unlikely(ret)) {
> +			DRM_ERROR("Failed to prepare wptr bo\n");

Same here, that this can fail is normal handling. The worst case is OOM and that is already printed in the logs.

> +			goto fail_lock;
> +		}

We got all the locks now, so the drm_exec_until_all_locked() loop can be closed here.

>  
> -	ret = amdgpu_bo_reserve(wptr_obj->obj, true);
> -	if (ret) {
> -		DRM_ERROR("Failed to reserve wptr bo\n");
> -		return ret;
> -	}


> +		/* mapping now should be stable since both the locks are held */
> +		wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> PAGE_SHIFT);
> +		if (!wptr_mapping) {
> +			DRM_ERROR("Failed to lock up wptr bo\n");
> +			ret = -EINVAL;
> +			goto fail_lock;
> +		}

Doing that again is unecessary. We are holding the VM lock above while doing the lockup.

>  
> -	/* TODO use eviction fence instead of pinning. */
> -	ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);
> -	if (ret) {
> -		drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin wptr bo\n");
> -		goto unresv_bo;
> -	}
> +		wptr_obj->obj = amdgpu_bo_ref(wptr_mapping->bo_va->base.bo);

That is now unecessary as well.

> +
> +		if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
> +			DRM_ERROR("Requested wptr bo size is larger than one page\n");

Same, please drop that error message.

> +			ret = -EINVAL;
> +			goto fail_map;
> +		}
> +
> +		ret = amdgpu_ttm_alloc_gart(&wptr_obj->obj->tbo);
> +		if (ret) {
> +			DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);

That error message is useful.

> +			goto fail_map;
> +		}
> +
> +		/* TODO use eviction fence instead of pinning. */
> +		ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);

Oh! The piun needs to come before the alloc_gart! That's wrong in the existing code as well.

> +		if (ret) {
> +			DRM_ERROR("Failed to pin wptr bo. ret %d\n", ret);

That error message makes sense.

Regards,
Christian.

> +			goto fail_map;
> +		}
>  
> -	queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);
> -	amdgpu_bo_unreserve(wptr_obj->obj);
> +		queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);
> +	}
>  
> +	drm_exec_fini(&exec);
>  	return 0;
>  
> -unresv_bo:
> -	amdgpu_bo_unreserve(wptr_obj->obj);
> +fail_map:
> +	amdgpu_bo_unref(&wptr_obj->obj);
> +fail_lock:
> +	drm_exec_fini(&exec);
>  	return ret;
>  
>  }


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/amdgpu/userq: fix access to stale wptr mapping
  2026-05-04 13:05 ` Christian König
@ 2026-05-05  7:29   ` Khatri, Sunil
  2026-05-05  7:54     ` Christian König
  0 siblings, 1 reply; 4+ messages in thread
From: Khatri, Sunil @ 2026-05-05  7:29 UTC (permalink / raw)
  To: Christian König, Sunil Khatri, Alex Deucher; +Cc: amd-gfx


On 04-05-2026 06:35 pm, Christian König wrote:
> On 5/4/26 14:56, Sunil Khatri wrote:
>> Use drm_exec to take both locks i.e vm root bo and
>> wptr_obj bo to access the mapping data properly.
>>
>> This fixes the security issue of unmap the wptr_obj while
>> a queue creation is in progress and passing other
>> bo at same address.
>>
>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 122 ++++++++++-----------
>>   1 file changed, 57 insertions(+), 65 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> index 501e2e10b4a6..3d4f83015488 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> @@ -30,34 +30,6 @@
>>   #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE
>>   #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE
>>   
>> -static int
>> -mes_userq_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
>> -{
>> -	int ret;
>> -
>> -	ret = amdgpu_bo_reserve(bo, true);
>> -	if (ret) {
>> -		DRM_ERROR("Failed to reserve bo. ret %d\n", ret);
>> -		goto err_reserve_bo_failed;
>> -	}
>> -
>> -	ret = amdgpu_ttm_alloc_gart(&bo->tbo);
>> -	if (ret) {
>> -		DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);
>> -		goto err_map_bo_gart_failed;
>> -	}
>> -
>> -	amdgpu_bo_unreserve(bo);
>> -	bo = amdgpu_bo_ref(bo);
>> -
>> -	return 0;
>> -
>> -err_map_bo_gart_failed:
>> -	amdgpu_bo_unreserve(bo);
>> -err_reserve_bo_failed:
>> -	return ret;
>> -}
>> -
>>   static int
>>   mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
>>   			      struct amdgpu_userq_mgr *uq_mgr,
>> @@ -65,55 +37,75 @@ mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
>>   			      uint64_t wptr)
>>   {
>>   	struct amdgpu_bo_va_mapping *wptr_mapping;
>> -	struct amdgpu_vm *wptr_vm;
>>   	struct amdgpu_userq_obj *wptr_obj = &queue->wptr_obj;
>> +	struct amdgpu_bo *obj;
>> +	struct amdgpu_vm *vm = queue->vm;
>> +	struct drm_exec exec;
>>   	int ret;
>>   
>> -	wptr_vm = queue->vm;
>> -	ret = amdgpu_bo_reserve(wptr_vm->root.bo, false);
>> -	if (ret)
>> -		return ret;
>> -
>>   	wptr &= AMDGPU_GMC_HOLE_MASK;
>> -	wptr_mapping = amdgpu_vm_bo_lookup_mapping(wptr_vm, wptr >> PAGE_SHIFT);
>> -	amdgpu_bo_unreserve(wptr_vm->root.bo);
>> -	if (!wptr_mapping) {
>> -		DRM_ERROR("Failed to lookup wptr bo\n");
>> -		return -EINVAL;
>> -	}
>>   
>> -	wptr_obj->obj = wptr_mapping->bo_va->base.bo;
>> -	if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
>> -		DRM_ERROR("Requested GART mapping for wptr bo larger than one page\n");
>> -		return -EINVAL;
>> -	}
>> +	drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> This should probably be 2 instead of 0.
Noted
>
>> +	drm_exec_until_all_locked(&exec) {
>> +		ret = amdgpu_vm_lock_pd(vm, &exec, 1);
>> +		drm_exec_retry_on_contention(&exec);
>> +		if (unlikely(ret))
>> +			goto fail_lock;
>> +
>> +		wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> PAGE_SHIFT);
>> +		if (!wptr_mapping) {
>> +			DRM_ERROR("Failed to lock up wptr bo\n");
> Please drop that error message. It can spam the logs when userspace intentionally gives incorrect values.
Noted
>
>> +			ret = -EINVAL;
>> +			goto fail_lock;
>> +		}
>>   
>> -	ret = mes_userq_map_gtt_bo_to_gart(wptr_obj->obj);
>> -	if (ret) {
>> -		DRM_ERROR("Failed to map wptr bo to GART\n");
>> -		return ret;
>> -	}
>> +		obj = wptr_mapping->bo_va->base.bo;
>> +		ret = drm_exec_prepare_obj(&exec, &obj->tbo.base, 1);
> Using drm_exec_lock_obj() should be sufficient.
>
> We don't need a fence slot for this use case.
Sure
>
>> +		drm_exec_retry_on_contention(&exec);
>> +		if (unlikely(ret)) {
>> +			DRM_ERROR("Failed to prepare wptr bo\n");
> Same here, that this can fail is normal handling. The worst case is OOM and that is already printed in the logs.
Noted
>
>> +			goto fail_lock;
>> +		}
> We got all the locks now, so the drm_exec_until_all_locked() loop can be closed here.
True, Noted
>
>>   
>> -	ret = amdgpu_bo_reserve(wptr_obj->obj, true);
>> -	if (ret) {
>> -		DRM_ERROR("Failed to reserve wptr bo\n");
>> -		return ret;
>> -	}
>
>> +		/* mapping now should be stable since both the locks are held */
>> +		wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> PAGE_SHIFT);
>> +		if (!wptr_mapping) {
>> +			DRM_ERROR("Failed to lock up wptr bo\n");
>> +			ret = -EINVAL;
>> +			goto fail_lock;
>> +		}
> Doing that again is unecessary. We are holding the VM lock above while doing the lockup.

First time we had mapping at time we did not had wptr object lock. Isnt 
it possible that either mapping is changed/updated or the bo is freed. 
At this moment we have both the locks and we

should be having correct mapping.

>
>>   
>> -	/* TODO use eviction fence instead of pinning. */
>> -	ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);
>> -	if (ret) {
>> -		drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin wptr bo\n");
>> -		goto unresv_bo;
>> -	}
>> +		wptr_obj->obj = amdgpu_bo_ref(wptr_mapping->bo_va->base.bo);
> That is now unecessary as well.

We are holding the reference in original code and freeing it in destroy. 
Dont we want to hold any reference now, how do we make sure that its not 
being freed in between?

If we dont need a reference then we need to remove from the destroy path 
too. In amdgpu_userq_destroy we are doing unpin and unref both db_obj 
and wptr_obj.

>
>> +
>> +		if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
>> +			DRM_ERROR("Requested wptr bo size is larger than one page\n");
> Same, please drop that error message.
Noted
>
>> +			ret = -EINVAL;
>> +			goto fail_map;
>> +		}
>> +
>> +		ret = amdgpu_ttm_alloc_gart(&wptr_obj->obj->tbo);
>> +		if (ret) {
>> +			DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);
> That error message is useful.
Got it.
>
>> +			goto fail_map;
>> +		}
>> +
>> +		/* TODO use eviction fence instead of pinning. */
>> +		ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);
> Oh! The piun needs to come before the alloc_gart! That's wrong in the existing code as well.
Got it.
>
>> +		if (ret) {
>> +			DRM_ERROR("Failed to pin wptr bo. ret %d\n", ret);
> That error message makes sense.
regards
Sunil Khatri

>
> Regards,
> Christian.
>
>> +			goto fail_map;
>> +		}
>>   
>> -	queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);
>> -	amdgpu_bo_unreserve(wptr_obj->obj);
>> +		queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);
>> +	}
>>   
>> +	drm_exec_fini(&exec);
>>   	return 0;
>>   
>> -unresv_bo:
>> -	amdgpu_bo_unreserve(wptr_obj->obj);
>> +fail_map:
>> +	amdgpu_bo_unref(&wptr_obj->obj);
>> +fail_lock:
>> +	drm_exec_fini(&exec);
>>   	return ret;
>>   
>>   }

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/amdgpu/userq: fix access to stale wptr mapping
  2026-05-05  7:29   ` Khatri, Sunil
@ 2026-05-05  7:54     ` Christian König
  0 siblings, 0 replies; 4+ messages in thread
From: Christian König @ 2026-05-05  7:54 UTC (permalink / raw)
  To: Khatri, Sunil, Sunil Khatri, Alex Deucher; +Cc: amd-gfx

On 5/5/26 09:29, Khatri, Sunil wrote:
> 
> On 04-05-2026 06:35 pm, Christian König wrote:
>> On 5/4/26 14:56, Sunil Khatri wrote:
>>> Use drm_exec to take both locks i.e vm root bo and
>>> wptr_obj bo to access the mapping data properly.
>>>
>>> This fixes the security issue of unmap the wptr_obj while
>>> a queue creation is in progress and passing other
>>> bo at same address.
>>>
>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 122 ++++++++++-----------
>>>   1 file changed, 57 insertions(+), 65 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>>> index 501e2e10b4a6..3d4f83015488 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>>> @@ -30,34 +30,6 @@
>>>   #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE
>>>   #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE
>>>   -static int
>>> -mes_userq_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
>>> -{
>>> -    int ret;
>>> -
>>> -    ret = amdgpu_bo_reserve(bo, true);
>>> -    if (ret) {
>>> -        DRM_ERROR("Failed to reserve bo. ret %d\n", ret);
>>> -        goto err_reserve_bo_failed;
>>> -    }
>>> -
>>> -    ret = amdgpu_ttm_alloc_gart(&bo->tbo);
>>> -    if (ret) {
>>> -        DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);
>>> -        goto err_map_bo_gart_failed;
>>> -    }
>>> -
>>> -    amdgpu_bo_unreserve(bo);
>>> -    bo = amdgpu_bo_ref(bo);
>>> -
>>> -    return 0;
>>> -
>>> -err_map_bo_gart_failed:
>>> -    amdgpu_bo_unreserve(bo);
>>> -err_reserve_bo_failed:
>>> -    return ret;
>>> -}
>>> -
>>>   static int
>>>   mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
>>>                     struct amdgpu_userq_mgr *uq_mgr,
>>> @@ -65,55 +37,75 @@ mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
>>>                     uint64_t wptr)
>>>   {
>>>       struct amdgpu_bo_va_mapping *wptr_mapping;
>>> -    struct amdgpu_vm *wptr_vm;
>>>       struct amdgpu_userq_obj *wptr_obj = &queue->wptr_obj;
>>> +    struct amdgpu_bo *obj;
>>> +    struct amdgpu_vm *vm = queue->vm;
>>> +    struct drm_exec exec;
>>>       int ret;
>>>   -    wptr_vm = queue->vm;
>>> -    ret = amdgpu_bo_reserve(wptr_vm->root.bo, false);
>>> -    if (ret)
>>> -        return ret;
>>> -
>>>       wptr &= AMDGPU_GMC_HOLE_MASK;
>>> -    wptr_mapping = amdgpu_vm_bo_lookup_mapping(wptr_vm, wptr >> PAGE_SHIFT);
>>> -    amdgpu_bo_unreserve(wptr_vm->root.bo);
>>> -    if (!wptr_mapping) {
>>> -        DRM_ERROR("Failed to lookup wptr bo\n");
>>> -        return -EINVAL;
>>> -    }
>>>   -    wptr_obj->obj = wptr_mapping->bo_va->base.bo;
>>> -    if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
>>> -        DRM_ERROR("Requested GART mapping for wptr bo larger than one page\n");
>>> -        return -EINVAL;
>>> -    }
>>> +    drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
>> This should probably be 2 instead of 0.
> Noted
>>
>>> +    drm_exec_until_all_locked(&exec) {
>>> +        ret = amdgpu_vm_lock_pd(vm, &exec, 1);
>>> +        drm_exec_retry_on_contention(&exec);
>>> +        if (unlikely(ret))
>>> +            goto fail_lock;
>>> +
>>> +        wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> PAGE_SHIFT);
>>> +        if (!wptr_mapping) {
>>> +            DRM_ERROR("Failed to lock up wptr bo\n");
>> Please drop that error message. It can spam the logs when userspace intentionally gives incorrect values.
> Noted
>>
>>> +            ret = -EINVAL;
>>> +            goto fail_lock;
>>> +        }
>>>   -    ret = mes_userq_map_gtt_bo_to_gart(wptr_obj->obj);
>>> -    if (ret) {
>>> -        DRM_ERROR("Failed to map wptr bo to GART\n");
>>> -        return ret;
>>> -    }
>>> +        obj = wptr_mapping->bo_va->base.bo;
>>> +        ret = drm_exec_prepare_obj(&exec, &obj->tbo.base, 1);
>> Using drm_exec_lock_obj() should be sufficient.
>>
>> We don't need a fence slot for this use case.
> Sure
>>
>>> +        drm_exec_retry_on_contention(&exec);
>>> +        if (unlikely(ret)) {
>>> +            DRM_ERROR("Failed to prepare wptr bo\n");
>> Same here, that this can fail is normal handling. The worst case is OOM and that is already printed in the logs.
> Noted
>>
>>> +            goto fail_lock;
>>> +        }
>> We got all the locks now, so the drm_exec_until_all_locked() loop can be closed here.
> True, Noted
>>
>>>   -    ret = amdgpu_bo_reserve(wptr_obj->obj, true);
>>> -    if (ret) {
>>> -        DRM_ERROR("Failed to reserve wptr bo\n");
>>> -        return ret;
>>> -    }
>>
>>> +        /* mapping now should be stable since both the locks are held */
>>> +        wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> PAGE_SHIFT);
>>> +        if (!wptr_mapping) {
>>> +            DRM_ERROR("Failed to lock up wptr bo\n");
>>> +            ret = -EINVAL;
>>> +            goto fail_lock;
>>> +        }
>> Doing that again is unecessary. We are holding the VM lock above while doing the lockup.
> 
> First time we had mapping at time we did not had wptr object lock. Isnt it possible that either mapping is changed/updated or the bo is freed. At this moment we have both the locks and we
> 
> should be having correct mapping.

The VM lock is sufficient for that. If somebody tries to unmap or free the BO the code will block on the VM lock to remove all the mappings.

So as soon as you grab the VM lock you know that the mapping wouldn't go away.

>>
>>>   -    /* TODO use eviction fence instead of pinning. */
>>> -    ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);
>>> -    if (ret) {
>>> -        drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin wptr bo\n");
>>> -        goto unresv_bo;
>>> -    }
>>> +        wptr_obj->obj = amdgpu_bo_ref(wptr_mapping->bo_va->base.bo);
>> That is now unecessary as well.
> 
> We are holding the reference in original code and freeing it in destroy. Dont we want to hold any reference now, how do we make sure that its not being freed in between?
> 
> If we dont need a reference then we need to remove from the destroy path too. In amdgpu_userq_destroy we are doing unpin and unref both db_obj and wptr_obj.

Ah! So we keep wptr_obj around and drop that in the destroy path! Yeah then it makes sense to grab the reference.

We can potentially drop that when we stop pinning the wptr BO and use the eviction fence instead.

Thanks for the explanation.

Regards,
Christian.

>>
>>> +
>>> +        if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
>>> +            DRM_ERROR("Requested wptr bo size is larger than one page\n");
>> Same, please drop that error message.
> Noted
>>
>>> +            ret = -EINVAL;
>>> +            goto fail_map;
>>> +        }
>>> +
>>> +        ret = amdgpu_ttm_alloc_gart(&wptr_obj->obj->tbo);
>>> +        if (ret) {
>>> +            DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);
>> That error message is useful.
> Got it.
>>
>>> +            goto fail_map;
>>> +        }
>>> +
>>> +        /* TODO use eviction fence instead of pinning. */
>>> +        ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);
>> Oh! The piun needs to come before the alloc_gart! That's wrong in the existing code as well.
> Got it.
>>
>>> +        if (ret) {
>>> +            DRM_ERROR("Failed to pin wptr bo. ret %d\n", ret);
>> That error message makes sense.
> regards
> Sunil Khatri
> 
>>
>> Regards,
>> Christian.
>>
>>> +            goto fail_map;
>>> +        }
>>>   -    queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);
>>> -    amdgpu_bo_unreserve(wptr_obj->obj);
>>> +        queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);
>>> +    }
>>>   +    drm_exec_fini(&exec);
>>>       return 0;
>>>   -unresv_bo:
>>> -    amdgpu_bo_unreserve(wptr_obj->obj);
>>> +fail_map:
>>> +    amdgpu_bo_unref(&wptr_obj->obj);
>>> +fail_lock:
>>> +    drm_exec_fini(&exec);
>>>       return ret;
>>>     }


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-05-05  7:54 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-04 12:56 [PATCH] drm/amdgpu/userq: fix access to stale wptr mapping Sunil Khatri
2026-05-04 13:05 ` Christian König
2026-05-05  7:29   ` Khatri, Sunil
2026-05-05  7:54     ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox