AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Don't clear PT after process killed
@ 2025-10-30 19:01 Philip Yang
  2025-10-31  7:42 ` Christian König
  0 siblings, 1 reply; 3+ messages in thread
From: Philip Yang @ 2025-10-30 19:01 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling, christian.koenig, Gang.Ba, Philip Yang

Move amdgpu_vm_ready check to inside amdgpu_vm_clear_freed, this removes
the duplicate code, also removes the error message "*ERROR* Trying to
push to a killed entity" when KFD release wq unmap_bo_from_gpuvm to
unmap outstanding BOs if using SDMA update page table.

Suggested-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 3 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 9 ++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 2 ++
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index ecdfe6cb36cc..6e1a5b922eb1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1116,9 +1116,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 		}
 	}
 
-	if (!amdgpu_vm_ready(vm))
-		return -EINVAL;
-
 	r = amdgpu_vm_clear_freed(adev, vm, NULL);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index ce073e894584..f6c297d62cfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -373,11 +373,9 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
 
 	amdgpu_vm_bo_del(adev, bo_va);
 	amdgpu_vm_bo_update_shared(bo);
-	if (!amdgpu_vm_ready(vm))
-		goto out_unlock;
 
 	r = amdgpu_vm_clear_freed(adev, vm, &fence);
-	if (unlikely(r < 0))
+	if (unlikely(r < 0 && r != -EINVAL))
 		dev_err(adev->dev, "failed to clear page "
 			"tables on GEM object close (%ld)\n", r);
 	if (r || !fence)
@@ -387,7 +385,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
 	dma_fence_put(fence);
 
 out_unlock:
-	if (r)
+	if (r && r != -EINVAL)
 		dev_err(adev->dev, "leaking bo va (%ld)\n", r);
 	drm_exec_fini(&exec);
 }
@@ -766,9 +764,6 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
 	struct dma_fence *fence = dma_fence_get_stub();
 	int r;
 
-	if (!amdgpu_vm_ready(vm))
-		return fence;
-
 	r = amdgpu_vm_clear_freed(adev, vm, &fence);
 	if (r)
 		goto error;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index db66b4232de0..febdd1b4286c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1543,6 +1543,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
 	struct amdgpu_sync sync;
 	int r;
 
+	if (!amdgpu_vm_ready(vm))
+		return -EINVAL;
 
 	/*
 	 * Implicitly sync to command submissions in the same VM before
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdgpu: Don't clear PT after process killed
  2025-10-30 19:01 [PATCH] drm/amdgpu: Don't clear PT after process killed Philip Yang
@ 2025-10-31  7:42 ` Christian König
  2025-10-31 13:33   ` Philip Yang
  0 siblings, 1 reply; 3+ messages in thread
From: Christian König @ 2025-10-31  7:42 UTC (permalink / raw)
  To: Philip Yang, amd-gfx; +Cc: Felix.Kuehling, Gang.Ba

On 10/30/25 20:01, Philip Yang wrote:
> Move amdgpu_vm_ready check to inside amdgpu_vm_clear_freed, this removes
> the duplicate code, also removes the error message "*ERROR* Trying to
> push to a killed entity" when KFD release wq unmap_bo_from_gpuvm to
> unmap outstanding BOs if using SDMA update page table.

In general good idea to have that fixed, but the ready check should stay outside of amdgpu_vm_clear_freed().

Background is that there are more operations than only clearling the freed covered by this check.

Regards,
Christian.

> 
> Suggested-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 3 ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 9 ++-------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 2 ++
>  3 files changed, 4 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index ecdfe6cb36cc..6e1a5b922eb1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1116,9 +1116,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>  		}
>  	}
>  
> -	if (!amdgpu_vm_ready(vm))
> -		return -EINVAL;
> -
>  	r = amdgpu_vm_clear_freed(adev, vm, NULL);
>  	if (r)
>  		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index ce073e894584..f6c297d62cfe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -373,11 +373,9 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
>  
>  	amdgpu_vm_bo_del(adev, bo_va);
>  	amdgpu_vm_bo_update_shared(bo);
> -	if (!amdgpu_vm_ready(vm))
> -		goto out_unlock;
>  
>  	r = amdgpu_vm_clear_freed(adev, vm, &fence);
> -	if (unlikely(r < 0))
> +	if (unlikely(r < 0 && r != -EINVAL))
>  		dev_err(adev->dev, "failed to clear page "
>  			"tables on GEM object close (%ld)\n", r);
>  	if (r || !fence)
> @@ -387,7 +385,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
>  	dma_fence_put(fence);
>  
>  out_unlock:
> -	if (r)
> +	if (r && r != -EINVAL)
>  		dev_err(adev->dev, "leaking bo va (%ld)\n", r);
>  	drm_exec_fini(&exec);
>  }
> @@ -766,9 +764,6 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
>  	struct dma_fence *fence = dma_fence_get_stub();
>  	int r;
>  
> -	if (!amdgpu_vm_ready(vm))
> -		return fence;
> -
>  	r = amdgpu_vm_clear_freed(adev, vm, &fence);
>  	if (r)
>  		goto error;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index db66b4232de0..febdd1b4286c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1543,6 +1543,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>  	struct amdgpu_sync sync;
>  	int r;
>  
> +	if (!amdgpu_vm_ready(vm))
> +		return -EINVAL;
>  
>  	/*
>  	 * Implicitly sync to command submissions in the same VM before


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdgpu: Don't clear PT after process killed
  2025-10-31  7:42 ` Christian König
@ 2025-10-31 13:33   ` Philip Yang
  0 siblings, 0 replies; 3+ messages in thread
From: Philip Yang @ 2025-10-31 13:33 UTC (permalink / raw)
  To: Christian König, Philip Yang, amd-gfx; +Cc: Felix.Kuehling, Gang.Ba


On 2025-10-31 03:42, Christian König wrote:
> On 10/30/25 20:01, Philip Yang wrote:
>> Move amdgpu_vm_ready check to inside amdgpu_vm_clear_freed, this removes
>> the duplicate code, also removes the error message "*ERROR* Trying to
>> push to a killed entity" when KFD release wq unmap_bo_from_gpuvm to
>> unmap outstanding BOs if using SDMA update page table.
> In general good idea to have that fixed, but the ready check should stay outside of amdgpu_vm_clear_freed().
>
> Background is that there are more operations than only clearling the freed covered by this check.

ok, then I will add amdgpu_vm_ready check inside unmap_bo_from_gpuvm to 
remove the ERROR message, leave the possible cleanup in another patch as 
it is tricky to handle the error path, amdgpu_dma_buf_move_notify also 
calls amdgpu_vm_clear_freed without checking vm ready.

Thanks,

Philip

>
> Regards,
> Christian.
>
>> Suggested-by: Christian König <christian.koenig@amd.com>
>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 3 ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 9 ++-------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 2 ++
>>   3 files changed, 4 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index ecdfe6cb36cc..6e1a5b922eb1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -1116,9 +1116,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>>   		}
>>   	}
>>   
>> -	if (!amdgpu_vm_ready(vm))
>> -		return -EINVAL;
>> -
>>   	r = amdgpu_vm_clear_freed(adev, vm, NULL);
>>   	if (r)
>>   		return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index ce073e894584..f6c297d62cfe 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -373,11 +373,9 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
>>   
>>   	amdgpu_vm_bo_del(adev, bo_va);
>>   	amdgpu_vm_bo_update_shared(bo);
>> -	if (!amdgpu_vm_ready(vm))
>> -		goto out_unlock;
>>   
>>   	r = amdgpu_vm_clear_freed(adev, vm, &fence);
>> -	if (unlikely(r < 0))
>> +	if (unlikely(r < 0 && r != -EINVAL))
>>   		dev_err(adev->dev, "failed to clear page "
>>   			"tables on GEM object close (%ld)\n", r);
>>   	if (r || !fence)
>> @@ -387,7 +385,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
>>   	dma_fence_put(fence);
>>   
>>   out_unlock:
>> -	if (r)
>> +	if (r && r != -EINVAL)
>>   		dev_err(adev->dev, "leaking bo va (%ld)\n", r);
>>   	drm_exec_fini(&exec);
>>   }
>> @@ -766,9 +764,6 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
>>   	struct dma_fence *fence = dma_fence_get_stub();
>>   	int r;
>>   
>> -	if (!amdgpu_vm_ready(vm))
>> -		return fence;
>> -
>>   	r = amdgpu_vm_clear_freed(adev, vm, &fence);
>>   	if (r)
>>   		goto error;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index db66b4232de0..febdd1b4286c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1543,6 +1543,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>>   	struct amdgpu_sync sync;
>>   	int r;
>>   
>> +	if (!amdgpu_vm_ready(vm))
>> +		return -EINVAL;
>>   
>>   	/*
>>   	 * Implicitly sync to command submissions in the same VM before

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-10-31 13:33 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-30 19:01 [PATCH] drm/amdgpu: Don't clear PT after process killed Philip Yang
2025-10-31  7:42 ` Christian König
2025-10-31 13:33   ` Philip Yang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox