* [PATCH] drm/amdgpu: Don't clear PT after process killed
@ 2025-10-30 19:01 Philip Yang
2025-10-31 7:42 ` Christian König
0 siblings, 1 reply; 3+ messages in thread
From: Philip Yang @ 2025-10-30 19:01 UTC (permalink / raw)
To: amd-gfx; +Cc: Felix.Kuehling, christian.koenig, Gang.Ba, Philip Yang
Move amdgpu_vm_ready check to inside amdgpu_vm_clear_freed, this removes
the duplicate code, also removes the error message "*ERROR* Trying to
push to a killed entity" when KFD release wq unmap_bo_from_gpuvm to
unmap outstanding BOs if using SDMA update page table.
Suggested-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 ---
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 9 ++-------
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 ++
3 files changed, 4 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index ecdfe6cb36cc..6e1a5b922eb1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1116,9 +1116,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
}
}
- if (!amdgpu_vm_ready(vm))
- return -EINVAL;
-
r = amdgpu_vm_clear_freed(adev, vm, NULL);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index ce073e894584..f6c297d62cfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -373,11 +373,9 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
amdgpu_vm_bo_del(adev, bo_va);
amdgpu_vm_bo_update_shared(bo);
- if (!amdgpu_vm_ready(vm))
- goto out_unlock;
r = amdgpu_vm_clear_freed(adev, vm, &fence);
- if (unlikely(r < 0))
+ if (unlikely(r < 0 && r != -EINVAL))
dev_err(adev->dev, "failed to clear page "
"tables on GEM object close (%ld)\n", r);
if (r || !fence)
@@ -387,7 +385,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
dma_fence_put(fence);
out_unlock:
- if (r)
+ if (r && r != -EINVAL)
dev_err(adev->dev, "leaking bo va (%ld)\n", r);
drm_exec_fini(&exec);
}
@@ -766,9 +764,6 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
struct dma_fence *fence = dma_fence_get_stub();
int r;
- if (!amdgpu_vm_ready(vm))
- return fence;
-
r = amdgpu_vm_clear_freed(adev, vm, &fence);
if (r)
goto error;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index db66b4232de0..febdd1b4286c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1543,6 +1543,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
struct amdgpu_sync sync;
int r;
+ if (!amdgpu_vm_ready(vm))
+ return -EINVAL;
/*
* Implicitly sync to command submissions in the same VM before
--
2.49.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] drm/amdgpu: Don't clear PT after process killed
2025-10-30 19:01 [PATCH] drm/amdgpu: Don't clear PT after process killed Philip Yang
@ 2025-10-31 7:42 ` Christian König
2025-10-31 13:33 ` Philip Yang
0 siblings, 1 reply; 3+ messages in thread
From: Christian König @ 2025-10-31 7:42 UTC (permalink / raw)
To: Philip Yang, amd-gfx; +Cc: Felix.Kuehling, Gang.Ba
On 10/30/25 20:01, Philip Yang wrote:
> Move amdgpu_vm_ready check to inside amdgpu_vm_clear_freed, this removes
> the duplicate code, also removes the error message "*ERROR* Trying to
> push to a killed entity" when KFD release wq unmap_bo_from_gpuvm to
> unmap outstanding BOs if using SDMA update page table.
In general good idea to have that fixed, but the ready check should stay outside of amdgpu_vm_clear_freed().
Background is that there are more operations than only clearling the freed covered by this check.
Regards,
Christian.
>
> Suggested-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 9 ++-------
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 ++
> 3 files changed, 4 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index ecdfe6cb36cc..6e1a5b922eb1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1116,9 +1116,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> }
> }
>
> - if (!amdgpu_vm_ready(vm))
> - return -EINVAL;
> -
> r = amdgpu_vm_clear_freed(adev, vm, NULL);
> if (r)
> return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index ce073e894584..f6c297d62cfe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -373,11 +373,9 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
>
> amdgpu_vm_bo_del(adev, bo_va);
> amdgpu_vm_bo_update_shared(bo);
> - if (!amdgpu_vm_ready(vm))
> - goto out_unlock;
>
> r = amdgpu_vm_clear_freed(adev, vm, &fence);
> - if (unlikely(r < 0))
> + if (unlikely(r < 0 && r != -EINVAL))
> dev_err(adev->dev, "failed to clear page "
> "tables on GEM object close (%ld)\n", r);
> if (r || !fence)
> @@ -387,7 +385,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
> dma_fence_put(fence);
>
> out_unlock:
> - if (r)
> + if (r && r != -EINVAL)
> dev_err(adev->dev, "leaking bo va (%ld)\n", r);
> drm_exec_fini(&exec);
> }
> @@ -766,9 +764,6 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
> struct dma_fence *fence = dma_fence_get_stub();
> int r;
>
> - if (!amdgpu_vm_ready(vm))
> - return fence;
> -
> r = amdgpu_vm_clear_freed(adev, vm, &fence);
> if (r)
> goto error;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index db66b4232de0..febdd1b4286c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1543,6 +1543,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
> struct amdgpu_sync sync;
> int r;
>
> + if (!amdgpu_vm_ready(vm))
> + return -EINVAL;
>
> /*
> * Implicitly sync to command submissions in the same VM before
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] drm/amdgpu: Don't clear PT after process killed
2025-10-31 7:42 ` Christian König
@ 2025-10-31 13:33 ` Philip Yang
0 siblings, 0 replies; 3+ messages in thread
From: Philip Yang @ 2025-10-31 13:33 UTC (permalink / raw)
To: Christian König, Philip Yang, amd-gfx; +Cc: Felix.Kuehling, Gang.Ba
On 2025-10-31 03:42, Christian König wrote:
> On 10/30/25 20:01, Philip Yang wrote:
>> Move amdgpu_vm_ready check to inside amdgpu_vm_clear_freed, this removes
>> the duplicate code, also removes the error message "*ERROR* Trying to
>> push to a killed entity" when KFD release wq unmap_bo_from_gpuvm to
>> unmap outstanding BOs if using SDMA update page table.
> In general good idea to have that fixed, but the ready check should stay outside of amdgpu_vm_clear_freed().
>
> Background is that there are more operations than only clearling the freed covered by this check.
ok, then I will add amdgpu_vm_ready check inside unmap_bo_from_gpuvm to
remove the ERROR message, leave the possible cleanup in another patch as
it is tricky to handle the error path, amdgpu_dma_buf_move_notify also
calls amdgpu_vm_clear_freed without checking vm ready.
Thanks,
Philip
>
> Regards,
> Christian.
>
>> Suggested-by: Christian König <christian.koenig@amd.com>
>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 9 ++-------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 ++
>> 3 files changed, 4 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index ecdfe6cb36cc..6e1a5b922eb1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -1116,9 +1116,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>> }
>> }
>>
>> - if (!amdgpu_vm_ready(vm))
>> - return -EINVAL;
>> -
>> r = amdgpu_vm_clear_freed(adev, vm, NULL);
>> if (r)
>> return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index ce073e894584..f6c297d62cfe 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -373,11 +373,9 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
>>
>> amdgpu_vm_bo_del(adev, bo_va);
>> amdgpu_vm_bo_update_shared(bo);
>> - if (!amdgpu_vm_ready(vm))
>> - goto out_unlock;
>>
>> r = amdgpu_vm_clear_freed(adev, vm, &fence);
>> - if (unlikely(r < 0))
>> + if (unlikely(r < 0 && r != -EINVAL))
>> dev_err(adev->dev, "failed to clear page "
>> "tables on GEM object close (%ld)\n", r);
>> if (r || !fence)
>> @@ -387,7 +385,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
>> dma_fence_put(fence);
>>
>> out_unlock:
>> - if (r)
>> + if (r && r != -EINVAL)
>> dev_err(adev->dev, "leaking bo va (%ld)\n", r);
>> drm_exec_fini(&exec);
>> }
>> @@ -766,9 +764,6 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
>> struct dma_fence *fence = dma_fence_get_stub();
>> int r;
>>
>> - if (!amdgpu_vm_ready(vm))
>> - return fence;
>> -
>> r = amdgpu_vm_clear_freed(adev, vm, &fence);
>> if (r)
>> goto error;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index db66b4232de0..febdd1b4286c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1543,6 +1543,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>> struct amdgpu_sync sync;
>> int r;
>>
>> + if (!amdgpu_vm_ready(vm))
>> + return -EINVAL;
>>
>> /*
>> * Implicitly sync to command submissions in the same VM before
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-10-31 13:33 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-30 19:01 [PATCH] drm/amdgpu: Don't clear PT after process killed Philip Yang
2025-10-31 7:42 ` Christian König
2025-10-31 13:33 ` Philip Yang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox