AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH AUTOSEL 6.1 5/7] drm/amdgpu: Fix size validation for non-exclusive domains (v4)
       [not found] <20221231200439.1748686-1-sashal@kernel.org>
@ 2022-12-31 20:04 ` Sasha Levin
  2023-01-12 16:24   ` Luben Tuikov
  2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 6/7] drm/amdkfd: Fix kfd_process_device_init_vm error handling Sasha Levin
  2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 7/7] drm/amdkfd: Fix double release compute pasid Sasha Levin
  2 siblings, 1 reply; 4+ messages in thread
From: Sasha Levin @ 2022-12-31 20:04 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Philip.Yang, guchun.chen, sunpeng.li, Felix.Kuehling,
	Xinhui.Pan, rajneesh.bhardwaj, AMD Graphics, michael.j.ruhl,
	Luben Tuikov, dri-devel, daniel, Alex Deucher, Yuliang.Shi,
	airlied, Christian König

From: Luben Tuikov <luben.tuikov@amd.com>

[ Upstream commit 7554886daa31eacc8e7fac9e15bbce67d10b8f1f ]

Fix amdgpu_bo_validate_size() to check whether the TTM domain manager for the
requested memory exists, else we get a kernel oops when dereferencing "man".

v2: Make the patch standalone, i.e. not dependent on local patches.
v3: Preserve old behaviour and just check that the manager pointer is not
    NULL.
v4: Complain if GTT domain requested and it is uninitialized--most likely a
    bug.

Cc: Alex Deucher <Alexander.Deucher@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: AMD Graphics <amd-gfx@lists.freedesktop.org>
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 2e8f6cd7a729..33e266433e9b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -446,27 +446,24 @@ static bool amdgpu_bo_validate_size(struct amdgpu_device *adev,
 
 	/*
 	 * If GTT is part of requested domains the check must succeed to
-	 * allow fall back to GTT
+	 * allow fall back to GTT.
 	 */
 	if (domain & AMDGPU_GEM_DOMAIN_GTT) {
 		man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
 
-		if (size < man->size)
+		if (man && size < man->size)
 			return true;
-		else
-			goto fail;
-	}
-
-	if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
+		else if (!man)
+			WARN_ON_ONCE("GTT domain requested but GTT mem manager uninitialized");
+		goto fail;
+	} else if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
 		man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
 
-		if (size < man->size)
+		if (man && size < man->size)
 			return true;
-		else
-			goto fail;
+		goto fail;
 	}
 
-
 	/* TODO add more domains checks, such as AMDGPU_GEM_DOMAIN_CPU */
 	return true;
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH AUTOSEL 6.1 6/7] drm/amdkfd: Fix kfd_process_device_init_vm error handling
       [not found] <20221231200439.1748686-1-sashal@kernel.org>
  2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 5/7] drm/amdgpu: Fix size validation for non-exclusive domains (v4) Sasha Levin
@ 2022-12-31 20:04 ` Sasha Levin
  2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 7/7] drm/amdkfd: Fix double release compute pasid Sasha Levin
  2 siblings, 0 replies; 4+ messages in thread
From: Sasha Levin @ 2022-12-31 20:04 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Philip Yang, Felix Kuehling, Xinhui.Pan, amd-gfx,
	dri-devel, daniel, Alex Deucher, airlied, christian.koenig

From: Philip Yang <Philip.Yang@amd.com>

[ Upstream commit 29d48b87db64b6697ddad007548e51d032081c59 ]

Should only destroy the ib_mem and let process cleanup worker to free
the outstanding BOs. Reset the pointer in pdd->qpd structure, to avoid
NULL pointer access in process destroy worker.

 BUG: kernel NULL pointer dereference, address: 0000000000000010
 Call Trace:
  amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel+0x46/0xb0 [amdgpu]
  kfd_process_device_destroy_cwsr_dgpu+0x40/0x70 [amdgpu]
  kfd_process_destroy_pdds+0x71/0x190 [amdgpu]
  kfd_process_wq_release+0x2a2/0x3b0 [amdgpu]
  process_one_work+0x2a1/0x600
  worker_thread+0x39/0x3d0

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 951b63677248..9821fa9268d3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -689,13 +689,13 @@ void kfd_process_destroy_wq(void)
 }
 
 static void kfd_process_free_gpuvm(struct kgd_mem *mem,
-			struct kfd_process_device *pdd, void *kptr)
+			struct kfd_process_device *pdd, void **kptr)
 {
 	struct kfd_dev *dev = pdd->dev;
 
-	if (kptr) {
+	if (kptr && *kptr) {
 		amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(mem);
-		kptr = NULL;
+		*kptr = NULL;
 	}
 
 	amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
@@ -795,7 +795,7 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
 	if (!qpd->ib_kaddr || !qpd->ib_base)
 		return;
 
-	kfd_process_free_gpuvm(qpd->ib_mem, pdd, qpd->ib_kaddr);
+	kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
 }
 
 struct kfd_process *kfd_create_process(struct file *filep)
@@ -1277,7 +1277,7 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
 	if (!dev->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
 		return;
 
-	kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, qpd->cwsr_kaddr);
+	kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
 }
 
 void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
@@ -1598,8 +1598,8 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
 	return 0;
 
 err_init_cwsr:
+	kfd_process_device_destroy_ib_mem(pdd);
 err_reserve_ib_mem:
-	kfd_process_device_free_bos(pdd);
 	pdd->drm_priv = NULL;
 
 	return ret;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH AUTOSEL 6.1 7/7] drm/amdkfd: Fix double release compute pasid
       [not found] <20221231200439.1748686-1-sashal@kernel.org>
  2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 5/7] drm/amdgpu: Fix size validation for non-exclusive domains (v4) Sasha Levin
  2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 6/7] drm/amdkfd: Fix kfd_process_device_init_vm error handling Sasha Levin
@ 2022-12-31 20:04 ` Sasha Levin
  2 siblings, 0 replies; 4+ messages in thread
From: Sasha Levin @ 2022-12-31 20:04 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Philip Yang, Felix Kuehling, Xinhui.Pan, amd-gfx,
	sumit.semwal, linaro-mm-sig, dri-devel, daniel, Alex Deucher,
	airlied, christian.koenig, linux-media

From: Philip Yang <Philip.Yang@amd.com>

[ Upstream commit 1a799c4c190ea9f0e81028e3eb3037ed0ab17ff5 ]

If kfd_process_device_init_vm returns failure after vm is converted to
compute vm and vm->pasid set to compute pasid, KFD will not take
pdd->drm_file reference. As a result, drm close file handler maybe
called to release the compute pasid before KFD process destroy worker to
release the same pasid and set vm->pasid to zero, this generates below
WARNING backtrace and NULL pointer access.

Add helper amdgpu_amdkfd_gpuvm_set_vm_pasid and call it at the last step
of kfd_process_device_init_vm, to ensure vm pasid is the original pasid
if acquiring vm failed or is the compute pasid with pdd->drm_file
reference taken to avoid double release same pasid.

 amdgpu: Failed to create process VM object
 ida_free called for id=32770 which is not allocated.
 WARNING: CPU: 57 PID: 72542 at ../lib/idr.c:522 ida_free+0x96/0x140
 RIP: 0010:ida_free+0x96/0x140
 Call Trace:
  amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu]
  amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu]
  drm_file_free.part.13+0x216/0x270 [drm]
  drm_close_helper.isra.14+0x60/0x70 [drm]
  drm_release+0x6e/0xf0 [drm]
  __fput+0xcc/0x280
  ____fput+0xe/0x20
  task_work_run+0x96/0xc0
  do_exit+0x3d0/0xc10

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 RIP: 0010:ida_free+0x76/0x140
 Call Trace:
  amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu]
  amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu]
  drm_file_free.part.13+0x216/0x270 [drm]
  drm_close_helper.isra.14+0x60/0x70 [drm]
  drm_release+0x6e/0xf0 [drm]
  __fput+0xcc/0x280
  ____fput+0xe/0x20
  task_work_run+0x96/0xc0
  do_exit+0x3d0/0xc10

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  4 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 39 +++++++++++++------
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 12 ++++--
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 647220a8762d..30f145dc8724 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -265,8 +265,10 @@ int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_
 	(&((struct amdgpu_fpriv *)					\
 		((struct drm_file *)(drm_priv))->driver_priv)->vm)
 
+int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
+				     struct file *filp, u32 pasid);
 int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
-					struct file *filp, u32 pasid,
+					struct file *filp,
 					void **process_info,
 					struct dma_fence **ef);
 void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 1f76e27f1a35..18a7cbbd00ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1473,10 +1473,9 @@ static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
 	amdgpu_bo_unreserve(bo);
 }
 
-int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
-					   struct file *filp, u32 pasid,
-					   void **process_info,
-					   struct dma_fence **ef)
+int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
+				     struct file *filp, u32 pasid)
+
 {
 	struct amdgpu_fpriv *drv_priv;
 	struct amdgpu_vm *avm;
@@ -1487,10 +1486,6 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
 		return ret;
 	avm = &drv_priv->vm;
 
-	/* Already a compute VM? */
-	if (avm->process_info)
-		return -EINVAL;
-
 	/* Free the original amdgpu allocated pasid,
 	 * will be replaced with kfd allocated pasid.
 	 */
@@ -1499,14 +1494,36 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
 		amdgpu_vm_set_pasid(adev, avm, 0);
 	}
 
-	/* Convert VM into a compute VM */
-	ret = amdgpu_vm_make_compute(adev, avm);
+	ret = amdgpu_vm_set_pasid(adev, avm, pasid);
 	if (ret)
 		return ret;
 
-	ret = amdgpu_vm_set_pasid(adev, avm, pasid);
+	return 0;
+}
+
+int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
+					   struct file *filp,
+					   void **process_info,
+					   struct dma_fence **ef)
+{
+	struct amdgpu_fpriv *drv_priv;
+	struct amdgpu_vm *avm;
+	int ret;
+
+	ret = amdgpu_file_to_fpriv(filp, &drv_priv);
 	if (ret)
 		return ret;
+	avm = &drv_priv->vm;
+
+	/* Already a compute VM? */
+	if (avm->process_info)
+		return -EINVAL;
+
+	/* Convert VM into a compute VM */
+	ret = amdgpu_vm_make_compute(adev, avm);
+	if (ret)
+		return ret;
+
 	/* Initialize KFD part of the VM and process info */
 	ret = init_kfd_vm(avm, process_info, ef);
 	if (ret)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 9821fa9268d3..dd351105c1bc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1576,9 +1576,9 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
 	p = pdd->process;
 	dev = pdd->dev;
 
-	ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(
-		dev->adev, drm_file, p->pasid,
-		&p->kgd_process_info, &p->ef);
+	ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, drm_file,
+						     &p->kgd_process_info,
+						     &p->ef);
 	if (ret) {
 		pr_err("Failed to create process VM object\n");
 		return ret;
@@ -1593,10 +1593,16 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
 	if (ret)
 		goto err_init_cwsr;
 
+	ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, drm_file, p->pasid);
+	if (ret)
+		goto err_set_pasid;
+
 	pdd->drm_file = drm_file;
 
 	return 0;
 
+err_set_pasid:
+	kfd_process_device_destroy_cwsr_dgpu(pdd);
 err_init_cwsr:
 	kfd_process_device_destroy_ib_mem(pdd);
 err_reserve_ib_mem:
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH AUTOSEL 6.1 5/7] drm/amdgpu: Fix size validation for non-exclusive domains (v4)
  2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 5/7] drm/amdgpu: Fix size validation for non-exclusive domains (v4) Sasha Levin
@ 2023-01-12 16:24   ` Luben Tuikov
  0 siblings, 0 replies; 4+ messages in thread
From: Luben Tuikov @ 2023-01-12 16:24 UTC (permalink / raw)
  To: Sasha Levin, linux-kernel, stable
  Cc: Philip.Yang, guchun.chen, sunpeng.li, Felix.Kuehling, Xinhui.Pan,
	rajneesh.bhardwaj, AMD Graphics, michael.j.ruhl, dri-devel,
	daniel, Alex Deucher, Yuliang.Shi, airlied, Christian König

Hi Sasha,

The patch in the link is a Fixes patch of the quoted patch, and should also go in:

https://lore.kernel.org/all/20230104221935.113400-1-luben.tuikov@amd.com/

Regards,
Luben

On 2022-12-31 15:04, Sasha Levin wrote:
> From: Luben Tuikov <luben.tuikov@amd.com>
> 
> [ Upstream commit 7554886daa31eacc8e7fac9e15bbce67d10b8f1f ]
> 
> Fix amdgpu_bo_validate_size() to check whether the TTM domain manager for the
> requested memory exists, else we get a kernel oops when dereferencing "man".
> 
> v2: Make the patch standalone, i.e. not dependent on local patches.
> v3: Preserve old behaviour and just check that the manager pointer is not
>     NULL.
> v4: Complain if GTT domain requested and it is uninitialized--most likely a
>     bug.
> 
> Cc: Alex Deucher <Alexander.Deucher@amd.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: AMD Graphics <amd-gfx@lists.freedesktop.org>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> Reviewed-by: Christian König <christian.koenig@amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Sasha Levin <sashal@kernel.org>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 19 ++++++++-----------
>  1 file changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 2e8f6cd7a729..33e266433e9b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -446,27 +446,24 @@ static bool amdgpu_bo_validate_size(struct amdgpu_device *adev,
>  
>  	/*
>  	 * If GTT is part of requested domains the check must succeed to
> -	 * allow fall back to GTT
> +	 * allow fall back to GTT.
>  	 */
>  	if (domain & AMDGPU_GEM_DOMAIN_GTT) {
>  		man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>  
> -		if (size < man->size)
> +		if (man && size < man->size)
>  			return true;
> -		else
> -			goto fail;
> -	}
> -
> -	if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
> +		else if (!man)
> +			WARN_ON_ONCE("GTT domain requested but GTT mem manager uninitialized");
> +		goto fail;
> +	} else if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
>  		man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
>  
> -		if (size < man->size)
> +		if (man && size < man->size)
>  			return true;
> -		else
> -			goto fail;
> +		goto fail;
>  	}
>  
> -
>  	/* TODO add more domains checks, such as AMDGPU_GEM_DOMAIN_CPU */
>  	return true;
>  


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-01-12 16:24 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20221231200439.1748686-1-sashal@kernel.org>
2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 5/7] drm/amdgpu: Fix size validation for non-exclusive domains (v4) Sasha Levin
2023-01-12 16:24   ` Luben Tuikov
2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 6/7] drm/amdkfd: Fix kfd_process_device_init_vm error handling Sasha Levin
2022-12-31 20:04 ` [PATCH AUTOSEL 6.1 7/7] drm/amdkfd: Fix double release compute pasid Sasha Levin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox