All of lore.kernel.org
 help / color / mirror / Atom feed
* [v6 0/5] Fix several bugs in error handling during device probe
@ 2025-01-24  5:19 Jiang Liu
  2025-01-24  5:19 ` [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free() Jiang Liu
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24  5:19 UTC (permalink / raw)
  To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
	sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
	xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
  Cc: Jiang Liu

This patchset tries to fix several memory leakages/invalid memory
accesses on error handling path during GPU driver loading/unloading.
They applies to:
https://gitlab.freedesktop.org/agd5f/linux.git amd-staging-drm-next

v6:
1) fix coding style of patch 5

v5:
1) drop first in v4, we have found a reliable way to fix the issue.
2) add patch 3 in v5 to fix a new issue
3) rework patch 5 according to review feedback

v4:
1) drop patch 1 in v3
2) split out amdxcp related change into a dedicated patch
3) use `guard(mutex)` instead of mutex_lock/unlock().
4) move patch 6 in v3 to next patch set

v3:
1) drop first patch of v2
2) rework the 0003/0004 patches of v2 according to review comments
3) add patch 0004 to fix possible resource leakage in amdgpu_pci_probe()

v2:
1) rebased to https://gitlab.freedesktop.org/agd5f/linux.git branch
   amd-staging-drm-next.
2) removed the first patch, which is unnecessary.
3) add amdgpu_xcp_drm_dev_free() in patch 0003 to enhance amdxcp
   driver to better support device remove and error handling.
4) reworked patch 0005 to fix it in amdgpu instead of drm core.

Jiang Liu (5):
  drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free()
  drm/amdgpu: fix use after free bug related to
    amdgpu_driver_release_kms()
  drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini()
  drm/amdgpu: enhance error handling in function amdgpu_pci_probe()
  drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini()

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 33 ++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c     | 14 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   |  9 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c     | 71 +++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h     |  3 +-
 drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c | 63 +++++++++++++++---
 drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h |  1 +
 7 files changed, 152 insertions(+), 42 deletions(-)

-- 
2.43.5


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free()
  2025-01-24  5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
@ 2025-01-24  5:19 ` Jiang Liu
  2025-01-24  5:19 ` [v6 2/5] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms() Jiang Liu
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24  5:19 UTC (permalink / raw)
  To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
	sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
	xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
  Cc: Jiang Liu

Introduce new interface amdgpu_xcp_drm_dev_free() to free a specific
drm_device crreated by amdgpu_xcp_drm_dev_alloc(), which will be used
to do error recovery.

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
 drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c | 63 +++++++++++++++++----
 drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h |  1 +
 2 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c
index faed84172dd4..2ff5377d54cd 100644
--- a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c
+++ b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c
@@ -45,18 +45,26 @@ static const struct drm_driver amdgpu_xcp_driver = {
 
 static int8_t pdev_num;
 static struct xcp_device *xcp_dev[MAX_XCP_PLATFORM_DEVICE];
+static DEFINE_MUTEX(xcp_mutex);
 
 int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev)
 {
 	struct platform_device *pdev;
 	struct xcp_device *pxcp_dev;
 	char dev_name[20];
-	int ret;
+	int ret, index;
+
+	guard(mutex)(&xcp_mutex);
 
 	if (pdev_num >= MAX_XCP_PLATFORM_DEVICE)
 		return -ENODEV;
 
-	snprintf(dev_name, sizeof(dev_name), "amdgpu_xcp_%d", pdev_num);
+	for (index = 0; index < MAX_XCP_PLATFORM_DEVICE; index++) {
+		if (!xcp_dev[index])
+			break;
+	}
+
+	snprintf(dev_name, sizeof(dev_name), "amdgpu_xcp_%d", index);
 	pdev = platform_device_register_simple(dev_name, -1, NULL, 0);
 	if (IS_ERR(pdev))
 		return PTR_ERR(pdev);
@@ -72,8 +80,8 @@ int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev)
 		goto out_devres;
 	}
 
-	xcp_dev[pdev_num] = pxcp_dev;
-	xcp_dev[pdev_num]->pdev = pdev;
+	xcp_dev[index] = pxcp_dev;
+	xcp_dev[index]->pdev = pdev;
 	*ddev = &pxcp_dev->drm;
 	pdev_num++;
 
@@ -88,16 +96,51 @@ int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev)
 }
 EXPORT_SYMBOL(amdgpu_xcp_drm_dev_alloc);
 
+static void __amdgpu_xcp_drm_dev_free(int index)
+{
+	struct platform_device *pdev;
+
+	WARN_ON(!pdev_num);
+	pdev = xcp_dev[index]->pdev;
+	devres_release_group(&pdev->dev, NULL);
+	platform_device_unregister(pdev);
+	xcp_dev[index] = NULL;
+	pdev_num--;
+}
+
+void amdgpu_xcp_drm_dev_free(struct drm_device *ddev)
+{
+	struct xcp_device *pxcp_dev;
+	int index;
+
+	if (ddev == NULL)
+		return;
+
+	guard(mutex)(&xcp_mutex);
+	WARN_ON(!pdev_num);
+
+	pxcp_dev = container_of(ddev, struct xcp_device, drm);
+	for (index = 0; index < MAX_XCP_PLATFORM_DEVICE; index++) {
+		if (xcp_dev[index] == pxcp_dev) {
+			__amdgpu_xcp_drm_dev_free(index);
+			break;
+		}
+	}
+}
+EXPORT_SYMBOL(amdgpu_xcp_drm_dev_free);
+
 void amdgpu_xcp_drv_release(void)
 {
-	for (--pdev_num; pdev_num >= 0; --pdev_num) {
-		struct platform_device *pdev = xcp_dev[pdev_num]->pdev;
+	int index;
 
-		devres_release_group(&pdev->dev, NULL);
-		platform_device_unregister(pdev);
-		xcp_dev[pdev_num] = NULL;
+	guard(mutex)(&xcp_mutex);
+
+	for (index = 0; pdev_num && index < MAX_XCP_PLATFORM_DEVICE; index++) {
+		if (xcp_dev[index])
+			__amdgpu_xcp_drm_dev_free(index);
 	}
-	pdev_num = 0;
+
+	WARN_ON(pdev_num != 0);
 }
 EXPORT_SYMBOL(amdgpu_xcp_drv_release);
 
diff --git a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h
index c1c4b679bf95..580a1602c8e3 100644
--- a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h
+++ b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h
@@ -25,5 +25,6 @@
 #define _AMDGPU_XCP_DRV_H_
 
 int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev);
+void amdgpu_xcp_drm_dev_free(struct drm_device *ddev);
 void amdgpu_xcp_drv_release(void);
 #endif /* _AMDGPU_XCP_DRV_H_ */
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [v6 2/5] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms()
  2025-01-24  5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
  2025-01-24  5:19 ` [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free() Jiang Liu
@ 2025-01-24  5:19 ` Jiang Liu
  2025-01-24  5:19 ` [v6 3/5] drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini() Jiang Liu
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24  5:19 UTC (permalink / raw)
  To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
	sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
	xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
  Cc: Jiang Liu

If some GPU device failed to probe, `rmmod amdgpu` will trigger a use
after free bug related to amdgpu_driver_release_kms() as:
[16002.085540] BUG: kernel NULL pointer dereference, address: 0000000000000000
[16002.093792] #PF: supervisor read access in kernel mode
[16002.099993] #PF: error_code(0x0000) - not-present page
[16002.106188] PGD 0 P4D 0
[16002.109464] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[16002.115372] CPU: 2 PID: 14375 Comm: rmmod Kdump: loaded Tainted: G        W   E      6.10.0+ #2
[16002.125577] Hardware name: Alibaba Alibaba Cloud ECS/Alibaba Cloud ECS, BIOS 3.0.ES.AL.P.087.05 04/07/2024
[16002.136858] RIP: 0010:drm_sched_fini+0x3f/0xe0 [gpu_sched]
[16002.143463] Code: 60 c6 87 be 00 00 00 01 e8 ce e0 90 d8 48 8d bb 80 00 00 00 e8 c2 e0 90 d8 8b 43 20 85 c0 74 51 45 31 e4 48 8b
8b 2c e8 48 89 ef e8 f5 0e 59 d9 48 8b 45 10 48 8d 55 10 48 39
[16002.164992] RSP: 0018:ffffb570dbbb7da8 EFLAGS: 00010246
[16002.171316] RAX: 0000000000000000 RBX: ffff96b0fdadc878 RCX: 0000000000000000
[16002.179784] RDX: 000fffffffe00000 RSI: 0000000000000000 RDI: ffff96b0fdadc8f8
[16002.188252] RBP: ffff96b0fdadc800 R08: ffff97abbd035040 R09: ffffffff9ac52540
[16002.196722] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[16002.205179] R13: 0000000000000000 R14: ffff96b0fdadfc00 R15: 0000000000000000
[16002.213648] FS:  00007f2737000740(0000) GS:ffff97abbd100000(0000) knlGS:0000000000000000
[16002.223189] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[16002.230103] CR2: 0000000000000000 CR3: 000000011be3a005 CR4: 0000000000f70ef0
[16002.238581] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[16002.247053] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
dxcp]
[16002.337645]  __do_sys_delete_module.constprop.0+0x176/0x310
[16002.344324]  do_syscall_64+0x5d/0x170
[16002.348858]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[16002.354956] RIP: 0033:0x7f2736a620cb-12-26

Fix it by removing xcp drm devices when failed to probe GPU devices.

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
Tested-by: Shuo Liu <shuox.liu@linux.alibaba.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c    | 69 ++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h    |  3 +-
 4 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0a121aab5c74..ee695e70fb4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4777,6 +4777,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
 	amdgpu_reset_put_reset_domain(adev->reset_domain);
 	adev->reset_domain = NULL;
 
+	amdgpu_xcp_mgr_fini(adev);
+
 	kfree(adev->pci_state);
 
 }
@@ -6677,7 +6679,7 @@ void amdgpu_device_halt(struct amdgpu_device *adev)
 	struct pci_dev *pdev = adev->pdev;
 	struct drm_device *ddev = adev_to_drm(adev);
 
-	amdgpu_xcp_dev_unplug(adev);
+	amdgpu_xcp_dev_deregister(adev);
 	drm_dev_unplug(ddev);
 
 	amdgpu_irq_disable_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index acb9dc3705ac..3f26e850120c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2435,7 +2435,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
 	struct drm_device *dev = pci_get_drvdata(pdev);
 	struct amdgpu_device *adev = drm_to_adev(dev);
 
-	amdgpu_xcp_dev_unplug(adev);
+	amdgpu_xcp_dev_deregister(adev);
 	amdgpu_gmc_prepare_nps_mode_change(adev);
 	drm_dev_unplug(dev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index e209b5e101df..272954f6e476 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -268,21 +268,33 @@ static int amdgpu_xcp_dev_alloc(struct amdgpu_device *adev)
 			return ret;
 		}
 
-		/* Redirect all IOCTLs to the primary device */
-		adev->xcp_mgr->xcp[i].rdev = p_ddev->render->dev;
-		adev->xcp_mgr->xcp[i].pdev = p_ddev->primary->dev;
-		adev->xcp_mgr->xcp[i].driver = (struct drm_driver *)p_ddev->driver;
-		adev->xcp_mgr->xcp[i].vma_offset_manager = p_ddev->vma_offset_manager;
-		p_ddev->render->dev = ddev;
-		p_ddev->primary->dev = ddev;
-		p_ddev->vma_offset_manager = ddev->vma_offset_manager;
-		p_ddev->driver = &amdgpu_partition_driver;
 		adev->xcp_mgr->xcp[i].ddev = p_ddev;
 	}
 
 	return 0;
 }
 
+static void amdgpu_xcp_dev_free(struct amdgpu_device *adev)
+{
+	struct drm_device *p_ddev;
+	int i;
+
+	if (!adev->xcp_mgr)
+		return;
+
+	for (i = 1; i < MAX_XCP; i++) {
+		if (!adev->xcp_mgr->xcp[i].ddev)
+			break;
+
+		p_ddev = adev->xcp_mgr->xcp[i].ddev;
+		adev->xcp_mgr->xcp[i].ddev = NULL;
+
+		amdgpu_xcp_drm_dev_free(p_ddev);
+	}
+
+	adev->xcp_mgr->xcp->ddev = NULL;
+}
+
 int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
 			int init_num_xcps,
 			struct amdgpu_xcp_mgr_funcs *xcp_funcs)
@@ -310,6 +322,13 @@ int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
 	return amdgpu_xcp_dev_alloc(adev);
 }
 
+void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev)
+{
+	amdgpu_xcp_dev_free(adev);
+	kfree(adev->xcp_mgr);
+	adev->xcp_mgr = NULL;
+}
+
 int amdgpu_xcp_get_partition(struct amdgpu_xcp_mgr *xcp_mgr,
 			     enum AMDGPU_XCP_IP_BLOCK ip, int instance)
 {
@@ -348,23 +367,44 @@ int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
 			const struct pci_device_id *ent)
 {
 	int i, ret;
+	struct drm_device *p_ddev;
+	struct drm_device *ddev;
 
 	if (!adev->xcp_mgr)
 		return 0;
 
+	ddev = adev_to_drm(adev);
+
 	for (i = 1; i < MAX_XCP; i++) {
-		if (!adev->xcp_mgr->xcp[i].ddev)
+		if (!adev->xcp_mgr->xcp[i].ddev || adev->xcp_mgr->xcp[i].driver)
 			break;
 
+		/* Redirect all IOCTLs to the primary device */
+		p_ddev = adev->xcp_mgr->xcp[i].ddev;
+		adev->xcp_mgr->xcp[i].rdev = p_ddev->render->dev;
+		adev->xcp_mgr->xcp[i].pdev = p_ddev->primary->dev;
+		adev->xcp_mgr->xcp[i].driver = (struct drm_driver *)p_ddev->driver;
+		adev->xcp_mgr->xcp[i].vma_offset_manager = p_ddev->vma_offset_manager;
+		p_ddev->render->dev = ddev;
+		p_ddev->primary->dev = ddev;
+		p_ddev->driver = &amdgpu_partition_driver;
+		p_ddev->vma_offset_manager = ddev->vma_offset_manager;
+
 		ret = drm_dev_register(adev->xcp_mgr->xcp[i].ddev, ent->driver_data);
-		if (ret)
+		if (ret) {
+			p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
+			p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev;
+			p_ddev->driver =  adev->xcp_mgr->xcp[i].driver;
+			p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;
+			adev->xcp_mgr->xcp[i].driver = NULL;
 			return ret;
+		}
 	}
 
 	return 0;
 }
 
-void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
+void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev)
 {
 	struct drm_device *p_ddev;
 	int i;
@@ -373,15 +413,18 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
 		return;
 
 	for (i = 1; i < MAX_XCP; i++) {
-		if (!adev->xcp_mgr->xcp[i].ddev)
+		if (!adev->xcp_mgr->xcp[i].ddev || !adev->xcp_mgr->xcp[i].driver)
 			break;
 
+		// Restore and free the original drm_device.
 		p_ddev = adev->xcp_mgr->xcp[i].ddev;
 		drm_dev_unplug(p_ddev);
+
 		p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
 		p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev;
 		p_ddev->driver =  adev->xcp_mgr->xcp[i].driver;
 		p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;
+		adev->xcp_mgr->xcp[i].driver = NULL;
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index b63f53242c57..97daf1a9236f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -155,6 +155,7 @@ int amdgpu_xcp_resume(struct amdgpu_xcp_mgr *xcp_mgr, int xcp_id);
 
 int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
 			int init_xcps, struct amdgpu_xcp_mgr_funcs *xcp_funcs);
+void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev);
 int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode);
 int amdgpu_xcp_query_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, u32 flags);
 int amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, int mode);
@@ -168,7 +169,7 @@ int amdgpu_xcp_get_inst_details(struct amdgpu_xcp *xcp,
 
 int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
 				const struct pci_device_id *ent);
-void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev);
+void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev);
 int amdgpu_xcp_open_device(struct amdgpu_device *adev,
 			   struct amdgpu_fpriv *fpriv,
 			   struct drm_file *file_priv);
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [v6 3/5] drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini()
  2025-01-24  5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
  2025-01-24  5:19 ` [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free() Jiang Liu
  2025-01-24  5:19 ` [v6 2/5] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms() Jiang Liu
@ 2025-01-24  5:19 ` Jiang Liu
  2025-01-24  5:19 ` [v6 4/5] drm/amdgpu: enhance error handling in function amdgpu_pci_probe() Jiang Liu
  2025-01-24  5:19 ` [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini() Jiang Liu
  4 siblings, 0 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24  5:19 UTC (permalink / raw)
  To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
	sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
	xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
  Cc: Jiang Liu

When AMD gpu firmware files are missing, loading the amdgpu driver will
cause following invalid memory access:

[   89.735573] amdgpu 0000:0a:00.0: amdgpu: Fetched VBIOS from platform
[   89.735583] amdgpu: ATOM BIOS: 113-M3080202-101
[   89.735676] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/psp_13_0_14_sos.bin failed with error -2
[   89.735684] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <psp> failed -19
[   89.746649] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/smu_13_0_14.bin failed with error -2
[   89.746657] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <smu> failed -19
[   89.757203] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/gc_9_4_4_rlc.bin failed with error -2
[   89.757209] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <gfx_v9_4_3> failed -19
[   89.768331] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/sdma_4_4_5.bin failed with error -2
[   89.768341] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <sdma_v4_4_2> failed -19
[   89.779385] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/vcn_4_0_3.bin failed with error -2
[   89.779389] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <vcn_v4_0_3> failed -19
[   89.790320] amdgpu 0000:0a:00.0: amdgpu: Fatal error during GPU init
[   89.797498] amdgpu 0000:0a:00.0: amdgpu: amdgpu: finishing device.
[   89.797519] BUG: kernel NULL pointer dereference, address: 0000000000000010
[   89.805377] #PF: supervisor read access in kernel mode
[   89.811639] #PF: error_code(0x0000) - not-present page
[   89.817825] PGD 0
[   89.820455] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[   89.826296] CPU: 0 UID: 0 PID: 8 Comm: kworker/0:0 Not tainted 6.12.0+ #3
[   89.834265] Hardware name: Alibaba Alibaba Cloud ECS/Alibaba Cloud ECS, BIOS 3.0.ES.AL.P.091.02 08/04/2024
[   89.845430] Workqueue: events work_for_cpu_fn
[   89.850649] RIP: 0010:amdgpu_device_fini_hw+0x1f2/0x570 [amdgpu]
[   89.858316] Code: 65 49 63 f5 48 83 fe 11 0f 83 0c 03 00 00 4c 89 ff 49 83 ec 20 4d 8d 7f e0 48 85 ff 74 48 41 83 ed 01 49 8b 84 24 18 6f 04 00 <48> 8b 40 10 48 8b 40 28 48 85 c0 74 c4 ff d0 0f 1f 00 41 89 c1 85
[   89.879966] RSP: 0018:ff23a0f9401dbd18 EFLAGS: 00010206
[   89.886123] RAX: 0000000000000000 RBX: ff13ecb721800000 RCX: 0000000000000000
[   89.894411] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ff13ecb721847028
[   89.902705] RBP: ff23a0f9401dbd68 R08: 0000000000000000 R09: 0000000000000000
[   89.910983] R10: 0000000000000000 R11: 0000000000000000 R12: ff13ecb721800120
[   89.919249] R13: 0000000000000008 R14: ff13ecb721800010 R15: ff13ecb721847008
[   89.927505] FS:  0000000000000000(0000) GS:ff13edb1fd000000(0000) knlGS:0000000000000000
[   89.936828] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   89.943513] CR2: 0000000000000010 CR3: 000001a8de63e001 CR4: 0000000000f73ef0
[   89.951761] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   89.960010] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[   89.968252] PKRU: 55555554
[   89.971516] Call Trace:
[   89.974487]  <TASK>
[   89.977067]  ? show_regs+0x6c/0x80
[   89.981115]  ? __die+0x24/0x80
[   89.984761]  ? page_fault_oops+0x175/0x5c0
[   89.989580]  ? vprintk_emit+0x127/0x400
[   89.994106]  ? do_user_addr_fault+0x4b2/0x870
[   89.999222]  ? exc_page_fault+0x82/0x1b0
[   90.003858]  ? asm_exc_page_fault+0x27/0x30
[   90.008804]  ? amdgpu_device_fini_hw+0x1f2/0x570 [amdgpu]
[   90.015539]  ? amdgpu_device_fini_hw+0x16d/0x570 [amdgpu]
[   90.022274]  ? blocking_notifier_chain_unregister+0x38/0x70
[   90.028773]  amdgpu_driver_unload_kms+0x4b/0x70 [amdgpu]
[   90.035419]  amdgpu_driver_load_kms+0x91/0xd0 [amdgpu]
[   90.041865]  amdgpu_pci_probe+0x1d1/0x650 [amdgpu]
[   90.047920]  local_pci_probe+0x44/0xb0
[   90.052388]  work_for_cpu_fn+0x17/0x30
[   90.056847]  process_one_work+0x178/0x3d0
[   90.061607]  worker_thread+0x2de/0x410
[   90.066066]  ? __pfx_worker_thread+0x10/0x10
[   90.071114]  kthread+0xe1/0x110
[   90.074891]  ? __pfx_kthread+0x10/0x10
[   90.079353]  ret_from_fork+0x44/0x70
[   90.083627]  ? __pfx_kthread+0x10/0x10
[   90.088085]  ret_from_fork_asm+0x1a/0x30
[   90.092742]  </TASK>
[   90.252277] ---[ end trace 0000000000000000 ]---

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index 272954f6e476..d430fab12355 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -752,7 +752,7 @@ void amdgpu_xcp_cfg_sysfs_fini(struct amdgpu_device *adev)
 	struct amdgpu_xcp_cfg *xcp_cfg;
 	int i;
 
-	if (!adev->xcp_mgr)
+	if (!adev->xcp_mgr || !adev->xcp_mgr->xcp_cfg)
 		return;
 
 	xcp_cfg =  adev->xcp_mgr->xcp_cfg;
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [v6 4/5] drm/amdgpu: enhance error handling in function amdgpu_pci_probe()
  2025-01-24  5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
                   ` (2 preceding siblings ...)
  2025-01-24  5:19 ` [v6 3/5] drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini() Jiang Liu
@ 2025-01-24  5:19 ` Jiang Liu
  2025-01-24  5:19 ` [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini() Jiang Liu
  4 siblings, 0 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24  5:19 UTC (permalink / raw)
  To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
	sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
	xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
  Cc: Jiang Liu

Enhance error handling in function amdgpu_pci_probe() to avoid
possible resource leakage.

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 3f26e850120c..f1378b867923 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2346,16 +2346,16 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
 		msleep(5000);
 		goto retry_init;
 	} else if (ret) {
-		goto err_pci;
+		goto unload_kms;
 	}
 
 	ret = amdgpu_xcp_dev_register(adev, ent);
 	if (ret)
-		goto err_pci;
+		goto unplug_drm;
 
 	ret = amdgpu_amdkfd_drm_client_create(adev);
 	if (ret)
-		goto err_pci;
+		goto deregister_xcp;
 
 	/*
 	 * 1. don't init fbdev on hw without DCE
@@ -2424,6 +2424,12 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
 
 	return 0;
 
+deregister_xcp:
+	amdgpu_xcp_dev_deregister(adev);
+unplug_drm:
+	drm_dev_unplug(ddev);
+unload_kms:
+	amdgpu_driver_unload_kms(ddev);
 err_pci:
 	pci_disable_device(pdev);
 	return ret;
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini()
  2025-01-24  5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
                   ` (3 preceding siblings ...)
  2025-01-24  5:19 ` [v6 4/5] drm/amdgpu: enhance error handling in function amdgpu_pci_probe() Jiang Liu
@ 2025-01-24  5:19 ` Jiang Liu
  2025-01-24 14:16   ` Christian König
  4 siblings, 1 reply; 7+ messages in thread
From: Jiang Liu @ 2025-01-24  5:19 UTC (permalink / raw)
  To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
	sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
	xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
  Cc: Jiang Liu

Introduce amdgpu_device_fini_schedulers() to clean scheduler related
resources, and avoid possible invalid memory access.

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 29 +++++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  9 -------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ee695e70fb4f..1619bd2473c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2857,27 +2857,48 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
 		if (r) {
 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
 				  ring->name);
-			return r;
+			goto out_err;
 		}
 		r = amdgpu_uvd_entity_init(adev, ring);
 		if (r) {
 			DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
 				  ring->name);
-			return r;
+			goto out_sched_fini;
 		}
 		r = amdgpu_vce_entity_init(adev, ring);
 		if (r) {
 			DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
 				  ring->name);
-			return r;
+			goto out_sched_fini;
 		}
 	}
 
 	amdgpu_xcp_update_partition_sched_list(adev);
 
 	return 0;
+
+out_sched_fini:
+	drm_sched_fini(&adev->rings[i]->sched);
+out_err:
+	while (i--)
+		if (adev->rings[i] && !adev->rings[i]->no_scheduler)
+			drm_sched_fini(&adev->rings[i]->sched);
+	return r;
 }
 
+static void amdgpu_device_fini_schedulers(struct amdgpu_device *adev)
+{
+	int i;
+
+	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+		struct amdgpu_ring *ring = adev->rings[i];
+
+		if (!ring || ring->no_scheduler)
+			continue;
+
+		drm_sched_fini(&ring->sched);
+	}
+}
 
 /**
  * amdgpu_device_ip_init - run init for hardware IPs
@@ -3424,6 +3445,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
 
 	amdgpu_amdkfd_device_fini_sw(adev);
 
+	amdgpu_device_fini_schedulers(adev);
+
 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
 		if (!adev->ip_blocks[i].status.sw)
 			continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 2f24a6aa13bf..c95895a7b888 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -650,15 +650,6 @@ void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev)
 		if (!ring || !ring->fence_drv.initialized)
 			continue;
 
-		/*
-		 * Notice we check for sched.ops since there's some
-		 * override on the meaning of sched.ready by amdgpu.
-		 * The natural check would be sched.ready, which is
-		 * set as drm_sched_init() finishes...
-		 */
-		if (ring->sched.ops)
-			drm_sched_fini(&ring->sched);
-
 		for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
 			dma_fence_put(ring->fence_drv.fences[j]);
 		kfree(ring->fence_drv.fences);
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini()
  2025-01-24  5:19 ` [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini() Jiang Liu
@ 2025-01-24 14:16   ` Christian König
  0 siblings, 0 replies; 7+ messages in thread
From: Christian König @ 2025-01-24 14:16 UTC (permalink / raw)
  To: Jiang Liu, alexander.deucher, Xinhui.Pan, airlied, simona,
	sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
	xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx

Am 24.01.25 um 06:19 schrieb Jiang Liu:
> Introduce amdgpu_device_fini_schedulers() to clean scheduler related
> resources, and avoid possible invalid memory access.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>

I can't say much about the RAS stuff but that patch here is Reviewed-by: 
Christian König <christian.koenig@amd.com>.

Alex will probably pick the patch up as soon as the series is fully 
reviewed, if not just ping me and I will push it.

Thanks,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 29 +++++++++++++++++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  9 -------
>   2 files changed, 26 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ee695e70fb4f..1619bd2473c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2857,27 +2857,48 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>   		if (r) {
>   			DRM_ERROR("Failed to create scheduler on ring %s.\n",
>   				  ring->name);
> -			return r;
> +			goto out_err;
>   		}
>   		r = amdgpu_uvd_entity_init(adev, ring);
>   		if (r) {
>   			DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
>   				  ring->name);
> -			return r;
> +			goto out_sched_fini;
>   		}
>   		r = amdgpu_vce_entity_init(adev, ring);
>   		if (r) {
>   			DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
>   				  ring->name);
> -			return r;
> +			goto out_sched_fini;
>   		}
>   	}
>   
>   	amdgpu_xcp_update_partition_sched_list(adev);
>   
>   	return 0;
> +
> +out_sched_fini:
> +	drm_sched_fini(&adev->rings[i]->sched);
> +out_err:
> +	while (i--)
> +		if (adev->rings[i] && !adev->rings[i]->no_scheduler)
> +			drm_sched_fini(&adev->rings[i]->sched);
> +	return r;
>   }
>   
> +static void amdgpu_device_fini_schedulers(struct amdgpu_device *adev)
> +{
> +	int i;
> +
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		struct amdgpu_ring *ring = adev->rings[i];
> +
> +		if (!ring || ring->no_scheduler)
> +			continue;
> +
> +		drm_sched_fini(&ring->sched);
> +	}
> +}
>   
>   /**
>    * amdgpu_device_ip_init - run init for hardware IPs
> @@ -3424,6 +3445,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
>   
>   	amdgpu_amdkfd_device_fini_sw(adev);
>   
> +	amdgpu_device_fini_schedulers(adev);
> +
>   	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
>   		if (!adev->ip_blocks[i].status.sw)
>   			continue;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 2f24a6aa13bf..c95895a7b888 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -650,15 +650,6 @@ void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev)
>   		if (!ring || !ring->fence_drv.initialized)
>   			continue;
>   
> -		/*
> -		 * Notice we check for sched.ops since there's some
> -		 * override on the meaning of sched.ready by amdgpu.
> -		 * The natural check would be sched.ready, which is
> -		 * set as drm_sched_init() finishes...
> -		 */
> -		if (ring->sched.ops)
> -			drm_sched_fini(&ring->sched);
> -
>   		for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
>   			dma_fence_put(ring->fence_drv.fences[j]);
>   		kfree(ring->fence_drv.fences);


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-01-24 14:16 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-24  5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
2025-01-24  5:19 ` [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free() Jiang Liu
2025-01-24  5:19 ` [v6 2/5] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms() Jiang Liu
2025-01-24  5:19 ` [v6 3/5] drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini() Jiang Liu
2025-01-24  5:19 ` [v6 4/5] drm/amdgpu: enhance error handling in function amdgpu_pci_probe() Jiang Liu
2025-01-24  5:19 ` [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini() Jiang Liu
2025-01-24 14:16   ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.