* [v6 0/5] Fix several bugs in error handling during device probe
@ 2025-01-24 5:19 Jiang Liu
2025-01-24 5:19 ` [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free() Jiang Liu
` (4 more replies)
0 siblings, 5 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24 5:19 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
This patchset tries to fix several memory leakages/invalid memory
accesses on error handling path during GPU driver loading/unloading.
They applies to:
https://gitlab.freedesktop.org/agd5f/linux.git amd-staging-drm-next
v6:
1) fix coding style of patch 5
v5:
1) drop first in v4, we have found a reliable way to fix the issue.
2) add patch 3 in v5 to fix a new issue
3) rework patch 5 according to review feedback
v4:
1) drop patch 1 in v3
2) split out amdxcp related change into a dedicated patch
3) use `guard(mutex)` instead of mutex_lock/unlock().
4) move patch 6 in v3 to next patch set
v3:
1) drop first patch of v2
2) rework the 0003/0004 patches of v2 according to review comments
3) add patch 0004 to fix possible resource leakage in amdgpu_pci_probe()
v2:
1) rebased to https://gitlab.freedesktop.org/agd5f/linux.git branch
amd-staging-drm-next.
2) removed the first patch, which is unnecessary.
3) add amdgpu_xcp_drm_dev_free() in patch 0003 to enhance amdxcp
driver to better support device remove and error handling.
4) reworked patch 0005 to fix it in amdgpu instead of drm core.
Jiang Liu (5):
drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free()
drm/amdgpu: fix use after free bug related to
amdgpu_driver_release_kms()
drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini()
drm/amdgpu: enhance error handling in function amdgpu_pci_probe()
drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini()
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 ++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 14 ++--
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 9 ---
drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 71 +++++++++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h | 3 +-
drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c | 63 +++++++++++++++---
drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h | 1 +
7 files changed, 152 insertions(+), 42 deletions(-)
--
2.43.5
^ permalink raw reply [flat|nested] 7+ messages in thread
* [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free()
2025-01-24 5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
@ 2025-01-24 5:19 ` Jiang Liu
2025-01-24 5:19 ` [v6 2/5] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms() Jiang Liu
` (3 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24 5:19 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Introduce new interface amdgpu_xcp_drm_dev_free() to free a specific
drm_device crreated by amdgpu_xcp_drm_dev_alloc(), which will be used
to do error recovery.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c | 63 +++++++++++++++++----
drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h | 1 +
2 files changed, 54 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c
index faed84172dd4..2ff5377d54cd 100644
--- a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c
+++ b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c
@@ -45,18 +45,26 @@ static const struct drm_driver amdgpu_xcp_driver = {
static int8_t pdev_num;
static struct xcp_device *xcp_dev[MAX_XCP_PLATFORM_DEVICE];
+static DEFINE_MUTEX(xcp_mutex);
int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev)
{
struct platform_device *pdev;
struct xcp_device *pxcp_dev;
char dev_name[20];
- int ret;
+ int ret, index;
+
+ guard(mutex)(&xcp_mutex);
if (pdev_num >= MAX_XCP_PLATFORM_DEVICE)
return -ENODEV;
- snprintf(dev_name, sizeof(dev_name), "amdgpu_xcp_%d", pdev_num);
+ for (index = 0; index < MAX_XCP_PLATFORM_DEVICE; index++) {
+ if (!xcp_dev[index])
+ break;
+ }
+
+ snprintf(dev_name, sizeof(dev_name), "amdgpu_xcp_%d", index);
pdev = platform_device_register_simple(dev_name, -1, NULL, 0);
if (IS_ERR(pdev))
return PTR_ERR(pdev);
@@ -72,8 +80,8 @@ int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev)
goto out_devres;
}
- xcp_dev[pdev_num] = pxcp_dev;
- xcp_dev[pdev_num]->pdev = pdev;
+ xcp_dev[index] = pxcp_dev;
+ xcp_dev[index]->pdev = pdev;
*ddev = &pxcp_dev->drm;
pdev_num++;
@@ -88,16 +96,51 @@ int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev)
}
EXPORT_SYMBOL(amdgpu_xcp_drm_dev_alloc);
+static void __amdgpu_xcp_drm_dev_free(int index)
+{
+ struct platform_device *pdev;
+
+ WARN_ON(!pdev_num);
+ pdev = xcp_dev[index]->pdev;
+ devres_release_group(&pdev->dev, NULL);
+ platform_device_unregister(pdev);
+ xcp_dev[index] = NULL;
+ pdev_num--;
+}
+
+void amdgpu_xcp_drm_dev_free(struct drm_device *ddev)
+{
+ struct xcp_device *pxcp_dev;
+ int index;
+
+ if (ddev == NULL)
+ return;
+
+ guard(mutex)(&xcp_mutex);
+ WARN_ON(!pdev_num);
+
+ pxcp_dev = container_of(ddev, struct xcp_device, drm);
+ for (index = 0; index < MAX_XCP_PLATFORM_DEVICE; index++) {
+ if (xcp_dev[index] == pxcp_dev) {
+ __amdgpu_xcp_drm_dev_free(index);
+ break;
+ }
+ }
+}
+EXPORT_SYMBOL(amdgpu_xcp_drm_dev_free);
+
void amdgpu_xcp_drv_release(void)
{
- for (--pdev_num; pdev_num >= 0; --pdev_num) {
- struct platform_device *pdev = xcp_dev[pdev_num]->pdev;
+ int index;
- devres_release_group(&pdev->dev, NULL);
- platform_device_unregister(pdev);
- xcp_dev[pdev_num] = NULL;
+ guard(mutex)(&xcp_mutex);
+
+ for (index = 0; pdev_num && index < MAX_XCP_PLATFORM_DEVICE; index++) {
+ if (xcp_dev[index])
+ __amdgpu_xcp_drm_dev_free(index);
}
- pdev_num = 0;
+
+ WARN_ON(pdev_num != 0);
}
EXPORT_SYMBOL(amdgpu_xcp_drv_release);
diff --git a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h
index c1c4b679bf95..580a1602c8e3 100644
--- a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h
+++ b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h
@@ -25,5 +25,6 @@
#define _AMDGPU_XCP_DRV_H_
int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev);
+void amdgpu_xcp_drm_dev_free(struct drm_device *ddev);
void amdgpu_xcp_drv_release(void);
#endif /* _AMDGPU_XCP_DRV_H_ */
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [v6 2/5] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms()
2025-01-24 5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
2025-01-24 5:19 ` [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free() Jiang Liu
@ 2025-01-24 5:19 ` Jiang Liu
2025-01-24 5:19 ` [v6 3/5] drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini() Jiang Liu
` (2 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24 5:19 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
If some GPU device failed to probe, `rmmod amdgpu` will trigger a use
after free bug related to amdgpu_driver_release_kms() as:
[16002.085540] BUG: kernel NULL pointer dereference, address: 0000000000000000
[16002.093792] #PF: supervisor read access in kernel mode
[16002.099993] #PF: error_code(0x0000) - not-present page
[16002.106188] PGD 0 P4D 0
[16002.109464] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[16002.115372] CPU: 2 PID: 14375 Comm: rmmod Kdump: loaded Tainted: G W E 6.10.0+ #2
[16002.125577] Hardware name: Alibaba Alibaba Cloud ECS/Alibaba Cloud ECS, BIOS 3.0.ES.AL.P.087.05 04/07/2024
[16002.136858] RIP: 0010:drm_sched_fini+0x3f/0xe0 [gpu_sched]
[16002.143463] Code: 60 c6 87 be 00 00 00 01 e8 ce e0 90 d8 48 8d bb 80 00 00 00 e8 c2 e0 90 d8 8b 43 20 85 c0 74 51 45 31 e4 48 8b
8b 2c e8 48 89 ef e8 f5 0e 59 d9 48 8b 45 10 48 8d 55 10 48 39
[16002.164992] RSP: 0018:ffffb570dbbb7da8 EFLAGS: 00010246
[16002.171316] RAX: 0000000000000000 RBX: ffff96b0fdadc878 RCX: 0000000000000000
[16002.179784] RDX: 000fffffffe00000 RSI: 0000000000000000 RDI: ffff96b0fdadc8f8
[16002.188252] RBP: ffff96b0fdadc800 R08: ffff97abbd035040 R09: ffffffff9ac52540
[16002.196722] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[16002.205179] R13: 0000000000000000 R14: ffff96b0fdadfc00 R15: 0000000000000000
[16002.213648] FS: 00007f2737000740(0000) GS:ffff97abbd100000(0000) knlGS:0000000000000000
[16002.223189] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[16002.230103] CR2: 0000000000000000 CR3: 000000011be3a005 CR4: 0000000000f70ef0
[16002.238581] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[16002.247053] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
dxcp]
[16002.337645] __do_sys_delete_module.constprop.0+0x176/0x310
[16002.344324] do_syscall_64+0x5d/0x170
[16002.348858] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[16002.354956] RIP: 0033:0x7f2736a620cb-12-26
Fix it by removing xcp drm devices when failed to probe GPU devices.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
Tested-by: Shuo Liu <shuox.liu@linux.alibaba.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 69 ++++++++++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h | 3 +-
4 files changed, 62 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0a121aab5c74..ee695e70fb4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4777,6 +4777,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
amdgpu_reset_put_reset_domain(adev->reset_domain);
adev->reset_domain = NULL;
+ amdgpu_xcp_mgr_fini(adev);
+
kfree(adev->pci_state);
}
@@ -6677,7 +6679,7 @@ void amdgpu_device_halt(struct amdgpu_device *adev)
struct pci_dev *pdev = adev->pdev;
struct drm_device *ddev = adev_to_drm(adev);
- amdgpu_xcp_dev_unplug(adev);
+ amdgpu_xcp_dev_deregister(adev);
drm_dev_unplug(ddev);
amdgpu_irq_disable_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index acb9dc3705ac..3f26e850120c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2435,7 +2435,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
- amdgpu_xcp_dev_unplug(adev);
+ amdgpu_xcp_dev_deregister(adev);
amdgpu_gmc_prepare_nps_mode_change(adev);
drm_dev_unplug(dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index e209b5e101df..272954f6e476 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -268,21 +268,33 @@ static int amdgpu_xcp_dev_alloc(struct amdgpu_device *adev)
return ret;
}
- /* Redirect all IOCTLs to the primary device */
- adev->xcp_mgr->xcp[i].rdev = p_ddev->render->dev;
- adev->xcp_mgr->xcp[i].pdev = p_ddev->primary->dev;
- adev->xcp_mgr->xcp[i].driver = (struct drm_driver *)p_ddev->driver;
- adev->xcp_mgr->xcp[i].vma_offset_manager = p_ddev->vma_offset_manager;
- p_ddev->render->dev = ddev;
- p_ddev->primary->dev = ddev;
- p_ddev->vma_offset_manager = ddev->vma_offset_manager;
- p_ddev->driver = &amdgpu_partition_driver;
adev->xcp_mgr->xcp[i].ddev = p_ddev;
}
return 0;
}
+static void amdgpu_xcp_dev_free(struct amdgpu_device *adev)
+{
+ struct drm_device *p_ddev;
+ int i;
+
+ if (!adev->xcp_mgr)
+ return;
+
+ for (i = 1; i < MAX_XCP; i++) {
+ if (!adev->xcp_mgr->xcp[i].ddev)
+ break;
+
+ p_ddev = adev->xcp_mgr->xcp[i].ddev;
+ adev->xcp_mgr->xcp[i].ddev = NULL;
+
+ amdgpu_xcp_drm_dev_free(p_ddev);
+ }
+
+ adev->xcp_mgr->xcp->ddev = NULL;
+}
+
int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
int init_num_xcps,
struct amdgpu_xcp_mgr_funcs *xcp_funcs)
@@ -310,6 +322,13 @@ int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
return amdgpu_xcp_dev_alloc(adev);
}
+void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev)
+{
+ amdgpu_xcp_dev_free(adev);
+ kfree(adev->xcp_mgr);
+ adev->xcp_mgr = NULL;
+}
+
int amdgpu_xcp_get_partition(struct amdgpu_xcp_mgr *xcp_mgr,
enum AMDGPU_XCP_IP_BLOCK ip, int instance)
{
@@ -348,23 +367,44 @@ int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
const struct pci_device_id *ent)
{
int i, ret;
+ struct drm_device *p_ddev;
+ struct drm_device *ddev;
if (!adev->xcp_mgr)
return 0;
+ ddev = adev_to_drm(adev);
+
for (i = 1; i < MAX_XCP; i++) {
- if (!adev->xcp_mgr->xcp[i].ddev)
+ if (!adev->xcp_mgr->xcp[i].ddev || adev->xcp_mgr->xcp[i].driver)
break;
+ /* Redirect all IOCTLs to the primary device */
+ p_ddev = adev->xcp_mgr->xcp[i].ddev;
+ adev->xcp_mgr->xcp[i].rdev = p_ddev->render->dev;
+ adev->xcp_mgr->xcp[i].pdev = p_ddev->primary->dev;
+ adev->xcp_mgr->xcp[i].driver = (struct drm_driver *)p_ddev->driver;
+ adev->xcp_mgr->xcp[i].vma_offset_manager = p_ddev->vma_offset_manager;
+ p_ddev->render->dev = ddev;
+ p_ddev->primary->dev = ddev;
+ p_ddev->driver = &amdgpu_partition_driver;
+ p_ddev->vma_offset_manager = ddev->vma_offset_manager;
+
ret = drm_dev_register(adev->xcp_mgr->xcp[i].ddev, ent->driver_data);
- if (ret)
+ if (ret) {
+ p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
+ p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev;
+ p_ddev->driver = adev->xcp_mgr->xcp[i].driver;
+ p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;
+ adev->xcp_mgr->xcp[i].driver = NULL;
return ret;
+ }
}
return 0;
}
-void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
+void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev)
{
struct drm_device *p_ddev;
int i;
@@ -373,15 +413,18 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
return;
for (i = 1; i < MAX_XCP; i++) {
- if (!adev->xcp_mgr->xcp[i].ddev)
+ if (!adev->xcp_mgr->xcp[i].ddev || !adev->xcp_mgr->xcp[i].driver)
break;
+ // Restore and free the original drm_device.
p_ddev = adev->xcp_mgr->xcp[i].ddev;
drm_dev_unplug(p_ddev);
+
p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev;
p_ddev->driver = adev->xcp_mgr->xcp[i].driver;
p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;
+ adev->xcp_mgr->xcp[i].driver = NULL;
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index b63f53242c57..97daf1a9236f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -155,6 +155,7 @@ int amdgpu_xcp_resume(struct amdgpu_xcp_mgr *xcp_mgr, int xcp_id);
int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
int init_xcps, struct amdgpu_xcp_mgr_funcs *xcp_funcs);
+void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev);
int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode);
int amdgpu_xcp_query_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, u32 flags);
int amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, int mode);
@@ -168,7 +169,7 @@ int amdgpu_xcp_get_inst_details(struct amdgpu_xcp *xcp,
int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
const struct pci_device_id *ent);
-void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev);
+void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev);
int amdgpu_xcp_open_device(struct amdgpu_device *adev,
struct amdgpu_fpriv *fpriv,
struct drm_file *file_priv);
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [v6 3/5] drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini()
2025-01-24 5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
2025-01-24 5:19 ` [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free() Jiang Liu
2025-01-24 5:19 ` [v6 2/5] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms() Jiang Liu
@ 2025-01-24 5:19 ` Jiang Liu
2025-01-24 5:19 ` [v6 4/5] drm/amdgpu: enhance error handling in function amdgpu_pci_probe() Jiang Liu
2025-01-24 5:19 ` [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini() Jiang Liu
4 siblings, 0 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24 5:19 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
When AMD gpu firmware files are missing, loading the amdgpu driver will
cause following invalid memory access:
[ 89.735573] amdgpu 0000:0a:00.0: amdgpu: Fetched VBIOS from platform
[ 89.735583] amdgpu: ATOM BIOS: 113-M3080202-101
[ 89.735676] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/psp_13_0_14_sos.bin failed with error -2
[ 89.735684] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <psp> failed -19
[ 89.746649] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/smu_13_0_14.bin failed with error -2
[ 89.746657] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <smu> failed -19
[ 89.757203] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/gc_9_4_4_rlc.bin failed with error -2
[ 89.757209] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <gfx_v9_4_3> failed -19
[ 89.768331] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/sdma_4_4_5.bin failed with error -2
[ 89.768341] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <sdma_v4_4_2> failed -19
[ 89.779385] amdgpu 0000:0a:00.0: Direct firmware load for amdgpu/vcn_4_0_3.bin failed with error -2
[ 89.779389] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init of IP block <vcn_v4_0_3> failed -19
[ 89.790320] amdgpu 0000:0a:00.0: amdgpu: Fatal error during GPU init
[ 89.797498] amdgpu 0000:0a:00.0: amdgpu: amdgpu: finishing device.
[ 89.797519] BUG: kernel NULL pointer dereference, address: 0000000000000010
[ 89.805377] #PF: supervisor read access in kernel mode
[ 89.811639] #PF: error_code(0x0000) - not-present page
[ 89.817825] PGD 0
[ 89.820455] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 89.826296] CPU: 0 UID: 0 PID: 8 Comm: kworker/0:0 Not tainted 6.12.0+ #3
[ 89.834265] Hardware name: Alibaba Alibaba Cloud ECS/Alibaba Cloud ECS, BIOS 3.0.ES.AL.P.091.02 08/04/2024
[ 89.845430] Workqueue: events work_for_cpu_fn
[ 89.850649] RIP: 0010:amdgpu_device_fini_hw+0x1f2/0x570 [amdgpu]
[ 89.858316] Code: 65 49 63 f5 48 83 fe 11 0f 83 0c 03 00 00 4c 89 ff 49 83 ec 20 4d 8d 7f e0 48 85 ff 74 48 41 83 ed 01 49 8b 84 24 18 6f 04 00 <48> 8b 40 10 48 8b 40 28 48 85 c0 74 c4 ff d0 0f 1f 00 41 89 c1 85
[ 89.879966] RSP: 0018:ff23a0f9401dbd18 EFLAGS: 00010206
[ 89.886123] RAX: 0000000000000000 RBX: ff13ecb721800000 RCX: 0000000000000000
[ 89.894411] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ff13ecb721847028
[ 89.902705] RBP: ff23a0f9401dbd68 R08: 0000000000000000 R09: 0000000000000000
[ 89.910983] R10: 0000000000000000 R11: 0000000000000000 R12: ff13ecb721800120
[ 89.919249] R13: 0000000000000008 R14: ff13ecb721800010 R15: ff13ecb721847008
[ 89.927505] FS: 0000000000000000(0000) GS:ff13edb1fd000000(0000) knlGS:0000000000000000
[ 89.936828] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 89.943513] CR2: 0000000000000010 CR3: 000001a8de63e001 CR4: 0000000000f73ef0
[ 89.951761] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 89.960010] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[ 89.968252] PKRU: 55555554
[ 89.971516] Call Trace:
[ 89.974487] <TASK>
[ 89.977067] ? show_regs+0x6c/0x80
[ 89.981115] ? __die+0x24/0x80
[ 89.984761] ? page_fault_oops+0x175/0x5c0
[ 89.989580] ? vprintk_emit+0x127/0x400
[ 89.994106] ? do_user_addr_fault+0x4b2/0x870
[ 89.999222] ? exc_page_fault+0x82/0x1b0
[ 90.003858] ? asm_exc_page_fault+0x27/0x30
[ 90.008804] ? amdgpu_device_fini_hw+0x1f2/0x570 [amdgpu]
[ 90.015539] ? amdgpu_device_fini_hw+0x16d/0x570 [amdgpu]
[ 90.022274] ? blocking_notifier_chain_unregister+0x38/0x70
[ 90.028773] amdgpu_driver_unload_kms+0x4b/0x70 [amdgpu]
[ 90.035419] amdgpu_driver_load_kms+0x91/0xd0 [amdgpu]
[ 90.041865] amdgpu_pci_probe+0x1d1/0x650 [amdgpu]
[ 90.047920] local_pci_probe+0x44/0xb0
[ 90.052388] work_for_cpu_fn+0x17/0x30
[ 90.056847] process_one_work+0x178/0x3d0
[ 90.061607] worker_thread+0x2de/0x410
[ 90.066066] ? __pfx_worker_thread+0x10/0x10
[ 90.071114] kthread+0xe1/0x110
[ 90.074891] ? __pfx_kthread+0x10/0x10
[ 90.079353] ret_from_fork+0x44/0x70
[ 90.083627] ? __pfx_kthread+0x10/0x10
[ 90.088085] ret_from_fork_asm+0x1a/0x30
[ 90.092742] </TASK>
[ 90.252277] ---[ end trace 0000000000000000 ]---
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index 272954f6e476..d430fab12355 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -752,7 +752,7 @@ void amdgpu_xcp_cfg_sysfs_fini(struct amdgpu_device *adev)
struct amdgpu_xcp_cfg *xcp_cfg;
int i;
- if (!adev->xcp_mgr)
+ if (!adev->xcp_mgr || !adev->xcp_mgr->xcp_cfg)
return;
xcp_cfg = adev->xcp_mgr->xcp_cfg;
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [v6 4/5] drm/amdgpu: enhance error handling in function amdgpu_pci_probe()
2025-01-24 5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
` (2 preceding siblings ...)
2025-01-24 5:19 ` [v6 3/5] drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini() Jiang Liu
@ 2025-01-24 5:19 ` Jiang Liu
2025-01-24 5:19 ` [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini() Jiang Liu
4 siblings, 0 replies; 7+ messages in thread
From: Jiang Liu @ 2025-01-24 5:19 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Enhance error handling in function amdgpu_pci_probe() to avoid
possible resource leakage.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 3f26e850120c..f1378b867923 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2346,16 +2346,16 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
msleep(5000);
goto retry_init;
} else if (ret) {
- goto err_pci;
+ goto unload_kms;
}
ret = amdgpu_xcp_dev_register(adev, ent);
if (ret)
- goto err_pci;
+ goto unplug_drm;
ret = amdgpu_amdkfd_drm_client_create(adev);
if (ret)
- goto err_pci;
+ goto deregister_xcp;
/*
* 1. don't init fbdev on hw without DCE
@@ -2424,6 +2424,12 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
return 0;
+deregister_xcp:
+ amdgpu_xcp_dev_deregister(adev);
+unplug_drm:
+ drm_dev_unplug(ddev);
+unload_kms:
+ amdgpu_driver_unload_kms(ddev);
err_pci:
pci_disable_device(pdev);
return ret;
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini()
2025-01-24 5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
` (3 preceding siblings ...)
2025-01-24 5:19 ` [v6 4/5] drm/amdgpu: enhance error handling in function amdgpu_pci_probe() Jiang Liu
@ 2025-01-24 5:19 ` Jiang Liu
2025-01-24 14:16 ` Christian König
4 siblings, 1 reply; 7+ messages in thread
From: Jiang Liu @ 2025-01-24 5:19 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Introduce amdgpu_device_fini_schedulers() to clean scheduler related
resources, and avoid possible invalid memory access.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 29 +++++++++++++++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 9 -------
2 files changed, 26 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ee695e70fb4f..1619bd2473c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2857,27 +2857,48 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
ring->name);
- return r;
+ goto out_err;
}
r = amdgpu_uvd_entity_init(adev, ring);
if (r) {
DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
ring->name);
- return r;
+ goto out_sched_fini;
}
r = amdgpu_vce_entity_init(adev, ring);
if (r) {
DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
ring->name);
- return r;
+ goto out_sched_fini;
}
}
amdgpu_xcp_update_partition_sched_list(adev);
return 0;
+
+out_sched_fini:
+ drm_sched_fini(&adev->rings[i]->sched);
+out_err:
+ while (i--)
+ if (adev->rings[i] && !adev->rings[i]->no_scheduler)
+ drm_sched_fini(&adev->rings[i]->sched);
+ return r;
}
+static void amdgpu_device_fini_schedulers(struct amdgpu_device *adev)
+{
+ int i;
+
+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ struct amdgpu_ring *ring = adev->rings[i];
+
+ if (!ring || ring->no_scheduler)
+ continue;
+
+ drm_sched_fini(&ring->sched);
+ }
+}
/**
* amdgpu_device_ip_init - run init for hardware IPs
@@ -3424,6 +3445,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
amdgpu_amdkfd_device_fini_sw(adev);
+ amdgpu_device_fini_schedulers(adev);
+
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
if (!adev->ip_blocks[i].status.sw)
continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 2f24a6aa13bf..c95895a7b888 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -650,15 +650,6 @@ void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev)
if (!ring || !ring->fence_drv.initialized)
continue;
- /*
- * Notice we check for sched.ops since there's some
- * override on the meaning of sched.ready by amdgpu.
- * The natural check would be sched.ready, which is
- * set as drm_sched_init() finishes...
- */
- if (ring->sched.ops)
- drm_sched_fini(&ring->sched);
-
for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
dma_fence_put(ring->fence_drv.fences[j]);
kfree(ring->fence_drv.fences);
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini()
2025-01-24 5:19 ` [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini() Jiang Liu
@ 2025-01-24 14:16 ` Christian König
0 siblings, 0 replies; 7+ messages in thread
From: Christian König @ 2025-01-24 14:16 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Am 24.01.25 um 06:19 schrieb Jiang Liu:
> Introduce amdgpu_device_fini_schedulers() to clean scheduler related
> resources, and avoid possible invalid memory access.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
I can't say much about the RAS stuff but that patch here is Reviewed-by:
Christian König <christian.koenig@amd.com>.
Alex will probably pick the patch up as soon as the series is fully
reviewed, if not just ping me and I will push it.
Thanks,
Christian.
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 29 +++++++++++++++++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 9 -------
> 2 files changed, 26 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ee695e70fb4f..1619bd2473c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2857,27 +2857,48 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
> if (r) {
> DRM_ERROR("Failed to create scheduler on ring %s.\n",
> ring->name);
> - return r;
> + goto out_err;
> }
> r = amdgpu_uvd_entity_init(adev, ring);
> if (r) {
> DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
> ring->name);
> - return r;
> + goto out_sched_fini;
> }
> r = amdgpu_vce_entity_init(adev, ring);
> if (r) {
> DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
> ring->name);
> - return r;
> + goto out_sched_fini;
> }
> }
>
> amdgpu_xcp_update_partition_sched_list(adev);
>
> return 0;
> +
> +out_sched_fini:
> + drm_sched_fini(&adev->rings[i]->sched);
> +out_err:
> + while (i--)
> + if (adev->rings[i] && !adev->rings[i]->no_scheduler)
> + drm_sched_fini(&adev->rings[i]->sched);
> + return r;
> }
>
> +static void amdgpu_device_fini_schedulers(struct amdgpu_device *adev)
> +{
> + int i;
> +
> + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> + struct amdgpu_ring *ring = adev->rings[i];
> +
> + if (!ring || ring->no_scheduler)
> + continue;
> +
> + drm_sched_fini(&ring->sched);
> + }
> +}
>
> /**
> * amdgpu_device_ip_init - run init for hardware IPs
> @@ -3424,6 +3445,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
>
> amdgpu_amdkfd_device_fini_sw(adev);
>
> + amdgpu_device_fini_schedulers(adev);
> +
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> if (!adev->ip_blocks[i].status.sw)
> continue;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 2f24a6aa13bf..c95895a7b888 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -650,15 +650,6 @@ void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev)
> if (!ring || !ring->fence_drv.initialized)
> continue;
>
> - /*
> - * Notice we check for sched.ops since there's some
> - * override on the meaning of sched.ready by amdgpu.
> - * The natural check would be sched.ready, which is
> - * set as drm_sched_init() finishes...
> - */
> - if (ring->sched.ops)
> - drm_sched_fini(&ring->sched);
> -
> for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
> dma_fence_put(ring->fence_drv.fences[j]);
> kfree(ring->fence_drv.fences);
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2025-01-24 14:16 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-24 5:19 [v6 0/5] Fix several bugs in error handling during device probe Jiang Liu
2025-01-24 5:19 ` [v6 1/5] drm/amdxcp: introduce new API amdgpu_xcp_drm_dev_free() Jiang Liu
2025-01-24 5:19 ` [v6 2/5] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms() Jiang Liu
2025-01-24 5:19 ` [v6 3/5] drm/amdgpu: fix invalid memory access in amdgpu_xcp_cfg_sysfs_fini() Jiang Liu
2025-01-24 5:19 ` [v6 4/5] drm/amdgpu: enhance error handling in function amdgpu_pci_probe() Jiang Liu
2025-01-24 5:19 ` [v6 5/5] drm/amdgpu: fix invalid memory access in amdgpu_fence_driver_sw_fini() Jiang Liu
2025-01-24 14:16 ` Christian König
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.