All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: cache in more vm fault information
@ 2024-03-06 18:19 Sunil Khatri
  2024-03-06 18:19 ` [PATCH] drm/amdgpu: add vm fault information to devcoredump Sunil Khatri
  2024-03-06 18:20 ` [PATCH] drm/amdgpu: cache in more vm fault information Khatri, Sunil
  0 siblings, 2 replies; 9+ messages in thread
From: Sunil Khatri @ 2024-03-06 18:19 UTC (permalink / raw)
  To: Alex Deucher, Christian König, Shashank Sharma
  Cc: amd-gfx, dri-devel, linux-kernel, Mukul Joshi,
	Arunpravin Paneer Selvam, Sunil Khatri

When an  page fault interrupt is raised there
is a lot more information that is useful for
developers to analyse the pagefault.

Add all such information in the last cached
pagefault from an interrupt handler.

Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 ++++++-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 2 +-
 7 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 4299ce386322..b77e8e28769d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2905,7 +2905,7 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m)
  * Cache the fault info for later use by userspace in debugging.
  */
 void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
-				  unsigned int pasid,
+				  struct amdgpu_iv_entry *entry,
 				  uint64_t addr,
 				  uint32_t status,
 				  unsigned int vmhub)
@@ -2915,7 +2915,7 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
 
 	xa_lock_irqsave(&adev->vm_manager.pasids, flags);
 
-	vm = xa_load(&adev->vm_manager.pasids, pasid);
+	vm = xa_load(&adev->vm_manager.pasids, entry->pasid);
 	/* Don't update the fault cache if status is 0.  In the multiple
 	 * fault case, subsequent faults will return a 0 status which is
 	 * useless for userspace and replaces the useful fault status, so
@@ -2924,6 +2924,11 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
 	if (vm && status) {
 		vm->fault_info.addr = addr;
 		vm->fault_info.status = status;
+		vm->fault_info.client_id = entry->client_id;
+		vm->fault_info.src_id = entry->src_id;
+		vm->fault_info.vmid = entry->vmid;
+		vm->fault_info.pasid = entry->pasid;
+		vm->fault_info.ring_id = entry->ring_id;
 		if (AMDGPU_IS_GFXHUB(vmhub)) {
 			vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;
 			vm->fault_info.vmhub |=
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 047ec1930d12..c7782a89bdb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -286,6 +286,11 @@ struct amdgpu_vm_fault_info {
 	uint32_t	status;
 	/* which vmhub? gfxhub, mmhub, etc. */
 	unsigned int	vmhub;
+	unsigned int	client_id;
+	unsigned int	src_id;
+	unsigned int	ring_id;
+	unsigned int	pasid;
+	unsigned int	vmid;
 };
 
 struct amdgpu_vm {
@@ -605,7 +610,7 @@ static inline void amdgpu_vm_eviction_unlock(struct amdgpu_vm *vm)
 }
 
 void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
-				  unsigned int pasid,
+				  struct amdgpu_iv_entry *entry,
 				  uint64_t addr,
 				  uint32_t status,
 				  unsigned int vmhub);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..6b177ce8db0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -150,7 +150,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
 		status = RREG32(hub->vm_l2_pro_fault_status);
 		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
-		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
+		amdgpu_vm_update_fault_cache(adev, entry, addr, status,
 					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 527dc917e049..bcf254856a3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -121,7 +121,7 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
 		status = RREG32(hub->vm_l2_pro_fault_status);
 		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
-		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
+		amdgpu_vm_update_fault_cache(adev, entry, addr, status,
 					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 3da7b6a2b00d..e9517ebbe1fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -1270,7 +1270,7 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev,
 	if (!addr && !status)
 		return 0;
 
-	amdgpu_vm_update_fault_cache(adev, entry->pasid,
+	amdgpu_vm_update_fault_cache(adev, entry,
 				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));
 
 	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index d20e5f20ee31..a271bf832312 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1438,7 +1438,7 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
 	if (!addr && !status)
 		return 0;
 
-	amdgpu_vm_update_fault_cache(adev, entry->pasid,
+	amdgpu_vm_update_fault_cache(adev, entry,
 				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));
 
 	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 47b63a4ce68b..dc9fb1fb9540 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -666,7 +666,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 	rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
 	WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
 
-	amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
+	amdgpu_vm_update_fault_cache(adev, entry, addr, status, vmhub);
 
 	dev_err(adev->dev,
 		"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH] drm/amdgpu: add vm fault information to devcoredump
  2024-03-06 18:19 [PATCH] drm/amdgpu: cache in more vm fault information Sunil Khatri
@ 2024-03-06 18:19 ` Sunil Khatri
  2024-03-06 19:21   ` Deucher, Alexander
  2024-03-07  8:17   ` Christian König
  2024-03-06 18:20 ` [PATCH] drm/amdgpu: cache in more vm fault information Khatri, Sunil
  1 sibling, 2 replies; 9+ messages in thread
From: Sunil Khatri @ 2024-03-06 18:19 UTC (permalink / raw)
  To: Alex Deucher, Christian König, Shashank Sharma
  Cc: amd-gfx, dri-devel, linux-kernel, Mukul Joshi,
	Arunpravin Paneer Selvam, Sunil Khatri

Add page fault information to the devcoredump.

Output of devcoredump:
**** AMDGPU Device Coredump ****
version: 1
kernel: 6.7.0-amd-staging-drm-next
module: amdgpu
time: 29.725011811
process_name: soft_recovery_p PID: 1720

Ring timed out details
IP Type: 0 Ring Name: gfx_0.0.0

[gfxhub] Page fault observed for GPU family:143
Faulty page starting at address 0x0000000000000000
Protection fault status register:0x301031

VRAM is lost due to GPU reset!

Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 147100c27c2d..d7fea6cdf2f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
 			   coredump->ring->name);
 	}
 
+	if (coredump->fault_info.status) {
+		struct amdgpu_vm_fault_info *fault_info = &coredump->fault_info;
+
+		drm_printf(&p, "\n[%s] Page fault observed for GPU family:%d\n",
+			   fault_info->vmhub ? "mmhub" : "gfxhub",
+			   coredump->adev->family);
+		drm_printf(&p, "Faulty page starting at address 0x%016llx\n",
+			   fault_info->addr);
+		drm_printf(&p, "Protection fault status register:0x%x\n",
+			   fault_info->status);
+	}
+
 	if (coredump->reset_vram_lost)
-		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
+		drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
 	if (coredump->adev->reset_info.num_regs) {
 		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
 
@@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
 	if (job) {
 		s_job = &job->base;
 		coredump->ring = to_amdgpu_ring(s_job->sched);
+		coredump->fault_info = job->vm->fault_info;
 	}
 
 	coredump->adev = adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 60522963aaca..3197955264f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
 	struct timespec64               reset_time;
 	bool                            reset_vram_lost;
 	struct amdgpu_ring			*ring;
+	struct amdgpu_vm_fault_info	fault_info;
 };
 #endif
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [PATCH] drm/amdgpu: cache in more vm fault information
  2024-03-06 18:19 [PATCH] drm/amdgpu: cache in more vm fault information Sunil Khatri
  2024-03-06 18:19 ` [PATCH] drm/amdgpu: add vm fault information to devcoredump Sunil Khatri
@ 2024-03-06 18:20 ` Khatri, Sunil
  1 sibling, 0 replies; 9+ messages in thread
From: Khatri, Sunil @ 2024-03-06 18:20 UTC (permalink / raw)
  To: Khatri, Sunil, Deucher, Alexander, Koenig, Christian,
	Sharma, Shashank
  Cc: amd-gfx@lists.freedesktop.org, dri-devel@lists.freedesktop.org,
	linux-kernel@vger.kernel.org, Joshi, Mukul,
	Paneer Selvam, Arunpravin

[AMD Official Use Only - General]

Ignore this. Triggered wrongly.

-----Original Message-----
From: Sunil Khatri <sunil.khatri@amd.com>
Sent: Wednesday, March 6, 2024 11:50 PM
To: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Sharma, Shashank <Shashank.Sharma@amd.com>
Cc: amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org; linux-kernel@vger.kernel.org; Joshi, Mukul <Mukul.Joshi@amd.com>; Paneer Selvam, Arunpravin <Arunpravin.PaneerSelvam@amd.com>; Khatri, Sunil <Sunil.Khatri@amd.com>
Subject: [PATCH] drm/amdgpu: cache in more vm fault information

When an  page fault interrupt is raised there is a lot more information that is useful for developers to analyse the pagefault.

Add all such information in the last cached pagefault from an interrupt handler.

Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 ++++++-  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +-  drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 2 +-  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 2 +-  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 2 +-
 7 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 4299ce386322..b77e8e28769d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2905,7 +2905,7 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m)
  * Cache the fault info for later use by userspace in debugging.
  */
 void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
-                                 unsigned int pasid,
+                                 struct amdgpu_iv_entry *entry,
                                  uint64_t addr,
                                  uint32_t status,
                                  unsigned int vmhub)
@@ -2915,7 +2915,7 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,

        xa_lock_irqsave(&adev->vm_manager.pasids, flags);

-       vm = xa_load(&adev->vm_manager.pasids, pasid);
+       vm = xa_load(&adev->vm_manager.pasids, entry->pasid);
        /* Don't update the fault cache if status is 0.  In the multiple
         * fault case, subsequent faults will return a 0 status which is
         * useless for userspace and replaces the useful fault status, so @@ -2924,6 +2924,11 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
        if (vm && status) {
                vm->fault_info.addr = addr;
                vm->fault_info.status = status;
+               vm->fault_info.client_id = entry->client_id;
+               vm->fault_info.src_id = entry->src_id;
+               vm->fault_info.vmid = entry->vmid;
+               vm->fault_info.pasid = entry->pasid;
+               vm->fault_info.ring_id = entry->ring_id;
                if (AMDGPU_IS_GFXHUB(vmhub)) {
                        vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;
                        vm->fault_info.vmhub |=
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 047ec1930d12..c7782a89bdb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -286,6 +286,11 @@ struct amdgpu_vm_fault_info {
        uint32_t        status;
        /* which vmhub? gfxhub, mmhub, etc. */
        unsigned int    vmhub;
+       unsigned int    client_id;
+       unsigned int    src_id;
+       unsigned int    ring_id;
+       unsigned int    pasid;
+       unsigned int    vmid;
 };

 struct amdgpu_vm {
@@ -605,7 +610,7 @@ static inline void amdgpu_vm_eviction_unlock(struct amdgpu_vm *vm)  }

 void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
-                                 unsigned int pasid,
+                                 struct amdgpu_iv_entry *entry,
                                  uint64_t addr,
                                  uint32_t status,
                                  unsigned int vmhub);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..6b177ce8db0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -150,7 +150,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
                status = RREG32(hub->vm_l2_pro_fault_status);
                WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);

-               amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
+               amdgpu_vm_update_fault_cache(adev, entry, addr, status,
                                             entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
        }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 527dc917e049..bcf254856a3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -121,7 +121,7 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
                status = RREG32(hub->vm_l2_pro_fault_status);
                WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);

-               amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
+               amdgpu_vm_update_fault_cache(adev, entry, addr, status,
                                             entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
        }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 3da7b6a2b00d..e9517ebbe1fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -1270,7 +1270,7 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev,
        if (!addr && !status)
                return 0;

-       amdgpu_vm_update_fault_cache(adev, entry->pasid,
+       amdgpu_vm_update_fault_cache(adev, entry,
                                     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));

        if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index d20e5f20ee31..a271bf832312 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1438,7 +1438,7 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
        if (!addr && !status)
                return 0;

-       amdgpu_vm_update_fault_cache(adev, entry->pasid,
+       amdgpu_vm_update_fault_cache(adev, entry,
                                     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));

        if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 47b63a4ce68b..dc9fb1fb9540 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -666,7 +666,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
        rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
        WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);

-       amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
+       amdgpu_vm_update_fault_cache(adev, entry, addr, status, vmhub);

        dev_err(adev->dev,
                "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
--
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [PATCH] drm/amdgpu: add vm fault information to devcoredump
  2024-03-06 18:19 ` [PATCH] drm/amdgpu: add vm fault information to devcoredump Sunil Khatri
@ 2024-03-06 19:21   ` Deucher, Alexander
  2024-03-07  4:02     ` Khatri, Sunil
  2024-03-07  8:17   ` Christian König
  1 sibling, 1 reply; 9+ messages in thread
From: Deucher, Alexander @ 2024-03-06 19:21 UTC (permalink / raw)
  To: Khatri, Sunil, Koenig, Christian, Sharma, Shashank
  Cc: amd-gfx@lists.freedesktop.org, dri-devel@lists.freedesktop.org,
	linux-kernel@vger.kernel.org, Joshi, Mukul,
	Paneer Selvam, Arunpravin, Khatri, Sunil

[Public]

> -----Original Message-----
> From: Sunil Khatri <sunil.khatri@amd.com>
> Sent: Wednesday, March 6, 2024 1:20 PM
> To: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian
> <Christian.Koenig@amd.com>; Sharma, Shashank
> <Shashank.Sharma@amd.com>
> Cc: amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org; linux-
> kernel@vger.kernel.org; Joshi, Mukul <Mukul.Joshi@amd.com>; Paneer
> Selvam, Arunpravin <Arunpravin.PaneerSelvam@amd.com>; Khatri, Sunil
> <Sunil.Khatri@amd.com>
> Subject: [PATCH] drm/amdgpu: add vm fault information to devcoredump
>
> Add page fault information to the devcoredump.
>
> Output of devcoredump:
> **** AMDGPU Device Coredump ****
> version: 1
> kernel: 6.7.0-amd-staging-drm-next
> module: amdgpu
> time: 29.725011811
> process_name: soft_recovery_p PID: 1720
>
> Ring timed out details
> IP Type: 0 Ring Name: gfx_0.0.0
>
> [gfxhub] Page fault observed for GPU family:143 Faulty page starting at

I think we should add a separate section for the GPU identification information (family, PCI ids, IP versions, etc.).  For this patch, I think fine to just print the fault address and status.

Alex

> address 0x0000000000000000 Protection fault status register:0x301031
>
> VRAM is lost due to GPU reset!
>
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>  2 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 147100c27c2d..d7fea6cdf2f9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t
> offset, size_t count,
>                          coredump->ring->name);
>       }
>
> +     if (coredump->fault_info.status) {
> +             struct amdgpu_vm_fault_info *fault_info = &coredump-
> >fault_info;
> +
> +             drm_printf(&p, "\n[%s] Page fault observed for GPU
> family:%d\n",
> +                        fault_info->vmhub ? "mmhub" : "gfxhub",
> +                        coredump->adev->family);
> +             drm_printf(&p, "Faulty page starting at address 0x%016llx\n",
> +                        fault_info->addr);
> +             drm_printf(&p, "Protection fault status register:0x%x\n",
> +                        fault_info->status);
> +     }
> +
>       if (coredump->reset_vram_lost)
> -             drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> +             drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
>       if (coredump->adev->reset_info.num_regs) {
>               drm_printf(&p, "AMDGPU register dumps:\nOffset:
> Value:\n");
>
> @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device
> *adev, bool vram_lost,
>       if (job) {
>               s_job = &job->base;
>               coredump->ring = to_amdgpu_ring(s_job->sched);
> +             coredump->fault_info = job->vm->fault_info;
>       }
>
>       coredump->adev = adev;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 60522963aaca..3197955264f9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
>       struct timespec64               reset_time;
>       bool                            reset_vram_lost;
>       struct amdgpu_ring                      *ring;
> +     struct amdgpu_vm_fault_info     fault_info;
>  };
>  #endif
>
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump
  2024-03-06 19:21   ` Deucher, Alexander
@ 2024-03-07  4:02     ` Khatri, Sunil
  0 siblings, 0 replies; 9+ messages in thread
From: Khatri, Sunil @ 2024-03-07  4:02 UTC (permalink / raw)
  To: Deucher, Alexander, Khatri, Sunil, Koenig, Christian,
	Sharma, Shashank
  Cc: amd-gfx@lists.freedesktop.org, dri-devel@lists.freedesktop.org,
	linux-kernel@vger.kernel.org, Joshi, Mukul,
	Paneer Selvam, Arunpravin


On 3/7/2024 12:51 AM, Deucher, Alexander wrote:
> [Public]
>
>> -----Original Message-----
>> From: Sunil Khatri <sunil.khatri@amd.com>
>> Sent: Wednesday, March 6, 2024 1:20 PM
>> To: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; Sharma, Shashank
>> <Shashank.Sharma@amd.com>
>> Cc: amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org; linux-
>> kernel@vger.kernel.org; Joshi, Mukul <Mukul.Joshi@amd.com>; Paneer
>> Selvam, Arunpravin <Arunpravin.PaneerSelvam@amd.com>; Khatri, Sunil
>> <Sunil.Khatri@amd.com>
>> Subject: [PATCH] drm/amdgpu: add vm fault information to devcoredump
>>
>> Add page fault information to the devcoredump.
>>
>> Output of devcoredump:
>> **** AMDGPU Device Coredump ****
>> version: 1
>> kernel: 6.7.0-amd-staging-drm-next
>> module: amdgpu
>> time: 29.725011811
>> process_name: soft_recovery_p PID: 1720
>>
>> Ring timed out details
>> IP Type: 0 Ring Name: gfx_0.0.0
>>
>> [gfxhub] Page fault observed for GPU family:143 Faulty page starting at
> I think we should add a separate section for the GPU identification information (family, PCI ids, IP versions, etc.).  For this patch, I think fine to just print the fault address and status.

Noted

Regards
Sunil

> Alex
>
>> address 0x0000000000000000 Protection fault status register:0x301031
>>
>> VRAM is lost due to GPU reset!
>>
>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++++++++++++++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>>   2 files changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index 147100c27c2d..d7fea6cdf2f9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t
>> offset, size_t count,
>>                           coredump->ring->name);
>>        }
>>
>> +     if (coredump->fault_info.status) {
>> +             struct amdgpu_vm_fault_info *fault_info = &coredump-
>>> fault_info;
>> +
>> +             drm_printf(&p, "\n[%s] Page fault observed for GPU
>> family:%d\n",
>> +                        fault_info->vmhub ? "mmhub" : "gfxhub",
>> +                        coredump->adev->family);
>> +             drm_printf(&p, "Faulty page starting at address 0x%016llx\n",
>> +                        fault_info->addr);
>> +             drm_printf(&p, "Protection fault status register:0x%x\n",
>> +                        fault_info->status);
>> +     }
>> +
>>        if (coredump->reset_vram_lost)
>> -             drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>> +             drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
>>        if (coredump->adev->reset_info.num_regs) {
>>                drm_printf(&p, "AMDGPU register dumps:\nOffset:
>> Value:\n");
>>
>> @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device
>> *adev, bool vram_lost,
>>        if (job) {
>>                s_job = &job->base;
>>                coredump->ring = to_amdgpu_ring(s_job->sched);
>> +             coredump->fault_info = job->vm->fault_info;
>>        }
>>
>>        coredump->adev = adev;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> index 60522963aaca..3197955264f9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> @@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
>>        struct timespec64               reset_time;
>>        bool                            reset_vram_lost;
>>        struct amdgpu_ring                      *ring;
>> +     struct amdgpu_vm_fault_info     fault_info;
>>   };
>>   #endif
>>
>> --
>> 2.34.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump
  2024-03-06 18:19 ` [PATCH] drm/amdgpu: add vm fault information to devcoredump Sunil Khatri
  2024-03-06 19:21   ` Deucher, Alexander
@ 2024-03-07  8:17   ` Christian König
  2024-03-07  8:37     ` Khatri, Sunil
  1 sibling, 1 reply; 9+ messages in thread
From: Christian König @ 2024-03-07  8:17 UTC (permalink / raw)
  To: Sunil Khatri, Alex Deucher, Shashank Sharma
  Cc: amd-gfx, dri-devel, linux-kernel, Mukul Joshi,
	Arunpravin Paneer Selvam

Am 06.03.24 um 19:19 schrieb Sunil Khatri:
> Add page fault information to the devcoredump.
>
> Output of devcoredump:
> **** AMDGPU Device Coredump ****
> version: 1
> kernel: 6.7.0-amd-staging-drm-next
> module: amdgpu
> time: 29.725011811
> process_name: soft_recovery_p PID: 1720
>
> Ring timed out details
> IP Type: 0 Ring Name: gfx_0.0.0
>
> [gfxhub] Page fault observed for GPU family:143
> Faulty page starting at address 0x0000000000000000
> Protection fault status register:0x301031
>
> VRAM is lost due to GPU reset!
>
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>   2 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 147100c27c2d..d7fea6cdf2f9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
>   			   coredump->ring->name);
>   	}
>   
> +	if (coredump->fault_info.status) {
> +		struct amdgpu_vm_fault_info *fault_info = &coredump->fault_info;
> +
> +		drm_printf(&p, "\n[%s] Page fault observed for GPU family:%d\n",
> +			   fault_info->vmhub ? "mmhub" : "gfxhub",
> +			   coredump->adev->family);
> +		drm_printf(&p, "Faulty page starting at address 0x%016llx\n",
> +			   fault_info->addr);
> +		drm_printf(&p, "Protection fault status register:0x%x\n",
> +			   fault_info->status);
> +	}
> +
>   	if (coredump->reset_vram_lost)
> -		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> +		drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
>   	if (coredump->adev->reset_info.num_regs) {
>   		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
>   
> @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>   	if (job) {
>   		s_job = &job->base;
>   		coredump->ring = to_amdgpu_ring(s_job->sched);
> +		coredump->fault_info = job->vm->fault_info;

That's illegal. The VM pointer might already be stale at this point.

I think you need to add the fault info of the last fault globally in the 
VRAM manager or move this to the process info Shashank is working on.

Regards,
Christian.

>   	}
>   
>   	coredump->adev = adev;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 60522963aaca..3197955264f9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
>   	struct timespec64               reset_time;
>   	bool                            reset_vram_lost;
>   	struct amdgpu_ring			*ring;
> +	struct amdgpu_vm_fault_info	fault_info;
>   };
>   #endif
>   


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump
  2024-03-07  8:17   ` Christian König
@ 2024-03-07  8:37     ` Khatri, Sunil
  2024-03-07 12:40       ` Christian König
  0 siblings, 1 reply; 9+ messages in thread
From: Khatri, Sunil @ 2024-03-07  8:37 UTC (permalink / raw)
  To: Christian König, Sunil Khatri, Alex Deucher, Shashank Sharma
  Cc: amd-gfx, dri-devel, linux-kernel, Mukul Joshi,
	Arunpravin Paneer Selvam


On 3/7/2024 1:47 PM, Christian König wrote:
> Am 06.03.24 um 19:19 schrieb Sunil Khatri:
>> Add page fault information to the devcoredump.
>>
>> Output of devcoredump:
>> **** AMDGPU Device Coredump ****
>> version: 1
>> kernel: 6.7.0-amd-staging-drm-next
>> module: amdgpu
>> time: 29.725011811
>> process_name: soft_recovery_p PID: 1720
>>
>> Ring timed out details
>> IP Type: 0 Ring Name: gfx_0.0.0
>>
>> [gfxhub] Page fault observed for GPU family:143
>> Faulty page starting at address 0x0000000000000000
>> Protection fault status register:0x301031
>>
>> VRAM is lost due to GPU reset!
>>
>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>>   2 files changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index 147100c27c2d..d7fea6cdf2f9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t 
>> offset, size_t count,
>>                  coredump->ring->name);
>>       }
>>   +    if (coredump->fault_info.status) {
>> +        struct amdgpu_vm_fault_info *fault_info = 
>> &coredump->fault_info;
>> +
>> +        drm_printf(&p, "\n[%s] Page fault observed for GPU 
>> family:%d\n",
>> +               fault_info->vmhub ? "mmhub" : "gfxhub",
>> +               coredump->adev->family);
>> +        drm_printf(&p, "Faulty page starting at address 0x%016llx\n",
>> +               fault_info->addr);
>> +        drm_printf(&p, "Protection fault status register:0x%x\n",
>> +               fault_info->status);
>> +    }
>> +
>>       if (coredump->reset_vram_lost)
>> -        drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>> +        drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
>>       if (coredump->adev->reset_info.num_regs) {
>>           drm_printf(&p, "AMDGPU register dumps:\nOffset:     
>> Value:\n");
>>   @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device 
>> *adev, bool vram_lost,
>>       if (job) {
>>           s_job = &job->base;
>>           coredump->ring = to_amdgpu_ring(s_job->sched);
>> +        coredump->fault_info = job->vm->fault_info;
>
> That's illegal. The VM pointer might already be stale at this point.
>
> I think you need to add the fault info of the last fault globally in 
> the VRAM manager or move this to the process info Shashank is working on.
> Are you saying that during the reset or otherwise a vm which is part 
> of this job could have been freed  and we might have a NULL 
> dereference or invalid reference? Till now based on the resets and 
> pagefaults that i have created till now using the same app which we 
> are using for IH overflow i am able to get the valid vm only.
>
> Assuming  amdgpu_vm is freed for this job or stale, are you suggesting 
> to update this information in adev-> vm_manager along with existing 
> per vm fault_info or only in vm_manager ?
>
> Regards,
> Christian.
>
>>       }
>>         coredump->adev = adev;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> index 60522963aaca..3197955264f9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> @@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
>>       struct timespec64               reset_time;
>>       bool                            reset_vram_lost;
>>       struct amdgpu_ring            *ring;
>> +    struct amdgpu_vm_fault_info    fault_info;
>>   };
>>   #endif
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump
  2024-03-07  8:37     ` Khatri, Sunil
@ 2024-03-07 12:40       ` Christian König
  2024-03-07 12:43         ` Khatri, Sunil
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2024-03-07 12:40 UTC (permalink / raw)
  To: Khatri, Sunil, Sunil Khatri, Alex Deucher, Shashank Sharma
  Cc: amd-gfx, dri-devel, linux-kernel, Mukul Joshi,
	Arunpravin Paneer Selvam, Sharma, Shashank

Am 07.03.24 um 09:37 schrieb Khatri, Sunil:
>
> On 3/7/2024 1:47 PM, Christian König wrote:
>> Am 06.03.24 um 19:19 schrieb Sunil Khatri:
>>> Add page fault information to the devcoredump.
>>>
>>> Output of devcoredump:
>>> **** AMDGPU Device Coredump ****
>>> version: 1
>>> kernel: 6.7.0-amd-staging-drm-next
>>> module: amdgpu
>>> time: 29.725011811
>>> process_name: soft_recovery_p PID: 1720
>>>
>>> Ring timed out details
>>> IP Type: 0 Ring Name: gfx_0.0.0
>>>
>>> [gfxhub] Page fault observed for GPU family:143
>>> Faulty page starting at address 0x0000000000000000
>>> Protection fault status register:0x301031
>>>
>>> VRAM is lost due to GPU reset!
>>>
>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++++++++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>>>   2 files changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> index 147100c27c2d..d7fea6cdf2f9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> @@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t 
>>> offset, size_t count,
>>>                  coredump->ring->name);
>>>       }
>>>   +    if (coredump->fault_info.status) {
>>> +        struct amdgpu_vm_fault_info *fault_info = 
>>> &coredump->fault_info;
>>> +
>>> +        drm_printf(&p, "\n[%s] Page fault observed for GPU 
>>> family:%d\n",
>>> +               fault_info->vmhub ? "mmhub" : "gfxhub",
>>> +               coredump->adev->family);
>>> +        drm_printf(&p, "Faulty page starting at address 0x%016llx\n",
>>> +               fault_info->addr);
>>> +        drm_printf(&p, "Protection fault status register:0x%x\n",
>>> +               fault_info->status);
>>> +    }
>>> +
>>>       if (coredump->reset_vram_lost)
>>> -        drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>>> +        drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
>>>       if (coredump->adev->reset_info.num_regs) {
>>>           drm_printf(&p, "AMDGPU register dumps:\nOffset:     
>>> Value:\n");
>>>   @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device 
>>> *adev, bool vram_lost,
>>>       if (job) {
>>>           s_job = &job->base;
>>>           coredump->ring = to_amdgpu_ring(s_job->sched);
>>> +        coredump->fault_info = job->vm->fault_info;
>>
>> That's illegal. The VM pointer might already be stale at this point.
>>
>> I think you need to add the fault info of the last fault globally in 
>> the VRAM manager or move this to the process info Shashank is working 
>> on.
>> Are you saying that during the reset or otherwise a vm which is part 
>> of this job could have been freed  and we might have a NULL 
>> dereference or invalid reference? Till now based on the resets and 
>> pagefaults that i have created till now using the same app which we 
>> are using for IH overflow i am able to get the valid vm only.
>>
>> Assuming  amdgpu_vm is freed for this job or stale, are you 
>> suggesting to update this information in adev-> vm_manager along with 
>> existing per vm fault_info or only in vm_manager ?

Good question. having it both in the VM as well as the VM manager sounds 
like the simplest option for now.

Regards,
Christian.

>>
>> Regards,
>> Christian.
>>
>>>       }
>>>         coredump->adev = adev;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> index 60522963aaca..3197955264f9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> @@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
>>>       struct timespec64               reset_time;
>>>       bool                            reset_vram_lost;
>>>       struct amdgpu_ring            *ring;
>>> +    struct amdgpu_vm_fault_info    fault_info;
>>>   };
>>>   #endif
>>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump
  2024-03-07 12:40       ` Christian König
@ 2024-03-07 12:43         ` Khatri, Sunil
  0 siblings, 0 replies; 9+ messages in thread
From: Khatri, Sunil @ 2024-03-07 12:43 UTC (permalink / raw)
  To: Christian König, Sunil Khatri, Alex Deucher, Shashank Sharma
  Cc: amd-gfx, dri-devel, linux-kernel, Mukul Joshi,
	Arunpravin Paneer Selvam


On 3/7/2024 6:10 PM, Christian König wrote:
> Am 07.03.24 um 09:37 schrieb Khatri, Sunil:
>>
>> On 3/7/2024 1:47 PM, Christian König wrote:
>>> Am 06.03.24 um 19:19 schrieb Sunil Khatri:
>>>> Add page fault information to the devcoredump.
>>>>
>>>> Output of devcoredump:
>>>> **** AMDGPU Device Coredump ****
>>>> version: 1
>>>> kernel: 6.7.0-amd-staging-drm-next
>>>> module: amdgpu
>>>> time: 29.725011811
>>>> process_name: soft_recovery_p PID: 1720
>>>>
>>>> Ring timed out details
>>>> IP Type: 0 Ring Name: gfx_0.0.0
>>>>
>>>> [gfxhub] Page fault observed for GPU family:143
>>>> Faulty page starting at address 0x0000000000000000
>>>> Protection fault status register:0x301031
>>>>
>>>> VRAM is lost due to GPU reset!
>>>>
>>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++++++++++++++-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>>>>   2 files changed, 15 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>> index 147100c27c2d..d7fea6cdf2f9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>>> @@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t 
>>>> offset, size_t count,
>>>>                  coredump->ring->name);
>>>>       }
>>>>   +    if (coredump->fault_info.status) {
>>>> +        struct amdgpu_vm_fault_info *fault_info = 
>>>> &coredump->fault_info;
>>>> +
>>>> +        drm_printf(&p, "\n[%s] Page fault observed for GPU 
>>>> family:%d\n",
>>>> +               fault_info->vmhub ? "mmhub" : "gfxhub",
>>>> +               coredump->adev->family);
>>>> +        drm_printf(&p, "Faulty page starting at address 0x%016llx\n",
>>>> +               fault_info->addr);
>>>> +        drm_printf(&p, "Protection fault status register:0x%x\n",
>>>> +               fault_info->status);
>>>> +    }
>>>> +
>>>>       if (coredump->reset_vram_lost)
>>>> -        drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>>>> +        drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
>>>>       if (coredump->adev->reset_info.num_regs) {
>>>>           drm_printf(&p, "AMDGPU register dumps:\nOffset:     
>>>> Value:\n");
>>>>   @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device 
>>>> *adev, bool vram_lost,
>>>>       if (job) {
>>>>           s_job = &job->base;
>>>>           coredump->ring = to_amdgpu_ring(s_job->sched);
>>>> +        coredump->fault_info = job->vm->fault_info;
>>>
>>> That's illegal. The VM pointer might already be stale at this point.
>>>
>>> I think you need to add the fault info of the last fault globally in 
>>> the VRAM manager or move this to the process info Shashank is 
>>> working on.
>>> Are you saying that during the reset or otherwise a vm which is part 
>>> of this job could have been freed  and we might have a NULL 
>>> dereference or invalid reference? Till now based on the resets and 
>>> pagefaults that i have created till now using the same app which we 
>>> are using for IH overflow i am able to get the valid vm only.
>>>
>>> Assuming  amdgpu_vm is freed for this job or stale, are you 
>>> suggesting to update this information in adev-> vm_manager along 
>>> with existing per vm fault_info or only in vm_manager ?
>
> Good question. having it both in the VM as well as the VM manager 
> sounds like the simplest option for now.

Let me update the patch then with information in VM manager.

Regards
Sunil

>
> Regards,
> Christian.
>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>       }
>>>>         coredump->adev = adev;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>> index 60522963aaca..3197955264f9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>>> @@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
>>>>       struct timespec64               reset_time;
>>>>       bool                            reset_vram_lost;
>>>>       struct amdgpu_ring            *ring;
>>>> +    struct amdgpu_vm_fault_info    fault_info;
>>>>   };
>>>>   #endif
>>>
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-03-07 12:43 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-03-06 18:19 [PATCH] drm/amdgpu: cache in more vm fault information Sunil Khatri
2024-03-06 18:19 ` [PATCH] drm/amdgpu: add vm fault information to devcoredump Sunil Khatri
2024-03-06 19:21   ` Deucher, Alexander
2024-03-07  4:02     ` Khatri, Sunil
2024-03-07  8:17   ` Christian König
2024-03-07  8:37     ` Khatri, Sunil
2024-03-07 12:40       ` Christian König
2024-03-07 12:43         ` Khatri, Sunil
2024-03-06 18:20 ` [PATCH] drm/amdgpu: cache in more vm fault information Khatri, Sunil

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.