* [PATCH 1/4] drm/amdgpu: check if vram is lost v2
@ 2017-05-16 9:25 Chunming Zhou
[not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
0 siblings, 1 reply; 22+ messages in thread
From: Chunming Zhou @ 2017-05-16 9:25 UTC (permalink / raw)
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Chunming Zhou
bakup first 64 byte of gart table as reset magic, check if magic is same
after gpu hw reset.
v2: use memcmp instead of manual innovation.
Change-Id: I9a73720da4084ea8677c3031dfb62e8157ee5704
Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++-
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index de08ff0..f9da215 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1502,6 +1502,7 @@ struct amdgpu_ssg {
#endif
};
+#define AMDGPU_RESET_MAGIC_NUM 64
struct amdgpu_device {
struct device *dev;
struct drm_device *ddev;
@@ -1705,6 +1706,7 @@ struct amdgpu_device {
/* record hw reset is performed */
bool has_hw_reset;
+ u8 reset_magic[AMDGPU_RESET_MAGIC_NUM];
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0a31fb1..c56ae4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1685,6 +1685,17 @@ static int amdgpu_init(struct amdgpu_device *adev)
return 0;
}
+static void amdgpu_fill_reset_magic(struct amdgpu_device *adev)
+{
+ memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
+}
+
+static bool amdgpu_check_vram_lost(struct amdgpu_device *adev)
+{
+ return !!memcmp(adev->gart.ptr, adev->reset_magic,
+ AMDGPU_RESET_MAGIC_NUM);
+}
+
static int amdgpu_late_init(struct amdgpu_device *adev)
{
int i = 0, r;
@@ -1715,6 +1726,8 @@ static int amdgpu_late_init(struct amdgpu_device *adev)
}
}
+ amdgpu_fill_reset_magic(adev);
+
return 0;
}
@@ -2830,7 +2843,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
struct drm_atomic_state *state = NULL;
int i, r;
int resched;
- bool need_full_reset;
+ bool need_full_reset, vram_lost = false;
if (amdgpu_sriov_vf(adev))
return amdgpu_sriov_gpu_reset(adev, true);
@@ -2899,12 +2912,17 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
r = amdgpu_resume_phase1(adev);
if (r)
goto out;
+ vram_lost = amdgpu_check_vram_lost(adev);
+ if (vram_lost)
+ DRM_ERROR("VRAM is lost!\n");
r = amdgpu_ttm_recover_gart(adev);
if (r)
goto out;
r = amdgpu_resume_phase2(adev);
if (r)
goto out;
+ if (vram_lost)
+ amdgpu_fill_reset_magic(adev);
}
}
out:
--
1.9.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 22+ messages in thread[parent not found: <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>]
* [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when vram is lost v2 [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org> @ 2017-05-16 9:25 ` Chunming Zhou [not found] ` <1494926750-1081-2-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org> 2017-05-16 9:25 ` [PATCH 3/4] drm/amdgpu: skip all jobs of guilty vm Chunming Zhou ` (2 subsequent siblings) 3 siblings, 1 reply; 22+ messages in thread From: Chunming Zhou @ 2017-05-16 9:25 UTC (permalink / raw) To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Chunming Zhou below ioctl will return -ENODEV: amdgpu_cs_ioctl amdgpu_cs_wait_ioctl amdgpu_cs_wait_fences_ioctl amdgpu_gem_va_ioctl amdgpu_info_ioctl v2: only for map and replace cases in amdgpu_gem_va_ioctl Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 9 +++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 ++++++++++ 5 files changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index f9da215..dcd6203 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -855,6 +855,7 @@ struct amdgpu_fpriv { struct amdgpu_ctx_mgr ctx_mgr; spinlock_t sem_handles_lock; struct idr sem_handles; + u32 vram_lost_counter; }; /* @@ -1607,6 +1608,7 @@ struct amdgpu_device { atomic64_t num_bytes_moved; atomic64_t num_evictions; atomic_t gpu_reset_counter; + atomic_t vram_lost_counter; /* data for buffer migration throttling */ struct { @@ -2005,6 +2007,8 @@ static inline void amdgpu_unregister_atpx_handler(void) {} extern const struct drm_ioctl_desc amdgpu_ioctls_kms[]; extern const int amdgpu_max_kms_ioctl; +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, + struct amdgpu_fpriv *fpriv); int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags); int amdgpu_driver_unload_kms(struct drm_device *dev); void amdgpu_driver_lastclose_kms(struct drm_device *dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index b803412..911aa02 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_fpriv *fpriv = filp->driver_priv; union drm_amdgpu_cs *cs = data; struct amdgpu_cs_parser parser = {}; bool reserved_buffers = false; @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) if (!adev->accel_working) return -EBUSY; + if (amdgpu_kms_vram_lost(adev, fpriv)) + return -ENODEV; parser.adev = adev; parser.filp = filp; @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data, { union drm_amdgpu_wait_cs *wait = data; struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_fpriv *fpriv = filp->driver_priv; unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout); struct amdgpu_ring *ring = NULL; struct amdgpu_ctx *ctx; struct fence *fence; long r; + if (amdgpu_kms_vram_lost(adev, fpriv)) + return -ENODEV; r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait->in.ip_instance, wait->in.ring, &ring); if (r) @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_fpriv *fpriv = filp->driver_priv; union drm_amdgpu_wait_fences *wait = data; uint32_t fence_count = wait->in.fence_count; struct drm_amdgpu_fence *fences_user; struct drm_amdgpu_fence *fences; int r; + if (amdgpu_kms_vram_lost(adev, fpriv)) + return -ENODEV; /* Get the fences from userspace */ fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence), GFP_KERNEL); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c56ae4a..2f0fcf8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2913,8 +2913,10 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) if (r) goto out; vram_lost = amdgpu_check_vram_lost(adev); - if (vram_lost) + if (vram_lost) { DRM_ERROR("VRAM is lost!\n"); + atomic_inc(&adev->vram_lost_counter); + } r = amdgpu_ttm_recover_gart(adev); if (r) goto out; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index d8275ef..83bc94c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -802,6 +802,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, args->operation); return -EINVAL; } + if ((args->operation == AMDGPU_VA_OP_MAP) || + (args->operation == AMDGPU_VA_OP_REPLACE)) { + if (amdgpu_kms_vram_lost(adev, fpriv)) + return -ENODEV; + } INIT_LIST_HEAD(&list); if ((args->operation != AMDGPU_VA_OP_CLEAR) && diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 368829a..a231aa1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info, static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_fpriv *fpriv = filp->driver_priv; struct drm_amdgpu_info *info = data; struct amdgpu_mode_info *minfo = &adev->mode_info; void __user *out = (void __user *)(uintptr_t)info->return_pointer; @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file if (!info->return_size || !info->return_pointer) return -EINVAL; + if (amdgpu_kms_vram_lost(adev, fpriv)) + return -ENODEV; switch (info->query) { case AMDGPU_INFO_VIRTUAL_RANGE: { @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct drm_device *dev) vga_switcheroo_process_delayed_switch(); } +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, + struct amdgpu_fpriv *fpriv) +{ + return fpriv->vram_lost_counter != atomic_read(&adev->vram_lost_counter); +} + /** * amdgpu_driver_open_kms - drm callback for open * @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) amdgpu_ctx_mgr_init(&fpriv->ctx_mgr); + fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter); file_priv->driver_priv = fpriv; out_suspend: -- 1.9.1 _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply related [flat|nested] 22+ messages in thread
[parent not found: <1494926750-1081-2-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>]
* RE: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when vram is lost v2 [not found] ` <1494926750-1081-2-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org> @ 2017-05-23 15:08 ` Deucher, Alexander [not found] ` <BN6PR12MB1652E1AFF691F58C92FC2CD7F7F90-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: Deucher, Alexander @ 2017-05-23 15:08 UTC (permalink / raw) To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org Cc: Zhou, David(ChunMing) > -----Original Message----- > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf > Of Chunming Zhou > Sent: Tuesday, May 16, 2017 5:26 AM > To: amd-gfx@lists.freedesktop.org > Cc: Zhou, David(ChunMing) > Subject: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when > vram is lost v2 > > below ioctl will return -ENODEV: > amdgpu_cs_ioctl > amdgpu_cs_wait_ioctl > amdgpu_cs_wait_fences_ioctl > amdgpu_gem_va_ioctl > amdgpu_info_ioctl Do we want to block the info ioctl? Isn't that where the lost context query is? Alex > > v2: only for map and replace cases in amdgpu_gem_va_ioctl > > Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e > Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 9 +++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- > drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 5 +++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 ++++++++++ > 5 files changed, 31 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index f9da215..dcd6203 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -855,6 +855,7 @@ struct amdgpu_fpriv { > struct amdgpu_ctx_mgr ctx_mgr; > spinlock_t sem_handles_lock; > struct idr sem_handles; > + u32 vram_lost_counter; > }; > > /* > @@ -1607,6 +1608,7 @@ struct amdgpu_device { > atomic64_t num_bytes_moved; > atomic64_t num_evictions; > atomic_t gpu_reset_counter; > + atomic_t vram_lost_counter; > > /* data for buffer migration throttling */ > struct { > @@ -2005,6 +2007,8 @@ static inline void > amdgpu_unregister_atpx_handler(void) {} > extern const struct drm_ioctl_desc amdgpu_ioctls_kms[]; > extern const int amdgpu_max_kms_ioctl; > > +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, > + struct amdgpu_fpriv *fpriv); > int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags); > int amdgpu_driver_unload_kms(struct drm_device *dev); > void amdgpu_driver_lastclose_kms(struct drm_device *dev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index b803412..911aa02 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct > amdgpu_cs_parser *p, > int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file > *filp) > { > struct amdgpu_device *adev = dev->dev_private; > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > union drm_amdgpu_cs *cs = data; > struct amdgpu_cs_parser parser = {}; > bool reserved_buffers = false; > @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void > *data, struct drm_file *filp) > > if (!adev->accel_working) > return -EBUSY; > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; > > parser.adev = adev; > parser.filp = filp; > @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device > *dev, void *data, > { > union drm_amdgpu_wait_cs *wait = data; > struct amdgpu_device *adev = dev->dev_private; > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout); > struct amdgpu_ring *ring = NULL; > struct amdgpu_ctx *ctx; > struct fence *fence; > long r; > > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; > r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait- > >in.ip_instance, > wait->in.ring, &ring); > if (r) > @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct > drm_device *dev, void *data, > struct drm_file *filp) > { > struct amdgpu_device *adev = dev->dev_private; > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > union drm_amdgpu_wait_fences *wait = data; > uint32_t fence_count = wait->in.fence_count; > struct drm_amdgpu_fence *fences_user; > struct drm_amdgpu_fence *fences; > int r; > > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; > /* Get the fences from userspace */ > fences = kmalloc_array(fence_count, sizeof(struct > drm_amdgpu_fence), > GFP_KERNEL); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index c56ae4a..2f0fcf8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2913,8 +2913,10 @@ int amdgpu_gpu_reset(struct amdgpu_device > *adev) > if (r) > goto out; > vram_lost = amdgpu_check_vram_lost(adev); > - if (vram_lost) > + if (vram_lost) { > DRM_ERROR("VRAM is lost!\n"); > + atomic_inc(&adev->vram_lost_counter); > + } > r = amdgpu_ttm_recover_gart(adev); > if (r) > goto out; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > index d8275ef..83bc94c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > @@ -802,6 +802,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, > void *data, > args->operation); > return -EINVAL; > } > + if ((args->operation == AMDGPU_VA_OP_MAP) || > + (args->operation == AMDGPU_VA_OP_REPLACE)) { > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; > + } > > INIT_LIST_HEAD(&list); > if ((args->operation != AMDGPU_VA_OP_CLEAR) && > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > index 368829a..a231aa1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct > drm_amdgpu_info_firmware *fw_info, > static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct > drm_file *filp) > { > struct amdgpu_device *adev = dev->dev_private; > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > struct drm_amdgpu_info *info = data; > struct amdgpu_mode_info *minfo = &adev->mode_info; > void __user *out = (void __user *)(uintptr_t)info->return_pointer; > @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, > void *data, struct drm_file > > if (!info->return_size || !info->return_pointer) > return -EINVAL; > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; > > switch (info->query) { > case AMDGPU_INFO_VIRTUAL_RANGE: { > @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct > drm_device *dev) > vga_switcheroo_process_delayed_switch(); > } > > +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, > + struct amdgpu_fpriv *fpriv) > +{ > + return fpriv->vram_lost_counter != atomic_read(&adev- > >vram_lost_counter); > +} > + > /** > * amdgpu_driver_open_kms - drm callback for open > * > @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device > *dev, struct drm_file *file_priv) > > amdgpu_ctx_mgr_init(&fpriv->ctx_mgr); > > + fpriv->vram_lost_counter = atomic_read(&adev- > >vram_lost_counter); > file_priv->driver_priv = fpriv; > > out_suspend: > -- > 1.9.1 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <BN6PR12MB1652E1AFF691F58C92FC2CD7F7F90-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>]
* Re: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when vram is lost v2 [not found] ` <BN6PR12MB1652E1AFF691F58C92FC2CD7F7F90-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org> @ 2017-05-23 15:16 ` Christian König [not found] ` <69717c0b-b2c1-589a-c466-5d6be9518eda-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: Christian König @ 2017-05-23 15:16 UTC (permalink / raw) To: Deucher, Alexander, Zhou, David(ChunMing), amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org Am 23.05.2017 um 17:08 schrieb Deucher, Alexander: >> -----Original Message----- >> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf >> Of Chunming Zhou >> Sent: Tuesday, May 16, 2017 5:26 AM >> To: amd-gfx@lists.freedesktop.org >> Cc: Zhou, David(ChunMing) >> Subject: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when >> vram is lost v2 >> >> below ioctl will return -ENODEV: >> amdgpu_cs_ioctl >> amdgpu_cs_wait_ioctl >> amdgpu_cs_wait_fences_ioctl >> amdgpu_gem_va_ioctl >> amdgpu_info_ioctl > Do we want to block the info ioctl? Isn't that where the lost context query is? No, that's amdgpu_ctx_ioctl. But I think the conclusion is that we want to move the vram_lost counter to be per CTX and not per device. Christian. > > Alex > >> v2: only for map and replace cases in amdgpu_gem_va_ioctl >> >> Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e >> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 9 +++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 5 +++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 ++++++++++ >> 5 files changed, 31 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index f9da215..dcd6203 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -855,6 +855,7 @@ struct amdgpu_fpriv { >> struct amdgpu_ctx_mgr ctx_mgr; >> spinlock_t sem_handles_lock; >> struct idr sem_handles; >> + u32 vram_lost_counter; >> }; >> >> /* >> @@ -1607,6 +1608,7 @@ struct amdgpu_device { >> atomic64_t num_bytes_moved; >> atomic64_t num_evictions; >> atomic_t gpu_reset_counter; >> + atomic_t vram_lost_counter; >> >> /* data for buffer migration throttling */ >> struct { >> @@ -2005,6 +2007,8 @@ static inline void >> amdgpu_unregister_atpx_handler(void) {} >> extern const struct drm_ioctl_desc amdgpu_ioctls_kms[]; >> extern const int amdgpu_max_kms_ioctl; >> >> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, >> + struct amdgpu_fpriv *fpriv); >> int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags); >> int amdgpu_driver_unload_kms(struct drm_device *dev); >> void amdgpu_driver_lastclose_kms(struct drm_device *dev); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> index b803412..911aa02 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct >> amdgpu_cs_parser *p, >> int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file >> *filp) >> { >> struct amdgpu_device *adev = dev->dev_private; >> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >> union drm_amdgpu_cs *cs = data; >> struct amdgpu_cs_parser parser = {}; >> bool reserved_buffers = false; >> @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void >> *data, struct drm_file *filp) >> >> if (!adev->accel_working) >> return -EBUSY; >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; >> >> parser.adev = adev; >> parser.filp = filp; >> @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device >> *dev, void *data, >> { >> union drm_amdgpu_wait_cs *wait = data; >> struct amdgpu_device *adev = dev->dev_private; >> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >> unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout); >> struct amdgpu_ring *ring = NULL; >> struct amdgpu_ctx *ctx; >> struct fence *fence; >> long r; >> >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; >> r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait- >>> in.ip_instance, >> wait->in.ring, &ring); >> if (r) >> @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct >> drm_device *dev, void *data, >> struct drm_file *filp) >> { >> struct amdgpu_device *adev = dev->dev_private; >> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >> union drm_amdgpu_wait_fences *wait = data; >> uint32_t fence_count = wait->in.fence_count; >> struct drm_amdgpu_fence *fences_user; >> struct drm_amdgpu_fence *fences; >> int r; >> >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; >> /* Get the fences from userspace */ >> fences = kmalloc_array(fence_count, sizeof(struct >> drm_amdgpu_fence), >> GFP_KERNEL); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index c56ae4a..2f0fcf8 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -2913,8 +2913,10 @@ int amdgpu_gpu_reset(struct amdgpu_device >> *adev) >> if (r) >> goto out; >> vram_lost = amdgpu_check_vram_lost(adev); >> - if (vram_lost) >> + if (vram_lost) { >> DRM_ERROR("VRAM is lost!\n"); >> + atomic_inc(&adev->vram_lost_counter); >> + } >> r = amdgpu_ttm_recover_gart(adev); >> if (r) >> goto out; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >> index d8275ef..83bc94c 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >> @@ -802,6 +802,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, >> void *data, >> args->operation); >> return -EINVAL; >> } >> + if ((args->operation == AMDGPU_VA_OP_MAP) || >> + (args->operation == AMDGPU_VA_OP_REPLACE)) { >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; >> + } >> >> INIT_LIST_HEAD(&list); >> if ((args->operation != AMDGPU_VA_OP_CLEAR) && >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> index 368829a..a231aa1 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct >> drm_amdgpu_info_firmware *fw_info, >> static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct >> drm_file *filp) >> { >> struct amdgpu_device *adev = dev->dev_private; >> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >> struct drm_amdgpu_info *info = data; >> struct amdgpu_mode_info *minfo = &adev->mode_info; >> void __user *out = (void __user *)(uintptr_t)info->return_pointer; >> @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, >> void *data, struct drm_file >> >> if (!info->return_size || !info->return_pointer) >> return -EINVAL; >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; >> >> switch (info->query) { >> case AMDGPU_INFO_VIRTUAL_RANGE: { >> @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct >> drm_device *dev) >> vga_switcheroo_process_delayed_switch(); >> } >> >> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, >> + struct amdgpu_fpriv *fpriv) >> +{ >> + return fpriv->vram_lost_counter != atomic_read(&adev- >>> vram_lost_counter); >> +} >> + >> /** >> * amdgpu_driver_open_kms - drm callback for open >> * >> @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device >> *dev, struct drm_file *file_priv) >> >> amdgpu_ctx_mgr_init(&fpriv->ctx_mgr); >> >> + fpriv->vram_lost_counter = atomic_read(&adev- >>> vram_lost_counter); >> file_priv->driver_priv = fpriv; >> >> out_suspend: >> -- >> 1.9.1 >> >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <69717c0b-b2c1-589a-c466-5d6be9518eda-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>]
* Re: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when vram is lost v2 [not found] ` <69717c0b-b2c1-589a-c466-5d6be9518eda-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org> @ 2017-05-24 2:20 ` zhoucm1 0 siblings, 0 replies; 22+ messages in thread From: zhoucm1 @ 2017-05-24 2:20 UTC (permalink / raw) To: Christian König, Deucher, Alexander, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org On 2017年05月23日 23:16, Christian König wrote: > Am 23.05.2017 um 17:08 schrieb Deucher, Alexander: >>> -----Original Message----- >>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf >>> Of Chunming Zhou >>> Sent: Tuesday, May 16, 2017 5:26 AM >>> To: amd-gfx@lists.freedesktop.org >>> Cc: Zhou, David(ChunMing) >>> Subject: [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when >>> vram is lost v2 >>> >>> below ioctl will return -ENODEV: >>> amdgpu_cs_ioctl >>> amdgpu_cs_wait_ioctl >>> amdgpu_cs_wait_fences_ioctl >>> amdgpu_gem_va_ioctl >>> amdgpu_info_ioctl >> Do we want to block the info ioctl? Isn't that where the lost >> context query is? > > No, that's amdgpu_ctx_ioctl. > > But I think the conclusion is that we want to move the vram_lost > counter to be per CTX and not per device. Yes, Monk is working on it for virt case, after it, I think we can reuse it. Regards, David zhou > > Christian. > >> >> Alex >> >>> v2: only for map and replace cases in amdgpu_gem_va_ioctl >>> >>> Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e >>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++++ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 9 +++++++++ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 5 +++++ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 ++++++++++ >>> 5 files changed, 31 insertions(+), 1 deletion(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> index f9da215..dcd6203 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> @@ -855,6 +855,7 @@ struct amdgpu_fpriv { >>> struct amdgpu_ctx_mgr ctx_mgr; >>> spinlock_t sem_handles_lock; >>> struct idr sem_handles; >>> + u32 vram_lost_counter; >>> }; >>> >>> /* >>> @@ -1607,6 +1608,7 @@ struct amdgpu_device { >>> atomic64_t num_bytes_moved; >>> atomic64_t num_evictions; >>> atomic_t gpu_reset_counter; >>> + atomic_t vram_lost_counter; >>> >>> /* data for buffer migration throttling */ >>> struct { >>> @@ -2005,6 +2007,8 @@ static inline void >>> amdgpu_unregister_atpx_handler(void) {} >>> extern const struct drm_ioctl_desc amdgpu_ioctls_kms[]; >>> extern const int amdgpu_max_kms_ioctl; >>> >>> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, >>> + struct amdgpu_fpriv *fpriv); >>> int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long >>> flags); >>> int amdgpu_driver_unload_kms(struct drm_device *dev); >>> void amdgpu_driver_lastclose_kms(struct drm_device *dev); >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>> index b803412..911aa02 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >>> @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct >>> amdgpu_cs_parser *p, >>> int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct >>> drm_file >>> *filp) >>> { >>> struct amdgpu_device *adev = dev->dev_private; >>> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >>> union drm_amdgpu_cs *cs = data; >>> struct amdgpu_cs_parser parser = {}; >>> bool reserved_buffers = false; >>> @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void >>> *data, struct drm_file *filp) >>> >>> if (!adev->accel_working) >>> return -EBUSY; >>> + if (amdgpu_kms_vram_lost(adev, fpriv)) >>> + return -ENODEV; >>> >>> parser.adev = adev; >>> parser.filp = filp; >>> @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device >>> *dev, void *data, >>> { >>> union drm_amdgpu_wait_cs *wait = data; >>> struct amdgpu_device *adev = dev->dev_private; >>> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >>> unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout); >>> struct amdgpu_ring *ring = NULL; >>> struct amdgpu_ctx *ctx; >>> struct fence *fence; >>> long r; >>> >>> + if (amdgpu_kms_vram_lost(adev, fpriv)) >>> + return -ENODEV; >>> r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait- >>>> in.ip_instance, >>> wait->in.ring, &ring); >>> if (r) >>> @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct >>> drm_device *dev, void *data, >>> struct drm_file *filp) >>> { >>> struct amdgpu_device *adev = dev->dev_private; >>> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >>> union drm_amdgpu_wait_fences *wait = data; >>> uint32_t fence_count = wait->in.fence_count; >>> struct drm_amdgpu_fence *fences_user; >>> struct drm_amdgpu_fence *fences; >>> int r; >>> >>> + if (amdgpu_kms_vram_lost(adev, fpriv)) >>> + return -ENODEV; >>> /* Get the fences from userspace */ >>> fences = kmalloc_array(fence_count, sizeof(struct >>> drm_amdgpu_fence), >>> GFP_KERNEL); >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index c56ae4a..2f0fcf8 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -2913,8 +2913,10 @@ int amdgpu_gpu_reset(struct amdgpu_device >>> *adev) >>> if (r) >>> goto out; >>> vram_lost = amdgpu_check_vram_lost(adev); >>> - if (vram_lost) >>> + if (vram_lost) { >>> DRM_ERROR("VRAM is lost!\n"); >>> + atomic_inc(&adev->vram_lost_counter); >>> + } >>> r = amdgpu_ttm_recover_gart(adev); >>> if (r) >>> goto out; >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >>> index d8275ef..83bc94c 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >>> @@ -802,6 +802,11 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, >>> void *data, >>> args->operation); >>> return -EINVAL; >>> } >>> + if ((args->operation == AMDGPU_VA_OP_MAP) || >>> + (args->operation == AMDGPU_VA_OP_REPLACE)) { >>> + if (amdgpu_kms_vram_lost(adev, fpriv)) >>> + return -ENODEV; >>> + } >>> >>> INIT_LIST_HEAD(&list); >>> if ((args->operation != AMDGPU_VA_OP_CLEAR) && >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >>> index 368829a..a231aa1 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >>> @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct >>> drm_amdgpu_info_firmware *fw_info, >>> static int amdgpu_info_ioctl(struct drm_device *dev, void *data, >>> struct >>> drm_file *filp) >>> { >>> struct amdgpu_device *adev = dev->dev_private; >>> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >>> struct drm_amdgpu_info *info = data; >>> struct amdgpu_mode_info *minfo = &adev->mode_info; >>> void __user *out = (void __user >>> *)(uintptr_t)info->return_pointer; >>> @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device >>> *dev, >>> void *data, struct drm_file >>> >>> if (!info->return_size || !info->return_pointer) >>> return -EINVAL; >>> + if (amdgpu_kms_vram_lost(adev, fpriv)) >>> + return -ENODEV; >>> >>> switch (info->query) { >>> case AMDGPU_INFO_VIRTUAL_RANGE: { >>> @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct >>> drm_device *dev) >>> vga_switcheroo_process_delayed_switch(); >>> } >>> >>> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, >>> + struct amdgpu_fpriv *fpriv) >>> +{ >>> + return fpriv->vram_lost_counter != atomic_read(&adev- >>>> vram_lost_counter); >>> +} >>> + >>> /** >>> * amdgpu_driver_open_kms - drm callback for open >>> * >>> @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device >>> *dev, struct drm_file *file_priv) >>> >>> amdgpu_ctx_mgr_init(&fpriv->ctx_mgr); >>> >>> + fpriv->vram_lost_counter = atomic_read(&adev- >>>> vram_lost_counter); >>> file_priv->driver_priv = fpriv; >>> >>> out_suspend: >>> -- >>> 1.9.1 >>> >>> _______________________________________________ >>> amd-gfx mailing list >>> amd-gfx@lists.freedesktop.org >>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH 3/4] drm/amdgpu: skip all jobs of guilty vm [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org> 2017-05-16 9:25 ` [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when " Chunming Zhou @ 2017-05-16 9:25 ` Chunming Zhou 2017-05-16 9:25 ` [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter Chunming Zhou 2017-05-16 10:49 ` [PATCH 1/4] drm/amdgpu: check if vram is lost v2 Christian König 3 siblings, 0 replies; 22+ messages in thread From: Chunming Zhou @ 2017-05-16 9:25 UTC (permalink / raw) To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Chunming Zhou Change-Id: Ib305282a21d37d6872afe4c1ce63d53b6517f338 Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 4a5ffa5..da2f87e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -167,6 +167,7 @@ static struct fence *amdgpu_job_run(struct amd_sched_job *sched_job) { struct fence *fence = NULL; struct amdgpu_job *job; + struct amdgpu_fpriv *fpriv = NULL; int r; if (!sched_job) { @@ -178,10 +179,16 @@ static struct fence *amdgpu_job_run(struct amd_sched_job *sched_job) BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); trace_amdgpu_sched_run_job(job); - r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence); - if (r) - DRM_ERROR("Error scheduling IBs (%d)\n", r); - + if (job->vm) + fpriv = container_of(job->vm, struct amdgpu_fpriv, vm); + /* skip ib schedule when vram is lost */ + if (fpriv && amdgpu_kms_vram_lost(job->adev, fpriv)) + DRM_ERROR("Skip scheduling IBs!\n"); + else { + r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence); + if (r) + DRM_ERROR("Error scheduling IBs (%d)\n", r); + } /* if gpu reset, hw fence will be replaced here */ fence_put(job->fence); job->fence = fence_get(fence); -- 1.9.1 _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply related [flat|nested] 22+ messages in thread
* [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org> 2017-05-16 9:25 ` [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when " Chunming Zhou 2017-05-16 9:25 ` [PATCH 3/4] drm/amdgpu: skip all jobs of guilty vm Chunming Zhou @ 2017-05-16 9:25 ` Chunming Zhou [not found] ` <1494926750-1081-4-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org> 2017-05-16 10:49 ` [PATCH 1/4] drm/amdgpu: check if vram is lost v2 Christian König 3 siblings, 1 reply; 22+ messages in thread From: Chunming Zhou @ 2017-05-16 9:25 UTC (permalink / raw) To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Chunming Zhou Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++ include/uapi/drm/amdgpu_drm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index bca1fb5..f3e7525 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) case AMDGPU_VM_OP_UNRESERVE_VMID: amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB); break; + case AMDGPU_VM_OP_RESET: + fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter); + break; default: return -EINVAL; } diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 81141e1..ed2ef9b 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -246,6 +246,7 @@ struct drm_amdgpu_sem_in { /* vm ioctl */ #define AMDGPU_VM_OP_RESERVE_VMID 1 #define AMDGPU_VM_OP_UNRESERVE_VMID 2 +#define AMDGPU_VM_OP_RESET 3 struct drm_amdgpu_vm_in { /** AMDGPU_VM_OP_* */ -- 1.9.1 _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply related [flat|nested] 22+ messages in thread
[parent not found: <1494926750-1081-4-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <1494926750-1081-4-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org> @ 2017-05-17 1:18 ` Michel Dänzer [not found] ` <58988726-543a-535a-3011-860d29b9f2da-otUistvHUpPR7s880joybQ@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: Michel Dänzer @ 2017-05-17 1:18 UTC (permalink / raw) To: Chunming Zhou; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW On 16/05/17 06:25 PM, Chunming Zhou wrote: > Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 > Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++ > include/uapi/drm/amdgpu_drm.h | 1 + > 2 files changed, 4 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index bca1fb5..f3e7525 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > case AMDGPU_VM_OP_UNRESERVE_VMID: > amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB); > break; > + case AMDGPU_VM_OP_RESET: > + fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter); > + break; How do you envision the UMDs using this? I can mostly think of them calling this ioctl when a context is created or destroyed. But that would also allow any other remaining contexts using the same DRM file descriptor to use all ioctls again. So, I think there needs to be a vram_lost_counter in struct amdgpu_ctx instead of in struct amdgpu_fpriv. But then I'm not sure this ioctl will be useful... I guess the UMD could try re-uploading the contents of crucial BOs (shader code, resource descriptors etc.) for an existing context and then call this ioctl. How about the GPUVM page tables? Will the kernel driver automatically re-generate those as needed, or will the UMD also need to e.g. destroy and re-create the VM mappings for all BOs before calling this ioctl? It's hard to be sure whether that's workable for the UMD without at least a working prototype... -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Mesa and X developer _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <58988726-543a-535a-3011-860d29b9f2da-otUistvHUpPR7s880joybQ@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <58988726-543a-535a-3011-860d29b9f2da-otUistvHUpPR7s880joybQ@public.gmane.org> @ 2017-05-17 3:04 ` zhoucm1 [not found] ` <591BBDA2.1070900-5C7GfCeVMHo@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: zhoucm1 @ 2017-05-17 3:04 UTC (permalink / raw) To: Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW On 2017年05月17日 09:18, Michel Dänzer wrote: > On 16/05/17 06:25 PM, Chunming Zhou wrote: >> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++ >> include/uapi/drm/amdgpu_drm.h | 1 + >> 2 files changed, 4 insertions(+) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> index bca1fb5..f3e7525 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) >> case AMDGPU_VM_OP_UNRESERVE_VMID: >> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB); >> break; >> + case AMDGPU_VM_OP_RESET: >> + fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter); >> + break; > How do you envision the UMDs using this? I can mostly think of them > calling this ioctl when a context is created or destroyed. But that > would also allow any other remaining contexts using the same DRM file > descriptor to use all ioctls again. So, I think there needs to be a > vram_lost_counter in struct amdgpu_ctx instead of in struct amdgpu_fpriv. struct amdgpu_fpriv for vram_lost_counter is proper place, especially for ioctl return value. if you need to reset ctx one by one, we can mark all contexts of that vm, and then reset by userspace. > > But then I'm not sure this ioctl will be useful... I guess the UMD could > try re-uploading the contents of crucial BOs (shader code, resource > descriptors etc.) for an existing context and then call this ioctl. How > about the GPUVM page tables? Will the kernel driver automatically > re-generate those as needed, or will the UMD also need to e.g. destroy > and re-create the VM mappings for all BOs before calling this ioctl? VM page table will recover by vm shadow BOs. > > It's hard to be sure whether that's workable for the UMD without at > least a working prototype... Totally agree, if you can help to do this in userspace, I'd like to support you from kernel side, or Christian. Thanks, David Zhou > > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <591BBDA2.1070900-5C7GfCeVMHo@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <591BBDA2.1070900-5C7GfCeVMHo@public.gmane.org> @ 2017-05-17 3:15 ` Michel Dänzer [not found] ` <29fe2142-7fd1-e23a-49d9-c38dc685db92-otUistvHUpPR7s880joybQ@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: Michel Dänzer @ 2017-05-17 3:15 UTC (permalink / raw) To: zhoucm1; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW On 17/05/17 12:04 PM, zhoucm1 wrote: > On 2017年05月17日 09:18, Michel Dänzer wrote: >> On 16/05/17 06:25 PM, Chunming Zhou wrote: >>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++ >>> include/uapi/drm/amdgpu_drm.h | 1 + >>> 2 files changed, 4 insertions(+) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> index bca1fb5..f3e7525 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, >>> void *data, struct drm_file *filp) >>> case AMDGPU_VM_OP_UNRESERVE_VMID: >>> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB); >>> break; >>> + case AMDGPU_VM_OP_RESET: >>> + fpriv->vram_lost_counter = >>> atomic_read(&adev->vram_lost_counter); >>> + break; >> How do you envision the UMDs using this? I can mostly think of them >> calling this ioctl when a context is created or destroyed. But that >> would also allow any other remaining contexts using the same DRM file >> descriptor to use all ioctls again. So, I think there needs to be a >> vram_lost_counter in struct amdgpu_ctx instead of in struct amdgpu_fpriv. > struct amdgpu_fpriv for vram_lost_counter is proper place, especially > for ioctl return value. > if you need to reset ctx one by one, we can mark all contexts of that > vm, and then reset by userspace. I'm not following. With vram_lost_counter in amdgpu_fpriv, if any context calls this ioctl, all other contexts using the same file descriptor will also be considered safe again, right? So I'm still not sure how this is supposed to be used by the UMDs. Can you describe your idea for that? >> It's hard to be sure whether that's workable for the UMD without at >> least a working prototype... > Totally agree, if you can help to do this in userspace, I'd like to > support you from kernel side, or Christian. I'm busy with other stuff. -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Mesa and X developer _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <29fe2142-7fd1-e23a-49d9-c38dc685db92-otUistvHUpPR7s880joybQ@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <29fe2142-7fd1-e23a-49d9-c38dc685db92-otUistvHUpPR7s880joybQ@public.gmane.org> @ 2017-05-17 4:28 ` zhoucm1 [not found] ` <591BD17C.8050903-5C7GfCeVMHo@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: zhoucm1 @ 2017-05-17 4:28 UTC (permalink / raw) To: Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW On 2017年05月17日 11:15, Michel Dänzer wrote: > On 17/05/17 12:04 PM, zhoucm1 wrote: >> On 2017年05月17日 09:18, Michel Dänzer wrote: >>> On 16/05/17 06:25 PM, Chunming Zhou wrote: >>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >>>> --- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +++ >>>> include/uapi/drm/amdgpu_drm.h | 1 + >>>> 2 files changed, 4 insertions(+) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>> index bca1fb5..f3e7525 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, >>>> void *data, struct drm_file *filp) >>>> case AMDGPU_VM_OP_UNRESERVE_VMID: >>>> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, AMDGPU_GFXHUB); >>>> break; >>>> + case AMDGPU_VM_OP_RESET: >>>> + fpriv->vram_lost_counter = >>>> atomic_read(&adev->vram_lost_counter); >>>> + break; >>> How do you envision the UMDs using this? I can mostly think of them >>> calling this ioctl when a context is created or destroyed. But that >>> would also allow any other remaining contexts using the same DRM file >>> descriptor to use all ioctls again. So, I think there needs to be a >>> vram_lost_counter in struct amdgpu_ctx instead of in struct amdgpu_fpriv. >> struct amdgpu_fpriv for vram_lost_counter is proper place, especially >> for ioctl return value. >> if you need to reset ctx one by one, we can mark all contexts of that >> vm, and then reset by userspace. > I'm not following. With vram_lost_counter in amdgpu_fpriv, if any > context calls this ioctl, all other contexts using the same file > descriptor will also be considered safe again, right? Yes, but it really depends on userspace requirement, if you need to reset ctx one by one, we can mark all contexts of that vm to guilty, and then reset one context by userspace. > So I'm still not > sure how this is supposed to be used by the UMDs. Can you describe your > idea for that? Correct first, this idea is picked up from Christian. We just one to provide a possibility to handle ENODEV and recover system, rather than just system dead when vram is lost. And how UMDs handle reset? which obviously need to more discussion between kernel and userspace. Regards, David Zhou > > >>> It's hard to be sure whether that's workable for the UMD without at >>> least a working prototype... >> Totally agree, if you can help to do this in userspace, I'd like to >> support you from kernel side, or Christian. > I'm busy with other stuff. > > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <591BD17C.8050903-5C7GfCeVMHo@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <591BD17C.8050903-5C7GfCeVMHo@public.gmane.org> @ 2017-05-17 6:57 ` Michel Dänzer [not found] ` <7d87bc8e-9c09-ad25-de6e-dfbd8116bf6e-otUistvHUpPR7s880joybQ@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: Michel Dänzer @ 2017-05-17 6:57 UTC (permalink / raw) To: zhoucm1; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW On 17/05/17 01:28 PM, zhoucm1 wrote: > On 2017年05月17日 11:15, Michel Dänzer wrote: >> On 17/05/17 12:04 PM, zhoucm1 wrote: >>> On 2017年05月17日 09:18, Michel Dänzer wrote: >>>> On 16/05/17 06:25 PM, Chunming Zhou wrote: >>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>> index bca1fb5..f3e7525 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, >>>>> void *data, struct drm_file *filp) >>>>> case AMDGPU_VM_OP_UNRESERVE_VMID: >>>>> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, >>>>> AMDGPU_GFXHUB); >>>>> break; >>>>> + case AMDGPU_VM_OP_RESET: >>>>> + fpriv->vram_lost_counter = >>>>> atomic_read(&adev->vram_lost_counter); >>>>> + break; >>>> How do you envision the UMDs using this? I can mostly think of them >>>> calling this ioctl when a context is created or destroyed. But that >>>> would also allow any other remaining contexts using the same DRM file >>>> descriptor to use all ioctls again. So, I think there needs to be a >>>> vram_lost_counter in struct amdgpu_ctx instead of in struct >>>> amdgpu_fpriv. >>> struct amdgpu_fpriv for vram_lost_counter is proper place, especially >>> for ioctl return value. >>> if you need to reset ctx one by one, we can mark all contexts of that >>> vm, and then reset by userspace. >> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any >> context calls this ioctl, all other contexts using the same file >> descriptor will also be considered safe again, right? > Yes, but it really depends on userspace requirement, if you need to > reset ctx one by one, we can mark all contexts of that vm to guilty, and > then reset one context by userspace. Still not sure what you mean by that. E.g. what do you mean by "guilty"? I thought that refers to the context which caused a hang. But it seems like you're using it to refer to any context which hasn't reacted yet to VRAM contents being lost. Also not sure what you mean by "if you need to reset ctx one by one". -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Mesa and X developer _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <7d87bc8e-9c09-ad25-de6e-dfbd8116bf6e-otUistvHUpPR7s880joybQ@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <7d87bc8e-9c09-ad25-de6e-dfbd8116bf6e-otUistvHUpPR7s880joybQ@public.gmane.org> @ 2017-05-17 7:13 ` zhoucm1 [not found] ` <591BF825.6090505-5C7GfCeVMHo@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: zhoucm1 @ 2017-05-17 7:13 UTC (permalink / raw) To: Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW On 2017年05月17日 14:57, Michel Dänzer wrote: > On 17/05/17 01:28 PM, zhoucm1 wrote: >> On 2017年05月17日 11:15, Michel Dänzer wrote: >>> On 17/05/17 12:04 PM, zhoucm1 wrote: >>>> On 2017年05月17日 09:18, Michel Dänzer wrote: >>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote: >>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >>>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >>>>>> >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>> index bca1fb5..f3e7525 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, >>>>>> void *data, struct drm_file *filp) >>>>>> case AMDGPU_VM_OP_UNRESERVE_VMID: >>>>>> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, >>>>>> AMDGPU_GFXHUB); >>>>>> break; >>>>>> + case AMDGPU_VM_OP_RESET: >>>>>> + fpriv->vram_lost_counter = >>>>>> atomic_read(&adev->vram_lost_counter); >>>>>> + break; >>>>> How do you envision the UMDs using this? I can mostly think of them >>>>> calling this ioctl when a context is created or destroyed. But that >>>>> would also allow any other remaining contexts using the same DRM file >>>>> descriptor to use all ioctls again. So, I think there needs to be a >>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct >>>>> amdgpu_fpriv. >>>> struct amdgpu_fpriv for vram_lost_counter is proper place, especially >>>> for ioctl return value. >>>> if you need to reset ctx one by one, we can mark all contexts of that >>>> vm, and then reset by userspace. >>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any >>> context calls this ioctl, all other contexts using the same file >>> descriptor will also be considered safe again, right? >> Yes, but it really depends on userspace requirement, if you need to >> reset ctx one by one, we can mark all contexts of that vm to guilty, and >> then reset one context by userspace. > Still not sure what you mean by that. > > E.g. what do you mean by "guilty"? I thought that refers to the context > which caused a hang. But it seems like you're using it to refer to any > context which hasn't reacted yet to VRAM contents being lost. When vram is lost, we treat all contexts need to reset. Regards, David Zhou > > Also not sure what you mean by "if you need to reset ctx one by one". > > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <591BF825.6090505-5C7GfCeVMHo@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <591BF825.6090505-5C7GfCeVMHo@public.gmane.org> @ 2017-05-17 8:01 ` Michel Dänzer [not found] ` <31db7a30-dd98-5cb2-4125-187d3d0e2a49-otUistvHUpPR7s880joybQ@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: Michel Dänzer @ 2017-05-17 8:01 UTC (permalink / raw) To: zhoucm1; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW On 17/05/17 04:13 PM, zhoucm1 wrote: > On 2017年05月17日 14:57, Michel Dänzer wrote: >> On 17/05/17 01:28 PM, zhoucm1 wrote: >>> On 2017年05月17日 11:15, Michel Dänzer wrote: >>>> On 17/05/17 12:04 PM, zhoucm1 wrote: >>>>> On 2017年05月17日 09:18, Michel Dänzer wrote: >>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote: >>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >>>>>>> >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>> index bca1fb5..f3e7525 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, >>>>>>> void *data, struct drm_file *filp) >>>>>>> case AMDGPU_VM_OP_UNRESERVE_VMID: >>>>>>> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, >>>>>>> AMDGPU_GFXHUB); >>>>>>> break; >>>>>>> + case AMDGPU_VM_OP_RESET: >>>>>>> + fpriv->vram_lost_counter = >>>>>>> atomic_read(&adev->vram_lost_counter); >>>>>>> + break; >>>>>> How do you envision the UMDs using this? I can mostly think of them >>>>>> calling this ioctl when a context is created or destroyed. But that >>>>>> would also allow any other remaining contexts using the same DRM file >>>>>> descriptor to use all ioctls again. So, I think there needs to be a >>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct >>>>>> amdgpu_fpriv. >>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, especially >>>>> for ioctl return value. >>>>> if you need to reset ctx one by one, we can mark all contexts of that >>>>> vm, and then reset by userspace. >>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any >>>> context calls this ioctl, all other contexts using the same file >>>> descriptor will also be considered safe again, right? >>> Yes, but it really depends on userspace requirement, if you need to >>> reset ctx one by one, we can mark all contexts of that vm to guilty, and >>> then reset one context by userspace. >> Still not sure what you mean by that. >> >> E.g. what do you mean by "guilty"? I thought that refers to the context >> which caused a hang. But it seems like you're using it to refer to any >> context which hasn't reacted yet to VRAM contents being lost. > When vram is lost, we treat all contexts need to reset. Essentially, your patches only track VRAM contents being lost per file descriptor, not per context. I'm not sure (rather skeptical) that this is suitable for OpenGL UMDs, since state is usually tracked per context. Marek / Nicolai? -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Mesa and X developer _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <31db7a30-dd98-5cb2-4125-187d3d0e2a49-otUistvHUpPR7s880joybQ@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <31db7a30-dd98-5cb2-4125-187d3d0e2a49-otUistvHUpPR7s880joybQ@public.gmane.org> @ 2017-05-17 8:40 ` Christian König [not found] ` <7a302ebe-1de1-734f-fb21-aadcc7904d37-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: Christian König @ 2017-05-17 8:40 UTC (permalink / raw) To: Michel Dänzer, zhoucm1; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW Am 17.05.2017 um 10:01 schrieb Michel Dänzer: > On 17/05/17 04:13 PM, zhoucm1 wrote: >> On 2017年05月17日 14:57, Michel Dänzer wrote: >>> On 17/05/17 01:28 PM, zhoucm1 wrote: >>>> On 2017年05月17日 11:15, Michel Dänzer wrote: >>>>> On 17/05/17 12:04 PM, zhoucm1 wrote: >>>>>> On 2017年05月17日 09:18, Michel Dänzer wrote: >>>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote: >>>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >>>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >>>>>>>> >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>> index bca1fb5..f3e7525 100644 >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, >>>>>>>> void *data, struct drm_file *filp) >>>>>>>> case AMDGPU_VM_OP_UNRESERVE_VMID: >>>>>>>> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, >>>>>>>> AMDGPU_GFXHUB); >>>>>>>> break; >>>>>>>> + case AMDGPU_VM_OP_RESET: >>>>>>>> + fpriv->vram_lost_counter = >>>>>>>> atomic_read(&adev->vram_lost_counter); >>>>>>>> + break; >>>>>>> How do you envision the UMDs using this? I can mostly think of them >>>>>>> calling this ioctl when a context is created or destroyed. But that >>>>>>> would also allow any other remaining contexts using the same DRM file >>>>>>> descriptor to use all ioctls again. So, I think there needs to be a >>>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct >>>>>>> amdgpu_fpriv. >>>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, especially >>>>>> for ioctl return value. >>>>>> if you need to reset ctx one by one, we can mark all contexts of that >>>>>> vm, and then reset by userspace. >>>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any >>>>> context calls this ioctl, all other contexts using the same file >>>>> descriptor will also be considered safe again, right? >>>> Yes, but it really depends on userspace requirement, if you need to >>>> reset ctx one by one, we can mark all contexts of that vm to guilty, and >>>> then reset one context by userspace. >>> Still not sure what you mean by that. >>> >>> E.g. what do you mean by "guilty"? I thought that refers to the context >>> which caused a hang. But it seems like you're using it to refer to any >>> context which hasn't reacted yet to VRAM contents being lost. >> When vram is lost, we treat all contexts need to reset. > Essentially, your patches only track VRAM contents being lost per file > descriptor, not per context. I'm not sure (rather skeptical) that this > is suitable for OpenGL UMDs, since state is usually tracked per context. > Marek / Nicolai? Oh, yeah that's a good point. The problem with tracking it per context is that Vulkan also wants the ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are context less. But thinking more about this blocking those two doesn't make much sense. The VM content can be restored and why should be disallow reading GPU info? Christian. _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <7a302ebe-1de1-734f-fb21-aadcc7904d37-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <7a302ebe-1de1-734f-fb21-aadcc7904d37-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org> @ 2017-05-17 8:46 ` zhoucm1 [not found] ` <591C0DFB.8030604-5C7GfCeVMHo@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: zhoucm1 @ 2017-05-17 8:46 UTC (permalink / raw) To: Christian König, Michel Dänzer Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW [-- Attachment #1.1: Type: text/plain, Size: 4220 bytes --] On 2017年05月17日 16:40, Christian König wrote: > Am 17.05.2017 um 10:01 schrieb Michel Dänzer: >> On 17/05/17 04:13 PM, zhoucm1 wrote: >>> On 2017年05月17日 14:57, Michel Dänzer wrote: >>>> On 17/05/17 01:28 PM, zhoucm1 wrote: >>>>> On 2017年05月17日 11:15, Michel Dänzer wrote: >>>>>> On 17/05/17 12:04 PM, zhoucm1 wrote: >>>>>>> On 2017年05月17日 09:18, Michel Dänzer wrote: >>>>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote: >>>>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >>>>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou-5C7GfCeVMHo@public.gmane.org> >>>>>>>>> >>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>> index bca1fb5..f3e7525 100644 >>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, >>>>>>>>> void *data, struct drm_file *filp) >>>>>>>>> case AMDGPU_VM_OP_UNRESERVE_VMID: >>>>>>>>> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, >>>>>>>>> AMDGPU_GFXHUB); >>>>>>>>> break; >>>>>>>>> + case AMDGPU_VM_OP_RESET: >>>>>>>>> + fpriv->vram_lost_counter = >>>>>>>>> atomic_read(&adev->vram_lost_counter); >>>>>>>>> + break; >>>>>>>> How do you envision the UMDs using this? I can mostly think of >>>>>>>> them >>>>>>>> calling this ioctl when a context is created or destroyed. But >>>>>>>> that >>>>>>>> would also allow any other remaining contexts using the same >>>>>>>> DRM file >>>>>>>> descriptor to use all ioctls again. So, I think there needs to >>>>>>>> be a >>>>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct >>>>>>>> amdgpu_fpriv. >>>>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, >>>>>>> especially >>>>>>> for ioctl return value. >>>>>>> if you need to reset ctx one by one, we can mark all contexts of >>>>>>> that >>>>>>> vm, and then reset by userspace. >>>>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any >>>>>> context calls this ioctl, all other contexts using the same file >>>>>> descriptor will also be considered safe again, right? >>>>> Yes, but it really depends on userspace requirement, if you need to >>>>> reset ctx one by one, we can mark all contexts of that vm to >>>>> guilty, and >>>>> then reset one context by userspace. >>>> Still not sure what you mean by that. >>>> >>>> E.g. what do you mean by "guilty"? I thought that refers to the >>>> context >>>> which caused a hang. But it seems like you're using it to refer to any >>>> context which hasn't reacted yet to VRAM contents being lost. >>> When vram is lost, we treat all contexts need to reset. >> Essentially, your patches only track VRAM contents being lost per file >> descriptor, not per context. I'm not sure (rather skeptical) that this >> is suitable for OpenGL UMDs, since state is usually tracked per context. >> Marek / Nicolai? > > Oh, yeah that's a good point. > > The problem with tracking it per context is that Vulkan also wants the > ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are > context less. > > But thinking more about this blocking those two doesn't make much > sense. The VM content can be restored and why should be disallow > reading GPU info? I can re-paste the Vulkan APIs requiring ENODEV: " The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST according to the spec. I tries to provide a list of u/k interfaces that could be called for each vk API. vkCreateDevice -amdgpu_device_initialize. -amdgpu_query_gpu_info vkQueueSubmit -amdgpu_cs_submit vkWaitForFences amdgpu_cs_wait_fences vkGetEventStatus vkQueueWaitIdle vkDeviceWaitIdle vkGetQueryPoolResults** amdgpu_cs_query_Fence_status vkQueueBindSparse** amdgpu_bo_va_op amdgpu_bo_va_op_raw vkCreateSwapchainKHR** vkAcquireNextImageKHR** vkQueuePresentKHR Not related with u/k interface.** ** Besides those listed above, I think amdgpu_cs_signal_Sem/amdgpu_cs_wait_sem should respond to gpu reset as well." > > Christian. > [-- Attachment #1.2: Type: text/html, Size: 9346 bytes --] [-- Attachment #2: Type: text/plain, Size: 154 bytes --] _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <591C0DFB.8030604-5C7GfCeVMHo@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <591C0DFB.8030604-5C7GfCeVMHo@public.gmane.org> @ 2017-05-17 8:55 ` Michel Dänzer 2017-05-17 8:56 ` Christian König 1 sibling, 0 replies; 22+ messages in thread From: Michel Dänzer @ 2017-05-17 8:55 UTC (permalink / raw) To: zhoucm1, Christian König; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW On 17/05/17 05:46 PM, zhoucm1 wrote: > > > On 2017年05月17日 16:40, Christian König wrote: >> Am 17.05.2017 um 10:01 schrieb Michel Dänzer: >>> On 17/05/17 04:13 PM, zhoucm1 wrote: >>>> On 2017年05月17日 14:57, Michel Dänzer wrote: >>>>> On 17/05/17 01:28 PM, zhoucm1 wrote: >>>>>> On 2017年05月17日 11:15, Michel Dänzer wrote: >>>>>>> On 17/05/17 12:04 PM, zhoucm1 wrote: >>>>>>>> On 2017年05月17日 09:18, Michel Dänzer wrote: >>>>>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote: >>>>>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >>>>>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> >>>>>>>>>> >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>>> index bca1fb5..f3e7525 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, >>>>>>>>>> void *data, struct drm_file *filp) >>>>>>>>>> case AMDGPU_VM_OP_UNRESERVE_VMID: >>>>>>>>>> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, >>>>>>>>>> AMDGPU_GFXHUB); >>>>>>>>>> break; >>>>>>>>>> + case AMDGPU_VM_OP_RESET: >>>>>>>>>> + fpriv->vram_lost_counter = >>>>>>>>>> atomic_read(&adev->vram_lost_counter); >>>>>>>>>> + break; >>>>>>>>> How do you envision the UMDs using this? I can mostly think of >>>>>>>>> them >>>>>>>>> calling this ioctl when a context is created or destroyed. But >>>>>>>>> that >>>>>>>>> would also allow any other remaining contexts using the same >>>>>>>>> DRM file >>>>>>>>> descriptor to use all ioctls again. So, I think there needs to >>>>>>>>> be a >>>>>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct >>>>>>>>> amdgpu_fpriv. >>>>>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, >>>>>>>> especially >>>>>>>> for ioctl return value. >>>>>>>> if you need to reset ctx one by one, we can mark all contexts of >>>>>>>> that >>>>>>>> vm, and then reset by userspace. >>>>>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any >>>>>>> context calls this ioctl, all other contexts using the same file >>>>>>> descriptor will also be considered safe again, right? >>>>>> Yes, but it really depends on userspace requirement, if you need to >>>>>> reset ctx one by one, we can mark all contexts of that vm to >>>>>> guilty, and >>>>>> then reset one context by userspace. >>>>> Still not sure what you mean by that. >>>>> >>>>> E.g. what do you mean by "guilty"? I thought that refers to the >>>>> context >>>>> which caused a hang. But it seems like you're using it to refer to any >>>>> context which hasn't reacted yet to VRAM contents being lost. >>>> When vram is lost, we treat all contexts need to reset. >>> Essentially, your patches only track VRAM contents being lost per file >>> descriptor, not per context. I'm not sure (rather skeptical) that this >>> is suitable for OpenGL UMDs, since state is usually tracked per context. >>> Marek / Nicolai? >> >> Oh, yeah that's a good point. >> >> The problem with tracking it per context is that Vulkan also wants the >> ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are >> context less. >> >> But thinking more about this blocking those two doesn't make much >> sense. The VM content can be restored and why should be disallow >> reading GPU info? > I can re-paste the Vulkan APIs requiring ENODEV: > " > > The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST according > to the spec. > > I tries to provide a list of u/k interfaces that could be called for > each vk API. > > > > vkCreateDevice > > - amdgpu_device_initialize. > - amdgpu_query_gpu_info [...] > vkQueueBindSparse** > > amdgpu_bo_va_op > amdgpu_bo_va_op_raw So these *can* return VK_ERROR_DEVICE_LOST, but do they *have to* do that after a reset? :) -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Mesa and X developer _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <591C0DFB.8030604-5C7GfCeVMHo@public.gmane.org> 2017-05-17 8:55 ` Michel Dänzer @ 2017-05-17 8:56 ` Christian König [not found] ` <46582a1e-e019-34ac-1913-ed4a2a992e4c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org> 1 sibling, 1 reply; 22+ messages in thread From: Christian König @ 2017-05-17 8:56 UTC (permalink / raw) To: zhoucm1, Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW [-- Attachment #1.1: Type: text/plain, Size: 4975 bytes --] Am 17.05.2017 um 10:46 schrieb zhoucm1: > > > On 2017年05月17日 16:40, Christian König wrote: >> Am 17.05.2017 um 10:01 schrieb Michel Dänzer: >>> On 17/05/17 04:13 PM, zhoucm1 wrote: >>>> On 2017年05月17日 14:57, Michel Dänzer wrote: >>>>> On 17/05/17 01:28 PM, zhoucm1 wrote: >>>>>> On 2017年05月17日 11:15, Michel Dänzer wrote: >>>>>>> On 17/05/17 12:04 PM, zhoucm1 wrote: >>>>>>>> On 2017年05月17日 09:18, Michel Dänzer wrote: >>>>>>>>> On 16/05/17 06:25 PM, Chunming Zhou wrote: >>>>>>>>>> Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 >>>>>>>>>> Signed-off-by: Chunming Zhou <David1.Zhou-5C7GfCeVMHo@public.gmane.org> >>>>>>>>>> >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>>> index bca1fb5..f3e7525 100644 >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c >>>>>>>>>> @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device >>>>>>>>>> *dev, >>>>>>>>>> void *data, struct drm_file *filp) >>>>>>>>>> case AMDGPU_VM_OP_UNRESERVE_VMID: >>>>>>>>>> amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, >>>>>>>>>> AMDGPU_GFXHUB); >>>>>>>>>> break; >>>>>>>>>> + case AMDGPU_VM_OP_RESET: >>>>>>>>>> + fpriv->vram_lost_counter = >>>>>>>>>> atomic_read(&adev->vram_lost_counter); >>>>>>>>>> + break; >>>>>>>>> How do you envision the UMDs using this? I can mostly think of >>>>>>>>> them >>>>>>>>> calling this ioctl when a context is created or destroyed. But >>>>>>>>> that >>>>>>>>> would also allow any other remaining contexts using the same >>>>>>>>> DRM file >>>>>>>>> descriptor to use all ioctls again. So, I think there needs to >>>>>>>>> be a >>>>>>>>> vram_lost_counter in struct amdgpu_ctx instead of in struct >>>>>>>>> amdgpu_fpriv. >>>>>>>> struct amdgpu_fpriv for vram_lost_counter is proper place, >>>>>>>> especially >>>>>>>> for ioctl return value. >>>>>>>> if you need to reset ctx one by one, we can mark all contexts >>>>>>>> of that >>>>>>>> vm, and then reset by userspace. >>>>>>> I'm not following. With vram_lost_counter in amdgpu_fpriv, if any >>>>>>> context calls this ioctl, all other contexts using the same file >>>>>>> descriptor will also be considered safe again, right? >>>>>> Yes, but it really depends on userspace requirement, if you need to >>>>>> reset ctx one by one, we can mark all contexts of that vm to >>>>>> guilty, and >>>>>> then reset one context by userspace. >>>>> Still not sure what you mean by that. >>>>> >>>>> E.g. what do you mean by "guilty"? I thought that refers to the >>>>> context >>>>> which caused a hang. But it seems like you're using it to refer to >>>>> any >>>>> context which hasn't reacted yet to VRAM contents being lost. >>>> When vram is lost, we treat all contexts need to reset. >>> Essentially, your patches only track VRAM contents being lost per file >>> descriptor, not per context. I'm not sure (rather skeptical) that this >>> is suitable for OpenGL UMDs, since state is usually tracked per >>> context. >>> Marek / Nicolai? >> >> Oh, yeah that's a good point. >> >> The problem with tracking it per context is that Vulkan also wants >> the ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which >> are context less. >> >> But thinking more about this blocking those two doesn't make much >> sense. The VM content can be restored and why should be disallow >> reading GPU info? > I can re-paste the Vulkan APIs requiring ENODEV: > " > > The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST > according to the spec. > > I tries to provide a list of u/k interfaces that could be called for > each vk API. > Well those are the Vulkan requirements, but that doesn't necessary mean we must follow that on the kernel side. Keep in mind that Vulkan can't made any requirements towards the kernel driver. IIRC we already have a query Vulkan can use to figure out if a GPU reset happened or not. So they could use that instead. Regards, Christian. > vkCreateDevice > > -amdgpu_device_initialize. > > -amdgpu_query_gpu_info > > vkQueueSubmit > > -amdgpu_cs_submit > > vkWaitForFences > > amdgpu_cs_wait_fences > > vkGetEventStatus > > vkQueueWaitIdle > > vkDeviceWaitIdle > > vkGetQueryPoolResults** > > amdgpu_cs_query_Fence_status > > vkQueueBindSparse** > > amdgpu_bo_va_op > > amdgpu_bo_va_op_raw > > vkCreateSwapchainKHR** > > vkAcquireNextImageKHR** > > vkQueuePresentKHR > > Not related with u/k interface.** > > ** > > Besides those listed above, I think > amdgpu_cs_signal_Sem/amdgpu_cs_wait_sem should respond to gpu reset as > well." >> >> Christian. >> > > > > _______________________________________________ > amd-gfx mailing list > amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx [-- Attachment #1.2: Type: text/html, Size: 10254 bytes --] [-- Attachment #2: Type: text/plain, Size: 154 bytes --] _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <46582a1e-e019-34ac-1913-ed4a2a992e4c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>]
* Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <46582a1e-e019-34ac-1913-ed4a2a992e4c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org> @ 2017-05-17 9:49 ` Marek Olšák [not found] ` <CAAxE2A7sRZXx3MnRSO76DW=61X06nfVs2AHe_a-r+K+46tfJPw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 0 siblings, 1 reply; 22+ messages in thread From: Marek Olšák @ 2017-05-17 9:49 UTC (permalink / raw) To: Christian König; +Cc: zhoucm1, Michel Dänzer, amd-gfx mailing list David, We already have a query that returns whether a device is lost. It's called amdgpu_cs_query_reset_state. That should return whether a hang was caused by a certain context or whether the hang happened but the context is innocent. You can extend it to accept no context, in which case it will return either NO_RESET (everything is OK) or UNKNOWN_RESET (= when a hang happened but the caller didn't specify the context). Marek On Wed, May 17, 2017 at 10:56 AM, Christian König <deathsimple@vodafone.de> wrote: > Am 17.05.2017 um 10:46 schrieb zhoucm1: > > > > On 2017年05月17日 16:40, Christian König wrote: > > Am 17.05.2017 um 10:01 schrieb Michel Dänzer: > > On 17/05/17 04:13 PM, zhoucm1 wrote: > > On 2017年05月17日 14:57, Michel Dänzer wrote: > > On 17/05/17 01:28 PM, zhoucm1 wrote: > > On 2017年05月17日 11:15, Michel Dänzer wrote: > > On 17/05/17 12:04 PM, zhoucm1 wrote: > > On 2017年05月17日 09:18, Michel Dänzer wrote: > > On 16/05/17 06:25 PM, Chunming Zhou wrote: > > Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 > Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index bca1fb5..f3e7525 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, > void *data, struct drm_file *filp) > case AMDGPU_VM_OP_UNRESERVE_VMID: > amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, > AMDGPU_GFXHUB); > break; > + case AMDGPU_VM_OP_RESET: > + fpriv->vram_lost_counter = > atomic_read(&adev->vram_lost_counter); > + break; > > How do you envision the UMDs using this? I can mostly think of them > calling this ioctl when a context is created or destroyed. But that > would also allow any other remaining contexts using the same DRM file > descriptor to use all ioctls again. So, I think there needs to be a > vram_lost_counter in struct amdgpu_ctx instead of in struct > amdgpu_fpriv. > > struct amdgpu_fpriv for vram_lost_counter is proper place, especially > for ioctl return value. > if you need to reset ctx one by one, we can mark all contexts of that > vm, and then reset by userspace. > > I'm not following. With vram_lost_counter in amdgpu_fpriv, if any > context calls this ioctl, all other contexts using the same file > descriptor will also be considered safe again, right? > > Yes, but it really depends on userspace requirement, if you need to > reset ctx one by one, we can mark all contexts of that vm to guilty, and > then reset one context by userspace. > > Still not sure what you mean by that. > > E.g. what do you mean by "guilty"? I thought that refers to the context > which caused a hang. But it seems like you're using it to refer to any > context which hasn't reacted yet to VRAM contents being lost. > > When vram is lost, we treat all contexts need to reset. > > Essentially, your patches only track VRAM contents being lost per file > descriptor, not per context. I'm not sure (rather skeptical) that this > is suitable for OpenGL UMDs, since state is usually tracked per context. > Marek / Nicolai? > > > Oh, yeah that's a good point. > > The problem with tracking it per context is that Vulkan also wants the > ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are context > less. > > But thinking more about this blocking those two doesn't make much sense. The > VM content can be restored and why should be disallow reading GPU info? > > I can re-paste the Vulkan APIs requiring ENODEV: > " > > The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST according to > the spec. > > I tries to provide a list of u/k interfaces that could be called for each vk > API. > > > Well those are the Vulkan requirements, but that doesn't necessary mean we > must follow that on the kernel side. Keep in mind that Vulkan can't made any > requirements towards the kernel driver. > > IIRC we already have a query Vulkan can use to figure out if a GPU reset > happened or not. So they could use that instead. > > Regards, > Christian. > > > > vkCreateDevice > > - amdgpu_device_initialize. > > - amdgpu_query_gpu_info > > > > vkQueueSubmit > > - amdgpu_cs_submit > > > > vkWaitForFences > > amdgpu_cs_wait_fences > > > > vkGetEventStatus > > vkQueueWaitIdle > > vkDeviceWaitIdle > > vkGetQueryPoolResults > > amdgpu_cs_query_Fence_status > > > > vkQueueBindSparse > > amdgpu_bo_va_op > > amdgpu_bo_va_op_raw > > > > vkCreateSwapchainKHR > > vkAcquireNextImageKHR > > vkQueuePresentKHR > > Not related with u/k interface. > > > > Besides those listed above, I think amdgpu_cs_signal_Sem/amdgpu_cs_wait_sem > should respond to gpu reset as well." > > > Christian. > > > > > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <CAAxE2A7sRZXx3MnRSO76DW=61X06nfVs2AHe_a-r+K+46tfJPw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* RE: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter [not found] ` <CAAxE2A7sRZXx3MnRSO76DW=61X06nfVs2AHe_a-r+K+46tfJPw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> @ 2017-05-17 10:15 ` Zhou, David(ChunMing) 0 siblings, 0 replies; 22+ messages in thread From: Zhou, David(ChunMing) @ 2017-05-17 10:15 UTC (permalink / raw) To: Marek Olšák, Christian König Cc: Michel Dänzer, amd-gfx mailing list Yes, agree, it's easy to extend it as your like. For innocent context, which would need us to find out the context of guilty job, which is TODO. Regards, David Zhou -----Original Message----- From: Marek Olšák [mailto:maraeo@gmail.com] Sent: Wednesday, May 17, 2017 5:50 PM To: Christian König <deathsimple@vodafone.de> Cc: Zhou, David(ChunMing) <David1.Zhou@amd.com>; Michel Dänzer <michel@daenzer.net>; amd-gfx mailing list <amd-gfx@lists.freedesktop.org> Subject: Re: [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter David, We already have a query that returns whether a device is lost. It's called amdgpu_cs_query_reset_state. That should return whether a hang was caused by a certain context or whether the hang happened but the context is innocent. You can extend it to accept no context, in which case it will return either NO_RESET (everything is OK) or UNKNOWN_RESET (= when a hang happened but the caller didn't specify the context). Marek On Wed, May 17, 2017 at 10:56 AM, Christian König <deathsimple@vodafone.de> wrote: > Am 17.05.2017 um 10:46 schrieb zhoucm1: > > > > On 2017年05月17日 16:40, Christian König wrote: > > Am 17.05.2017 um 10:01 schrieb Michel Dänzer: > > On 17/05/17 04:13 PM, zhoucm1 wrote: > > On 2017年05月17日 14:57, Michel Dänzer wrote: > > On 17/05/17 01:28 PM, zhoucm1 wrote: > > On 2017年05月17日 11:15, Michel Dänzer wrote: > > On 17/05/17 12:04 PM, zhoucm1 wrote: > > On 2017年05月17日 09:18, Michel Dänzer wrote: > > On 16/05/17 06:25 PM, Chunming Zhou wrote: > > Change-Id: I8eb6d7f558da05510e429d3bf1d48c8cec6c1977 > Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index bca1fb5..f3e7525 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -2547,6 +2547,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void > *data, struct drm_file *filp) > case AMDGPU_VM_OP_UNRESERVE_VMID: > amdgpu_vm_free_reserved_vmid(adev, &fpriv->vm, > AMDGPU_GFXHUB); > break; > + case AMDGPU_VM_OP_RESET: > + fpriv->vram_lost_counter = > atomic_read(&adev->vram_lost_counter); > + break; > > How do you envision the UMDs using this? I can mostly think of them > calling this ioctl when a context is created or destroyed. But that > would also allow any other remaining contexts using the same DRM file > descriptor to use all ioctls again. So, I think there needs to be a > vram_lost_counter in struct amdgpu_ctx instead of in struct > amdgpu_fpriv. > > struct amdgpu_fpriv for vram_lost_counter is proper place, especially > for ioctl return value. > if you need to reset ctx one by one, we can mark all contexts of that > vm, and then reset by userspace. > > I'm not following. With vram_lost_counter in amdgpu_fpriv, if any > context calls this ioctl, all other contexts using the same file > descriptor will also be considered safe again, right? > > Yes, but it really depends on userspace requirement, if you need to > reset ctx one by one, we can mark all contexts of that vm to guilty, > and then reset one context by userspace. > > Still not sure what you mean by that. > > E.g. what do you mean by "guilty"? I thought that refers to the > context which caused a hang. But it seems like you're using it to > refer to any context which hasn't reacted yet to VRAM contents being lost. > > When vram is lost, we treat all contexts need to reset. > > Essentially, your patches only track VRAM contents being lost per file > descriptor, not per context. I'm not sure (rather skeptical) that this > is suitable for OpenGL UMDs, since state is usually tracked per context. > Marek / Nicolai? > > > Oh, yeah that's a good point. > > The problem with tracking it per context is that Vulkan also wants the > ENODEV on the amdgpu_gem_va_ioct() and amdgpu_info_ioctl() which are > context less. > > But thinking more about this blocking those two doesn't make much > sense. The VM content can be restored and why should be disallow reading GPU info? > > I can re-paste the Vulkan APIs requiring ENODEV: > " > > The Vulkan APIs listed below could return VK_ERROR_DEVICE_LOST > according to the spec. > > I tries to provide a list of u/k interfaces that could be called for > each vk API. > > > Well those are the Vulkan requirements, but that doesn't necessary > mean we must follow that on the kernel side. Keep in mind that Vulkan > can't made any requirements towards the kernel driver. > > IIRC we already have a query Vulkan can use to figure out if a GPU > reset happened or not. So they could use that instead. > > Regards, > Christian. > > > > vkCreateDevice > > - amdgpu_device_initialize. > > - amdgpu_query_gpu_info > > > > vkQueueSubmit > > - amdgpu_cs_submit > > > > vkWaitForFences > > amdgpu_cs_wait_fences > > > > vkGetEventStatus > > vkQueueWaitIdle > > vkDeviceWaitIdle > > vkGetQueryPoolResults > > amdgpu_cs_query_Fence_status > > > > vkQueueBindSparse > > amdgpu_bo_va_op > > amdgpu_bo_va_op_raw > > > > vkCreateSwapchainKHR > > vkAcquireNextImageKHR > > vkQueuePresentKHR > > Not related with u/k interface. > > > > Besides those listed above, I think > amdgpu_cs_signal_Sem/amdgpu_cs_wait_sem > should respond to gpu reset as well." > > > Christian. > > > > > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH 1/4] drm/amdgpu: check if vram is lost v2 [not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org> ` (2 preceding siblings ...) 2017-05-16 9:25 ` [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter Chunming Zhou @ 2017-05-16 10:49 ` Christian König [not found] ` <0c1d89c5-65ea-cdad-100a-80d0377b865c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org> 3 siblings, 1 reply; 22+ messages in thread From: Christian König @ 2017-05-16 10:49 UTC (permalink / raw) To: Chunming Zhou, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW Am 16.05.2017 um 11:25 schrieb Chunming Zhou: > bakup first 64 byte of gart table as reset magic, check if magic is same > after gpu hw reset. > v2: use memcmp instead of manual innovation. > > Change-Id: I9a73720da4084ea8677c3031dfb62e8157ee5704 > Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> Patch #1-#3 are Reviewed-by: Christian König <christian.koenig@amd.com> Patch #4: You need to add the new enum on line 591 or otherwise you will get an "unsupported operation" error. Line 604 should be changed as well or otherwise we need a BO for this operation. A libdrm test case to just call this IOCTL would probably be a good idea. Additional to that I would ping Marek (Mesa) and Michel (DDX) for their opinion on this. Could be that this is completely superfluous and the UMDs needs something else. Regards, Christian. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++- > 2 files changed, 21 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index de08ff0..f9da215 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1502,6 +1502,7 @@ struct amdgpu_ssg { > #endif > }; > > +#define AMDGPU_RESET_MAGIC_NUM 64 > struct amdgpu_device { > struct device *dev; > struct drm_device *ddev; > @@ -1705,6 +1706,7 @@ struct amdgpu_device { > > /* record hw reset is performed */ > bool has_hw_reset; > + u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 0a31fb1..c56ae4a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -1685,6 +1685,17 @@ static int amdgpu_init(struct amdgpu_device *adev) > return 0; > } > > +static void amdgpu_fill_reset_magic(struct amdgpu_device *adev) > +{ > + memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); > +} > + > +static bool amdgpu_check_vram_lost(struct amdgpu_device *adev) > +{ > + return !!memcmp(adev->gart.ptr, adev->reset_magic, > + AMDGPU_RESET_MAGIC_NUM); > +} > + > static int amdgpu_late_init(struct amdgpu_device *adev) > { > int i = 0, r; > @@ -1715,6 +1726,8 @@ static int amdgpu_late_init(struct amdgpu_device *adev) > } > } > > + amdgpu_fill_reset_magic(adev); > + > return 0; > } > > @@ -2830,7 +2843,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) > struct drm_atomic_state *state = NULL; > int i, r; > int resched; > - bool need_full_reset; > + bool need_full_reset, vram_lost = false; > > if (amdgpu_sriov_vf(adev)) > return amdgpu_sriov_gpu_reset(adev, true); > @@ -2899,12 +2912,17 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) > r = amdgpu_resume_phase1(adev); > if (r) > goto out; > + vram_lost = amdgpu_check_vram_lost(adev); > + if (vram_lost) > + DRM_ERROR("VRAM is lost!\n"); > r = amdgpu_ttm_recover_gart(adev); > if (r) > goto out; > r = amdgpu_resume_phase2(adev); > if (r) > goto out; > + if (vram_lost) > + amdgpu_fill_reset_magic(adev); > } > } > out: _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
[parent not found: <0c1d89c5-65ea-cdad-100a-80d0377b865c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>]
* Re: [PATCH 1/4] drm/amdgpu: check if vram is lost v2 [not found] ` <0c1d89c5-65ea-cdad-100a-80d0377b865c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org> @ 2017-05-17 4:37 ` zhoucm1 0 siblings, 0 replies; 22+ messages in thread From: zhoucm1 @ 2017-05-17 4:37 UTC (permalink / raw) To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW On 2017年05月16日 18:49, Christian König wrote: > Am 16.05.2017 um 11:25 schrieb Chunming Zhou: >> bakup first 64 byte of gart table as reset magic, check if magic is same >> after gpu hw reset. >> v2: use memcmp instead of manual innovation. >> >> Change-Id: I9a73720da4084ea8677c3031dfb62e8157ee5704 >> Signed-off-by: Chunming Zhou <David1.Zhou@amd.com> > > Patch #1-#3 are Reviewed-by: Christian König <christian.koenig@amd.com> pushed. > > Patch #4: > > You need to add the new enum on line 591 or otherwise you will get an > "unsupported operation" error. > > Line 604 should be changed as well or otherwise we need a BO for this > operation. are you sure you are talking this patch#4? I cannot address what you said. > > A libdrm test case to just call this IOCTL would probably be a good idea. > > Additional to that I would ping Marek (Mesa) and Michel (DDX) for > their opinion on this. Could be that this is completely superfluous > and the UMDs needs something else. Michel seems have different opinion/concern, maybe we need more discussions before we make new interfaces. Thanks, David Zhou > > Regards, > Christian. > >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++- >> 2 files changed, 21 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index de08ff0..f9da215 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -1502,6 +1502,7 @@ struct amdgpu_ssg { >> #endif >> }; >> +#define AMDGPU_RESET_MAGIC_NUM 64 >> struct amdgpu_device { >> struct device *dev; >> struct drm_device *ddev; >> @@ -1705,6 +1706,7 @@ struct amdgpu_device { >> /* record hw reset is performed */ >> bool has_hw_reset; >> + u8 reset_magic[AMDGPU_RESET_MAGIC_NUM]; >> }; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 0a31fb1..c56ae4a 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -1685,6 +1685,17 @@ static int amdgpu_init(struct amdgpu_device >> *adev) >> return 0; >> } >> +static void amdgpu_fill_reset_magic(struct amdgpu_device *adev) >> +{ >> + memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); >> +} >> + >> +static bool amdgpu_check_vram_lost(struct amdgpu_device *adev) >> +{ >> + return !!memcmp(adev->gart.ptr, adev->reset_magic, >> + AMDGPU_RESET_MAGIC_NUM); >> +} >> + >> static int amdgpu_late_init(struct amdgpu_device *adev) >> { >> int i = 0, r; >> @@ -1715,6 +1726,8 @@ static int amdgpu_late_init(struct >> amdgpu_device *adev) >> } >> } >> + amdgpu_fill_reset_magic(adev); >> + >> return 0; >> } >> @@ -2830,7 +2843,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) >> struct drm_atomic_state *state = NULL; >> int i, r; >> int resched; >> - bool need_full_reset; >> + bool need_full_reset, vram_lost = false; >> if (amdgpu_sriov_vf(adev)) >> return amdgpu_sriov_gpu_reset(adev, true); >> @@ -2899,12 +2912,17 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) >> r = amdgpu_resume_phase1(adev); >> if (r) >> goto out; >> + vram_lost = amdgpu_check_vram_lost(adev); >> + if (vram_lost) >> + DRM_ERROR("VRAM is lost!\n"); >> r = amdgpu_ttm_recover_gart(adev); >> if (r) >> goto out; >> r = amdgpu_resume_phase2(adev); >> if (r) >> goto out; >> + if (vram_lost) >> + amdgpu_fill_reset_magic(adev); >> } >> } >> out: > > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 22+ messages in thread
end of thread, other threads:[~2017-05-24 2:20 UTC | newest]
Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-05-16 9:25 [PATCH 1/4] drm/amdgpu: check if vram is lost v2 Chunming Zhou
[not found] ` <1494926750-1081-1-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
2017-05-16 9:25 ` [PATCH 2/4] drm/amdgpu: return -ENODEV to user space when " Chunming Zhou
[not found] ` <1494926750-1081-2-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
2017-05-23 15:08 ` Deucher, Alexander
[not found] ` <BN6PR12MB1652E1AFF691F58C92FC2CD7F7F90-/b2+HYfkarQqUD6E6FAiowdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-05-23 15:16 ` Christian König
[not found] ` <69717c0b-b2c1-589a-c466-5d6be9518eda-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-24 2:20 ` zhoucm1
2017-05-16 9:25 ` [PATCH 3/4] drm/amdgpu: skip all jobs of guilty vm Chunming Zhou
2017-05-16 9:25 ` [PATCH 4/4] drm/amdgpu: reset fpriv vram_lost_counter Chunming Zhou
[not found] ` <1494926750-1081-4-git-send-email-David1.Zhou-5C7GfCeVMHo@public.gmane.org>
2017-05-17 1:18 ` Michel Dänzer
[not found] ` <58988726-543a-535a-3011-860d29b9f2da-otUistvHUpPR7s880joybQ@public.gmane.org>
2017-05-17 3:04 ` zhoucm1
[not found] ` <591BBDA2.1070900-5C7GfCeVMHo@public.gmane.org>
2017-05-17 3:15 ` Michel Dänzer
[not found] ` <29fe2142-7fd1-e23a-49d9-c38dc685db92-otUistvHUpPR7s880joybQ@public.gmane.org>
2017-05-17 4:28 ` zhoucm1
[not found] ` <591BD17C.8050903-5C7GfCeVMHo@public.gmane.org>
2017-05-17 6:57 ` Michel Dänzer
[not found] ` <7d87bc8e-9c09-ad25-de6e-dfbd8116bf6e-otUistvHUpPR7s880joybQ@public.gmane.org>
2017-05-17 7:13 ` zhoucm1
[not found] ` <591BF825.6090505-5C7GfCeVMHo@public.gmane.org>
2017-05-17 8:01 ` Michel Dänzer
[not found] ` <31db7a30-dd98-5cb2-4125-187d3d0e2a49-otUistvHUpPR7s880joybQ@public.gmane.org>
2017-05-17 8:40 ` Christian König
[not found] ` <7a302ebe-1de1-734f-fb21-aadcc7904d37-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-17 8:46 ` zhoucm1
[not found] ` <591C0DFB.8030604-5C7GfCeVMHo@public.gmane.org>
2017-05-17 8:55 ` Michel Dänzer
2017-05-17 8:56 ` Christian König
[not found] ` <46582a1e-e019-34ac-1913-ed4a2a992e4c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-17 9:49 ` Marek Olšák
[not found] ` <CAAxE2A7sRZXx3MnRSO76DW=61X06nfVs2AHe_a-r+K+46tfJPw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-17 10:15 ` Zhou, David(ChunMing)
2017-05-16 10:49 ` [PATCH 1/4] drm/amdgpu: check if vram is lost v2 Christian König
[not found] ` <0c1d89c5-65ea-cdad-100a-80d0377b865c-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-17 4:37 ` zhoucm1
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.