All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 3/3] drm/amdgpu: fix gpu reset issue
@ 2017-04-21  7:08 Roger.He
       [not found] ` <1492758490-12631-1-git-send-email-Hongbo.He-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Roger.He @ 2017-04-21  7:08 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: david1.zhou-5C7GfCeVMHo, Roger.He

Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
Signed-off-by: Roger.He <Hongbo.He@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 71364f5..ab0ffa8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1526,6 +1526,7 @@ struct amdgpu_device {
 	atomic64_t			num_bytes_moved;
 	atomic64_t			num_evictions;
 	atomic_t			gpu_reset_counter;
+	atomic_t			gpu_reset_state;
 
 	/* data for buffer migration throttling */
 	struct {
@@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
 #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
 
 /* Common functions */
+static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
+{
+	return atomic_read(&adev->gpu_reset_state);
+}
+
 int amdgpu_gpu_reset(struct amdgpu_device *adev);
 bool amdgpu_need_backup(struct amdgpu_device *adev);
 void amdgpu_pci_config_reset(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f882496..0fb4716 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	mutex_init(&adev->grbm_idx_mutex);
 	mutex_init(&adev->mn_lock);
 	hash_init(adev->mn_hash);
+	atomic_set(&adev->gpu_reset_state, 0);
 
 	amdgpu_check_arguments(adev);
 
@@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
 }
 
 /**
+ * amdgpu_device_set_reset_state - set gpu reset state
+ *
+ * @adev: amdgpu device pointer
+ * @state: true when start to reset gpu; false: reset done
+ */
+static inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
+							bool state)
+{
+	atomic_set(&adev->gpu_reset_state, state);
+}
+
+/**
  * amdgpu_gpu_reset - reset the asic
  *
  * @adev: amdgpu device pointer
@@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 	}
 
 	atomic_inc(&adev->gpu_reset_counter);
-
+	amdgpu_device_set_reset_state(adev, true);
 	/* block TTM */
 	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
 	/* store modesetting */
@@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 		dev_info(adev->dev, "GPU reset failed\n");
 	}
 
+	amdgpu_device_set_reset_state(adev, false);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ead00d7..8cc14af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
 	struct drm_file *file_priv = filp->private_data;
 	struct drm_device *dev;
 	long ret;
+
 	dev = file_priv->minor->dev;
 	ret = pm_runtime_get_sync(dev->dev);
 	if (ret < 0)
 		return ret;
 
+	while (amdgpu_device_is_reset(dev->dev_private))
+		msleep(100);
+
 	ret = drm_ioctl(filp, cmd, arg);
 
 	pm_runtime_mark_last_busy(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
index 2648291..22b8059 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
@@ -36,10 +36,15 @@
 long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	unsigned int nr = DRM_IOCTL_NR(cmd);
+	struct drm_file *file_priv = filp->private_data;
+	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
 	int ret;
 
-	if (nr < DRM_COMMAND_BASE)
+	if (nr < DRM_COMMAND_BASE) {
+		while (amdgpu_device_is_reset(adev))
+			msleep(100);
 		return drm_compat_ioctl(filp, cmd, arg);
+	}
 
 	ret = amdgpu_drm_ioctl(filp, cmd, arg);
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found] ` <1492758490-12631-1-git-send-email-Hongbo.He-5C7GfCeVMHo@public.gmane.org>
@ 2017-04-21  7:18   ` zhoucm1
       [not found]     ` <58F9B22E.7050502-5C7GfCeVMHo@public.gmane.org>
  2017-04-21  8:27   ` Christian König
  1 sibling, 1 reply; 18+ messages in thread
From: zhoucm1 @ 2017-04-21  7:18 UTC (permalink / raw)
  To: Roger.He, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2017年04月21日 15:08, Roger.He wrote:

> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
> Signed-off-by: Roger.He <Hongbo.He@amd.com>
Please add more explaination in comment, like "prevent ioctl during gpu 
reset, otherwise many un-expected behaviour happen."

Regards,
David Zhou
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>   4 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 71364f5..ab0ffa8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>   	atomic64_t			num_bytes_moved;
>   	atomic64_t			num_evictions;
>   	atomic_t			gpu_reset_counter;
> +	atomic_t			gpu_reset_state;
>   
>   	/* data for buffer migration throttling */
>   	struct {
> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>   
>   /* Common functions */
> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
> +{
> +	return atomic_read(&adev->gpu_reset_state);
> +}
> +
>   int amdgpu_gpu_reset(struct amdgpu_device *adev);
>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>   void amdgpu_pci_config_reset(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f882496..0fb4716 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->grbm_idx_mutex);
>   	mutex_init(&adev->mn_lock);
>   	hash_init(adev->mn_hash);
> +	atomic_set(&adev->gpu_reset_state, 0);
>   
>   	amdgpu_check_arguments(adev);
>   
> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>   }
>   
>   /**
> + * amdgpu_device_set_reset_state - set gpu reset state
> + *
> + * @adev: amdgpu device pointer
> + * @state: true when start to reset gpu; false: reset done
> + */
> +static inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
> +							bool state)
> +{
> +	atomic_set(&adev->gpu_reset_state, state);
> +}
> +
> +/**
>    * amdgpu_gpu_reset - reset the asic
>    *
>    * @adev: amdgpu device pointer
> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   	}
>   
>   	atomic_inc(&adev->gpu_reset_counter);
> -
> +	amdgpu_device_set_reset_state(adev, true);
>   	/* block TTM */
>   	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>   	/* store modesetting */
> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   		dev_info(adev->dev, "GPU reset failed\n");
>   	}
>   
> +	amdgpu_device_set_reset_state(adev, false);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index ead00d7..8cc14af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>   	struct drm_file *file_priv = filp->private_data;
>   	struct drm_device *dev;
>   	long ret;
> +
>   	dev = file_priv->minor->dev;
>   	ret = pm_runtime_get_sync(dev->dev);
>   	if (ret < 0)
>   		return ret;
>   
> +	while (amdgpu_device_is_reset(dev->dev_private))
> +		msleep(100);
> +
>   	ret = drm_ioctl(filp, cmd, arg);
>   
>   	pm_runtime_mark_last_busy(dev->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> index 2648291..22b8059 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> @@ -36,10 +36,15 @@
>   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   {
>   	unsigned int nr = DRM_IOCTL_NR(cmd);
> +	struct drm_file *file_priv = filp->private_data;
> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>   	int ret;
>   
> -	if (nr < DRM_COMMAND_BASE)
> +	if (nr < DRM_COMMAND_BASE) {
> +		while (amdgpu_device_is_reset(adev))
> +			msleep(100);
>   		return drm_compat_ioctl(filp, cmd, arg);
> +	}
>   
>   	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found] ` <1492758490-12631-1-git-send-email-Hongbo.He-5C7GfCeVMHo@public.gmane.org>
  2017-04-21  7:18   ` zhoucm1
@ 2017-04-21  8:27   ` Christian König
       [not found]     ` <ef2c40cd-bb55-5921-9318-e0147f1b4719-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  1 sibling, 1 reply; 18+ messages in thread
From: Christian König @ 2017-04-21  8:27 UTC (permalink / raw)
  To: Roger.He, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: david1.zhou-5C7GfCeVMHo

NAK, that is exactly what we wanted to avoid.

We used to have an exclusive lock for this and it cause a whole bunch of 
problems.

Please elaborate why that should be necessary.

Regards,
Christian.

Am 21.04.2017 um 09:08 schrieb Roger.He:
> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
> Signed-off-by: Roger.He <Hongbo.He@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>   4 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 71364f5..ab0ffa8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>   	atomic64_t			num_bytes_moved;
>   	atomic64_t			num_evictions;
>   	atomic_t			gpu_reset_counter;
> +	atomic_t			gpu_reset_state;
>   
>   	/* data for buffer migration throttling */
>   	struct {
> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>   
>   /* Common functions */
> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
> +{
> +	return atomic_read(&adev->gpu_reset_state);
> +}
> +
>   int amdgpu_gpu_reset(struct amdgpu_device *adev);
>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>   void amdgpu_pci_config_reset(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f882496..0fb4716 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->grbm_idx_mutex);
>   	mutex_init(&adev->mn_lock);
>   	hash_init(adev->mn_hash);
> +	atomic_set(&adev->gpu_reset_state, 0);
>   
>   	amdgpu_check_arguments(adev);
>   
> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>   }
>   
>   /**
> + * amdgpu_device_set_reset_state - set gpu reset state
> + *
> + * @adev: amdgpu device pointer
> + * @state: true when start to reset gpu; false: reset done
> + */
> +static inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
> +							bool state)
> +{
> +	atomic_set(&adev->gpu_reset_state, state);
> +}
> +
> +/**
>    * amdgpu_gpu_reset - reset the asic
>    *
>    * @adev: amdgpu device pointer
> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   	}
>   
>   	atomic_inc(&adev->gpu_reset_counter);
> -
> +	amdgpu_device_set_reset_state(adev, true);
>   	/* block TTM */
>   	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>   	/* store modesetting */
> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   		dev_info(adev->dev, "GPU reset failed\n");
>   	}
>   
> +	amdgpu_device_set_reset_state(adev, false);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index ead00d7..8cc14af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>   	struct drm_file *file_priv = filp->private_data;
>   	struct drm_device *dev;
>   	long ret;
> +
>   	dev = file_priv->minor->dev;
>   	ret = pm_runtime_get_sync(dev->dev);
>   	if (ret < 0)
>   		return ret;
>   
> +	while (amdgpu_device_is_reset(dev->dev_private))
> +		msleep(100);
> +
>   	ret = drm_ioctl(filp, cmd, arg);
>   
>   	pm_runtime_mark_last_busy(dev->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> index 2648291..22b8059 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> @@ -36,10 +36,15 @@
>   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   {
>   	unsigned int nr = DRM_IOCTL_NR(cmd);
> +	struct drm_file *file_priv = filp->private_data;
> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>   	int ret;
>   
> -	if (nr < DRM_COMMAND_BASE)
> +	if (nr < DRM_COMMAND_BASE) {
> +		while (amdgpu_device_is_reset(adev))
> +			msleep(100);
>   		return drm_compat_ioctl(filp, cmd, arg);
> +	}
>   
>   	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>   


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]     ` <ef2c40cd-bb55-5921-9318-e0147f1b4719-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-04-21  8:54       ` zhoucm1
  2017-04-21  9:00       ` He, Hongbo
  2017-05-04 10:31       ` zhoucm1
  2 siblings, 0 replies; 18+ messages in thread
From: zhoucm1 @ 2017-04-21  8:54 UTC (permalink / raw)
  To: Christian König, Roger.He,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Roger,

Could you try to move 'id->current_gpu_reset_count = 
atomic_read(&adev->gpu_reset_counter);' in amdgpu_vm_grab_id to 
no_flush_needed label.

Regards,
David Zhou

On 2017年04月21日 16:27, Christian König wrote:
> NAK, that is exactly what we wanted to avoid.
>
> We used to have an exclusive lock for this and it cause a whole bunch 
> of problems.
>
> Please elaborate why that should be necessary.
>
> Regards,
> Christian.
>
> Am 21.04.2017 um 09:08 schrieb Roger.He:
>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>   4 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 71364f5..ab0ffa8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>       atomic64_t            num_bytes_moved;
>>       atomic64_t            num_evictions;
>>       atomic_t            gpu_reset_counter;
>> +    atomic_t            gpu_reset_state;
>>         /* data for buffer migration throttling */
>>       struct {
>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring 
>> *ring)
>>   #define amdgpu_psp_check_fw_loading_status(adev, i) 
>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>     /* Common functions */
>> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
>> +{
>> +    return atomic_read(&adev->gpu_reset_state);
>> +}
>> +
>>   int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>>   void amdgpu_pci_config_reset(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f882496..0fb4716 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>       mutex_init(&adev->grbm_idx_mutex);
>>       mutex_init(&adev->mn_lock);
>>       hash_init(adev->mn_hash);
>> +    atomic_set(&adev->gpu_reset_state, 0);
>>         amdgpu_check_arguments(adev);
>>   @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct 
>> amdgpu_device *adev, bool voluntary)
>>   }
>>     /**
>> + * amdgpu_device_set_reset_state - set gpu reset state
>> + *
>> + * @adev: amdgpu device pointer
>> + * @state: true when start to reset gpu; false: reset done
>> + */
>> +static inline void amdgpu_device_set_reset_state(struct 
>> amdgpu_device *adev,
>> +                            bool state)
>> +{
>> +    atomic_set(&adev->gpu_reset_state, state);
>> +}
>> +
>> +/**
>>    * amdgpu_gpu_reset - reset the asic
>>    *
>>    * @adev: amdgpu device pointer
>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>       }
>>         atomic_inc(&adev->gpu_reset_counter);
>> -
>> +    amdgpu_device_set_reset_state(adev, true);
>>       /* block TTM */
>>       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>       /* store modesetting */
>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>           dev_info(adev->dev, "GPU reset failed\n");
>>       }
>>   +    amdgpu_device_set_reset_state(adev, false);
>>       return r;
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index ead00d7..8cc14af 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>       struct drm_file *file_priv = filp->private_data;
>>       struct drm_device *dev;
>>       long ret;
>> +
>>       dev = file_priv->minor->dev;
>>       ret = pm_runtime_get_sync(dev->dev);
>>       if (ret < 0)
>>           return ret;
>>   +    while (amdgpu_device_is_reset(dev->dev_private))
>> +        msleep(100);
>> +
>>       ret = drm_ioctl(filp, cmd, arg);
>>         pm_runtime_mark_last_busy(dev->dev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> index 2648291..22b8059 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> @@ -36,10 +36,15 @@
>>   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, 
>> unsigned long arg)
>>   {
>>       unsigned int nr = DRM_IOCTL_NR(cmd);
>> +    struct drm_file *file_priv = filp->private_data;
>> +    struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>       int ret;
>>   -    if (nr < DRM_COMMAND_BASE)
>> +    if (nr < DRM_COMMAND_BASE) {
>> +        while (amdgpu_device_is_reset(adev))
>> +            msleep(100);
>>           return drm_compat_ioctl(filp, cmd, arg);
>> +    }
>>         ret = amdgpu_drm_ioctl(filp, cmd, arg);
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]     ` <ef2c40cd-bb55-5921-9318-e0147f1b4719-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2017-04-21  8:54       ` zhoucm1
@ 2017-04-21  9:00       ` He, Hongbo
       [not found]         ` <MWHPR1201MB01272142B19FFEB9EAD281F3FD1A0-3iK1xFAIwjq9imrIu4W8xGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  2017-05-04 10:31       ` zhoucm1
  2 siblings, 1 reply; 18+ messages in thread
From: He, Hongbo @ 2017-04-21  9:00 UTC (permalink / raw)
  To: Christian König,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhou, David(ChunMing)

Hi Christian:

During GPU reset, IOCTL still  submit command to GPU.
That will result in lots of strange problems when we test this feature.

I have no better idea to handle this case. At that time, I have tried with return error directly rather than waiting GPU reset completion. 
But that doesn't work and it lead to other problems.

Thanks
Roger(Hongbo.He)

-----Original Message-----
From: Christian König [mailto:deathsimple@vodafone.de] 
Sent: Friday, April 21, 2017 4:27 PM
To: He, Hongbo; amd-gfx@lists.freedesktop.org
Cc: Zhou, David(ChunMing)
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue

NAK, that is exactly what we wanted to avoid.

We used to have an exclusive lock for this and it cause a whole bunch of problems.

Please elaborate why that should be necessary.

Regards,
Christian.

Am 21.04.2017 um 09:08 schrieb Roger.He:
> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
> Signed-off-by: Roger.He <Hongbo.He@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>   4 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 71364f5..ab0ffa8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>   	atomic64_t			num_bytes_moved;
>   	atomic64_t			num_evictions;
>   	atomic_t			gpu_reset_counter;
> +	atomic_t			gpu_reset_state;
>   
>   	/* data for buffer migration throttling */
>   	struct {
> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_psp_check_fw_loading_status(adev, i) 
> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>   
>   /* Common functions */
> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev) 
> +{
> +	return atomic_read(&adev->gpu_reset_state);
> +}
> +
>   int amdgpu_gpu_reset(struct amdgpu_device *adev);
>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>   void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f882496..0fb4716 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->grbm_idx_mutex);
>   	mutex_init(&adev->mn_lock);
>   	hash_init(adev->mn_hash);
> +	atomic_set(&adev->gpu_reset_state, 0);
>   
>   	amdgpu_check_arguments(adev);
>   
> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>   }
>   
>   /**
> + * amdgpu_device_set_reset_state - set gpu reset state
> + *
> + * @adev: amdgpu device pointer
> + * @state: true when start to reset gpu; false: reset done  */ static 
> +inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
> +							bool state)
> +{
> +	atomic_set(&adev->gpu_reset_state, state); }
> +
> +/**
>    * amdgpu_gpu_reset - reset the asic
>    *
>    * @adev: amdgpu device pointer
> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   	}
>   
>   	atomic_inc(&adev->gpu_reset_counter);
> -
> +	amdgpu_device_set_reset_state(adev, true);
>   	/* block TTM */
>   	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>   	/* store modesetting */
> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   		dev_info(adev->dev, "GPU reset failed\n");
>   	}
>   
> +	amdgpu_device_set_reset_state(adev, false);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index ead00d7..8cc14af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>   	struct drm_file *file_priv = filp->private_data;
>   	struct drm_device *dev;
>   	long ret;
> +
>   	dev = file_priv->minor->dev;
>   	ret = pm_runtime_get_sync(dev->dev);
>   	if (ret < 0)
>   		return ret;
>   
> +	while (amdgpu_device_is_reset(dev->dev_private))
> +		msleep(100);
> +
>   	ret = drm_ioctl(filp, cmd, arg);
>   
>   	pm_runtime_mark_last_busy(dev->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> index 2648291..22b8059 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> @@ -36,10 +36,15 @@
>   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   {
>   	unsigned int nr = DRM_IOCTL_NR(cmd);
> +	struct drm_file *file_priv = filp->private_data;
> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>   	int ret;
>   
> -	if (nr < DRM_COMMAND_BASE)
> +	if (nr < DRM_COMMAND_BASE) {
> +		while (amdgpu_device_is_reset(adev))
> +			msleep(100);
>   		return drm_compat_ioctl(filp, cmd, arg);
> +	}
>   
>   	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>   


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]         ` <MWHPR1201MB01272142B19FFEB9EAD281F3FD1A0-3iK1xFAIwjq9imrIu4W8xGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-04-21  9:15           ` Christian König
       [not found]             ` <05b9d783-75ab-9924-6881-472540fc77d1-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Christian König @ 2017-04-21  9:15 UTC (permalink / raw)
  To: He, Hongbo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhou, David(ChunMing)

> During GPU reset, IOCTL still  submit command to GPU.
No it won't. We disable the GPU scheduler during GPU reset, so that 
isn't an issue.

Do you have a concrete problem (e.g. dmesg?) you ran into?

Regards,
Christian.

Am 21.04.2017 um 11:00 schrieb He, Hongbo:
> Hi Christian:
>
> During GPU reset, IOCTL still  submit command to GPU.
> That will result in lots of strange problems when we test this feature.
>
> I have no better idea to handle this case. At that time, I have tried with return error directly rather than waiting GPU reset completion.
> But that doesn't work and it lead to other problems.
>
> Thanks
> Roger(Hongbo.He)
>
> -----Original Message-----
> From: Christian König [mailto:deathsimple@vodafone.de]
> Sent: Friday, April 21, 2017 4:27 PM
> To: He, Hongbo; amd-gfx@lists.freedesktop.org
> Cc: Zhou, David(ChunMing)
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>
> NAK, that is exactly what we wanted to avoid.
>
> We used to have an exclusive lock for this and it cause a whole bunch of problems.
>
> Please elaborate why that should be necessary.
>
> Regards,
> Christian.
>
> Am 21.04.2017 um 09:08 schrieb Roger.He:
>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>    4 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 71364f5..ab0ffa8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>    	atomic64_t			num_bytes_moved;
>>    	atomic64_t			num_evictions;
>>    	atomic_t			gpu_reset_counter;
>> +	atomic_t			gpu_reset_state;
>>    
>>    	/* data for buffer migration throttling */
>>    	struct {
>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>    #define amdgpu_psp_check_fw_loading_status(adev, i)
>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>    
>>    /* Common functions */
>> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
>> +{
>> +	return atomic_read(&adev->gpu_reset_state);
>> +}
>> +
>>    int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>    bool amdgpu_need_backup(struct amdgpu_device *adev);
>>    void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f882496..0fb4716 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	mutex_init(&adev->grbm_idx_mutex);
>>    	mutex_init(&adev->mn_lock);
>>    	hash_init(adev->mn_hash);
>> +	atomic_set(&adev->gpu_reset_state, 0);
>>    
>>    	amdgpu_check_arguments(adev);
>>    
>> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>>    }
>>    
>>    /**
>> + * amdgpu_device_set_reset_state - set gpu reset state
>> + *
>> + * @adev: amdgpu device pointer
>> + * @state: true when start to reset gpu; false: reset done  */ static
>> +inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
>> +							bool state)
>> +{
>> +	atomic_set(&adev->gpu_reset_state, state); }
>> +
>> +/**
>>     * amdgpu_gpu_reset - reset the asic
>>     *
>>     * @adev: amdgpu device pointer
>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>    	}
>>    
>>    	atomic_inc(&adev->gpu_reset_counter);
>> -
>> +	amdgpu_device_set_reset_state(adev, true);
>>    	/* block TTM */
>>    	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>    	/* store modesetting */
>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>    		dev_info(adev->dev, "GPU reset failed\n");
>>    	}
>>    
>> +	amdgpu_device_set_reset_state(adev, false);
>>    	return r;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index ead00d7..8cc14af 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>    	struct drm_file *file_priv = filp->private_data;
>>    	struct drm_device *dev;
>>    	long ret;
>> +
>>    	dev = file_priv->minor->dev;
>>    	ret = pm_runtime_get_sync(dev->dev);
>>    	if (ret < 0)
>>    		return ret;
>>    
>> +	while (amdgpu_device_is_reset(dev->dev_private))
>> +		msleep(100);
>> +
>>    	ret = drm_ioctl(filp, cmd, arg);
>>    
>>    	pm_runtime_mark_last_busy(dev->dev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> index 2648291..22b8059 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> @@ -36,10 +36,15 @@
>>    long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>    {
>>    	unsigned int nr = DRM_IOCTL_NR(cmd);
>> +	struct drm_file *file_priv = filp->private_data;
>> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>    	int ret;
>>    
>> -	if (nr < DRM_COMMAND_BASE)
>> +	if (nr < DRM_COMMAND_BASE) {
>> +		while (amdgpu_device_is_reset(adev))
>> +			msleep(100);
>>    		return drm_compat_ioctl(filp, cmd, arg);
>> +	}
>>    
>>    	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>    
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]             ` <05b9d783-75ab-9924-6881-472540fc77d1-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-04-21  9:54               ` He, Hongbo
       [not found]                 ` <MWHPR1201MB0127A593761702A7A179855EFD1A0-3iK1xFAIwjq9imrIu4W8xGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: He, Hongbo @ 2017-04-21  9:54 UTC (permalink / raw)
  To: Christian König,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhou, David(ChunMing)

> During GPU reset, IOCTL still  submit command to GPU.
No it won't. We disable the GPU scheduler during GPU reset, so that isn't an issue.
          Could you point out the related code? Thanks.


Thanks
Roger(Hongbo.He)

-----Original Message-----
From: Christian König [mailto:deathsimple@vodafone.de] 
Sent: Friday, April 21, 2017 5:16 PM
To: He, Hongbo; amd-gfx@lists.freedesktop.org
Cc: Zhou, David(ChunMing)
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue

> During GPU reset, IOCTL still  submit command to GPU.
No it won't. We disable the GPU scheduler during GPU reset, so that isn't an issue.

Do you have a concrete problem (e.g. dmesg?) you ran into?

Regards,
Christian.

Am 21.04.2017 um 11:00 schrieb He, Hongbo:
> Hi Christian:
>
> During GPU reset, IOCTL still  submit command to GPU.
> That will result in lots of strange problems when we test this feature.
>
> I have no better idea to handle this case. At that time, I have tried with return error directly rather than waiting GPU reset completion.
> But that doesn't work and it lead to other problems.
>
> Thanks
> Roger(Hongbo.He)
>
> -----Original Message-----
> From: Christian König [mailto:deathsimple@vodafone.de]
> Sent: Friday, April 21, 2017 4:27 PM
> To: He, Hongbo; amd-gfx@lists.freedesktop.org
> Cc: Zhou, David(ChunMing)
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>
> NAK, that is exactly what we wanted to avoid.
>
> We used to have an exclusive lock for this and it cause a whole bunch of problems.
>
> Please elaborate why that should be necessary.
>
> Regards,
> Christian.
>
> Am 21.04.2017 um 09:08 schrieb Roger.He:
>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>    4 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 71364f5..ab0ffa8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>    	atomic64_t			num_bytes_moved;
>>    	atomic64_t			num_evictions;
>>    	atomic_t			gpu_reset_counter;
>> +	atomic_t			gpu_reset_state;
>>    
>>    	/* data for buffer migration throttling */
>>    	struct {
>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>    #define amdgpu_psp_check_fw_loading_status(adev, i) 
>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>    
>>    /* Common functions */
>> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev) 
>> +{
>> +	return atomic_read(&adev->gpu_reset_state);
>> +}
>> +
>>    int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>    bool amdgpu_need_backup(struct amdgpu_device *adev);
>>    void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff 
>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f882496..0fb4716 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	mutex_init(&adev->grbm_idx_mutex);
>>    	mutex_init(&adev->mn_lock);
>>    	hash_init(adev->mn_hash);
>> +	atomic_set(&adev->gpu_reset_state, 0);
>>    
>>    	amdgpu_check_arguments(adev);
>>    
>> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>>    }
>>    
>>    /**
>> + * amdgpu_device_set_reset_state - set gpu reset state
>> + *
>> + * @adev: amdgpu device pointer
>> + * @state: true when start to reset gpu; false: reset done  */ 
>> +static inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
>> +							bool state)
>> +{
>> +	atomic_set(&adev->gpu_reset_state, state); }
>> +
>> +/**
>>     * amdgpu_gpu_reset - reset the asic
>>     *
>>     * @adev: amdgpu device pointer
>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>    	}
>>    
>>    	atomic_inc(&adev->gpu_reset_counter);
>> -
>> +	amdgpu_device_set_reset_state(adev, true);
>>    	/* block TTM */
>>    	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>    	/* store modesetting */
>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>    		dev_info(adev->dev, "GPU reset failed\n");
>>    	}
>>    
>> +	amdgpu_device_set_reset_state(adev, false);
>>    	return r;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index ead00d7..8cc14af 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>    	struct drm_file *file_priv = filp->private_data;
>>    	struct drm_device *dev;
>>    	long ret;
>> +
>>    	dev = file_priv->minor->dev;
>>    	ret = pm_runtime_get_sync(dev->dev);
>>    	if (ret < 0)
>>    		return ret;
>>    
>> +	while (amdgpu_device_is_reset(dev->dev_private))
>> +		msleep(100);
>> +
>>    	ret = drm_ioctl(filp, cmd, arg);
>>    
>>    	pm_runtime_mark_last_busy(dev->dev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> index 2648291..22b8059 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> @@ -36,10 +36,15 @@
>>    long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>    {
>>    	unsigned int nr = DRM_IOCTL_NR(cmd);
>> +	struct drm_file *file_priv = filp->private_data;
>> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>    	int ret;
>>    
>> -	if (nr < DRM_COMMAND_BASE)
>> +	if (nr < DRM_COMMAND_BASE) {
>> +		while (amdgpu_device_is_reset(adev))
>> +			msleep(100);
>>    		return drm_compat_ioctl(filp, cmd, arg);
>> +	}
>>    
>>    	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>    
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]                 ` <MWHPR1201MB0127A593761702A7A179855EFD1A0-3iK1xFAIwjq9imrIu4W8xGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-04-21 11:47                   ` Christian König
       [not found]                     ` <eeb3931b-ef67-8645-a3bc-bc7f3db4002e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Christian König @ 2017-04-21 11:47 UTC (permalink / raw)
  To: He, Hongbo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhou, David(ChunMing)

See amdgpu_gpu_reset():

>         /* block scheduler */
>         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                 struct amdgpu_ring *ring = adev->rings[i];
>
>                 if (!ring)
>                         continue;
>                 kthread_park(ring->sched.thread);
>                 amd_sched_hw_job_reset(&ring->sched);
>         }

and

>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = adev->rings[i];
>                         if (!ring)
>                                 continue;
>
> amd_sched_job_recovery(&ring->sched);
>                         kthread_unpark(ring->sched.thread);
>                 }

Regards,
Christian.

Am 21.04.2017 um 11:54 schrieb He, Hongbo:
>> During GPU reset, IOCTL still  submit command to GPU.
> No it won't. We disable the GPU scheduler during GPU reset, so that isn't an issue.
>            Could you point out the related code? Thanks.
>
>
> Thanks
> Roger(Hongbo.He)
>
> -----Original Message-----
> From: Christian König [mailto:deathsimple@vodafone.de]
> Sent: Friday, April 21, 2017 5:16 PM
> To: He, Hongbo; amd-gfx@lists.freedesktop.org
> Cc: Zhou, David(ChunMing)
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>
>> During GPU reset, IOCTL still  submit command to GPU.
> No it won't. We disable the GPU scheduler during GPU reset, so that isn't an issue.
>
> Do you have a concrete problem (e.g. dmesg?) you ran into?
>
> Regards,
> Christian.
>
> Am 21.04.2017 um 11:00 schrieb He, Hongbo:
>> Hi Christian:
>>
>> During GPU reset, IOCTL still  submit command to GPU.
>> That will result in lots of strange problems when we test this feature.
>>
>> I have no better idea to handle this case. At that time, I have tried with return error directly rather than waiting GPU reset completion.
>> But that doesn't work and it lead to other problems.
>>
>> Thanks
>> Roger(Hongbo.He)
>>
>> -----Original Message-----
>> From: Christian König [mailto:deathsimple@vodafone.de]
>> Sent: Friday, April 21, 2017 4:27 PM
>> To: He, Hongbo; amd-gfx@lists.freedesktop.org
>> Cc: Zhou, David(ChunMing)
>> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>>
>> NAK, that is exactly what we wanted to avoid.
>>
>> We used to have an exclusive lock for this and it cause a whole bunch of problems.
>>
>> Please elaborate why that should be necessary.
>>
>> Regards,
>> Christian.
>>
>> Am 21.04.2017 um 09:08 schrieb Roger.He:
>>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>>     4 files changed, 31 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 71364f5..ab0ffa8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>>     	atomic64_t			num_bytes_moved;
>>>     	atomic64_t			num_evictions;
>>>     	atomic_t			gpu_reset_counter;
>>> +	atomic_t			gpu_reset_state;
>>>     
>>>     	/* data for buffer migration throttling */
>>>     	struct {
>>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>     #define amdgpu_psp_check_fw_loading_status(adev, i)
>>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>>     
>>>     /* Common functions */
>>> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
>>> +{
>>> +	return atomic_read(&adev->gpu_reset_state);
>>> +}
>>> +
>>>     int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>>     bool amdgpu_need_backup(struct amdgpu_device *adev);
>>>     void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff
>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index f882496..0fb4716 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>     	mutex_init(&adev->grbm_idx_mutex);
>>>     	mutex_init(&adev->mn_lock);
>>>     	hash_init(adev->mn_hash);
>>> +	atomic_set(&adev->gpu_reset_state, 0);
>>>     
>>>     	amdgpu_check_arguments(adev);
>>>     
>>> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>>>     }
>>>     
>>>     /**
>>> + * amdgpu_device_set_reset_state - set gpu reset state
>>> + *
>>> + * @adev: amdgpu device pointer
>>> + * @state: true when start to reset gpu; false: reset done  */
>>> +static inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
>>> +							bool state)
>>> +{
>>> +	atomic_set(&adev->gpu_reset_state, state); }
>>> +
>>> +/**
>>>      * amdgpu_gpu_reset - reset the asic
>>>      *
>>>      * @adev: amdgpu device pointer
>>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>     	}
>>>     
>>>     	atomic_inc(&adev->gpu_reset_counter);
>>> -
>>> +	amdgpu_device_set_reset_state(adev, true);
>>>     	/* block TTM */
>>>     	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>>     	/* store modesetting */
>>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>     		dev_info(adev->dev, "GPU reset failed\n");
>>>     	}
>>>     
>>> +	amdgpu_device_set_reset_state(adev, false);
>>>     	return r;
>>>     }
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index ead00d7..8cc14af 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>>     	struct drm_file *file_priv = filp->private_data;
>>>     	struct drm_device *dev;
>>>     	long ret;
>>> +
>>>     	dev = file_priv->minor->dev;
>>>     	ret = pm_runtime_get_sync(dev->dev);
>>>     	if (ret < 0)
>>>     		return ret;
>>>     
>>> +	while (amdgpu_device_is_reset(dev->dev_private))
>>> +		msleep(100);
>>> +
>>>     	ret = drm_ioctl(filp, cmd, arg);
>>>     
>>>     	pm_runtime_mark_last_busy(dev->dev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> index 2648291..22b8059 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> @@ -36,10 +36,15 @@
>>>     long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>>     {
>>>     	unsigned int nr = DRM_IOCTL_NR(cmd);
>>> +	struct drm_file *file_priv = filp->private_data;
>>> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>>     	int ret;
>>>     
>>> -	if (nr < DRM_COMMAND_BASE)
>>> +	if (nr < DRM_COMMAND_BASE) {
>>> +		while (amdgpu_device_is_reset(adev))
>>> +			msleep(100);
>>>     		return drm_compat_ioctl(filp, cmd, arg);
>>> +	}
>>>     
>>>     	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>>     
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]                     ` <eeb3931b-ef67-8645-a3bc-bc7f3db4002e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-04-21 12:14                       ` He, Hongbo
  0 siblings, 0 replies; 18+ messages in thread
From: He, Hongbo @ 2017-04-21 12:14 UTC (permalink / raw)
  To: Christian König,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhou, David(ChunMing)

Thank you!

-----Original Message-----
From: Christian König [mailto:deathsimple@vodafone.de] 
Sent: Friday, April 21, 2017 7:47 PM
To: He, Hongbo; amd-gfx@lists.freedesktop.org
Cc: Zhou, David(ChunMing)
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue

See amdgpu_gpu_reset():

>         /* block scheduler */
>         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                 struct amdgpu_ring *ring = adev->rings[i];
>
>                 if (!ring)
>                         continue;
>                 kthread_park(ring->sched.thread);
>                 amd_sched_hw_job_reset(&ring->sched);
>         }

and

>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = adev->rings[i];
>                         if (!ring)
>                                 continue;
>
> amd_sched_job_recovery(&ring->sched);
>                         kthread_unpark(ring->sched.thread);
>                 }

Regards,
Christian.

Am 21.04.2017 um 11:54 schrieb He, Hongbo:
>> During GPU reset, IOCTL still  submit command to GPU.
> No it won't. We disable the GPU scheduler during GPU reset, so that isn't an issue.
>            Could you point out the related code? Thanks.
>
>
> Thanks
> Roger(Hongbo.He)
>
> -----Original Message-----
> From: Christian König [mailto:deathsimple@vodafone.de]
> Sent: Friday, April 21, 2017 5:16 PM
> To: He, Hongbo; amd-gfx@lists.freedesktop.org
> Cc: Zhou, David(ChunMing)
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>
>> During GPU reset, IOCTL still  submit command to GPU.
> No it won't. We disable the GPU scheduler during GPU reset, so that isn't an issue.
>
> Do you have a concrete problem (e.g. dmesg?) you ran into?
>
> Regards,
> Christian.
>
> Am 21.04.2017 um 11:00 schrieb He, Hongbo:
>> Hi Christian:
>>
>> During GPU reset, IOCTL still  submit command to GPU.
>> That will result in lots of strange problems when we test this feature.
>>
>> I have no better idea to handle this case. At that time, I have tried with return error directly rather than waiting GPU reset completion.
>> But that doesn't work and it lead to other problems.
>>
>> Thanks
>> Roger(Hongbo.He)
>>
>> -----Original Message-----
>> From: Christian König [mailto:deathsimple@vodafone.de]
>> Sent: Friday, April 21, 2017 4:27 PM
>> To: He, Hongbo; amd-gfx@lists.freedesktop.org
>> Cc: Zhou, David(ChunMing)
>> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>>
>> NAK, that is exactly what we wanted to avoid.
>>
>> We used to have an exclusive lock for this and it cause a whole bunch of problems.
>>
>> Please elaborate why that should be necessary.
>>
>> Regards,
>> Christian.
>>
>> Am 21.04.2017 um 09:08 schrieb Roger.He:
>>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>>     4 files changed, 31 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 71364f5..ab0ffa8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>>     	atomic64_t			num_bytes_moved;
>>>     	atomic64_t			num_evictions;
>>>     	atomic_t			gpu_reset_counter;
>>> +	atomic_t			gpu_reset_state;
>>>     
>>>     	/* data for buffer migration throttling */
>>>     	struct {
>>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>     #define amdgpu_psp_check_fw_loading_status(adev, i) 
>>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>>     
>>>     /* Common functions */
>>> +static inline int amdgpu_device_is_reset(struct amdgpu_device 
>>> +*adev) {
>>> +	return atomic_read(&adev->gpu_reset_state);
>>> +}
>>> +
>>>     int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>>     bool amdgpu_need_backup(struct amdgpu_device *adev);
>>>     void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff 
>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index f882496..0fb4716 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>     	mutex_init(&adev->grbm_idx_mutex);
>>>     	mutex_init(&adev->mn_lock);
>>>     	hash_init(adev->mn_hash);
>>> +	atomic_set(&adev->gpu_reset_state, 0);
>>>     
>>>     	amdgpu_check_arguments(adev);
>>>     
>>> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>>>     }
>>>     
>>>     /**
>>> + * amdgpu_device_set_reset_state - set gpu reset state
>>> + *
>>> + * @adev: amdgpu device pointer
>>> + * @state: true when start to reset gpu; false: reset done  */ 
>>> +static inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
>>> +							bool state)
>>> +{
>>> +	atomic_set(&adev->gpu_reset_state, state); }
>>> +
>>> +/**
>>>      * amdgpu_gpu_reset - reset the asic
>>>      *
>>>      * @adev: amdgpu device pointer
>>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>     	}
>>>     
>>>     	atomic_inc(&adev->gpu_reset_counter);
>>> -
>>> +	amdgpu_device_set_reset_state(adev, true);
>>>     	/* block TTM */
>>>     	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>>     	/* store modesetting */
>>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>     		dev_info(adev->dev, "GPU reset failed\n");
>>>     	}
>>>     
>>> +	amdgpu_device_set_reset_state(adev, false);
>>>     	return r;
>>>     }
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index ead00d7..8cc14af 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>>     	struct drm_file *file_priv = filp->private_data;
>>>     	struct drm_device *dev;
>>>     	long ret;
>>> +
>>>     	dev = file_priv->minor->dev;
>>>     	ret = pm_runtime_get_sync(dev->dev);
>>>     	if (ret < 0)
>>>     		return ret;
>>>     
>>> +	while (amdgpu_device_is_reset(dev->dev_private))
>>> +		msleep(100);
>>> +
>>>     	ret = drm_ioctl(filp, cmd, arg);
>>>     
>>>     	pm_runtime_mark_last_busy(dev->dev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> index 2648291..22b8059 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> @@ -36,10 +36,15 @@
>>>     long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>>     {
>>>     	unsigned int nr = DRM_IOCTL_NR(cmd);
>>> +	struct drm_file *file_priv = filp->private_data;
>>> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>>     	int ret;
>>>     
>>> -	if (nr < DRM_COMMAND_BASE)
>>> +	if (nr < DRM_COMMAND_BASE) {
>>> +		while (amdgpu_device_is_reset(adev))
>>> +			msleep(100);
>>>     		return drm_compat_ioctl(filp, cmd, arg);
>>> +	}
>>>     
>>>     	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>>     
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]     ` <ef2c40cd-bb55-5921-9318-e0147f1b4719-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2017-04-21  8:54       ` zhoucm1
  2017-04-21  9:00       ` He, Hongbo
@ 2017-05-04 10:31       ` zhoucm1
       [not found]         ` <590B0307.2090001-5C7GfCeVMHo@public.gmane.org>
  2 siblings, 1 reply; 18+ messages in thread
From: zhoucm1 @ 2017-05-04 10:31 UTC (permalink / raw)
  To: Christian König, Roger.He,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Christian,

We possibly must re-consider this patch, the direct issue is VM FAULT 
after gpu reset. one obvious bug is we could bind gtt to gart during GPU 
reset, binding gtt to gart needs to fill gart table and tlb flush, which 
is parallel with gpu reset thread.

Regards,
David Zhou

On 2017年04月21日 16:27, Christian König wrote:
> NAK, that is exactly what we wanted to avoid.
>
> We used to have an exclusive lock for this and it cause a whole bunch 
> of problems.
>
> Please elaborate why that should be necessary.
>
> Regards,
> Christian.
>
> Am 21.04.2017 um 09:08 schrieb Roger.He:
>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>   4 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 71364f5..ab0ffa8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>       atomic64_t            num_bytes_moved;
>>       atomic64_t            num_evictions;
>>       atomic_t            gpu_reset_counter;
>> +    atomic_t            gpu_reset_state;
>>         /* data for buffer migration throttling */
>>       struct {
>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring 
>> *ring)
>>   #define amdgpu_psp_check_fw_loading_status(adev, i) 
>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>     /* Common functions */
>> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
>> +{
>> +    return atomic_read(&adev->gpu_reset_state);
>> +}
>> +
>>   int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>>   void amdgpu_pci_config_reset(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f882496..0fb4716 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>       mutex_init(&adev->grbm_idx_mutex);
>>       mutex_init(&adev->mn_lock);
>>       hash_init(adev->mn_hash);
>> +    atomic_set(&adev->gpu_reset_state, 0);
>>         amdgpu_check_arguments(adev);
>>   @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct 
>> amdgpu_device *adev, bool voluntary)
>>   }
>>     /**
>> + * amdgpu_device_set_reset_state - set gpu reset state
>> + *
>> + * @adev: amdgpu device pointer
>> + * @state: true when start to reset gpu; false: reset done
>> + */
>> +static inline void amdgpu_device_set_reset_state(struct 
>> amdgpu_device *adev,
>> +                            bool state)
>> +{
>> +    atomic_set(&adev->gpu_reset_state, state);
>> +}
>> +
>> +/**
>>    * amdgpu_gpu_reset - reset the asic
>>    *
>>    * @adev: amdgpu device pointer
>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>       }
>>         atomic_inc(&adev->gpu_reset_counter);
>> -
>> +    amdgpu_device_set_reset_state(adev, true);
>>       /* block TTM */
>>       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>       /* store modesetting */
>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>           dev_info(adev->dev, "GPU reset failed\n");
>>       }
>>   +    amdgpu_device_set_reset_state(adev, false);
>>       return r;
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index ead00d7..8cc14af 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>       struct drm_file *file_priv = filp->private_data;
>>       struct drm_device *dev;
>>       long ret;
>> +
>>       dev = file_priv->minor->dev;
>>       ret = pm_runtime_get_sync(dev->dev);
>>       if (ret < 0)
>>           return ret;
>>   +    while (amdgpu_device_is_reset(dev->dev_private))
>> +        msleep(100);
>> +
>>       ret = drm_ioctl(filp, cmd, arg);
>>         pm_runtime_mark_last_busy(dev->dev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> index 2648291..22b8059 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> @@ -36,10 +36,15 @@
>>   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, 
>> unsigned long arg)
>>   {
>>       unsigned int nr = DRM_IOCTL_NR(cmd);
>> +    struct drm_file *file_priv = filp->private_data;
>> +    struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>       int ret;
>>   -    if (nr < DRM_COMMAND_BASE)
>> +    if (nr < DRM_COMMAND_BASE) {
>> +        while (amdgpu_device_is_reset(adev))
>> +            msleep(100);
>>           return drm_compat_ioctl(filp, cmd, arg);
>> +    }
>>         ret = amdgpu_drm_ioctl(filp, cmd, arg);
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]     ` <58F9B22E.7050502-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-04 10:42       ` Liu, Monk
       [not found]         ` <DM5PR12MB161079A7788AC1D9E4CD5D6284EA0-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Liu, Monk @ 2017-05-04 10:42 UTC (permalink / raw)
  To: Zhou, David(ChunMing), He, Hongbo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

First of all,  Your patch will impact on SR-IOV case, because "gpu_reset_state" will be referenced by two place: 
1) IOCTL; 2)GPU-RESET ,
SRIOV use srio_gpu_reset, but SRIOV use the same IOCTL routines, that way the gpu_reset_state is not consistent for SR-IOV case

I suggest you firstly unify this gpu_reset_state for both SRIOV and bare-metal case 

For sriov, we have "adev->gfx.in_reset " and you now have "adev->gpu_reset_state",
I suggest you replace all "adev->gfx.in_reset" with "adev->gpu_reset_state",

BR Monk


-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of zhoucm1
Sent: Friday, April 21, 2017 3:18 PM
To: He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue



On 2017年04月21日 15:08, Roger.He wrote:

> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
> Signed-off-by: Roger.He <Hongbo.He@amd.com>
Please add more explaination in comment, like "prevent ioctl during gpu reset, otherwise many un-expected behaviour happen."

Regards,
David Zhou
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>   4 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 71364f5..ab0ffa8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>   	atomic64_t			num_bytes_moved;
>   	atomic64_t			num_evictions;
>   	atomic_t			gpu_reset_counter;
> +	atomic_t			gpu_reset_state;
>   
>   	/* data for buffer migration throttling */
>   	struct {
> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_psp_check_fw_loading_status(adev, i) 
> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>   
>   /* Common functions */
> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev) 
> +{
> +	return atomic_read(&adev->gpu_reset_state);
> +}
> +
>   int amdgpu_gpu_reset(struct amdgpu_device *adev);
>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>   void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f882496..0fb4716 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->grbm_idx_mutex);
>   	mutex_init(&adev->mn_lock);
>   	hash_init(adev->mn_hash);
> +	atomic_set(&adev->gpu_reset_state, 0);
>   
>   	amdgpu_check_arguments(adev);
>   
> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>   }
>   
>   /**
> + * amdgpu_device_set_reset_state - set gpu reset state
> + *
> + * @adev: amdgpu device pointer
> + * @state: true when start to reset gpu; false: reset done  */ static 
> +inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
> +							bool state)
> +{
> +	atomic_set(&adev->gpu_reset_state, state); }
> +
> +/**
>    * amdgpu_gpu_reset - reset the asic
>    *
>    * @adev: amdgpu device pointer
> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   	}
>   
>   	atomic_inc(&adev->gpu_reset_counter);
> -
> +	amdgpu_device_set_reset_state(adev, true);
>   	/* block TTM */
>   	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>   	/* store modesetting */
> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   		dev_info(adev->dev, "GPU reset failed\n");
>   	}
>   
> +	amdgpu_device_set_reset_state(adev, false);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index ead00d7..8cc14af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>   	struct drm_file *file_priv = filp->private_data;
>   	struct drm_device *dev;
>   	long ret;
> +
>   	dev = file_priv->minor->dev;
>   	ret = pm_runtime_get_sync(dev->dev);
>   	if (ret < 0)
>   		return ret;
>   
> +	while (amdgpu_device_is_reset(dev->dev_private))
> +		msleep(100);
> +
>   	ret = drm_ioctl(filp, cmd, arg);
>   
>   	pm_runtime_mark_last_busy(dev->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> index 2648291..22b8059 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> @@ -36,10 +36,15 @@
>   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   {
>   	unsigned int nr = DRM_IOCTL_NR(cmd);
> +	struct drm_file *file_priv = filp->private_data;
> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>   	int ret;
>   
> -	if (nr < DRM_COMMAND_BASE)
> +	if (nr < DRM_COMMAND_BASE) {
> +		while (amdgpu_device_is_reset(adev))
> +			msleep(100);
>   		return drm_compat_ioctl(filp, cmd, arg);
> +	}
>   
>   	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>   

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]         ` <590B0307.2090001-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-04 11:20           ` Christian König
       [not found]             ` <f2c6a892-5929-8644-ca34-09f89e6359a4-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Christian König @ 2017-05-04 11:20 UTC (permalink / raw)
  To: zhoucm1, Roger.He, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

But that should be harmless because we rebuild the GART table after GPU 
reset, don't we?

Apart from that I'm fine with having a lock or locks to prevent 
concurrent things from happening while we do the reset.

But we should NOT do the mistake again and try to prevent everything is 
one big global lock. That was rather bad idea and using an atomic isn't 
better in any way.

Regards,
Christian.

Am 04.05.2017 um 12:31 schrieb zhoucm1:
> Hi Christian,
>
> We possibly must re-consider this patch, the direct issue is VM FAULT 
> after gpu reset. one obvious bug is we could bind gtt to gart during 
> GPU reset, binding gtt to gart needs to fill gart table and tlb flush, 
> which is parallel with gpu reset thread.
>
> Regards,
> David Zhou
>
> On 2017年04月21日 16:27, Christian König wrote:
>> NAK, that is exactly what we wanted to avoid.
>>
>> We used to have an exclusive lock for this and it cause a whole bunch 
>> of problems.
>>
>> Please elaborate why that should be necessary.
>>
>> Regards,
>> Christian.
>>
>> Am 21.04.2017 um 09:08 schrieb Roger.He:
>>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>>   4 files changed, 31 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 71364f5..ab0ffa8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>>       atomic64_t            num_bytes_moved;
>>>       atomic64_t            num_evictions;
>>>       atomic_t            gpu_reset_counter;
>>> +    atomic_t            gpu_reset_state;
>>>         /* data for buffer migration throttling */
>>>       struct {
>>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring 
>>> *ring)
>>>   #define amdgpu_psp_check_fw_loading_status(adev, i) 
>>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>>     /* Common functions */
>>> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
>>> +{
>>> +    return atomic_read(&adev->gpu_reset_state);
>>> +}
>>> +
>>>   int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>>>   void amdgpu_pci_config_reset(struct amdgpu_device *adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index f882496..0fb4716 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device 
>>> *adev,
>>>       mutex_init(&adev->grbm_idx_mutex);
>>>       mutex_init(&adev->mn_lock);
>>>       hash_init(adev->mn_hash);
>>> +    atomic_set(&adev->gpu_reset_state, 0);
>>>         amdgpu_check_arguments(adev);
>>>   @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct 
>>> amdgpu_device *adev, bool voluntary)
>>>   }
>>>     /**
>>> + * amdgpu_device_set_reset_state - set gpu reset state
>>> + *
>>> + * @adev: amdgpu device pointer
>>> + * @state: true when start to reset gpu; false: reset done
>>> + */
>>> +static inline void amdgpu_device_set_reset_state(struct 
>>> amdgpu_device *adev,
>>> +                            bool state)
>>> +{
>>> +    atomic_set(&adev->gpu_reset_state, state);
>>> +}
>>> +
>>> +/**
>>>    * amdgpu_gpu_reset - reset the asic
>>>    *
>>>    * @adev: amdgpu device pointer
>>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>       }
>>>         atomic_inc(&adev->gpu_reset_counter);
>>> -
>>> +    amdgpu_device_set_reset_state(adev, true);
>>>       /* block TTM */
>>>       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>>       /* store modesetting */
>>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>           dev_info(adev->dev, "GPU reset failed\n");
>>>       }
>>>   +    amdgpu_device_set_reset_state(adev, false);
>>>       return r;
>>>   }
>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index ead00d7..8cc14af 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>>       struct drm_file *file_priv = filp->private_data;
>>>       struct drm_device *dev;
>>>       long ret;
>>> +
>>>       dev = file_priv->minor->dev;
>>>       ret = pm_runtime_get_sync(dev->dev);
>>>       if (ret < 0)
>>>           return ret;
>>>   +    while (amdgpu_device_is_reset(dev->dev_private))
>>> +        msleep(100);
>>> +
>>>       ret = drm_ioctl(filp, cmd, arg);
>>>         pm_runtime_mark_last_busy(dev->dev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> index 2648291..22b8059 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> @@ -36,10 +36,15 @@
>>>   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, 
>>> unsigned long arg)
>>>   {
>>>       unsigned int nr = DRM_IOCTL_NR(cmd);
>>> +    struct drm_file *file_priv = filp->private_data;
>>> +    struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>>       int ret;
>>>   -    if (nr < DRM_COMMAND_BASE)
>>> +    if (nr < DRM_COMMAND_BASE) {
>>> +        while (amdgpu_device_is_reset(adev))
>>> +            msleep(100);
>>>           return drm_compat_ioctl(filp, cmd, arg);
>>> +    }
>>>         ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]             ` <f2c6a892-5929-8644-ca34-09f89e6359a4-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-04 11:28               ` Liu, Monk
       [not found]                 ` <DM5PR12MB1610FEFA4B2654FF67B2F1B684EA0-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Liu, Monk @ 2017-05-04 11:28 UTC (permalink / raw)
  To: Christian König, Zhou, David(ChunMing), He, Hongbo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

Christian

Why not use rw lock ? that way only gpu reset will make those iOCTL blocked, 
and in usual case the performance is still the same 

BR Monk

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian König
Sent: Thursday, May 04, 2017 7:21 PM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue

But that should be harmless because we rebuild the GART table after GPU reset, don't we?

Apart from that I'm fine with having a lock or locks to prevent concurrent things from happening while we do the reset.

But we should NOT do the mistake again and try to prevent everything is one big global lock. That was rather bad idea and using an atomic isn't better in any way.

Regards,
Christian.

Am 04.05.2017 um 12:31 schrieb zhoucm1:
> Hi Christian,
>
> We possibly must re-consider this patch, the direct issue is VM FAULT 
> after gpu reset. one obvious bug is we could bind gtt to gart during 
> GPU reset, binding gtt to gart needs to fill gart table and tlb flush, 
> which is parallel with gpu reset thread.
>
> Regards,
> David Zhou
>
> On 2017年04月21日 16:27, Christian König wrote:
>> NAK, that is exactly what we wanted to avoid.
>>
>> We used to have an exclusive lock for this and it cause a whole bunch 
>> of problems.
>>
>> Please elaborate why that should be necessary.
>>
>> Regards,
>> Christian.
>>
>> Am 21.04.2017 um 09:08 schrieb Roger.He:
>>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>>   4 files changed, 31 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 71364f5..ab0ffa8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>>       atomic64_t            num_bytes_moved;
>>>       atomic64_t            num_evictions;
>>>       atomic_t            gpu_reset_counter;
>>> +    atomic_t            gpu_reset_state;
>>>         /* data for buffer migration throttling */
>>>       struct {
>>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring
>>> *ring)
>>>   #define amdgpu_psp_check_fw_loading_status(adev, i) 
>>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>>     /* Common functions */
>>> +static inline int amdgpu_device_is_reset(struct amdgpu_device 
>>> +*adev) {
>>> +    return atomic_read(&adev->gpu_reset_state);
>>> +}
>>> +
>>>   int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>>>   void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff 
>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index f882496..0fb4716 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device 
>>> *adev,
>>>       mutex_init(&adev->grbm_idx_mutex);
>>>       mutex_init(&adev->mn_lock);
>>>       hash_init(adev->mn_hash);
>>> +    atomic_set(&adev->gpu_reset_state, 0);
>>>         amdgpu_check_arguments(adev);
>>>   @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct 
>>> amdgpu_device *adev, bool voluntary)
>>>   }
>>>     /**
>>> + * amdgpu_device_set_reset_state - set gpu reset state
>>> + *
>>> + * @adev: amdgpu device pointer
>>> + * @state: true when start to reset gpu; false: reset done  */ 
>>> +static inline void amdgpu_device_set_reset_state(struct
>>> amdgpu_device *adev,
>>> +                            bool state) {
>>> +    atomic_set(&adev->gpu_reset_state, state); }
>>> +
>>> +/**
>>>    * amdgpu_gpu_reset - reset the asic
>>>    *
>>>    * @adev: amdgpu device pointer
>>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>       }
>>>         atomic_inc(&adev->gpu_reset_counter);
>>> -
>>> +    amdgpu_device_set_reset_state(adev, true);
>>>       /* block TTM */
>>>       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>>       /* store modesetting */
>>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>           dev_info(adev->dev, "GPU reset failed\n");
>>>       }
>>>   +    amdgpu_device_set_reset_state(adev, false);
>>>       return r;
>>>   }
>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index ead00d7..8cc14af 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>>       struct drm_file *file_priv = filp->private_data;
>>>       struct drm_device *dev;
>>>       long ret;
>>> +
>>>       dev = file_priv->minor->dev;
>>>       ret = pm_runtime_get_sync(dev->dev);
>>>       if (ret < 0)
>>>           return ret;
>>>   +    while (amdgpu_device_is_reset(dev->dev_private))
>>> +        msleep(100);
>>> +
>>>       ret = drm_ioctl(filp, cmd, arg);
>>>         pm_runtime_mark_last_busy(dev->dev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> index 2648291..22b8059 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> @@ -36,10 +36,15 @@
>>>   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, 
>>> unsigned long arg)
>>>   {
>>>       unsigned int nr = DRM_IOCTL_NR(cmd);
>>> +    struct drm_file *file_priv = filp->private_data;
>>> +    struct amdgpu_device *adev = 
>>> + file_priv->minor->dev->dev_private;
>>>       int ret;
>>>   -    if (nr < DRM_COMMAND_BASE)
>>> +    if (nr < DRM_COMMAND_BASE) {
>>> +        while (amdgpu_device_is_reset(adev))
>>> +            msleep(100);
>>>           return drm_compat_ioctl(filp, cmd, arg);
>>> +    }
>>>         ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]         ` <DM5PR12MB161079A7788AC1D9E4CD5D6284EA0-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-05-04 11:29           ` Christian König
       [not found]             ` <c88049b2-2440-142e-129a-542c136b8d61-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2017-05-04 14:46           ` Deucher, Alexander
  1 sibling, 1 reply; 18+ messages in thread
From: Christian König @ 2017-05-04 11:29 UTC (permalink / raw)
  To: Liu, Monk, Zhou, David(ChunMing), He, Hongbo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

> I suggest you firstly unify this gpu_reset_state for both SRIOV and bare-metal case
Yeah, that sounds like a good idea to me as well.

I suggest to have a function which covers things necessary for both 
cases, e.g. stopping the scheduler and blocking TTM.

And inside that one we call specific functions for bare metal and SRIOV 
for the special handling.

Regards,
Christian.

Am 04.05.2017 um 12:42 schrieb Liu, Monk:
> First of all,  Your patch will impact on SR-IOV case, because "gpu_reset_state" will be referenced by two place:
> 1) IOCTL; 2)GPU-RESET ,
> SRIOV use srio_gpu_reset, but SRIOV use the same IOCTL routines, that way the gpu_reset_state is not consistent for SR-IOV case
>
> I suggest you firstly unify this gpu_reset_state for both SRIOV and bare-metal case
>
> For sriov, we have "adev->gfx.in_reset " and you now have "adev->gpu_reset_state",
> I suggest you replace all "adev->gfx.in_reset" with "adev->gpu_reset_state",
>
> BR Monk
>
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of zhoucm1
> Sent: Friday, April 21, 2017 3:18 PM
> To: He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>
>
>
> On 2017年04月21日 15:08, Roger.He wrote:
>
>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
> Please add more explaination in comment, like "prevent ioctl during gpu reset, otherwise many un-expected behaviour happen."
>
> Regards,
> David Zhou
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>    4 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 71364f5..ab0ffa8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>    	atomic64_t			num_bytes_moved;
>>    	atomic64_t			num_evictions;
>>    	atomic_t			gpu_reset_counter;
>> +	atomic_t			gpu_reset_state;
>>    
>>    	/* data for buffer migration throttling */
>>    	struct {
>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>    #define amdgpu_psp_check_fw_loading_status(adev, i)
>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>    
>>    /* Common functions */
>> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
>> +{
>> +	return atomic_read(&adev->gpu_reset_state);
>> +}
>> +
>>    int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>    bool amdgpu_need_backup(struct amdgpu_device *adev);
>>    void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f882496..0fb4716 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	mutex_init(&adev->grbm_idx_mutex);
>>    	mutex_init(&adev->mn_lock);
>>    	hash_init(adev->mn_hash);
>> +	atomic_set(&adev->gpu_reset_state, 0);
>>    
>>    	amdgpu_check_arguments(adev);
>>    
>> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>>    }
>>    
>>    /**
>> + * amdgpu_device_set_reset_state - set gpu reset state
>> + *
>> + * @adev: amdgpu device pointer
>> + * @state: true when start to reset gpu; false: reset done  */ static
>> +inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
>> +							bool state)
>> +{
>> +	atomic_set(&adev->gpu_reset_state, state); }
>> +
>> +/**
>>     * amdgpu_gpu_reset - reset the asic
>>     *
>>     * @adev: amdgpu device pointer
>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>    	}
>>    
>>    	atomic_inc(&adev->gpu_reset_counter);
>> -
>> +	amdgpu_device_set_reset_state(adev, true);
>>    	/* block TTM */
>>    	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>    	/* store modesetting */
>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>    		dev_info(adev->dev, "GPU reset failed\n");
>>    	}
>>    
>> +	amdgpu_device_set_reset_state(adev, false);
>>    	return r;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index ead00d7..8cc14af 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>    	struct drm_file *file_priv = filp->private_data;
>>    	struct drm_device *dev;
>>    	long ret;
>> +
>>    	dev = file_priv->minor->dev;
>>    	ret = pm_runtime_get_sync(dev->dev);
>>    	if (ret < 0)
>>    		return ret;
>>    
>> +	while (amdgpu_device_is_reset(dev->dev_private))
>> +		msleep(100);
>> +
>>    	ret = drm_ioctl(filp, cmd, arg);
>>    
>>    	pm_runtime_mark_last_busy(dev->dev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> index 2648291..22b8059 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> @@ -36,10 +36,15 @@
>>    long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>    {
>>    	unsigned int nr = DRM_IOCTL_NR(cmd);
>> +	struct drm_file *file_priv = filp->private_data;
>> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>    	int ret;
>>    
>> -	if (nr < DRM_COMMAND_BASE)
>> +	if (nr < DRM_COMMAND_BASE) {
>> +		while (amdgpu_device_is_reset(adev))
>> +			msleep(100);
>>    		return drm_compat_ioctl(filp, cmd, arg);
>> +	}
>>    
>>    	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>    
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]                 ` <DM5PR12MB1610FEFA4B2654FF67B2F1B684EA0-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-05-04 11:32                   ` Christian König
       [not found]                     ` <6321c998-f337-39af-3d64-070341e5fbe4-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Christian König @ 2017-05-04 11:32 UTC (permalink / raw)
  To: Liu, Monk, Zhou, David(ChunMing), He, Hongbo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

That's what we used to have with the exclusive lock.

The issue is not what kind of lock or locking or not locking at all, but 
rather where to place it.

Having a rw lock called something like "in_reset" is ok to me, but we 
shouldn't take it in the IOCTL path.

Instead we should block for example the GART TLB flush while being 
inside a reset etc...

Otherwise we run into the same problems (lock inversions for example) as 
we had before.

Regards,
Christian.

Am 04.05.2017 um 13:28 schrieb Liu, Monk:
> Christian
>
> Why not use rw lock ? that way only gpu reset will make those iOCTL blocked,
> and in usual case the performance is still the same
>
> BR Monk
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian König
> Sent: Thursday, May 04, 2017 7:21 PM
> To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>
> But that should be harmless because we rebuild the GART table after GPU reset, don't we?
>
> Apart from that I'm fine with having a lock or locks to prevent concurrent things from happening while we do the reset.
>
> But we should NOT do the mistake again and try to prevent everything is one big global lock. That was rather bad idea and using an atomic isn't better in any way.
>
> Regards,
> Christian.
>
> Am 04.05.2017 um 12:31 schrieb zhoucm1:
>> Hi Christian,
>>
>> We possibly must re-consider this patch, the direct issue is VM FAULT
>> after gpu reset. one obvious bug is we could bind gtt to gart during
>> GPU reset, binding gtt to gart needs to fill gart table and tlb flush,
>> which is parallel with gpu reset thread.
>>
>> Regards,
>> David Zhou
>>
>> On 2017年04月21日 16:27, Christian König wrote:
>>> NAK, that is exactly what we wanted to avoid.
>>>
>>> We used to have an exclusive lock for this and it cause a whole bunch
>>> of problems.
>>>
>>> Please elaborate why that should be necessary.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 21.04.2017 um 09:08 schrieb Roger.He:
>>>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>>>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>>>    4 files changed, 31 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 71364f5..ab0ffa8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>>>        atomic64_t            num_bytes_moved;
>>>>        atomic64_t            num_evictions;
>>>>        atomic_t            gpu_reset_counter;
>>>> +    atomic_t            gpu_reset_state;
>>>>          /* data for buffer migration throttling */
>>>>        struct {
>>>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring
>>>> *ring)
>>>>    #define amdgpu_psp_check_fw_loading_status(adev, i)
>>>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>>>      /* Common functions */
>>>> +static inline int amdgpu_device_is_reset(struct amdgpu_device
>>>> +*adev) {
>>>> +    return atomic_read(&adev->gpu_reset_state);
>>>> +}
>>>> +
>>>>    int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>>>    bool amdgpu_need_backup(struct amdgpu_device *adev);
>>>>    void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff
>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index f882496..0fb4716 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device
>>>> *adev,
>>>>        mutex_init(&adev->grbm_idx_mutex);
>>>>        mutex_init(&adev->mn_lock);
>>>>        hash_init(adev->mn_hash);
>>>> +    atomic_set(&adev->gpu_reset_state, 0);
>>>>          amdgpu_check_arguments(adev);
>>>>    @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct
>>>> amdgpu_device *adev, bool voluntary)
>>>>    }
>>>>      /**
>>>> + * amdgpu_device_set_reset_state - set gpu reset state
>>>> + *
>>>> + * @adev: amdgpu device pointer
>>>> + * @state: true when start to reset gpu; false: reset done  */
>>>> +static inline void amdgpu_device_set_reset_state(struct
>>>> amdgpu_device *adev,
>>>> +                            bool state) {
>>>> +    atomic_set(&adev->gpu_reset_state, state); }
>>>> +
>>>> +/**
>>>>     * amdgpu_gpu_reset - reset the asic
>>>>     *
>>>>     * @adev: amdgpu device pointer
>>>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>>        }
>>>>          atomic_inc(&adev->gpu_reset_counter);
>>>> -
>>>> +    amdgpu_device_set_reset_state(adev, true);
>>>>        /* block TTM */
>>>>        resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>>>        /* store modesetting */
>>>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>>            dev_info(adev->dev, "GPU reset failed\n");
>>>>        }
>>>>    +    amdgpu_device_set_reset_state(adev, false);
>>>>        return r;
>>>>    }
>>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index ead00d7..8cc14af 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>>>        struct drm_file *file_priv = filp->private_data;
>>>>        struct drm_device *dev;
>>>>        long ret;
>>>> +
>>>>        dev = file_priv->minor->dev;
>>>>        ret = pm_runtime_get_sync(dev->dev);
>>>>        if (ret < 0)
>>>>            return ret;
>>>>    +    while (amdgpu_device_is_reset(dev->dev_private))
>>>> +        msleep(100);
>>>> +
>>>>        ret = drm_ioctl(filp, cmd, arg);
>>>>          pm_runtime_mark_last_busy(dev->dev);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>>> index 2648291..22b8059 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>>> @@ -36,10 +36,15 @@
>>>>    long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd,
>>>> unsigned long arg)
>>>>    {
>>>>        unsigned int nr = DRM_IOCTL_NR(cmd);
>>>> +    struct drm_file *file_priv = filp->private_data;
>>>> +    struct amdgpu_device *adev =
>>>> + file_priv->minor->dev->dev_private;
>>>>        int ret;
>>>>    -    if (nr < DRM_COMMAND_BASE)
>>>> +    if (nr < DRM_COMMAND_BASE) {
>>>> +        while (amdgpu_device_is_reset(adev))
>>>> +            msleep(100);
>>>>            return drm_compat_ioctl(filp, cmd, arg);
>>>> +    }
>>>>          ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]             ` <c88049b2-2440-142e-129a-542c136b8d61-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-04 11:34               ` Liu, Monk
  0 siblings, 0 replies; 18+ messages in thread
From: Liu, Monk @ 2017-05-04 11:34 UTC (permalink / raw)
  To: Christian König, Zhou, David(ChunMing), He, Hongbo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

I still want to keep different police and design for bare-metal and sr-iov on gpu reset 
e.g. what I designed is per sched reset, not that like current bare-metal's reset all sched, etc...

after we all make this feature stable, we can consider merge most part into one 

BR Monk

-----Original Message-----
From: Christian König [mailto:deathsimple@vodafone.de] 
Sent: Thursday, May 04, 2017 7:30 PM
To: Liu, Monk <Monk.Liu@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue

> I suggest you firstly unify this gpu_reset_state for both SRIOV and 
> bare-metal case
Yeah, that sounds like a good idea to me as well.

I suggest to have a function which covers things necessary for both cases, e.g. stopping the scheduler and blocking TTM.

And inside that one we call specific functions for bare metal and SRIOV for the special handling.

Regards,
Christian.

Am 04.05.2017 um 12:42 schrieb Liu, Monk:
> First of all,  Your patch will impact on SR-IOV case, because "gpu_reset_state" will be referenced by two place:
> 1) IOCTL; 2)GPU-RESET ,
> SRIOV use srio_gpu_reset, but SRIOV use the same IOCTL routines, that 
> way the gpu_reset_state is not consistent for SR-IOV case
>
> I suggest you firstly unify this gpu_reset_state for both SRIOV and 
> bare-metal case
>
> For sriov, we have "adev->gfx.in_reset " and you now have 
> "adev->gpu_reset_state", I suggest you replace all 
> "adev->gfx.in_reset" with "adev->gpu_reset_state",
>
> BR Monk
>
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf 
> Of zhoucm1
> Sent: Friday, April 21, 2017 3:18 PM
> To: He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>
>
>
> On 2017年04月21日 15:08, Roger.He wrote:
>
>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
> Please add more explaination in comment, like "prevent ioctl during gpu reset, otherwise many un-expected behaviour happen."
>
> Regards,
> David Zhou
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>    4 files changed, 31 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 71364f5..ab0ffa8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>    	atomic64_t			num_bytes_moved;
>>    	atomic64_t			num_evictions;
>>    	atomic_t			gpu_reset_counter;
>> +	atomic_t			gpu_reset_state;
>>    
>>    	/* data for buffer migration throttling */
>>    	struct {
>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>    #define amdgpu_psp_check_fw_loading_status(adev, i) 
>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>    
>>    /* Common functions */
>> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev) 
>> +{
>> +	return atomic_read(&adev->gpu_reset_state);
>> +}
>> +
>>    int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>    bool amdgpu_need_backup(struct amdgpu_device *adev);
>>    void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff 
>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f882496..0fb4716 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>    	mutex_init(&adev->grbm_idx_mutex);
>>    	mutex_init(&adev->mn_lock);
>>    	hash_init(adev->mn_hash);
>> +	atomic_set(&adev->gpu_reset_state, 0);
>>    
>>    	amdgpu_check_arguments(adev);
>>    
>> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>>    }
>>    
>>    /**
>> + * amdgpu_device_set_reset_state - set gpu reset state
>> + *
>> + * @adev: amdgpu device pointer
>> + * @state: true when start to reset gpu; false: reset done  */ 
>> +static inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
>> +							bool state)
>> +{
>> +	atomic_set(&adev->gpu_reset_state, state); }
>> +
>> +/**
>>     * amdgpu_gpu_reset - reset the asic
>>     *
>>     * @adev: amdgpu device pointer
>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>    	}
>>    
>>    	atomic_inc(&adev->gpu_reset_counter);
>> -
>> +	amdgpu_device_set_reset_state(adev, true);
>>    	/* block TTM */
>>    	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>    	/* store modesetting */
>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>    		dev_info(adev->dev, "GPU reset failed\n");
>>    	}
>>    
>> +	amdgpu_device_set_reset_state(adev, false);
>>    	return r;
>>    }
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index ead00d7..8cc14af 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>    	struct drm_file *file_priv = filp->private_data;
>>    	struct drm_device *dev;
>>    	long ret;
>> +
>>    	dev = file_priv->minor->dev;
>>    	ret = pm_runtime_get_sync(dev->dev);
>>    	if (ret < 0)
>>    		return ret;
>>    
>> +	while (amdgpu_device_is_reset(dev->dev_private))
>> +		msleep(100);
>> +
>>    	ret = drm_ioctl(filp, cmd, arg);
>>    
>>    	pm_runtime_mark_last_busy(dev->dev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> index 2648291..22b8059 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>> @@ -36,10 +36,15 @@
>>    long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>    {
>>    	unsigned int nr = DRM_IOCTL_NR(cmd);
>> +	struct drm_file *file_priv = filp->private_data;
>> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>    	int ret;
>>    
>> -	if (nr < DRM_COMMAND_BASE)
>> +	if (nr < DRM_COMMAND_BASE) {
>> +		while (amdgpu_device_is_reset(adev))
>> +			msleep(100);
>>    		return drm_compat_ioctl(filp, cmd, arg);
>> +	}
>>    
>>    	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>    
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]                     ` <6321c998-f337-39af-3d64-070341e5fbe4-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-04 11:41                       ` Liu, Monk
  0 siblings, 0 replies; 18+ messages in thread
From: Liu, Monk @ 2017-05-04 11:41 UTC (permalink / raw)
  To: Christian König, Zhou, David(ChunMing), He, Hongbo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

Agreed.
We can put rw lock in the place that accessing hardware or pde/pte,
But I'm not sure this is safe enough, maybe one IOCTL access hw at top and bottom, and need consistence access, 
If gpu reset happens maybe the consistence is broken .... just my worry 

But we can do from your approach to try first 

BR Monk

-----Original Message-----
From: Christian König [mailto:deathsimple@vodafone.de] 
Sent: Thursday, May 04, 2017 7:33 PM
To: Liu, Monk <Monk.Liu@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue

That's what we used to have with the exclusive lock.

The issue is not what kind of lock or locking or not locking at all, but rather where to place it.

Having a rw lock called something like "in_reset" is ok to me, but we shouldn't take it in the IOCTL path.

Instead we should block for example the GART TLB flush while being inside a reset etc...

Otherwise we run into the same problems (lock inversions for example) as we had before.

Regards,
Christian.

Am 04.05.2017 um 13:28 schrieb Liu, Monk:
> Christian
>
> Why not use rw lock ? that way only gpu reset will make those iOCTL 
> blocked, and in usual case the performance is still the same
>
> BR Monk
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf 
> Of Christian König
> Sent: Thursday, May 04, 2017 7:21 PM
> To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; He, Hongbo 
> <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>
> But that should be harmless because we rebuild the GART table after GPU reset, don't we?
>
> Apart from that I'm fine with having a lock or locks to prevent concurrent things from happening while we do the reset.
>
> But we should NOT do the mistake again and try to prevent everything is one big global lock. That was rather bad idea and using an atomic isn't better in any way.
>
> Regards,
> Christian.
>
> Am 04.05.2017 um 12:31 schrieb zhoucm1:
>> Hi Christian,
>>
>> We possibly must re-consider this patch, the direct issue is VM FAULT 
>> after gpu reset. one obvious bug is we could bind gtt to gart during 
>> GPU reset, binding gtt to gart needs to fill gart table and tlb 
>> flush, which is parallel with gpu reset thread.
>>
>> Regards,
>> David Zhou
>>
>> On 2017年04月21日 16:27, Christian König wrote:
>>> NAK, that is exactly what we wanted to avoid.
>>>
>>> We used to have an exclusive lock for this and it cause a whole 
>>> bunch of problems.
>>>
>>> Please elaborate why that should be necessary.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 21.04.2017 um 09:08 schrieb Roger.He:
>>>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>>>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>>>    4 files changed, 31 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 71364f5..ab0ffa8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>>>        atomic64_t            num_bytes_moved;
>>>>        atomic64_t            num_evictions;
>>>>        atomic_t            gpu_reset_counter;
>>>> +    atomic_t            gpu_reset_state;
>>>>          /* data for buffer migration throttling */
>>>>        struct {
>>>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring
>>>> *ring)
>>>>    #define amdgpu_psp_check_fw_loading_status(adev, i) 
>>>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>>>      /* Common functions */
>>>> +static inline int amdgpu_device_is_reset(struct amdgpu_device
>>>> +*adev) {
>>>> +    return atomic_read(&adev->gpu_reset_state);
>>>> +}
>>>> +
>>>>    int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>>>    bool amdgpu_need_backup(struct amdgpu_device *adev);
>>>>    void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff 
>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index f882496..0fb4716 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device 
>>>> *adev,
>>>>        mutex_init(&adev->grbm_idx_mutex);
>>>>        mutex_init(&adev->mn_lock);
>>>>        hash_init(adev->mn_hash);
>>>> +    atomic_set(&adev->gpu_reset_state, 0);
>>>>          amdgpu_check_arguments(adev);
>>>>    @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct 
>>>> amdgpu_device *adev, bool voluntary)
>>>>    }
>>>>      /**
>>>> + * amdgpu_device_set_reset_state - set gpu reset state
>>>> + *
>>>> + * @adev: amdgpu device pointer
>>>> + * @state: true when start to reset gpu; false: reset done  */ 
>>>> +static inline void amdgpu_device_set_reset_state(struct
>>>> amdgpu_device *adev,
>>>> +                            bool state) {
>>>> +    atomic_set(&adev->gpu_reset_state, state); }
>>>> +
>>>> +/**
>>>>     * amdgpu_gpu_reset - reset the asic
>>>>     *
>>>>     * @adev: amdgpu device pointer
>>>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>>        }
>>>>          atomic_inc(&adev->gpu_reset_counter);
>>>> -
>>>> +    amdgpu_device_set_reset_state(adev, true);
>>>>        /* block TTM */
>>>>        resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>>>        /* store modesetting */
>>>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>>            dev_info(adev->dev, "GPU reset failed\n");
>>>>        }
>>>>    +    amdgpu_device_set_reset_state(adev, false);
>>>>        return r;
>>>>    }
>>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index ead00d7..8cc14af 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>>>        struct drm_file *file_priv = filp->private_data;
>>>>        struct drm_device *dev;
>>>>        long ret;
>>>> +
>>>>        dev = file_priv->minor->dev;
>>>>        ret = pm_runtime_get_sync(dev->dev);
>>>>        if (ret < 0)
>>>>            return ret;
>>>>    +    while (amdgpu_device_is_reset(dev->dev_private))
>>>> +        msleep(100);
>>>> +
>>>>        ret = drm_ioctl(filp, cmd, arg);
>>>>          pm_runtime_mark_last_busy(dev->dev);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>>> index 2648291..22b8059 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>>> @@ -36,10 +36,15 @@
>>>>    long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int 
>>>> cmd, unsigned long arg)
>>>>    {
>>>>        unsigned int nr = DRM_IOCTL_NR(cmd);
>>>> +    struct drm_file *file_priv = filp->private_data;
>>>> +    struct amdgpu_device *adev =
>>>> + file_priv->minor->dev->dev_private;
>>>>        int ret;
>>>>    -    if (nr < DRM_COMMAND_BASE)
>>>> +    if (nr < DRM_COMMAND_BASE) {
>>>> +        while (amdgpu_device_is_reset(adev))
>>>> +            msleep(100);
>>>>            return drm_compat_ioctl(filp, cmd, arg);
>>>> +    }
>>>>          ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
       [not found]         ` <DM5PR12MB161079A7788AC1D9E4CD5D6284EA0-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2017-05-04 11:29           ` Christian König
@ 2017-05-04 14:46           ` Deucher, Alexander
  1 sibling, 0 replies; 18+ messages in thread
From: Deucher, Alexander @ 2017-05-04 14:46 UTC (permalink / raw)
  To: Liu, Monk, Zhou, David(ChunMing), He, Hongbo,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Liu, Monk
> Sent: Thursday, May 04, 2017 6:42 AM
> To: Zhou, David(ChunMing); He, Hongbo; amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
> 
> First of all,  Your patch will impact on SR-IOV case, because "gpu_reset_state"
> will be referenced by two place:
> 1) IOCTL; 2)GPU-RESET ,
> SRIOV use srio_gpu_reset, but SRIOV use the same IOCTL routines, that way
> the gpu_reset_state is not consistent for SR-IOV case
> 
> I suggest you firstly unify this gpu_reset_state for both SRIOV and bare-
> metal case
> 
> For sriov, we have "adev->gfx.in_reset " and you now have "adev-
> >gpu_reset_state",
> I suggest you replace all "adev->gfx.in_reset" with "adev->gpu_reset_state",

Yes, I agree. Please unify these.

Alex

> 
> BR Monk
> 
> 
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
> Of zhoucm1
> Sent: Friday, April 21, 2017 3:18 PM
> To: He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
> 
> 
> 
> On 2017年04月21日 15:08, Roger.He wrote:
> 
> > Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
> > Signed-off-by: Roger.He <Hongbo.He@amd.com>
> Please add more explaination in comment, like "prevent ioctl during gpu
> reset, otherwise many un-expected behaviour happen."
> 
> Regards,
> David Zhou
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16
> +++++++++++++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
> >   4 files changed, 31 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 71364f5..ab0ffa8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -1526,6 +1526,7 @@ struct amdgpu_device {
> >   	atomic64_t			num_bytes_moved;
> >   	atomic64_t			num_evictions;
> >   	atomic_t			gpu_reset_counter;
> > +	atomic_t			gpu_reset_state;
> >
> >   	/* data for buffer migration throttling */
> >   	struct {
> > @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct
> amdgpu_ring *ring)
> >   #define amdgpu_psp_check_fw_loading_status(adev, i)
> > (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
> >
> >   /* Common functions */
> > +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
> > +{
> > +	return atomic_read(&adev->gpu_reset_state);
> > +}
> > +
> >   int amdgpu_gpu_reset(struct amdgpu_device *adev);
> >   bool amdgpu_need_backup(struct amdgpu_device *adev);
> >   void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index f882496..0fb4716 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
> >   	mutex_init(&adev->grbm_idx_mutex);
> >   	mutex_init(&adev->mn_lock);
> >   	hash_init(adev->mn_hash);
> > +	atomic_set(&adev->gpu_reset_state, 0);
> >
> >   	amdgpu_check_arguments(adev);
> >
> > @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct
> amdgpu_device *adev, bool voluntary)
> >   }
> >
> >   /**
> > + * amdgpu_device_set_reset_state - set gpu reset state
> > + *
> > + * @adev: amdgpu device pointer
> > + * @state: true when start to reset gpu; false: reset done  */ static
> > +inline void amdgpu_device_set_reset_state(struct amdgpu_device
> *adev,
> > +							bool state)
> > +{
> > +	atomic_set(&adev->gpu_reset_state, state); }
> > +
> > +/**
> >    * amdgpu_gpu_reset - reset the asic
> >    *
> >    * @adev: amdgpu device pointer
> > @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device
> *adev)
> >   	}
> >
> >   	atomic_inc(&adev->gpu_reset_counter);
> > -
> > +	amdgpu_device_set_reset_state(adev, true);
> >   	/* block TTM */
> >   	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> >   	/* store modesetting */
> > @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device
> *adev)
> >   		dev_info(adev->dev, "GPU reset failed\n");
> >   	}
> >
> > +	amdgpu_device_set_reset_state(adev, false);
> >   	return r;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > index ead00d7..8cc14af 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
> >   	struct drm_file *file_priv = filp->private_data;
> >   	struct drm_device *dev;
> >   	long ret;
> > +
> >   	dev = file_priv->minor->dev;
> >   	ret = pm_runtime_get_sync(dev->dev);
> >   	if (ret < 0)
> >   		return ret;
> >
> > +	while (amdgpu_device_is_reset(dev->dev_private))
> > +		msleep(100);
> > +
> >   	ret = drm_ioctl(filp, cmd, arg);
> >
> >   	pm_runtime_mark_last_busy(dev->dev);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> > index 2648291..22b8059 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> > @@ -36,10 +36,15 @@
> >   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd,
> unsigned long arg)
> >   {
> >   	unsigned int nr = DRM_IOCTL_NR(cmd);
> > +	struct drm_file *file_priv = filp->private_data;
> > +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
> >   	int ret;
> >
> > -	if (nr < DRM_COMMAND_BASE)
> > +	if (nr < DRM_COMMAND_BASE) {
> > +		while (amdgpu_device_is_reset(adev))
> > +			msleep(100);
> >   		return drm_compat_ioctl(filp, cmd, arg);
> > +	}
> >
> >   	ret = amdgpu_drm_ioctl(filp, cmd, arg);
> >
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2017-05-04 14:46 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-04-21  7:08 [PATCH 3/3] drm/amdgpu: fix gpu reset issue Roger.He
     [not found] ` <1492758490-12631-1-git-send-email-Hongbo.He-5C7GfCeVMHo@public.gmane.org>
2017-04-21  7:18   ` zhoucm1
     [not found]     ` <58F9B22E.7050502-5C7GfCeVMHo@public.gmane.org>
2017-05-04 10:42       ` Liu, Monk
     [not found]         ` <DM5PR12MB161079A7788AC1D9E4CD5D6284EA0-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-05-04 11:29           ` Christian König
     [not found]             ` <c88049b2-2440-142e-129a-542c136b8d61-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-04 11:34               ` Liu, Monk
2017-05-04 14:46           ` Deucher, Alexander
2017-04-21  8:27   ` Christian König
     [not found]     ` <ef2c40cd-bb55-5921-9318-e0147f1b4719-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-04-21  8:54       ` zhoucm1
2017-04-21  9:00       ` He, Hongbo
     [not found]         ` <MWHPR1201MB01272142B19FFEB9EAD281F3FD1A0-3iK1xFAIwjq9imrIu4W8xGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-04-21  9:15           ` Christian König
     [not found]             ` <05b9d783-75ab-9924-6881-472540fc77d1-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-04-21  9:54               ` He, Hongbo
     [not found]                 ` <MWHPR1201MB0127A593761702A7A179855EFD1A0-3iK1xFAIwjq9imrIu4W8xGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-04-21 11:47                   ` Christian König
     [not found]                     ` <eeb3931b-ef67-8645-a3bc-bc7f3db4002e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-04-21 12:14                       ` He, Hongbo
2017-05-04 10:31       ` zhoucm1
     [not found]         ` <590B0307.2090001-5C7GfCeVMHo@public.gmane.org>
2017-05-04 11:20           ` Christian König
     [not found]             ` <f2c6a892-5929-8644-ca34-09f89e6359a4-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-04 11:28               ` Liu, Monk
     [not found]                 ` <DM5PR12MB1610FEFA4B2654FF67B2F1B684EA0-2J9CzHegvk++jCVTvoAFKAdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-05-04 11:32                   ` Christian König
     [not found]                     ` <6321c998-f337-39af-3d64-070341e5fbe4-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-04 11:41                       ` Liu, Monk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.