All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: extend lock range for race condition when gpu reset
@ 2017-05-05  7:22 Roger.He
       [not found] ` <1493968962-10463-1-git-send-email-Hongbo.He-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Roger.He @ 2017-05-05  7:22 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Roger.He

to cover below case:
1. A task gart bind/unbind but not add to adev->gtt_list yet
2. at this time gpu reset, gtt only recover those gtt in adev->gtt_list

Change-Id: Ifb4360e3b68624f2be67fa82100623cf4c451873
Signed-off-by: Roger.He <Hongbo.He@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c |  6 ++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c  | 22 ++++++++++++++--------
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 90a69bf..5310781 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -556,7 +556,7 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
 void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
 int amdgpu_gart_init(struct amdgpu_device *adev);
 void amdgpu_gart_fini(struct amdgpu_device *adev);
-void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
+int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
 			int pages);
 int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
 		     int pages, struct page **pagelist,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index e7406ce..ccef3cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -221,8 +221,9 @@ void amdgpu_gart_table_vram_free(struct amdgpu_device *adev)
  *
  * Unbinds the requested pages from the gart page table and
  * replaces them with the dummy page (all asics).
+ * Returns 0 for success, -EINVAL for failure.
  */
-void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
+int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
 			int pages)
 {
 	unsigned t;
@@ -234,7 +235,7 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
 
 	if (!adev->gart.ready) {
 		WARN(1, "trying to unbind memory from uninitialized GART !\n");
-		return;
+		return -EINVAL;
 	}
 
 	t = offset / AMDGPU_GPU_PAGE_SIZE;
@@ -255,6 +256,7 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
 	}
 	mb();
 	amdgpu_gart_flush_gpu_tlb(adev, 0);
+	return 0;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index c3fb2f9..278f55b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -745,6 +745,7 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, struct ttm_mem_reg *bo_mem)
 		return r;
 	}
 
+	spin_lock(&gtt->adev->gtt_list_lock);
 	flags = amdgpu_ttm_tt_pte_flags(gtt->adev, ttm, bo_mem);
 	gtt->offset = (u64)bo_mem->start << PAGE_SHIFT;
 	r = amdgpu_gart_bind(gtt->adev, gtt->offset, ttm->num_pages,
@@ -753,12 +754,13 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, struct ttm_mem_reg *bo_mem)
 	if (r) {
 		DRM_ERROR("failed to bind %lu pages at 0x%08llX\n",
 			  ttm->num_pages, gtt->offset);
-		return r;
+		goto error_gart_bind;
 	}
-	spin_lock(&gtt->adev->gtt_list_lock);
+
 	list_add_tail(&gtt->list, &gtt->adev->gtt_list);
+error_gart_bind:
 	spin_unlock(&gtt->adev->gtt_list_lock);
-	return 0;
+	return r;
 }
 
 int amdgpu_ttm_recover_gart(struct amdgpu_device *adev)
@@ -789,6 +791,7 @@ int amdgpu_ttm_recover_gart(struct amdgpu_device *adev)
 static int amdgpu_ttm_backend_unbind(struct ttm_tt *ttm)
 {
 	struct amdgpu_ttm_tt *gtt = (void *)ttm;
+	int r;
 
 	if (gtt->userptr)
 		amdgpu_ttm_tt_unpin_userptr(ttm);
@@ -797,14 +800,17 @@ static int amdgpu_ttm_backend_unbind(struct ttm_tt *ttm)
 		return 0;
 
 	/* unbind shouldn't be done for GDS/GWS/OA in ttm_bo_clean_mm */
-	if (gtt->adev->gart.ready)
-		amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
-
 	spin_lock(&gtt->adev->gtt_list_lock);
+	r = amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
+	if (r) {
+		DRM_ERROR("failed to unbind %lu pages at 0x%08llX\n",
+			  gtt->ttm.ttm.num_pages, gtt->offset);
+		goto error_unbind;
+	}
 	list_del_init(&gtt->list);
+error_unbind:
 	spin_unlock(&gtt->adev->gtt_list_lock);
-
-	return 0;
+	return r;
 }
 
 static void amdgpu_ttm_backend_destroy(struct ttm_tt *ttm)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdgpu: extend lock range for race condition when gpu reset
       [not found] ` <1493968962-10463-1-git-send-email-Hongbo.He-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-05  7:33   ` zhoucm1
       [not found]     ` <590C2AB9.201-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: zhoucm1 @ 2017-05-05  7:33 UTC (permalink / raw)
  To: Roger.He, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Reviewed-by: Chunming Zhou <david1.zhou@amd.com>

On 2017年05月05日 15:22, Roger.He wrote:
> to cover below case:
> 1. A task gart bind/unbind but not add to adev->gtt_list yet
> 2. at this time gpu reset, gtt only recover those gtt in adev->gtt_list
>
> Change-Id: Ifb4360e3b68624f2be67fa82100623cf4c451873
> Signed-off-by: Roger.He <Hongbo.He@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c |  6 ++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c  | 22 ++++++++++++++--------
>   3 files changed, 19 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 90a69bf..5310781 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -556,7 +556,7 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>   int amdgpu_gart_init(struct amdgpu_device *adev);
>   void amdgpu_gart_fini(struct amdgpu_device *adev);
> -void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
> +int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>   			int pages);
>   int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
>   		     int pages, struct page **pagelist,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> index e7406ce..ccef3cf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> @@ -221,8 +221,9 @@ void amdgpu_gart_table_vram_free(struct amdgpu_device *adev)
>    *
>    * Unbinds the requested pages from the gart page table and
>    * replaces them with the dummy page (all asics).
> + * Returns 0 for success, -EINVAL for failure.
>    */
> -void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
> +int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>   			int pages)
>   {
>   	unsigned t;
> @@ -234,7 +235,7 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>   
>   	if (!adev->gart.ready) {
>   		WARN(1, "trying to unbind memory from uninitialized GART !\n");
> -		return;
> +		return -EINVAL;
>   	}
>   
>   	t = offset / AMDGPU_GPU_PAGE_SIZE;
> @@ -255,6 +256,7 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>   	}
>   	mb();
>   	amdgpu_gart_flush_gpu_tlb(adev, 0);
> +	return 0;
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index c3fb2f9..278f55b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -745,6 +745,7 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, struct ttm_mem_reg *bo_mem)
>   		return r;
>   	}
>   
> +	spin_lock(&gtt->adev->gtt_list_lock);
>   	flags = amdgpu_ttm_tt_pte_flags(gtt->adev, ttm, bo_mem);
>   	gtt->offset = (u64)bo_mem->start << PAGE_SHIFT;
>   	r = amdgpu_gart_bind(gtt->adev, gtt->offset, ttm->num_pages,
> @@ -753,12 +754,13 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, struct ttm_mem_reg *bo_mem)
>   	if (r) {
>   		DRM_ERROR("failed to bind %lu pages at 0x%08llX\n",
>   			  ttm->num_pages, gtt->offset);
> -		return r;
> +		goto error_gart_bind;
>   	}
> -	spin_lock(&gtt->adev->gtt_list_lock);
> +
>   	list_add_tail(&gtt->list, &gtt->adev->gtt_list);
> +error_gart_bind:
>   	spin_unlock(&gtt->adev->gtt_list_lock);
> -	return 0;
> +	return r;
>   }
>   
>   int amdgpu_ttm_recover_gart(struct amdgpu_device *adev)
> @@ -789,6 +791,7 @@ int amdgpu_ttm_recover_gart(struct amdgpu_device *adev)
>   static int amdgpu_ttm_backend_unbind(struct ttm_tt *ttm)
>   {
>   	struct amdgpu_ttm_tt *gtt = (void *)ttm;
> +	int r;
>   
>   	if (gtt->userptr)
>   		amdgpu_ttm_tt_unpin_userptr(ttm);
> @@ -797,14 +800,17 @@ static int amdgpu_ttm_backend_unbind(struct ttm_tt *ttm)
>   		return 0;
>   
>   	/* unbind shouldn't be done for GDS/GWS/OA in ttm_bo_clean_mm */
> -	if (gtt->adev->gart.ready)
> -		amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
> -
>   	spin_lock(&gtt->adev->gtt_list_lock);
> +	r = amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
> +	if (r) {
> +		DRM_ERROR("failed to unbind %lu pages at 0x%08llX\n",
> +			  gtt->ttm.ttm.num_pages, gtt->offset);
> +		goto error_unbind;
> +	}
>   	list_del_init(&gtt->list);
> +error_unbind:
>   	spin_unlock(&gtt->adev->gtt_list_lock);
> -
> -	return 0;
> +	return r;
>   }
>   
>   static void amdgpu_ttm_backend_destroy(struct ttm_tt *ttm)

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdgpu: extend lock range for race condition when gpu reset
       [not found]     ` <590C2AB9.201-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-05 12:04       ` Christian König
       [not found]         ` <50c920fd-2645-8d9c-44d5-34a97f3c6f85-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2017-05-05 12:04 UTC (permalink / raw)
  To: zhoucm1, Roger.He, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Reviewed-by: Christian König <christian.koenig@amd.com>

Am 05.05.2017 um 09:33 schrieb zhoucm1:
> Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
>
> On 2017年05月05日 15:22, Roger.He wrote:
>> to cover below case:
>> 1. A task gart bind/unbind but not add to adev->gtt_list yet
>> 2. at this time gpu reset, gtt only recover those gtt in adev->gtt_list
>>
>> Change-Id: Ifb4360e3b68624f2be67fa82100623cf4c451873
>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  2 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c |  6 ++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c  | 22 ++++++++++++++--------
>>   3 files changed, 19 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 90a69bf..5310781 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -556,7 +556,7 @@ int amdgpu_gart_table_vram_pin(struct 
>> amdgpu_device *adev);
>>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>>   int amdgpu_gart_init(struct amdgpu_device *adev);
>>   void amdgpu_gart_fini(struct amdgpu_device *adev);
>> -void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>> +int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>               int pages);
>>   int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
>>                int pages, struct page **pagelist,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> index e7406ce..ccef3cf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> @@ -221,8 +221,9 @@ void amdgpu_gart_table_vram_free(struct 
>> amdgpu_device *adev)
>>    *
>>    * Unbinds the requested pages from the gart page table and
>>    * replaces them with the dummy page (all asics).
>> + * Returns 0 for success, -EINVAL for failure.
>>    */
>> -void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>> +int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>               int pages)
>>   {
>>       unsigned t;
>> @@ -234,7 +235,7 @@ void amdgpu_gart_unbind(struct amdgpu_device 
>> *adev, uint64_t offset,
>>         if (!adev->gart.ready) {
>>           WARN(1, "trying to unbind memory from uninitialized GART 
>> !\n");
>> -        return;
>> +        return -EINVAL;
>>       }
>>         t = offset / AMDGPU_GPU_PAGE_SIZE;
>> @@ -255,6 +256,7 @@ void amdgpu_gart_unbind(struct amdgpu_device 
>> *adev, uint64_t offset,
>>       }
>>       mb();
>>       amdgpu_gart_flush_gpu_tlb(adev, 0);
>> +    return 0;
>>   }
>>     /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index c3fb2f9..278f55b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -745,6 +745,7 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, 
>> struct ttm_mem_reg *bo_mem)
>>           return r;
>>       }
>>   +    spin_lock(&gtt->adev->gtt_list_lock);
>>       flags = amdgpu_ttm_tt_pte_flags(gtt->adev, ttm, bo_mem);
>>       gtt->offset = (u64)bo_mem->start << PAGE_SHIFT;
>>       r = amdgpu_gart_bind(gtt->adev, gtt->offset, ttm->num_pages,
>> @@ -753,12 +754,13 @@ int amdgpu_ttm_bind(struct ttm_buffer_object 
>> *bo, struct ttm_mem_reg *bo_mem)
>>       if (r) {
>>           DRM_ERROR("failed to bind %lu pages at 0x%08llX\n",
>>                 ttm->num_pages, gtt->offset);
>> -        return r;
>> +        goto error_gart_bind;
>>       }
>> -    spin_lock(&gtt->adev->gtt_list_lock);
>> +
>>       list_add_tail(&gtt->list, &gtt->adev->gtt_list);
>> +error_gart_bind:
>>       spin_unlock(&gtt->adev->gtt_list_lock);
>> -    return 0;
>> +    return r;
>>   }
>>     int amdgpu_ttm_recover_gart(struct amdgpu_device *adev)
>> @@ -789,6 +791,7 @@ int amdgpu_ttm_recover_gart(struct amdgpu_device 
>> *adev)
>>   static int amdgpu_ttm_backend_unbind(struct ttm_tt *ttm)
>>   {
>>       struct amdgpu_ttm_tt *gtt = (void *)ttm;
>> +    int r;
>>         if (gtt->userptr)
>>           amdgpu_ttm_tt_unpin_userptr(ttm);
>> @@ -797,14 +800,17 @@ static int amdgpu_ttm_backend_unbind(struct 
>> ttm_tt *ttm)
>>           return 0;
>>         /* unbind shouldn't be done for GDS/GWS/OA in ttm_bo_clean_mm */
>> -    if (gtt->adev->gart.ready)
>> -        amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
>> -
>>       spin_lock(&gtt->adev->gtt_list_lock);
>> +    r = amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
>> +    if (r) {
>> +        DRM_ERROR("failed to unbind %lu pages at 0x%08llX\n",
>> +              gtt->ttm.ttm.num_pages, gtt->offset);
>> +        goto error_unbind;
>> +    }
>>       list_del_init(&gtt->list);
>> +error_unbind:
>>       spin_unlock(&gtt->adev->gtt_list_lock);
>> -
>> -    return 0;
>> +    return r;
>>   }
>>     static void amdgpu_ttm_backend_destroy(struct ttm_tt *ttm)
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] drm/amdgpu: extend lock range for race condition when gpu reset
       [not found]         ` <50c920fd-2645-8d9c-44d5-34a97f3c6f85-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-06-16  6:29           ` Yu, Xiangliang
       [not found]             ` <BY2PR1201MB093501712C1A8297EA44A45CEBC10-O28G1zQ8oGkaqtME6NEo1mrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Yu, Xiangliang @ 2017-06-16  6:29 UTC (permalink / raw)
  To: Christian König, Zhou, David(ChunMing), He, Roger,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

Hi Hongbo,

I got soft lockup message as below when running valkan test with two VFs, and work fine if reverting this patch. Could you help check it when you feel free?

[ 1920.163455] NMI watchdog: BUG: soft lockup - CPU#1 stuck for 23s! [deqp-vk:2175]
[ 1920.163459] Modules linked in: amdkfd amd_iommu_v2 amdgpu(OE) ttm drm_kms_helper drm i2c_algo_bit fb_sys_fops syscopyarea sysfillrect sysimgblt snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_seq_device snd_timer aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd serio_raw snd mac_hid soundcore i2c_piix4 binfmt_misc parport_pc ppdev sunrpc lp parport autofs4 psmouse pata_acpi floppy
[ 1920.163482] CPU: 1 PID: 2175 Comm: deqp-vk Tainted: G           OE   4.9.0-custom #1
[ 1920.163483] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 1920.163484] task: ffff880139c63a80 task.stack: ffffc9000313c000
[ 1920.163485] RIP: 0010:[<ffffffff810c7c75>]  [<ffffffff810c7c75>] native_queued_spin_lock_slowpath+0x15/0x1a0
[ 1920.163491] RSP: 0018:ffffc9000313f6d8  EFLAGS: 00000202
[ 1920.163492] RAX: 0000000000000001 RBX: ffff88012f054d80 RCX: 0000000000000000
[ 1920.163492] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8801334899a8
[ 1920.163493] RBP: ffffc9000313f6d8 R08: 0000000000000000 R09: 0000000000000128
[ 1920.163493] R10: 000001b9269bb35e R11: 0000000000000001 R12: ffff88012f054e28
[ 1920.163494] R13: ffff88012f054d80 R14: ffff8800369ecc00 R15: ffff880133482778
[ 1920.163495] FS:  00007fc0cf438740(0000) GS:ffff88013fc80000(0000) knlGS:0000000000000000
[ 1920.163496] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1920.163496] CR2: 00007fc0c5100fa0 CR3: 000000013870d000 CR4: 00000000001406e0
[ 1920.163500] Stack:
[ 1920.163500]  ffffc9000313f6e8 ffffffff811816ef ffffc9000313f6f8 ffffffff817e1a10
[ 1920.163502]  ffffc9000313f738 ffffffffa03242c0 ffff8800369ecc00 0000000000000400
[ 1920.163503]  0000000000000001 ffff88012f054d80 0000000000000002 ffff88012e831380
[ 1920.163505] Call Trace:
[ 1920.163509]  [<ffffffff811816ef>] queued_spin_lock_slowpath+0xb/0xf
[ 1920.163513]  [<ffffffff817e1a10>] _raw_spin_lock+0x20/0x30
[ 1920.163562]  [<ffffffffa03242c0>] amdgpu_ttm_backend_unbind+0x50/0x1e0 [amdgpu]
[ 1920.163567]  [<ffffffffa02ecbee>] ttm_tt_unbind+0x1e/0x30 [ttm]
[ 1920.163570]  [<ffffffffa02ece27>] ttm_tt_destroy+0x17/0x60 [ttm]
[ 1920.163572]  [<ffffffffa02ed370>] ttm_bo_cleanup_memtype_use+0x30/0x70 [ttm]
[ 1920.163575]  [<ffffffffa02ee4fa>] ttm_bo_release+0x1ca/0x2a0 [ttm]
[ 1920.163578]  [<ffffffffa02ee5f4>] ttm_bo_unref+0x24/0x30 [ttm]
[ 1920.163580]  [<ffffffffa02f1ce7>] ttm_bo_pipeline_move+0x2a7/0x3a0 [ttm]
[ 1920.163599]  [<ffffffffa0325fbc>] amdgpu_move_blit+0x1bc/0x260 [amdgpu]
[ 1920.163617]  [<ffffffffa03263e9>] amdgpu_bo_move+0xb9/0x230 [amdgpu]
[ 1920.163620]  [<ffffffffa02eec38>] ttm_bo_handle_move_mem+0x268/0x590 [ttm]
[ 1920.163623]  [<ffffffffa02ef7ed>] ? ttm_bo_mem_space+0x38d/0x440 [ttm]
[ 1920.163625]  [<ffffffff8103d583>] ? __save_stack_trace+0x73/0xd0
[ 1920.163628]  [<ffffffffa02efd74>] ttm_bo_validate+0x114/0x130 [ttm]
[ 1920.163657]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu]
[ 1920.163675]  [<ffffffffa032e325>] amdgpu_cs_bo_validate.isra.5+0x75/0xb0 [amdgpu]
[ 1920.163693]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu]
[ 1920.163710]  [<ffffffffa032e3a9>] amdgpu_cs_validate+0x49/0x1b0 [amdgpu]
[ 1920.163727]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu]
[ 1920.163743]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu]
[ 1920.163762]  [<ffffffffa033d57f>] amdgpu_vm_validate_level.isra.9+0x4f/0x90 [amdgpu]
[ 1920.163791]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu]
[ 1920.163807]  [<ffffffffa033d596>] amdgpu_vm_validate_level.isra.9+0x66/0x90 [amdgpu]
[ 1920.163823]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu]
[ 1920.163840]  [<ffffffffa033d596>] amdgpu_vm_validate_level.isra.9+0x66/0x90 [amdgpu]
[ 1920.163856]  [<ffffffffa033e606>] amdgpu_vm_validate_pt_bos+0x26/0x30 [amdgpu]
[ 1920.163872]  [<ffffffffa032fef8>] amdgpu_cs_ioctl+0xca8/0x1490 [amdgpu]
[ 1920.163889]  [<ffffffffa023cbac>] drm_ioctl+0x32c/0x440 [drm]
[ 1920.163904]  [<ffffffffa032f250>] ? amdgpu_cs_find_mapping+0xb0/0xb0 [amdgpu]
[ 1920.163906]  [<ffffffff811fe356>] ? mem_cgroup_commit_charge+0x76/0xe0
[ 1920.163908]  [<ffffffff811c5529>] ? page_add_new_anon_rmap+0x89/0xc0
[ 1920.163910]  [<ffffffff81194249>] ? lru_cache_add_active_or_unevictable+0x39/0xc0
[ 1920.163925]  [<ffffffffa031104c>] amdgpu_drm_ioctl+0x4c/0x80 [amdgpu]
[ 1920.163927]  [<ffffffff81220db6>] do_vfs_ioctl+0x96/0x5b0
[ 1920.163929]  [<ffffffff810674d7>] ? __do_page_fault+0x267/0x4d0
[ 1920.163930]  [<ffffffff81221349>] SyS_ioctl+0x79/0x90
[ 1920.163932]  [<ffffffff8100392e>] do_syscall_64+0x6e/0x180
[ 1920.163933]  [<ffffffff817e1d2f>] entry_SYSCALL64_slow_path+0x25/0x25

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian K?nig
Sent: Friday, May 05, 2017 8:04 PM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: extend lock range for race condition when gpu reset

Reviewed-by: Christian König <christian.koenig@amd.com>

Am 05.05.2017 um 09:33 schrieb zhoucm1:
> Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
>
> On 2017年05月05日 15:22, Roger.He wrote:
>> to cover below case:
>> 1. A task gart bind/unbind but not add to adev->gtt_list yet 2. at 
>> this time gpu reset, gtt only recover those gtt in adev->gtt_list
>>
>> Change-Id: Ifb4360e3b68624f2be67fa82100623cf4c451873
>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  2 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c |  6 ++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c  | 22 ++++++++++++++--------
>>   3 files changed, 19 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 90a69bf..5310781 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -556,7 +556,7 @@ int amdgpu_gart_table_vram_pin(struct 
>> amdgpu_device *adev);
>>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>>   int amdgpu_gart_init(struct amdgpu_device *adev);
>>   void amdgpu_gart_fini(struct amdgpu_device *adev); -void 
>> amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>> +int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>               int pages);
>>   int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
>>                int pages, struct page **pagelist, diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> index e7406ce..ccef3cf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> @@ -221,8 +221,9 @@ void amdgpu_gart_table_vram_free(struct
>> amdgpu_device *adev)
>>    *
>>    * Unbinds the requested pages from the gart page table and
>>    * replaces them with the dummy page (all asics).
>> + * Returns 0 for success, -EINVAL for failure.
>>    */
>> -void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>> +int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>               int pages)
>>   {
>>       unsigned t;
>> @@ -234,7 +235,7 @@ void amdgpu_gart_unbind(struct amdgpu_device 
>> *adev, uint64_t offset,
>>         if (!adev->gart.ready) {
>>           WARN(1, "trying to unbind memory from uninitialized GART 
>> !\n");
>> -        return;
>> +        return -EINVAL;
>>       }
>>         t = offset / AMDGPU_GPU_PAGE_SIZE; @@ -255,6 +256,7 @@ void 
>> amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>       }
>>       mb();
>>       amdgpu_gart_flush_gpu_tlb(adev, 0);
>> +    return 0;
>>   }
>>     /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index c3fb2f9..278f55b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -745,6 +745,7 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, 
>> struct ttm_mem_reg *bo_mem)
>>           return r;
>>       }
>>   +    spin_lock(&gtt->adev->gtt_list_lock);
>>       flags = amdgpu_ttm_tt_pte_flags(gtt->adev, ttm, bo_mem);
>>       gtt->offset = (u64)bo_mem->start << PAGE_SHIFT;
>>       r = amdgpu_gart_bind(gtt->adev, gtt->offset, ttm->num_pages, @@ 
>> -753,12 +754,13 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, 
>> struct ttm_mem_reg *bo_mem)
>>       if (r) {
>>           DRM_ERROR("failed to bind %lu pages at 0x%08llX\n",
>>                 ttm->num_pages, gtt->offset);
>> -        return r;
>> +        goto error_gart_bind;
>>       }
>> -    spin_lock(&gtt->adev->gtt_list_lock);
>> +
>>       list_add_tail(&gtt->list, &gtt->adev->gtt_list);
>> +error_gart_bind:
>>       spin_unlock(&gtt->adev->gtt_list_lock);
>> -    return 0;
>> +    return r;
>>   }
>>     int amdgpu_ttm_recover_gart(struct amdgpu_device *adev) @@ -789,6 
>> +791,7 @@ int amdgpu_ttm_recover_gart(struct amdgpu_device
>> *adev)
>>   static int amdgpu_ttm_backend_unbind(struct ttm_tt *ttm)
>>   {
>>       struct amdgpu_ttm_tt *gtt = (void *)ttm;
>> +    int r;
>>         if (gtt->userptr)
>>           amdgpu_ttm_tt_unpin_userptr(ttm); @@ -797,14 +800,17 @@ 
>> static int amdgpu_ttm_backend_unbind(struct ttm_tt *ttm)
>>           return 0;
>>         /* unbind shouldn't be done for GDS/GWS/OA in ttm_bo_clean_mm */
>> -    if (gtt->adev->gart.ready)
>> -        amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
>> -
>>       spin_lock(&gtt->adev->gtt_list_lock);
>> +    r = amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
>> +    if (r) {
>> +        DRM_ERROR("failed to unbind %lu pages at 0x%08llX\n",
>> +              gtt->ttm.ttm.num_pages, gtt->offset);
>> +        goto error_unbind;
>> +    }
>>       list_del_init(&gtt->list);
>> +error_unbind:
>>       spin_unlock(&gtt->adev->gtt_list_lock);
>> -
>> -    return 0;
>> +    return r;
>>   }
>>     static void amdgpu_ttm_backend_destroy(struct ttm_tt *ttm)
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] drm/amdgpu: extend lock range for race condition when gpu reset
       [not found]             ` <BY2PR1201MB093501712C1A8297EA44A45CEBC10-O28G1zQ8oGkaqtME6NEo1mrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-06-16  6:57               ` He, Roger
  0 siblings, 0 replies; 5+ messages in thread
From: He, Roger @ 2017-06-16  6:57 UTC (permalink / raw)
  To: Yu, Xiangliang, Christian König, Zhou, David(ChunMing),
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

Hi Xiangliang:
      Could you try to replace spinlock with mutex?

Thanks
Roger(Hongbo.He)
-----Original Message-----
From: Yu, Xiangliang 
Sent: Friday, June 16, 2017 2:30 PM
To: Christian König <deathsimple@vodafone.de>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; He, Roger <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu: extend lock range for race condition when gpu reset

Hi Hongbo,

I got soft lockup message as below when running valkan test with two VFs, and work fine if reverting this patch. Could you help check it when you feel free?

[ 1920.163455] NMI watchdog: BUG: soft lockup - CPU#1 stuck for 23s! [deqp-vk:2175] [ 1920.163459] Modules linked in: amdkfd amd_iommu_v2 amdgpu(OE) ttm drm_kms_helper drm i2c_algo_bit fb_sys_fops syscopyarea sysfillrect sysimgblt snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_seq_device snd_timer aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd serio_raw snd mac_hid soundcore i2c_piix4 binfmt_misc parport_pc ppdev sunrpc lp parport autofs4 psmouse pata_acpi floppy
[ 1920.163482] CPU: 1 PID: 2175 Comm: deqp-vk Tainted: G           OE   4.9.0-custom #1
[ 1920.163483] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 1920.163484] task: ffff880139c63a80 task.stack: ffffc9000313c000 [ 1920.163485] RIP: 0010:[<ffffffff810c7c75>]  [<ffffffff810c7c75>] native_queued_spin_lock_slowpath+0x15/0x1a0
[ 1920.163491] RSP: 0018:ffffc9000313f6d8  EFLAGS: 00000202 [ 1920.163492] RAX: 0000000000000001 RBX: ffff88012f054d80 RCX: 0000000000000000 [ 1920.163492] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8801334899a8 [ 1920.163493] RBP: ffffc9000313f6d8 R08: 0000000000000000 R09: 0000000000000128 [ 1920.163493] R10: 000001b9269bb35e R11: 0000000000000001 R12: ffff88012f054e28 [ 1920.163494] R13: ffff88012f054d80 R14: ffff8800369ecc00 R15: ffff880133482778 [ 1920.163495] FS:  00007fc0cf438740(0000) GS:ffff88013fc80000(0000) knlGS:0000000000000000 [ 1920.163496] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1920.163496] CR2: 00007fc0c5100fa0 CR3: 000000013870d000 CR4: 00000000001406e0 [ 1920.163500] Stack:
[ 1920.163500]  ffffc9000313f6e8 ffffffff811816ef ffffc9000313f6f8 ffffffff817e1a10 [ 1920.163502]  ffffc9000313f738 ffffffffa03242c0 ffff8800369ecc00 0000000000000400 [ 1920.163503]  0000000000000001 ffff88012f054d80 0000000000000002 ffff88012e831380 [ 1920.163505] Call Trace:
[ 1920.163509]  [<ffffffff811816ef>] queued_spin_lock_slowpath+0xb/0xf [ 1920.163513]  [<ffffffff817e1a10>] _raw_spin_lock+0x20/0x30 [ 1920.163562]  [<ffffffffa03242c0>] amdgpu_ttm_backend_unbind+0x50/0x1e0 [amdgpu] [ 1920.163567]  [<ffffffffa02ecbee>] ttm_tt_unbind+0x1e/0x30 [ttm] [ 1920.163570]  [<ffffffffa02ece27>] ttm_tt_destroy+0x17/0x60 [ttm] [ 1920.163572]  [<ffffffffa02ed370>] ttm_bo_cleanup_memtype_use+0x30/0x70 [ttm] [ 1920.163575]  [<ffffffffa02ee4fa>] ttm_bo_release+0x1ca/0x2a0 [ttm] [ 1920.163578]  [<ffffffffa02ee5f4>] ttm_bo_unref+0x24/0x30 [ttm] [ 1920.163580]  [<ffffffffa02f1ce7>] ttm_bo_pipeline_move+0x2a7/0x3a0 [ttm] [ 1920.163599]  [<ffffffffa0325fbc>] amdgpu_move_blit+0x1bc/0x260 [amdgpu] [ 1920.163617]  [<ffffffffa03263e9>] amdgpu_bo_move+0xb9/0x230 [amdgpu] [ 1920.163620]  [<ffffffffa02eec38>] ttm_bo_handle_move_mem+0x268/0x590 [ttm] [ 1920.163623]  [<ffffffffa02ef7ed>] ? ttm_bo_mem_space+0x38d/0x440 [ttm] [ 1920.163625]  [<ffffffff8103d583>] ? __save_stack_trace+0x73/0xd0 [ 1920.163628]  [<ffffffffa02efd74>] ttm_bo_validate+0x114/0x130 [ttm] [ 1920.163657]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu] [ 1920.163675]  [<ffffffffa032e325>] amdgpu_cs_bo_validate.isra.5+0x75/0xb0 [amdgpu] [ 1920.163693]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu] [ 1920.163710]  [<ffffffffa032e3a9>] amdgpu_cs_validate+0x49/0x1b0 [amdgpu] [ 1920.163727]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu] [ 1920.163743]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu] [ 1920.163762]  [<ffffffffa033d57f>] amdgpu_vm_validate_level.isra.9+0x4f/0x90 [amdgpu] [ 1920.163791]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu] [ 1920.163807]  [<ffffffffa033d596>] amdgpu_vm_validate_level.isra.9+0x66/0x90 [amdgpu] [ 1920.163823]  [<ffffffffa032e360>] ? amdgpu_cs_bo_validate.isra.5+0xb0/0xb0 [amdgpu] [ 1920.163840]  [<ffffffffa033d596>] amdgpu_vm_validate_level.isra.9+0x66/0x90 [amdgpu] [ 1920.163856]  [<ffffffffa033e606>] amdgpu_vm_validate_pt_bos+0x26/0x30 [amdgpu] [ 1920.163872]  [<ffffffffa032fef8>] amdgpu_cs_ioctl+0xca8/0x1490 [amdgpu] [ 1920.163889]  [<ffffffffa023cbac>] drm_ioctl+0x32c/0x440 [drm] [ 1920.163904]  [<ffffffffa032f250>] ? amdgpu_cs_find_mapping+0xb0/0xb0 [amdgpu] [ 1920.163906]  [<ffffffff811fe356>] ? mem_cgroup_commit_charge+0x76/0xe0
[ 1920.163908]  [<ffffffff811c5529>] ? page_add_new_anon_rmap+0x89/0xc0 [ 1920.163910]  [<ffffffff81194249>] ? lru_cache_add_active_or_unevictable+0x39/0xc0
[ 1920.163925]  [<ffffffffa031104c>] amdgpu_drm_ioctl+0x4c/0x80 [amdgpu] [ 1920.163927]  [<ffffffff81220db6>] do_vfs_ioctl+0x96/0x5b0 [ 1920.163929]  [<ffffffff810674d7>] ? __do_page_fault+0x267/0x4d0 [ 1920.163930]  [<ffffffff81221349>] SyS_ioctl+0x79/0x90 [ 1920.163932]  [<ffffffff8100392e>] do_syscall_64+0x6e/0x180 [ 1920.163933]  [<ffffffff817e1d2f>] entry_SYSCALL64_slow_path+0x25/0x25

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Christian K?nig
Sent: Friday, May 05, 2017 8:04 PM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; He, Hongbo <Hongbo.He@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: extend lock range for race condition when gpu reset

Reviewed-by: Christian König <christian.koenig@amd.com>

Am 05.05.2017 um 09:33 schrieb zhoucm1:
> Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
>
> On 2017年05月05日 15:22, Roger.He wrote:
>> to cover below case:
>> 1. A task gart bind/unbind but not add to adev->gtt_list yet 2. at 
>> this time gpu reset, gtt only recover those gtt in adev->gtt_list
>>
>> Change-Id: Ifb4360e3b68624f2be67fa82100623cf4c451873
>> Signed-off-by: Roger.He <Hongbo.He@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  2 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c |  6 ++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c  | 22 ++++++++++++++--------
>>   3 files changed, 19 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 90a69bf..5310781 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -556,7 +556,7 @@ int amdgpu_gart_table_vram_pin(struct 
>> amdgpu_device *adev);
>>   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
>>   int amdgpu_gart_init(struct amdgpu_device *adev);
>>   void amdgpu_gart_fini(struct amdgpu_device *adev); -void 
>> amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>> +int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>               int pages);
>>   int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
>>                int pages, struct page **pagelist, diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> index e7406ce..ccef3cf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> @@ -221,8 +221,9 @@ void amdgpu_gart_table_vram_free(struct
>> amdgpu_device *adev)
>>    *
>>    * Unbinds the requested pages from the gart page table and
>>    * replaces them with the dummy page (all asics).
>> + * Returns 0 for success, -EINVAL for failure.
>>    */
>> -void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>> +int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>               int pages)
>>   {
>>       unsigned t;
>> @@ -234,7 +235,7 @@ void amdgpu_gart_unbind(struct amdgpu_device 
>> *adev, uint64_t offset,
>>         if (!adev->gart.ready) {
>>           WARN(1, "trying to unbind memory from uninitialized GART 
>> !\n");
>> -        return;
>> +        return -EINVAL;
>>       }
>>         t = offset / AMDGPU_GPU_PAGE_SIZE; @@ -255,6 +256,7 @@ void 
>> amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
>>       }
>>       mb();
>>       amdgpu_gart_flush_gpu_tlb(adev, 0);
>> +    return 0;
>>   }
>>     /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index c3fb2f9..278f55b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -745,6 +745,7 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, 
>> struct ttm_mem_reg *bo_mem)
>>           return r;
>>       }
>>   +    spin_lock(&gtt->adev->gtt_list_lock);
>>       flags = amdgpu_ttm_tt_pte_flags(gtt->adev, ttm, bo_mem);
>>       gtt->offset = (u64)bo_mem->start << PAGE_SHIFT;
>>       r = amdgpu_gart_bind(gtt->adev, gtt->offset, ttm->num_pages, @@
>> -753,12 +754,13 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, 
>> struct ttm_mem_reg *bo_mem)
>>       if (r) {
>>           DRM_ERROR("failed to bind %lu pages at 0x%08llX\n",
>>                 ttm->num_pages, gtt->offset);
>> -        return r;
>> +        goto error_gart_bind;
>>       }
>> -    spin_lock(&gtt->adev->gtt_list_lock);
>> +
>>       list_add_tail(&gtt->list, &gtt->adev->gtt_list);
>> +error_gart_bind:
>>       spin_unlock(&gtt->adev->gtt_list_lock);
>> -    return 0;
>> +    return r;
>>   }
>>     int amdgpu_ttm_recover_gart(struct amdgpu_device *adev) @@ -789,6
>> +791,7 @@ int amdgpu_ttm_recover_gart(struct amdgpu_device
>> *adev)
>>   static int amdgpu_ttm_backend_unbind(struct ttm_tt *ttm)
>>   {
>>       struct amdgpu_ttm_tt *gtt = (void *)ttm;
>> +    int r;
>>         if (gtt->userptr)
>>           amdgpu_ttm_tt_unpin_userptr(ttm); @@ -797,14 +800,17 @@ 
>> static int amdgpu_ttm_backend_unbind(struct ttm_tt *ttm)
>>           return 0;
>>         /* unbind shouldn't be done for GDS/GWS/OA in ttm_bo_clean_mm */
>> -    if (gtt->adev->gart.ready)
>> -        amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
>> -
>>       spin_lock(&gtt->adev->gtt_list_lock);
>> +    r = amdgpu_gart_unbind(gtt->adev, gtt->offset, ttm->num_pages);
>> +    if (r) {
>> +        DRM_ERROR("failed to unbind %lu pages at 0x%08llX\n",
>> +              gtt->ttm.ttm.num_pages, gtt->offset);
>> +        goto error_unbind;
>> +    }
>>       list_del_init(&gtt->list);
>> +error_unbind:
>>       spin_unlock(&gtt->adev->gtt_list_lock);
>> -
>> -    return 0;
>> +    return r;
>>   }
>>     static void amdgpu_ttm_backend_destroy(struct ttm_tt *ttm)
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-06-16  6:57 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-05-05  7:22 [PATCH] drm/amdgpu: extend lock range for race condition when gpu reset Roger.He
     [not found] ` <1493968962-10463-1-git-send-email-Hongbo.He-5C7GfCeVMHo@public.gmane.org>
2017-05-05  7:33   ` zhoucm1
     [not found]     ` <590C2AB9.201-5C7GfCeVMHo@public.gmane.org>
2017-05-05 12:04       ` Christian König
     [not found]         ` <50c920fd-2645-8d9c-44d5-34a97f3c6f85-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-06-16  6:29           ` Yu, Xiangliang
     [not found]             ` <BY2PR1201MB093501712C1A8297EA44A45CEBC10-O28G1zQ8oGkaqtME6NEo1mrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-06-16  6:57               ` He, Roger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.