All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu:fix world switch hang
@ 2017-06-06  9:27 Monk Liu
       [not found] ` <1496741253-24964-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 3+ messages in thread
From: Monk Liu @ 2017-06-06  9:27 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

for SR-IOV, we must keep the pipeline-sync in the protection
of COND_EXEC, otherwise the command consumed by CPG is not
consistent when world switch triggerd, e.g.:

world switch hit and the IB frame is skipped so the fence
won't signal, thus CP will jump to the next DMAframe's pipeline-sync
command, and it will make CP hang foever.

after pipelin-sync moved into COND_EXEC the consistency can be
guaranteed

Change-Id: I3059535745e2e1215343299b17f27fe081fc01bc
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 +++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 025e03b..947bd34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -130,6 +130,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 
 	unsigned i;
 	int r = 0;
+	bool need_pipe_sync = false;
 
 	if (num_ibs == 0)
 		return -EINVAL;
@@ -165,7 +166,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	if (ring->funcs->emit_pipeline_sync && job &&
 	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync)) ||
 	     amdgpu_vm_need_pipeline_sync(ring, job))) {
-		amdgpu_ring_emit_pipeline_sync(ring);
+		need_pipe_sync = true;
 		fence_put(tmp);
 	}
 
@@ -173,7 +174,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		ring->funcs->insert_start(ring);
 
 	if (vm) {
-		r = amdgpu_vm_flush(ring, job);
+		r = amdgpu_vm_flush(ring, job, need_pipe_sync);
 		if (r) {
 			amdgpu_ring_undo(ring);
 			return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index b065f8b..6d62e34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -747,7 +747,7 @@ static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
  *
  * Emit a VM flush when it is necessary.
  */
-int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job)
+int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync)
 {
 	struct amdgpu_device *adev = ring->adev;
 	unsigned vmhub = ring->funcs->vmhub;
@@ -770,12 +770,15 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job)
 		vm_flush_needed = true;
 	}
 
-	if (!vm_flush_needed && !gds_switch_needed)
+	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
 		return 0;
 
 	if (ring->funcs->init_cond_exec)
 		patch_offset = amdgpu_ring_init_cond_exec(ring);
 
+	if (need_pipe_sync)
+		amdgpu_ring_emit_pipeline_sync(ring);
+
 	if (ring->funcs->emit_vm_flush && vm_flush_needed) {
 		struct fence *fence;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 9aa00d9..cc0009f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -222,7 +222,7 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
 int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
 		      struct amdgpu_sync *sync, struct fence *fence,
 		      struct amdgpu_job *job);
-int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job);
+int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync);
 void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub,
 			unsigned vmid);
 void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdgpu:fix world switch hang
       [not found] ` <1496741253-24964-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-06 12:01   ` Christian König
       [not found]     ` <f4d58615-318c-a093-e31a-2fa8995a6a85-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 3+ messages in thread
From: Christian König @ 2017-06-06 12:01 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 06.06.2017 um 11:27 schrieb Monk Liu:
> for SR-IOV, we must keep the pipeline-sync in the protection
> of COND_EXEC, otherwise the command consumed by CPG is not
> consistent when world switch triggerd, e.g.:
>
> world switch hit and the IB frame is skipped so the fence
> won't signal, thus CP will jump to the next DMAframe's pipeline-sync
> command, and it will make CP hang foever.
>
> after pipelin-sync moved into COND_EXEC the consistency can be
> guaranteed
>
> Change-Id: I3059535745e2e1215343299b17f27fe081fc01bc
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 5 +++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 +++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
>   3 files changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 025e03b..947bd34 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -130,6 +130,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   
>   	unsigned i;
>   	int r = 0;
> +	bool need_pipe_sync = false;
>   
>   	if (num_ibs == 0)
>   		return -EINVAL;
> @@ -165,7 +166,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	if (ring->funcs->emit_pipeline_sync && job &&
>   	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync)) ||
>   	     amdgpu_vm_need_pipeline_sync(ring, job))) {
> -		amdgpu_ring_emit_pipeline_sync(ring);
> +		need_pipe_sync = true;
>   		fence_put(tmp);
>   	}
>   
> @@ -173,7 +174,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   		ring->funcs->insert_start(ring);
>   
>   	if (vm) {
> -		r = amdgpu_vm_flush(ring, job);
> +		r = amdgpu_vm_flush(ring, job, need_pipe_sync);
>   		if (r) {
>   			amdgpu_ring_undo(ring);
>   			return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index b065f8b..6d62e34 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -747,7 +747,7 @@ static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
>    *
>    * Emit a VM flush when it is necessary.
>    */
> -int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job)
> +int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync)
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	unsigned vmhub = ring->funcs->vmhub;
> @@ -770,12 +770,15 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job)
>   		vm_flush_needed = true;
>   	}
>   
> -	if (!vm_flush_needed && !gds_switch_needed)
> +	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>   		return 0;
>   
>   	if (ring->funcs->init_cond_exec)
>   		patch_offset = amdgpu_ring_init_cond_exec(ring);
>   
> +	if (need_pipe_sync)
> +		amdgpu_ring_emit_pipeline_sync(ring);
> +
>   	if (ring->funcs->emit_vm_flush && vm_flush_needed) {
>   		struct fence *fence;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 9aa00d9..cc0009f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -222,7 +222,7 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>   int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>   		      struct amdgpu_sync *sync, struct fence *fence,
>   		      struct amdgpu_job *job);
> -int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job);
> +int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync);
>   void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub,
>   			unsigned vmid);
>   void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev);


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdgpu:fix world switch hang
       [not found]     ` <f4d58615-318c-a093-e31a-2fa8995a6a85-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-06-07  1:52       ` zhoucm1
  0 siblings, 0 replies; 3+ messages in thread
From: zhoucm1 @ 2017-06-07  1:52 UTC (permalink / raw)
  To: Christian König, Monk Liu,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2017年06月06日 20:01, Christian König wrote:
> Am 06.06.2017 um 11:27 schrieb Monk Liu:
>> for SR-IOV, we must keep the pipeline-sync in the protection
>> of COND_EXEC, otherwise the command consumed by CPG is not
>> consistent when world switch triggerd, e.g.:
>>
>> world switch hit and the IB frame is skipped so the fence
>> won't signal, thus CP will jump to the next DMAframe's pipeline-sync
>> command, and it will make CP hang foever.
>>
>> after pipelin-sync moved into COND_EXEC the consistency can be
>> guaranteed
>>
>> Change-Id: I3059535745e2e1215343299b17f27fe081fc01bc
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>
> Reviewed-by: Christian König <christian.koenig@amd.com>
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 5 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 +++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
>>   3 files changed, 9 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> index 025e03b..947bd34 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> @@ -130,6 +130,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, 
>> unsigned num_ibs,
>>         unsigned i;
>>       int r = 0;
>> +    bool need_pipe_sync = false;
>>         if (num_ibs == 0)
>>           return -EINVAL;
>> @@ -165,7 +166,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, 
>> unsigned num_ibs,
>>       if (ring->funcs->emit_pipeline_sync && job &&
>>           ((tmp = amdgpu_sync_get_fence(&job->sched_sync)) ||
>>            amdgpu_vm_need_pipeline_sync(ring, job))) {
>> -        amdgpu_ring_emit_pipeline_sync(ring);
>> +        need_pipe_sync = true;
>>           fence_put(tmp);
>>       }
>>   @@ -173,7 +174,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring 
>> *ring, unsigned num_ibs,
>>           ring->funcs->insert_start(ring);
>>         if (vm) {
>> -        r = amdgpu_vm_flush(ring, job);
>> +        r = amdgpu_vm_flush(ring, job, need_pipe_sync);
There is a potential case lost, if no-vm job needs pipeline_sync, then 
which will be lost.

Regards,
David Zhou
>>           if (r) {
>>               amdgpu_ring_undo(ring);
>>               return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index b065f8b..6d62e34 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -747,7 +747,7 @@ static bool amdgpu_vm_is_large_bar(struct 
>> amdgpu_device *adev)
>>    *
>>    * Emit a VM flush when it is necessary.
>>    */
>> -int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job)
>> +int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job 
>> *job, bool need_pipe_sync)
>>   {
>>       struct amdgpu_device *adev = ring->adev;
>>       unsigned vmhub = ring->funcs->vmhub;
>> @@ -770,12 +770,15 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, 
>> struct amdgpu_job *job)
>>           vm_flush_needed = true;
>>       }
>>   -    if (!vm_flush_needed && !gds_switch_needed)
>> +    if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>           return 0;
>>         if (ring->funcs->init_cond_exec)
>>           patch_offset = amdgpu_ring_init_cond_exec(ring);
>>   +    if (need_pipe_sync)
>> +        amdgpu_ring_emit_pipeline_sync(ring);
>> +
>>       if (ring->funcs->emit_vm_flush && vm_flush_needed) {
>>           struct fence *fence;
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index 9aa00d9..cc0009f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -222,7 +222,7 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>>   int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>>                 struct amdgpu_sync *sync, struct fence *fence,
>>                 struct amdgpu_job *job);
>> -int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job);
>> +int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job 
>> *job, bool need_pipe_sync);
>>   void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vmhub,
>>               unsigned vmid);
>>   void amdgpu_vm_reset_all_ids(struct amdgpu_device *adev);
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-06-07  1:52 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-06-06  9:27 [PATCH] drm/amdgpu:fix world switch hang Monk Liu
     [not found] ` <1496741253-24964-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-06-06 12:01   ` Christian König
     [not found]     ` <f4d58615-318c-a093-e31a-2fa8995a6a85-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-06-07  1:52       ` zhoucm1

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.