All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/amdgpu:cleanup force_completion
@ 2017-10-20  3:33 Monk Liu
       [not found] ` <1508470385-16164-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Monk Liu @ 2017-10-20  3:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

cleanups, now only operate on the given ring

Change-Id: I42ee081696ac348660d38be68807f34090ffcce2
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 29 +++++++----------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  3 +--
 3 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 99acf29..dae1e5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2871,7 +2871,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
 		amd_sched_hw_job_reset(&ring->sched);
 
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
-		amdgpu_fence_driver_force_completion_ring(ring);
+		amdgpu_fence_driver_force_completion(ring);
 	}
 
 	/* request to take full control of GPU before re-initialization  */
@@ -2990,9 +2990,9 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 			continue;
 		kthread_park(ring->sched.thread);
 		amd_sched_hw_job_reset(&ring->sched);
+		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
+		amdgpu_fence_driver_force_completion(ring);
 	}
-	/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
-	amdgpu_fence_driver_force_completion(adev);
 
 	need_full_reset = amdgpu_need_full_reset(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index fb9f88ef..2167dac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -499,7 +499,7 @@ void amdgpu_fence_driver_fini(struct amdgpu_device *adev)
 		r = amdgpu_fence_wait_empty(ring);
 		if (r) {
 			/* no need to trigger GPU reset as we are unloading */
-			amdgpu_fence_driver_force_completion(adev);
+			amdgpu_fence_driver_force_completion(ring);
 		}
 		amdgpu_irq_put(adev, ring->fence_drv.irq_src,
 			       ring->fence_drv.irq_type);
@@ -534,7 +534,7 @@ void amdgpu_fence_driver_suspend(struct amdgpu_device *adev)
 		r = amdgpu_fence_wait_empty(ring);
 		if (r) {
 			/* delay GPU reset to resume */
-			amdgpu_fence_driver_force_completion(adev);
+			amdgpu_fence_driver_force_completion(ring);
 		}
 
 		/* disable the interrupt */
@@ -571,30 +571,15 @@ void amdgpu_fence_driver_resume(struct amdgpu_device *adev)
 }
 
 /**
- * amdgpu_fence_driver_force_completion - force all fence waiter to complete
+ * amdgpu_fence_driver_force_completion - force signal latest fence of ring
  *
- * @adev: amdgpu device pointer
+ * @ring: fence of the ring to signal
  *
- * In case of GPU reset failure make sure no process keep waiting on fence
- * that will never complete.
  */
-void amdgpu_fence_driver_force_completion(struct amdgpu_device *adev)
+void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
 {
-	int i;
-
-	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
-		struct amdgpu_ring *ring = adev->rings[i];
-		if (!ring || !ring->fence_drv.initialized)
-			continue;
-
-		amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
-	}
-}
-
-void amdgpu_fence_driver_force_completion_ring(struct amdgpu_ring *ring)
-{
-	if (ring)
-		amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
+	amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
+	amdgpu_fence_process(ring);
 }
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index b18c2b9..a6b89e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -79,8 +79,7 @@ struct amdgpu_fence_driver {
 
 int amdgpu_fence_driver_init(struct amdgpu_device *adev);
 void amdgpu_fence_driver_fini(struct amdgpu_device *adev);
-void amdgpu_fence_driver_force_completion(struct amdgpu_device *adev);
-void amdgpu_fence_driver_force_completion_ring(struct amdgpu_ring *ring);
+void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
 
 int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 				  unsigned num_hw_submission);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/5] drm/amdgpu:add hang_limit for sched
       [not found] ` <1508470385-16164-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-20  3:33   ` Monk Liu
       [not found]     ` <1508470385-16164-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-20  3:33   ` [PATCH 3/5] drm/amdgpu:implement guilty pointer Monk Liu
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 15+ messages in thread
From: Monk Liu @ 2017-10-20  3:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

since gpu_scheduler source domain cannot access amdgpu variable
so need create the hang_limit membewr for sched, and it can
refer it for the upcoming GPU RESET patches

Change-Id: I977ae2717e55a8b87c59e58a288bffc3b458b653
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     | 2 ++
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 2167dac..f6a55cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -453,6 +453,8 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 				  ring->name);
 			return r;
 		}
+
+		ring->sched.hang_limit = amdgpu_job_hang_limit;
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 52c8e54..1dac7bc 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -142,6 +142,7 @@ struct amd_gpu_scheduler {
 	struct task_struct		*thread;
 	struct list_head	ring_mirror_list;
 	spinlock_t			job_list_lock;
+	int hang_limit;
 };
 
 int amd_sched_init(struct amd_gpu_scheduler *sched,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/5] drm/amdgpu:implement guilty pointer
       [not found] ` <1508470385-16164-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-20  3:33   ` [PATCH 2/5] drm/amdgpu:add hang_limit for sched Monk Liu
@ 2017-10-20  3:33   ` Monk Liu
       [not found]     ` <1508470385-16164-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-20  3:33   ` [PATCH 4/5] drm/amdgpu:cleanup job reset routine Monk Liu
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 15+ messages in thread
From: Monk Liu @ 2017-10-20  3:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

for user context there will be a guilty pointer in entity
that points to the guilty member of its context, thus we
cant track if a given entity is not from kernel ctx and
if it is already marked guilty.

Change-Id: Ie0a30830d89f52a6c4a514e67206140698a46367
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       | 1 +
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 +
 3 files changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 774edc1..6a4178b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -744,6 +744,7 @@ struct amdgpu_ctx {
 	enum amd_sched_priority init_priority;
 	enum amd_sched_priority override_priority;
 	struct mutex            lock;
+	atomic_t	guilty;
 };
 
 struct amdgpu_ctx_mgr {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c184468..d822e95 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -93,6 +93,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
 					  rq, amdgpu_sched_jobs);
 		if (r)
 			goto failed;
+		ctx->rings[i].entity.guilty = &ctx->guilty;
 	}
 
 	r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr);
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 1dac7bc..2d59fc5 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -50,6 +50,7 @@ struct amd_sched_entity {
 
 	struct dma_fence		*dependency;
 	struct dma_fence_cb		cb;
+	atomic_t	*guilty; /* points to ctx's guilty */
 };
 
 /**
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/5] drm/amdgpu:cleanup job reset routine
       [not found] ` <1508470385-16164-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-20  3:33   ` [PATCH 2/5] drm/amdgpu:add hang_limit for sched Monk Liu
  2017-10-20  3:33   ` [PATCH 3/5] drm/amdgpu:implement guilty pointer Monk Liu
@ 2017-10-20  3:33   ` Monk Liu
       [not found]     ` <1508470385-16164-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-20  3:33   ` [PATCH 5/5] drm/amdgpu:skip job for guilty ctx in parser_init Monk Liu
  2017-10-20  7:17   ` [PATCH 1/5] drm/amdgpu:cleanup force_completion Christian König
  4 siblings, 1 reply; 15+ messages in thread
From: Monk Liu @ 2017-10-20  3:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

merge the setting guilty on context into this function
to avoid implement extra routine.

Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  4 ++--
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 13 ++++++++++++-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dae1e5b..a07544d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
 			amd_sched_job_kickout(&job->base);
 
 		/* only do job_reset on the hang ring if @job not NULL */
-		amd_sched_hw_job_reset(&ring->sched);
+		amd_sched_hw_job_reset(&ring->sched, NULL);
 
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
 		amdgpu_fence_driver_force_completion(ring);
@@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 		if (!ring || !ring->sched.thread)
 			continue;
 		kthread_park(ring->sched.thread);
-		amd_sched_hw_job_reset(&ring->sched);
+		amd_sched_hw_job_reset(&ring->sched, NULL);
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
 		amdgpu_fence_driver_force_completion(ring);
 	}
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index e4d3b4e..322b6c1 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -442,7 +442,14 @@ static void amd_sched_job_timedout(struct work_struct *work)
 	job->sched->ops->timedout_job(job);
 }
 
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
+static void amd_sched_set_guilty(struct amd_sched_job *s_job)
+{
+	if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
+		if (s_job->s_entity->guilty)
+			atomic_set(s_job->s_entity->guilty, 1);
+}
+
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
 {
 	struct amd_sched_job *s_job;
 
@@ -455,6 +462,10 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
 			s_job->s_fence->parent = NULL;
 			atomic_dec(&sched->hw_rq_count);
 		}
+
+		if (bad && bad == s_job) {
+			amd_sched_set_guilty(s_job);
+		}
 	}
 	spin_unlock(&sched->job_list_lock);
 }
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 2d59fc5..fff7cc7 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -172,7 +172,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
 		       struct amd_gpu_scheduler *sched,
 		       struct amd_sched_entity *entity,
 		       void *owner);
-void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
+void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *job);
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
 bool amd_sched_dependency_optimized(struct dma_fence* fence,
 				    struct amd_sched_entity *entity);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/5] drm/amdgpu:skip job for guilty ctx in parser_init
       [not found] ` <1508470385-16164-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2017-10-20  3:33   ` [PATCH 4/5] drm/amdgpu:cleanup job reset routine Monk Liu
@ 2017-10-20  3:33   ` Monk Liu
       [not found]     ` <1508470385-16164-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-10-20  7:17   ` [PATCH 1/5] drm/amdgpu:cleanup force_completion Christian König
  4 siblings, 1 reply; 15+ messages in thread
From: Monk Liu @ 2017-10-20  3:33 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

Change-Id: I44019f6475b1eaaba55633cf5f8bb84284f19a2c
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index f7fceb6..1d5cbe7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -90,6 +90,12 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
 		goto free_chunk;
 	}
 
+	/* skip guilty context job */
+	if (atomic_read(&p->ctx->guilty) == 1) {
+		ret = -ECANCELED;
+		goto free_chunk;
+	}
+
 	mutex_lock(&p->ctx->lock);
 
 	/* get chunks */
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] drm/amdgpu:cleanup force_completion
       [not found] ` <1508470385-16164-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
                     ` (3 preceding siblings ...)
  2017-10-20  3:33   ` [PATCH 5/5] drm/amdgpu:skip job for guilty ctx in parser_init Monk Liu
@ 2017-10-20  7:17   ` Christian König
  4 siblings, 0 replies; 15+ messages in thread
From: Christian König @ 2017-10-20  7:17 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 20.10.2017 um 05:33 schrieb Monk Liu:
> cleanups, now only operate on the given ring
>
> Change-Id: I42ee081696ac348660d38be68807f34090ffcce2
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 +++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 29 +++++++----------------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  3 +--
>   3 files changed, 11 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 99acf29..dae1e5b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2871,7 +2871,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
>   		amd_sched_hw_job_reset(&ring->sched);
>   
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
> -		amdgpu_fence_driver_force_completion_ring(ring);
> +		amdgpu_fence_driver_force_completion(ring);
>   	}
>   
>   	/* request to take full control of GPU before re-initialization  */
> @@ -2990,9 +2990,9 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   			continue;
>   		kthread_park(ring->sched.thread);
>   		amd_sched_hw_job_reset(&ring->sched);
> +		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
> +		amdgpu_fence_driver_force_completion(ring);
>   	}
> -	/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
> -	amdgpu_fence_driver_force_completion(adev);
>   
>   	need_full_reset = amdgpu_need_full_reset(adev);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index fb9f88ef..2167dac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -499,7 +499,7 @@ void amdgpu_fence_driver_fini(struct amdgpu_device *adev)
>   		r = amdgpu_fence_wait_empty(ring);
>   		if (r) {
>   			/* no need to trigger GPU reset as we are unloading */
> -			amdgpu_fence_driver_force_completion(adev);
> +			amdgpu_fence_driver_force_completion(ring);
>   		}
>   		amdgpu_irq_put(adev, ring->fence_drv.irq_src,
>   			       ring->fence_drv.irq_type);
> @@ -534,7 +534,7 @@ void amdgpu_fence_driver_suspend(struct amdgpu_device *adev)
>   		r = amdgpu_fence_wait_empty(ring);
>   		if (r) {
>   			/* delay GPU reset to resume */
> -			amdgpu_fence_driver_force_completion(adev);
> +			amdgpu_fence_driver_force_completion(ring);
>   		}
>   
>   		/* disable the interrupt */
> @@ -571,30 +571,15 @@ void amdgpu_fence_driver_resume(struct amdgpu_device *adev)
>   }
>   
>   /**
> - * amdgpu_fence_driver_force_completion - force all fence waiter to complete
> + * amdgpu_fence_driver_force_completion - force signal latest fence of ring
>    *
> - * @adev: amdgpu device pointer
> + * @ring: fence of the ring to signal
>    *
> - * In case of GPU reset failure make sure no process keep waiting on fence
> - * that will never complete.
>    */
> -void amdgpu_fence_driver_force_completion(struct amdgpu_device *adev)
> +void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
>   {
> -	int i;
> -
> -	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> -		struct amdgpu_ring *ring = adev->rings[i];
> -		if (!ring || !ring->fence_drv.initialized)
> -			continue;
> -
> -		amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
> -	}
> -}
> -
> -void amdgpu_fence_driver_force_completion_ring(struct amdgpu_ring *ring)
> -{
> -	if (ring)
> -		amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
> +	amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
> +	amdgpu_fence_process(ring);
>   }
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index b18c2b9..a6b89e3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -79,8 +79,7 @@ struct amdgpu_fence_driver {
>   
>   int amdgpu_fence_driver_init(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_fini(struct amdgpu_device *adev);
> -void amdgpu_fence_driver_force_completion(struct amdgpu_device *adev);
> -void amdgpu_fence_driver_force_completion_ring(struct amdgpu_ring *ring);
> +void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
>   
>   int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   				  unsigned num_hw_submission);


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/5] drm/amdgpu:add hang_limit for sched
       [not found]     ` <1508470385-16164-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-20  9:03       ` Christian König
       [not found]         ` <2002cd84-8f78-52c3-d56c-faa43add4050-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Christian König @ 2017-10-20  9:03 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 20.10.2017 um 05:33 schrieb Monk Liu:
> since gpu_scheduler source domain cannot access amdgpu variable
> so need create the hang_limit membewr for sched, and it can
> refer it for the upcoming GPU RESET patches
>
> Change-Id: I977ae2717e55a8b87c59e58a288bffc3b458b653
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     | 2 ++
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 +
>   2 files changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 2167dac..f6a55cf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -453,6 +453,8 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   				  ring->name);
>   			return r;
>   		}
> +
> +		ring->sched.hang_limit = amdgpu_job_hang_limit;

Don't set it directly, make it a parameter of amd_sched_init().

>   	}
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 52c8e54..1dac7bc 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -142,6 +142,7 @@ struct amd_gpu_scheduler {
>   	struct task_struct		*thread;
>   	struct list_head	ring_mirror_list;
>   	spinlock_t			job_list_lock;
> +	int hang_limit;

Please indent the new field the same way as the other one in the structure.

Christian.

>   };
>   
>   int amd_sched_init(struct amd_gpu_scheduler *sched,


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] drm/amdgpu:cleanup job reset routine
       [not found]     ` <1508470385-16164-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-20  9:06       ` Christian König
       [not found]         ` <85ee460e-1f65-69c7-c17b-329d9b31d205-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Christian König @ 2017-10-20  9:06 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 20.10.2017 um 05:33 schrieb Monk Liu:
> merge the setting guilty on context into this function
> to avoid implement extra routine.
>
> Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  4 ++--
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 13 ++++++++++++-
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
>   3 files changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index dae1e5b..a07544d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
>   			amd_sched_job_kickout(&job->base);
>   
>   		/* only do job_reset on the hang ring if @job not NULL */
> -		amd_sched_hw_job_reset(&ring->sched);
> +		amd_sched_hw_job_reset(&ring->sched, NULL);
>   
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
> @@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   		if (!ring || !ring->sched.thread)
>   			continue;
>   		kthread_park(ring->sched.thread);
> -		amd_sched_hw_job_reset(&ring->sched);
> +		amd_sched_hw_job_reset(&ring->sched, NULL);
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index e4d3b4e..322b6c1 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -442,7 +442,14 @@ static void amd_sched_job_timedout(struct work_struct *work)
>   	job->sched->ops->timedout_job(job);
>   }
>   
> -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
> +static void amd_sched_set_guilty(struct amd_sched_job *s_job)
> +{
> +	if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
> +		if (s_job->s_entity->guilty)
> +			atomic_set(s_job->s_entity->guilty, 1);

The entity might be long freed at this point.

You need to walk the list of entities (with lock held) and check if 
entity->fence_context == job->s_fence->scheduled.context and only if you 
found the right one set the guilty flag there.

Christian.

> +}
> +
> +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
>   {
>   	struct amd_sched_job *s_job;
>   
> @@ -455,6 +462,10 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
>   			s_job->s_fence->parent = NULL;
>   			atomic_dec(&sched->hw_rq_count);
>   		}
> +
> +		if (bad && bad == s_job) {
> +			amd_sched_set_guilty(s_job);
> +		}
>   	}
>   	spin_unlock(&sched->job_list_lock);
>   }
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 2d59fc5..fff7cc7 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -172,7 +172,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>   		       struct amd_gpu_scheduler *sched,
>   		       struct amd_sched_entity *entity,
>   		       void *owner);
> -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
> +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *job);
>   void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
>   bool amd_sched_dependency_optimized(struct dma_fence* fence,
>   				    struct amd_sched_entity *entity);


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] drm/amdgpu:implement guilty pointer
       [not found]     ` <1508470385-16164-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-20  9:08       ` Christian König
       [not found]         ` <f3a9431e-31fd-e8a5-79e9-d8bb09de366b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Christian König @ 2017-10-20  9:08 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 20.10.2017 um 05:33 schrieb Monk Liu:
> for user context there will be a guilty pointer in entity
> that points to the guilty member of its context, thus we
> cant track if a given entity is not from kernel ctx and
> if it is already marked guilty.
>
> Change-Id: Ie0a30830d89f52a6c4a514e67206140698a46367
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           | 1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       | 1 +
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 +
>   3 files changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 774edc1..6a4178b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -744,6 +744,7 @@ struct amdgpu_ctx {
>   	enum amd_sched_priority init_priority;
>   	enum amd_sched_priority override_priority;
>   	struct mutex            lock;
> +	atomic_t	guilty;
>   };
>   
>   struct amdgpu_ctx_mgr {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index c184468..d822e95 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -93,6 +93,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>   					  rq, amdgpu_sched_jobs);
>   		if (r)
>   			goto failed;
> +		ctx->rings[i].entity.guilty = &ctx->guilty;

Don't set this directly, make it a parameter of amd_sched_entity_init().

And please split up the patches differently.

First all changes to the scheduler, including the guilty marking.

Then the changes to amdgpu.

Regards,
Christian.

>   	}
>   
>   	r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr);
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 1dac7bc..2d59fc5 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -50,6 +50,7 @@ struct amd_sched_entity {
>   
>   	struct dma_fence		*dependency;
>   	struct dma_fence_cb		cb;
> +	atomic_t	*guilty; /* points to ctx's guilty */
>   };
>   
>   /**


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 5/5] drm/amdgpu:skip job for guilty ctx in parser_init
       [not found]     ` <1508470385-16164-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-20  9:09       ` Christian König
  0 siblings, 0 replies; 15+ messages in thread
From: Christian König @ 2017-10-20  9:09 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 20.10.2017 um 05:33 schrieb Monk Liu:
> Change-Id: I44019f6475b1eaaba55633cf5f8bb84284f19a2c
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 ++++++
>   1 file changed, 6 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index f7fceb6..1d5cbe7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -90,6 +90,12 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>   		goto free_chunk;
>   	}
>   
> +	/* skip guilty context job */
> +	if (atomic_read(&p->ctx->guilty) == 1) {
> +		ret = -ECANCELED;
> +		goto free_chunk;
> +	}
> +
>   	mutex_lock(&p->ctx->lock);
>   
>   	/* get chunks */


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH 2/5] drm/amdgpu:add hang_limit for sched
       [not found]         ` <2002cd84-8f78-52c3-d56c-faa43add4050-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-10-20  9:41           ` Liu, Monk
  0 siblings, 0 replies; 15+ messages in thread
From: Liu, Monk @ 2017-10-20  9:41 UTC (permalink / raw)
  To: Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

> Don't set it directly, make it a parameter of amd_sched_init().
Okay


the indent is somehow always looks incorrect in email, but it already aligned actually 

-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 
Sent: 2017年10月20日 17:03
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/5] drm/amdgpu:add hang_limit for sched

Am 20.10.2017 um 05:33 schrieb Monk Liu:
> since gpu_scheduler source domain cannot access amdgpu variable so 
> need create the hang_limit membewr for sched, and it can refer it for 
> the upcoming GPU RESET patches
>
> Change-Id: I977ae2717e55a8b87c59e58a288bffc3b458b653
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c     | 2 ++
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 +
>   2 files changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 2167dac..f6a55cf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -453,6 +453,8 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   				  ring->name);
>   			return r;
>   		}
> +
> +		ring->sched.hang_limit = amdgpu_job_hang_limit;

Don't set it directly, make it a parameter of amd_sched_init().

>   	}
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 52c8e54..1dac7bc 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -142,6 +142,7 @@ struct amd_gpu_scheduler {
>   	struct task_struct		*thread;
>   	struct list_head	ring_mirror_list;
>   	spinlock_t			job_list_lock;
> +	int hang_limit;

Please indent the new field the same way as the other one in the structure.

Christian.

>   };
>   
>   int amd_sched_init(struct amd_gpu_scheduler *sched,


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH 4/5] drm/amdgpu:cleanup job reset routine
       [not found]         ` <85ee460e-1f65-69c7-c17b-329d9b31d205-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-10-20 10:00           ` Liu, Monk
  0 siblings, 0 replies; 15+ messages in thread
From: Liu, Monk @ 2017-10-20 10:00 UTC (permalink / raw)
  To: Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

> You need to walk the list of entities (with lock held) and check if 
entity->fence_context == job->s_fence->scheduled.context and only if you
found the right one set the guilty flag there.


Yeah you remind me, I did this in the old gpu reset patch, will add them back !

BR Monk
-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 
Sent: 2017年10月20日 17:07
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/5] drm/amdgpu:cleanup job reset routine

Am 20.10.2017 um 05:33 schrieb Monk Liu:
> merge the setting guilty on context into this function to avoid 
> implement extra routine.
>
> Change-Id: I7a0063464fdc85d5ac9080046380e745565ff540
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  4 ++--
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 13 ++++++++++++-
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
>   3 files changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index dae1e5b..a07544d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2868,7 +2868,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
>   			amd_sched_job_kickout(&job->base);
>   
>   		/* only do job_reset on the hang ring if @job not NULL */
> -		amd_sched_hw_job_reset(&ring->sched);
> +		amd_sched_hw_job_reset(&ring->sched, NULL);
>   
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
> @@ -2989,7 +2989,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   		if (!ring || !ring->sched.thread)
>   			continue;
>   		kthread_park(ring->sched.thread);
> -		amd_sched_hw_job_reset(&ring->sched);
> +		amd_sched_hw_job_reset(&ring->sched, NULL);
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index e4d3b4e..322b6c1 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -442,7 +442,14 @@ static void amd_sched_job_timedout(struct work_struct *work)
>   	job->sched->ops->timedout_job(job);
>   }
>   
> -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
> +static void amd_sched_set_guilty(struct amd_sched_job *s_job) {
> +	if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
> +		if (s_job->s_entity->guilty)
> +			atomic_set(s_job->s_entity->guilty, 1);

The entity might be long freed at this point.

You need to walk the list of entities (with lock held) and check if 
entity->fence_context == job->s_fence->scheduled.context and only if you
found the right one set the guilty flag there.

Christian.

> +}
> +
> +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
>   {
>   	struct amd_sched_job *s_job;
>   
> @@ -455,6 +462,10 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
>   			s_job->s_fence->parent = NULL;
>   			atomic_dec(&sched->hw_rq_count);
>   		}
> +
> +		if (bad && bad == s_job) {
> +			amd_sched_set_guilty(s_job);
> +		}
>   	}
>   	spin_unlock(&sched->job_list_lock);
>   }
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 2d59fc5..fff7cc7 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -172,7 +172,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>   		       struct amd_gpu_scheduler *sched,
>   		       struct amd_sched_entity *entity,
>   		       void *owner);
> -void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
> +void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *job);
>   void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
>   bool amd_sched_dependency_optimized(struct dma_fence* fence,
>   				    struct amd_sched_entity *entity);


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH 3/5] drm/amdgpu:implement guilty pointer
       [not found]         ` <f3a9431e-31fd-e8a5-79e9-d8bb09de366b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-10-23  4:26           ` Liu, Monk
       [not found]             ` <BLUPR12MB0449682AC30F4365B0A5068284460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Liu, Monk @ 2017-10-23  4:26 UTC (permalink / raw)
  To: Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

Don't set this directly, make it a parameter of amd_sched_entity_init().

And please split up the patches differently.

First all changes to the scheduler, including the guilty marking.

Then the changes to amdgpu.


[ML] that way the first patch will break kernel from compiling because you change the interface of entity_init without changing all place 
Calling this interface 

I suggest just use one patch

-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 
Sent: 2017年10月20日 17:09
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/5] drm/amdgpu:implement guilty pointer

Am 20.10.2017 um 05:33 schrieb Monk Liu:
> for user context there will be a guilty pointer in entity that points 
> to the guilty member of its context, thus we cant track if a given 
> entity is not from kernel ctx and if it is already marked guilty.
>
> Change-Id: Ie0a30830d89f52a6c4a514e67206140698a46367
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           | 1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       | 1 +
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 +
>   3 files changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 774edc1..6a4178b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -744,6 +744,7 @@ struct amdgpu_ctx {
>   	enum amd_sched_priority init_priority;
>   	enum amd_sched_priority override_priority;
>   	struct mutex            lock;
> +	atomic_t	guilty;
>   };
>   
>   struct amdgpu_ctx_mgr {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index c184468..d822e95 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -93,6 +93,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>   					  rq, amdgpu_sched_jobs);
>   		if (r)
>   			goto failed;
> +		ctx->rings[i].entity.guilty = &ctx->guilty;

Don't set this directly, make it a parameter of amd_sched_entity_init().

And please split up the patches differently.

First all changes to the scheduler, including the guilty marking.

Then the changes to amdgpu.

Regards,
Christian.

>   	}
>   
>   	r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr); diff --git 
> a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 1dac7bc..2d59fc5 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -50,6 +50,7 @@ struct amd_sched_entity {
>   
>   	struct dma_fence		*dependency;
>   	struct dma_fence_cb		cb;
> +	atomic_t	*guilty; /* points to ctx's guilty */
>   };
>   
>   /**


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] drm/amdgpu:implement guilty pointer
       [not found]             ` <BLUPR12MB0449682AC30F4365B0A5068284460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-10-23  7:17               ` Christian König
       [not found]                 ` <9ee4a45a-53c5-5a70-449f-714b90a3de08-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Christian König @ 2017-10-23  7:17 UTC (permalink / raw)
  To: Liu, Monk,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

Am 23.10.2017 um 06:26 schrieb Liu, Monk:
> Don't set this directly, make it a parameter of amd_sched_entity_init().
>
> And please split up the patches differently.
>
> First all changes to the scheduler, including the guilty marking.
>
> Then the changes to amdgpu.
>
>
> [ML] that way the first patch will break kernel from compiling because you change the interface of entity_init without changing all place
> Calling this interface

You obviously need to adjust the calls to entity_init and at least 
provide NULL as value for the new parameter.

> I suggest just use one patch

I prefer to separate that, but not a strict must have when you don't 
have time.

The scheduler and amdgpu using it should essentially be two different 
components.

Regards,
Christian.

>
> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
> Sent: 2017年10月20日 17:09
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 3/5] drm/amdgpu:implement guilty pointer
>
> Am 20.10.2017 um 05:33 schrieb Monk Liu:
>> for user context there will be a guilty pointer in entity that points
>> to the guilty member of its context, thus we cant track if a given
>> entity is not from kernel ctx and if it is already marked guilty.
>>
>> Change-Id: Ie0a30830d89f52a6c4a514e67206140698a46367
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           | 1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       | 1 +
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 +
>>    3 files changed, 3 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 774edc1..6a4178b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -744,6 +744,7 @@ struct amdgpu_ctx {
>>    	enum amd_sched_priority init_priority;
>>    	enum amd_sched_priority override_priority;
>>    	struct mutex            lock;
>> +	atomic_t	guilty;
>>    };
>>    
>>    struct amdgpu_ctx_mgr {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index c184468..d822e95 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -93,6 +93,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>>    					  rq, amdgpu_sched_jobs);
>>    		if (r)
>>    			goto failed;
>> +		ctx->rings[i].entity.guilty = &ctx->guilty;
> Don't set this directly, make it a parameter of amd_sched_entity_init().
>
> And please split up the patches differently.
>
> First all changes to the scheduler, including the guilty marking.
>
> Then the changes to amdgpu.
>
> Regards,
> Christian.
>
>>    	}
>>    
>>    	r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr); diff --git
>> a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> index 1dac7bc..2d59fc5 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> @@ -50,6 +50,7 @@ struct amd_sched_entity {
>>    
>>    	struct dma_fence		*dependency;
>>    	struct dma_fence_cb		cb;
>> +	atomic_t	*guilty; /* points to ctx's guilty */
>>    };
>>    
>>    /**
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH 3/5] drm/amdgpu:implement guilty pointer
       [not found]                 ` <9ee4a45a-53c5-5a70-449f-714b90a3de08-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-23  7:53                   ` Liu, Monk
  0 siblings, 0 replies; 15+ messages in thread
From: Liu, Monk @ 2017-10-23  7:53 UTC (permalink / raw)
  To: Koenig, Christian,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org

Yeah, already did so, please check the amd-gfx

-----Original Message-----
From: Koenig, Christian 
Sent: 2017年10月23日 15:17
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/5] drm/amdgpu:implement guilty pointer

Am 23.10.2017 um 06:26 schrieb Liu, Monk:
> Don't set this directly, make it a parameter of amd_sched_entity_init().
>
> And please split up the patches differently.
>
> First all changes to the scheduler, including the guilty marking.
>
> Then the changes to amdgpu.
>
>
> [ML] that way the first patch will break kernel from compiling because 
> you change the interface of entity_init without changing all place 
> Calling this interface

You obviously need to adjust the calls to entity_init and at least provide NULL as value for the new parameter.

> I suggest just use one patch

I prefer to separate that, but not a strict must have when you don't have time.

The scheduler and amdgpu using it should essentially be two different components.

Regards,
Christian.

>
> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
> Sent: 2017年10月20日 17:09
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 3/5] drm/amdgpu:implement guilty pointer
>
> Am 20.10.2017 um 05:33 schrieb Monk Liu:
>> for user context there will be a guilty pointer in entity that points 
>> to the guilty member of its context, thus we cant track if a given 
>> entity is not from kernel ctx and if it is already marked guilty.
>>
>> Change-Id: Ie0a30830d89f52a6c4a514e67206140698a46367
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           | 1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       | 1 +
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 1 +
>>    3 files changed, 3 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 774edc1..6a4178b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -744,6 +744,7 @@ struct amdgpu_ctx {
>>    	enum amd_sched_priority init_priority;
>>    	enum amd_sched_priority override_priority;
>>    	struct mutex            lock;
>> +	atomic_t	guilty;
>>    };
>>    
>>    struct amdgpu_ctx_mgr {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index c184468..d822e95 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -93,6 +93,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>>    					  rq, amdgpu_sched_jobs);
>>    		if (r)
>>    			goto failed;
>> +		ctx->rings[i].entity.guilty = &ctx->guilty;
> Don't set this directly, make it a parameter of amd_sched_entity_init().
>
> And please split up the patches differently.
>
> First all changes to the scheduler, including the guilty marking.
>
> Then the changes to amdgpu.
>
> Regards,
> Christian.
>
>>    	}
>>    
>>    	r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr); diff --git 
>> a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> index 1dac7bc..2d59fc5 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> @@ -50,6 +50,7 @@ struct amd_sched_entity {
>>    
>>    	struct dma_fence		*dependency;
>>    	struct dma_fence_cb		cb;
>> +	atomic_t	*guilty; /* points to ctx's guilty */
>>    };
>>    
>>    /**
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2017-10-23  7:53 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-10-20  3:33 [PATCH 1/5] drm/amdgpu:cleanup force_completion Monk Liu
     [not found] ` <1508470385-16164-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-20  3:33   ` [PATCH 2/5] drm/amdgpu:add hang_limit for sched Monk Liu
     [not found]     ` <1508470385-16164-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-20  9:03       ` Christian König
     [not found]         ` <2002cd84-8f78-52c3-d56c-faa43add4050-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-20  9:41           ` Liu, Monk
2017-10-20  3:33   ` [PATCH 3/5] drm/amdgpu:implement guilty pointer Monk Liu
     [not found]     ` <1508470385-16164-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-20  9:08       ` Christian König
     [not found]         ` <f3a9431e-31fd-e8a5-79e9-d8bb09de366b-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-23  4:26           ` Liu, Monk
     [not found]             ` <BLUPR12MB0449682AC30F4365B0A5068284460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-10-23  7:17               ` Christian König
     [not found]                 ` <9ee4a45a-53c5-5a70-449f-714b90a3de08-5C7GfCeVMHo@public.gmane.org>
2017-10-23  7:53                   ` Liu, Monk
2017-10-20  3:33   ` [PATCH 4/5] drm/amdgpu:cleanup job reset routine Monk Liu
     [not found]     ` <1508470385-16164-4-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-20  9:06       ` Christian König
     [not found]         ` <85ee460e-1f65-69c7-c17b-329d9b31d205-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-20 10:00           ` Liu, Monk
2017-10-20  3:33   ` [PATCH 5/5] drm/amdgpu:skip job for guilty ctx in parser_init Monk Liu
     [not found]     ` <1508470385-16164-5-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-10-20  9:09       ` Christian König
2017-10-20  7:17   ` [PATCH 1/5] drm/amdgpu:cleanup force_completion Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.