public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Record the current requests queue for execlists upon hang
@ 2016-10-12 16:14 Chris Wilson
  2016-10-12 18:20 ` ✓ Fi.CI.BAT: success for " Patchwork
  2016-10-13  9:51 ` [PATCH] " Mika Kuoppala
  0 siblings, 2 replies; 4+ messages in thread
From: Chris Wilson @ 2016-10-12 16:14 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Mika wanted to know what requests were pending at the time of a hang as
we now track which requests we have submitted to the hardware.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h       |  3 +-
 drivers/gpu/drm/i915/i915_gpu_error.c | 64 ++++++++++++++++++++++++-----------
 2 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bf397b643cc0..6360e807c6ba 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -832,10 +832,11 @@ struct drm_i915_error_state {
 		struct drm_i915_error_request {
 			long jiffies;
 			pid_t pid;
+			u32 context;
 			u32 seqno;
 			u32 head;
 			u32 tail;
-		} *requests;
+		} *requests, execlist[2];
 
 		struct drm_i915_error_waiter {
 			char comm[TASK_COMM_LEN];
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 78cc13b9b2a5..026b78c66219 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -363,6 +363,20 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
 			   ee->instdone.row[slice][subslice]);
 }
 
+static void error_print_request(struct drm_i915_error_state_buf *m,
+				const char *prefix,
+				struct drm_i915_error_request *erq)
+{
+	if (!erq->seqno)
+		return;
+
+	err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
+		   prefix, erq->pid,
+		   erq->context, erq->seqno,
+		   jiffies_to_msecs(jiffies - erq->jiffies),
+		   erq->head, erq->tail);
+}
+
 static void error_print_engine(struct drm_i915_error_state_buf *m,
 			       struct drm_i915_error_engine *ee)
 {
@@ -434,6 +448,8 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  hangcheck: %s [%d]\n",
 		   hangcheck_action_to_str(ee->hangcheck_action),
 		   ee->hangcheck_score);
+	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
+	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
 }
 
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
@@ -649,14 +665,8 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			err_printf(m, "%s --- %d requests\n",
 				   dev_priv->engine[i].name,
 				   ee->num_requests);
-			for (j = 0; j < ee->num_requests; j++) {
-				err_printf(m, "  pid %d, seqno 0x%08x, emitted %ld, head 0x%08x, tail 0x%08x\n",
-					   ee->requests[j].pid,
-					   ee->requests[j].seqno,
-					   ee->requests[j].jiffies,
-					   ee->requests[j].head,
-					   ee->requests[j].tail);
-			}
+			for (j = 0; j < ee->num_requests; j++)
+				error_print_request(m, " ", &ee->requests[j]);
 		}
 
 		if (IS_ERR(ee->waiters)) {
@@ -1155,6 +1165,20 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 	}
 }
 
+static void record_request(struct drm_i915_gem_request *request,
+			   struct drm_i915_error_request *erq)
+{
+	erq->context = request->ctx->hw_id;
+	erq->seqno = request->fence.seqno;
+	erq->jiffies = request->emitted_jiffies;
+	erq->head = request->head;
+	erq->tail = request->tail;
+
+	rcu_read_lock();
+	erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
+	rcu_read_unlock();
+}
+
 static void engine_record_requests(struct intel_engine_cs *engine,
 				   struct drm_i915_gem_request *first,
 				   struct drm_i915_error_engine *ee)
@@ -1178,8 +1202,6 @@ static void engine_record_requests(struct intel_engine_cs *engine,
 	count = 0;
 	request = first;
 	list_for_each_entry_from(request, &engine->request_list, link) {
-		struct drm_i915_error_request *erq;
-
 		if (count >= ee->num_requests) {
 			/*
 			 * If the ring request list was changed in
@@ -1199,19 +1221,22 @@ static void engine_record_requests(struct intel_engine_cs *engine,
 			break;
 		}
 
-		erq = &ee->requests[count++];
-		erq->seqno = request->fence.seqno;
-		erq->jiffies = request->emitted_jiffies;
-		erq->head = request->head;
-		erq->tail = request->tail;
-
-		rcu_read_lock();
-		erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
-		rcu_read_unlock();
+		record_request(request, &ee->requests[count++]);
 	}
 	ee->num_requests = count;
 }
 
+static void error_record_engine_execlists(struct intel_engine_cs *engine,
+					  struct drm_i915_error_engine *ee)
+{
+	unsigned int n;
+
+	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
+		if (engine->execlist_port[n].request)
+			record_request(engine->execlist_port[n].request,
+				       &ee->execlist[n]);
+}
+
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				  struct drm_i915_error_state *error)
 {
@@ -1236,6 +1261,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 
 		error_record_engine_registers(error, engine, ee);
 		error_record_engine_waiters(engine, ee);
+		error_record_engine_execlists(engine, ee);
 
 		request = i915_gem_find_active_request(engine);
 		if (request) {
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Record the current requests queue for execlists upon hang
  2016-10-12 16:14 [PATCH] drm/i915: Record the current requests queue for execlists upon hang Chris Wilson
@ 2016-10-12 18:20 ` Patchwork
  2016-10-13  9:51 ` [PATCH] " Mika Kuoppala
  1 sibling, 0 replies; 4+ messages in thread
From: Patchwork @ 2016-10-12 18:20 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Record the current requests queue for execlists upon hang
URL   : https://patchwork.freedesktop.org/series/13660/
State : success

== Summary ==

Series 13660v1 drm/i915: Record the current requests queue for execlists upon hang
https://patchwork.freedesktop.org/api/1.0/series/13660/revisions/1/mbox/

Test drv_module_reload_basic:
                skip       -> PASS       (fi-skl-6770hq)
Test kms_flip:
        Subgroup basic-flip-vs-modeset:
                dmesg-warn -> PASS       (fi-skl-6770hq)
Test kms_psr_sink_crc:
        Subgroup psr_basic:
                dmesg-warn -> PASS       (fi-skl-6700hq)
Test vgem_basic:
        Subgroup unload:
                skip       -> PASS       (fi-kbl-7200u)
                skip       -> PASS       (fi-hsw-4770)

fi-bdw-5557u     total:248  pass:232  dwarn:0   dfail:0   fail:0   skip:16 
fi-bsw-n3050     total:248  pass:205  dwarn:0   dfail:0   fail:0   skip:43 
fi-bxt-t5700     total:248  pass:217  dwarn:0   dfail:0   fail:0   skip:31 
fi-byt-j1900     total:248  pass:213  dwarn:2   dfail:0   fail:1   skip:32 
fi-byt-n2820     total:248  pass:211  dwarn:0   dfail:0   fail:1   skip:36 
fi-hsw-4770      total:248  pass:225  dwarn:0   dfail:0   fail:0   skip:23 
fi-hsw-4770r     total:248  pass:225  dwarn:0   dfail:0   fail:0   skip:23 
fi-ivb-3520m     total:248  pass:222  dwarn:0   dfail:0   fail:0   skip:26 
fi-ivb-3770      total:248  pass:222  dwarn:0   dfail:0   fail:0   skip:26 
fi-kbl-7200u     total:248  pass:223  dwarn:0   dfail:0   fail:0   skip:25 
fi-skl-6260u     total:248  pass:233  dwarn:0   dfail:0   fail:0   skip:15 
fi-skl-6700hq    total:248  pass:225  dwarn:0   dfail:0   fail:0   skip:23 
fi-skl-6700k     total:248  pass:222  dwarn:1   dfail:0   fail:0   skip:25 
fi-skl-6770hq    total:248  pass:231  dwarn:1   dfail:0   fail:1   skip:15 
fi-snb-2520m     total:248  pass:211  dwarn:0   dfail:0   fail:0   skip:37 
fi-snb-2600      total:248  pass:210  dwarn:0   dfail:0   fail:0   skip:38 

Results at /archive/results/CI_IGT_test/Patchwork_2692/

14740bb25ec36fe4ce8042af3eb48aeb45e5bc13 drm-intel-nightly: 2016y-10m-12d-16h-18m-24s UTC integration manifest
988e045 drm/i915: Record the current requests queue for execlists upon hang

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/i915: Record the current requests queue for execlists upon hang
  2016-10-12 16:14 [PATCH] drm/i915: Record the current requests queue for execlists upon hang Chris Wilson
  2016-10-12 18:20 ` ✓ Fi.CI.BAT: success for " Patchwork
@ 2016-10-13  9:51 ` Mika Kuoppala
  2016-10-13 10:16   ` Chris Wilson
  1 sibling, 1 reply; 4+ messages in thread
From: Mika Kuoppala @ 2016-10-13  9:51 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Mika wanted to know what requests were pending at the time of a hang as
> we now track which requests we have submitted to the hardware.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  3 +-
>  drivers/gpu/drm/i915/i915_gpu_error.c | 64 ++++++++++++++++++++++++-----------
>  2 files changed, 47 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index bf397b643cc0..6360e807c6ba 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -832,10 +832,11 @@ struct drm_i915_error_state {
>  		struct drm_i915_error_request {
>  			long jiffies;
>  			pid_t pid;
> +			u32 context;
>  			u32 seqno;
>  			u32 head;
>  			u32 tail;
> -		} *requests;
> +		} *requests, execlist[2];
>  
>  		struct drm_i915_error_waiter {
>  			char comm[TASK_COMM_LEN];
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 78cc13b9b2a5..026b78c66219 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -363,6 +363,20 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
>  			   ee->instdone.row[slice][subslice]);
>  }
>  
> +static void error_print_request(struct drm_i915_error_state_buf *m,
> +				const char *prefix,
> +				struct drm_i915_error_request *erq)
> +{
> +	if (!erq->seqno)
> +		return;
> +
> +	err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
> +		   prefix, erq->pid,
> +		   erq->context, erq->seqno,
> +		   jiffies_to_msecs(jiffies - erq->jiffies),
> +		   erq->head, erq->tail);
> +}
> +
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
>  			       struct drm_i915_error_engine *ee)
>  {
> @@ -434,6 +448,8 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
>  	err_printf(m, "  hangcheck: %s [%d]\n",
>  		   hangcheck_action_to_str(ee->hangcheck_action),
>  		   ee->hangcheck_score);
> +	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
> +	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
>  }
>  
>  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
> @@ -649,14 +665,8 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  			err_printf(m, "%s --- %d requests\n",
>  				   dev_priv->engine[i].name,
>  				   ee->num_requests);
> -			for (j = 0; j < ee->num_requests; j++) {
> -				err_printf(m, "  pid %d, seqno 0x%08x, emitted %ld, head 0x%08x, tail 0x%08x\n",
> -					   ee->requests[j].pid,
> -					   ee->requests[j].seqno,
> -					   ee->requests[j].jiffies,
> -					   ee->requests[j].head,
> -					   ee->requests[j].tail);
> -			}
> +			for (j = 0; j < ee->num_requests; j++)
> +				error_print_request(m, " ", &ee->requests[j]);
>  		}
>  
>  		if (IS_ERR(ee->waiters)) {
> @@ -1155,6 +1165,20 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
>  	}
>  }
>  
> +static void record_request(struct drm_i915_gem_request *request,
> +			   struct drm_i915_error_request *erq)
> +{
> +	erq->context = request->ctx->hw_id;
> +	erq->seqno = request->fence.seqno;
> +	erq->jiffies = request->emitted_jiffies;
> +	erq->head = request->head;
> +	erq->tail = request->tail;
> +
> +	rcu_read_lock();
> +	erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;

This lock is only for the pid_nr and nothing to do with ctx dereference?
Not that it was added by this patch...

> +	rcu_read_unlock();
> +}
> +
>  static void engine_record_requests(struct intel_engine_cs *engine,
>  				   struct drm_i915_gem_request *first,
>  				   struct drm_i915_error_engine *ee)
> @@ -1178,8 +1202,6 @@ static void engine_record_requests(struct intel_engine_cs *engine,
>  	count = 0;
>  	request = first;
>  	list_for_each_entry_from(request, &engine->request_list, link) {
> -		struct drm_i915_error_request *erq;
> -
>  		if (count >= ee->num_requests) {
>  			/*
>  			 * If the ring request list was changed in
> @@ -1199,19 +1221,22 @@ static void engine_record_requests(struct intel_engine_cs *engine,
>  			break;
>  		}
>  
> -		erq = &ee->requests[count++];
> -		erq->seqno = request->fence.seqno;
> -		erq->jiffies = request->emitted_jiffies;
> -		erq->head = request->head;
> -		erq->tail = request->tail;
> -
> -		rcu_read_lock();
> -		erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
> -		rcu_read_unlock();
> +		record_request(request, &ee->requests[count++]);
>  	}
>  	ee->num_requests = count;
>  }
>  
> +static void error_record_engine_execlists(struct intel_engine_cs *engine,
> +					  struct drm_i915_error_engine *ee)
> +{
> +	unsigned int n;
> +
> +	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> +		if (engine->execlist_port[n].request)
> +			record_request(engine->execlist_port[n].request,
> +				       &ee->execlist[n]);

Ok even if we get interrupt at around here and reset the ports,
the pointer should stay in request_list and at that part we should be
safe.

And with retirement, we are in no more unsafer waters as with the other
requests.

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> +}
> +
>  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  				  struct drm_i915_error_state *error)
>  {
> @@ -1236,6 +1261,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  
>  		error_record_engine_registers(error, engine, ee);
>  		error_record_engine_waiters(engine, ee);
> +		error_record_engine_execlists(engine, ee);
>  
>  		request = i915_gem_find_active_request(engine);
>  		if (request) {
> -- 
> 2.9.3
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/i915: Record the current requests queue for execlists upon hang
  2016-10-13  9:51 ` [PATCH] " Mika Kuoppala
@ 2016-10-13 10:16   ` Chris Wilson
  0 siblings, 0 replies; 4+ messages in thread
From: Chris Wilson @ 2016-10-13 10:16 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Thu, Oct 13, 2016 at 12:51:26PM +0300, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> > +static void record_request(struct drm_i915_gem_request *request,
> > +			   struct drm_i915_error_request *erq)
> > +{
> > +	erq->context = request->ctx->hw_id;
> > +	erq->seqno = request->fence.seqno;
> > +	erq->jiffies = request->emitted_jiffies;
> > +	erq->head = request->head;
> > +	erq->tail = request->tail;
> > +
> > +	rcu_read_lock();
> > +	erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
> 
> This lock is only for the pid_nr and nothing to do with ctx dereference?
> Not that it was added by this patch...

It's for the struct task lookup inside pid_nr.

But...

> > +	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> > +		if (engine->execlist_port[n].request)
> > +			record_request(engine->execlist_port[n].request,
> > +				       &ee->execlist[n]);
> 
> Ok even if we get interrupt at around here and reset the ports,
> the pointer should stay in request_list and at that part we should be
> safe.

Note that we don't even get interrupts anymore as we completely stop the
machine whilst capturing. So even rcu_read_lock() above is overkill,
mere documentation.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2016-10-13 10:16 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-10-12 16:14 [PATCH] drm/i915: Record the current requests queue for execlists upon hang Chris Wilson
2016-10-12 18:20 ` ✓ Fi.CI.BAT: success for " Patchwork
2016-10-13  9:51 ` [PATCH] " Mika Kuoppala
2016-10-13 10:16   ` Chris Wilson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox