[PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request

intel-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request
@ 2015-12-11 22:59 Chris Wilson
  2015-12-11 22:59 ` [PATCH v2 2/3] drm/i915: Allow userspace to request no-error-capture upon GPU hangs Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Chris Wilson @ 2015-12-11 22:59 UTC (permalink / raw)
  To: intel-gfx

The request tells us where to read the ringbuf from, so use that
information to simplify the error capture. If no request was active at
the time of the hang, the ring is idle and there is no information
inside the ring pertaining to the hang.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 3e137fc701cf..6eefe9c36931 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -995,7 +995,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
 
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		struct intel_engine_cs *ring = &dev_priv->ring[i];
-		struct intel_ringbuffer *rbuf;
+		struct intel_ringbuffer *rbuf = NULL;
 
 		error->ring[i].pid = -1;
 
@@ -1039,26 +1039,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
 				}
 				rcu_read_unlock();
 			}
+
+			rbuf = request->ringbuf;
 		}
 
-		if (i915.enable_execlists) {
-			/* TODO: This is only a small fix to keep basic error
-			 * capture working, but we need to add more information
-			 * for it to be useful (e.g. dump the context being
-			 * executed).
-			 */
-			if (request)
-				rbuf = request->ctx->engine[ring->id].ringbuf;
-			else
-				rbuf = ring->default_context->engine[ring->id].ringbuf;
-		} else
-			rbuf = ring->buffer;
-
-		error->ring[i].cpu_ring_head = rbuf->head;
-		error->ring[i].cpu_ring_tail = rbuf->tail;
-
-		error->ring[i].ringbuffer =
-			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
+		if (rbuf) {
+			error->ring[i].cpu_ring_head = rbuf->head;
+			error->ring[i].cpu_ring_tail = rbuf->tail;
+			error->ring[i].ringbuffer =
+				i915_error_ggtt_object_create(dev_priv,
+							      rbuf->obj);
+		}
 
 		error->ring[i].hws_page =
 			i915_error_ggtt_object_create(dev_priv, ring->status_page.obj);
-- 
2.6.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 2/3] drm/i915: Allow userspace to request no-error-capture upon GPU hangs
  2015-12-11 22:59 [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request Chris Wilson
@ 2015-12-11 22:59 ` Chris Wilson
  2015-12-15 16:59   ` Dave Gordon
  2015-12-11 22:59 ` [PATCH v2 3/3] drm/i915: Clean up GPU hang message Chris Wilson
  2015-12-14 11:14 ` [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request Dave Gordon
  2 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2015-12-11 22:59 UTC (permalink / raw)
  To: intel-gfx

igt likes to inject GPU hangs into its command streams. However, as we
expect these hangs, we don't actually want them recorded in the dmesg
output or stored in the i915_error_state (usually). To accomodate this
allow userspace to set a flag on the context that any hang emanating
from that context will not be recorded. We still do the error capture
(otherwise how do we find the guilty context and know its intent?) as
part of the reason for random GPU hang injection is to exercise the race
conditions between the error capture and normal execution.

v2: Split out the request->ringbuf error capture changes.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h         |  8 ++++++--
 drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c   | 13 ++++++++-----
 include/uapi/drm/i915_drm.h             |  1 +
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b33091c2c39e..c511b3cbf9b2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -493,6 +493,7 @@ struct drm_i915_error_state {
 	struct timeval time;
 
 	char error_msg[128];
+	bool simulated;
 	int iommu;
 	u32 reset_count;
 	u32 suspend_count;
@@ -845,7 +846,9 @@ struct i915_ctx_hang_stats {
 /* This must match up with the value previously used for execbuf2.rsvd1. */
 #define DEFAULT_CONTEXT_HANDLE 0
 
-#define CONTEXT_NO_ZEROMAP (1<<0)
+#define CONTEXT_NO_ZEROMAP		(1<<0)
+#define CONTEXT_NO_ERROR_CAPTURE	(1<<1)
+
 /**
  * struct intel_context - as the name implies, represents a context.
  * @ref: reference count.
@@ -870,11 +873,12 @@ struct intel_context {
 	int user_handle;
 	uint8_t remap_slice;
 	struct drm_i915_private *i915;
-	int flags;
 	struct drm_i915_file_private *file_priv;
 	struct i915_ctx_hang_stats hang_stats;
 	struct i915_hw_ppgtt *ppgtt;
 
+	unsigned flags;
+
 	/* Legacy ring buffer submission */
 	struct {
 		struct drm_i915_gem_object *rcs_state;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 900ffd044db8..d9998ab9d94d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -938,6 +938,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		else
 			args->value = to_i915(dev)->gtt.base.total;
 		break;
+	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
+		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -983,6 +986,16 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 			ctx->flags |= args->value ? CONTEXT_NO_ZEROMAP : 0;
 		}
 		break;
+	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
+		if (args->size) {
+			ret = -EINVAL;
+		} else {
+			if (args->value)
+				ctx->flags |= CONTEXT_NO_ERROR_CAPTURE;
+			else
+				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
+		}
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 6eefe9c36931..6db6d7e02aea 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1040,6 +1040,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
 				rcu_read_unlock();
 			}
 
+			error->simulated |= request->ctx->flags & CONTEXT_NO_ERROR_CAPTURE;
 			rbuf = request->ringbuf;
 		}
 
@@ -1336,12 +1337,14 @@ void i915_capture_error_state(struct drm_device *dev, bool wedged,
 	i915_error_capture_msg(dev, error, wedged, error_msg);
 	DRM_INFO("%s\n", error->error_msg);
 
-	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
-	if (dev_priv->gpu_error.first_error == NULL) {
-		dev_priv->gpu_error.first_error = error;
-		error = NULL;
+	if (!error->simulated) {
+		spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
+		if (dev_priv->gpu_error.first_error == NULL) {
+			dev_priv->gpu_error.first_error = error;
+			error = NULL;
+		}
+		spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
 	}
-	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
 
 	if (error) {
 		i915_error_state_free(&error->ref);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index acf21026c78a..7fee4416dcc7 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1140,6 +1140,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_BAN_PERIOD	0x1
 #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
+#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
 	__u64 value;
 };
 
-- 
2.6.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 3/3] drm/i915: Clean up GPU hang message
  2015-12-11 22:59 [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request Chris Wilson
  2015-12-11 22:59 ` [PATCH v2 2/3] drm/i915: Allow userspace to request no-error-capture upon GPU hangs Chris Wilson
@ 2015-12-11 22:59 ` Chris Wilson
  2015-12-14 11:28   ` Dave Gordon
  2015-12-14 11:14 ` [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request Dave Gordon
  2 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2015-12-11 22:59 UTC (permalink / raw)
  To: intel-gfx

Remove some redundant kernel messages as we deduce a hung GPU and
capture the error state.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_irq.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 4cfbd694b3a8..365d4872a67d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2963,7 +2963,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	struct drm_device *dev = dev_priv->dev;
 	struct intel_engine_cs *ring;
 	int i;
-	int busy_count = 0, rings_hung = 0;
+	int busy_count = 0;
 	bool stuck[I915_NUM_RINGS] = { 0 };
 #define BUSY 1
 #define KICK 5
@@ -3057,17 +3057,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	}
 
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
-			DRM_INFO("%s on %s\n",
-				 stuck[i] ? "stuck" : "no progress",
-				 ring->name);
-			rings_hung++;
-		}
+		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
+			return i915_handle_error(dev, true,
+						 "%s on %s",
+						 stuck[i] ? "No progress" : "Hang",
+						 ring->name);
 	}
 
-	if (rings_hung)
-		return i915_handle_error(dev, true, "Ring hung");
-
 	/* Reset timer in case GPU hangs without another request being added */
 	if (busy_count)
 		i915_queue_hangcheck(dev_priv);
-- 
2.6.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request
  2015-12-11 22:59 [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request Chris Wilson
  2015-12-11 22:59 ` [PATCH v2 2/3] drm/i915: Allow userspace to request no-error-capture upon GPU hangs Chris Wilson
  2015-12-11 22:59 ` [PATCH v2 3/3] drm/i915: Clean up GPU hang message Chris Wilson
@ 2015-12-14 11:14 ` Dave Gordon
  2015-12-14 11:28   ` Chris Wilson
  2 siblings, 1 reply; 10+ messages in thread
From: Dave Gordon @ 2015-12-14 11:14 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On 11/12/15 22:59, Chris Wilson wrote:
> The request tells us where to read the ringbuf from, so use that
> information to simplify the error capture. If no request was active at
> the time of the hang, the ring is idle and there is no information
> inside the ring pertaining to the hang.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gpu_error.c | 29 ++++++++++-------------------
>   1 file changed, 10 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 3e137fc701cf..6eefe9c36931 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -995,7 +995,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
>
>   	for (i = 0; i < I915_NUM_RINGS; i++) {
>   		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_ringbuffer *rbuf;
> +		struct intel_ringbuffer *rbuf = NULL;
>
>   		error->ring[i].pid = -1;
>
> @@ -1039,26 +1039,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
>   				}
>   				rcu_read_unlock();
>   			}
> +
> +			rbuf = request->ringbuf;
>   		}
>
> -		if (i915.enable_execlists) {
> -			/* TODO: This is only a small fix to keep basic error
> -			 * capture working, but we need to add more information
> -			 * for it to be useful (e.g. dump the context being
> -			 * executed).
> -			 */
> -			if (request)
> -				rbuf = request->ctx->engine[ring->id].ringbuf;
> -			else
> -				rbuf = ring->default_context->engine[ring->id].ringbuf;
> -		} else
> -			rbuf = ring->buffer;
> -
> -		error->ring[i].cpu_ring_head = rbuf->head;
> -		error->ring[i].cpu_ring_tail = rbuf->tail;
> -
> -		error->ring[i].ringbuffer =
> -			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
> +		if (rbuf) {
> +			error->ring[i].cpu_ring_head = rbuf->head;
> +			error->ring[i].cpu_ring_tail = rbuf->tail;
> +			error->ring[i].ringbuffer =
> +				i915_error_ggtt_object_create(dev_priv,
> +							      rbuf->obj);
> +		}
>
>   		error->ring[i].hws_page =
>   			i915_error_ggtt_object_create(dev_priv, ring->status_page.obj);

I think the code you deleted is intended to capture the *default* 
ringbuffer if there is no request active -- sometimes we will switch an 
engine to the default context (and therefore ringbuffer) when there's no 
work to be done.

Another option that's sometimes useful is to capture the ringbuffer 
pointed to by the START register. This helps in finding situations where 
the driver and the GPU disagree about what should be in progress.

I've got a few patches that update some of the error capture that's 
always been missing in execlist mode (like, actually capturing the 
active context), and add some more decoding of what we do capture.
John Harrison posted them as part of the "Preemption support for GPU 
scheduler" patchset last week, although they're not really anything to 
do with preemption per se.

One of them "drm/i915/error: improve CSB reporting" also updates this 
area of the code, so maybe I should incorporate your change into the 
next revision of that patch?

.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request
  2015-12-14 11:14 ` [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request Dave Gordon
@ 2015-12-14 11:28   ` Chris Wilson
  2015-12-15 16:53     ` Dave Gordon
  0 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2015-12-14 11:28 UTC (permalink / raw)
  To: Dave Gordon; +Cc: intel-gfx

On Mon, Dec 14, 2015 at 11:14:31AM +0000, Dave Gordon wrote:
> On 11/12/15 22:59, Chris Wilson wrote:
> >The request tells us where to read the ringbuf from, so use that
> >information to simplify the error capture. If no request was active at
> >the time of the hang, the ring is idle and there is no information
> >inside the ring pertaining to the hang.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/i915_gpu_error.c | 29 ++++++++++-------------------
> >  1 file changed, 10 insertions(+), 19 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> >index 3e137fc701cf..6eefe9c36931 100644
> >--- a/drivers/gpu/drm/i915/i915_gpu_error.c
> >+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> >@@ -995,7 +995,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
> >
> >  	for (i = 0; i < I915_NUM_RINGS; i++) {
> >  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> >-		struct intel_ringbuffer *rbuf;
> >+		struct intel_ringbuffer *rbuf = NULL;
> >
> >  		error->ring[i].pid = -1;
> >
> >@@ -1039,26 +1039,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
> >  				}
> >  				rcu_read_unlock();
> >  			}
> >+
> >+			rbuf = request->ringbuf;
> >  		}
> >
> >-		if (i915.enable_execlists) {
> >-			/* TODO: This is only a small fix to keep basic error
> >-			 * capture working, but we need to add more information
> >-			 * for it to be useful (e.g. dump the context being
> >-			 * executed).
> >-			 */
> >-			if (request)
> >-				rbuf = request->ctx->engine[ring->id].ringbuf;
> >-			else
> >-				rbuf = ring->default_context->engine[ring->id].ringbuf;
> >-		} else
> >-			rbuf = ring->buffer;
> >-
> >-		error->ring[i].cpu_ring_head = rbuf->head;
> >-		error->ring[i].cpu_ring_tail = rbuf->tail;
> >-
> >-		error->ring[i].ringbuffer =
> >-			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
> >+		if (rbuf) {
> >+			error->ring[i].cpu_ring_head = rbuf->head;
> >+			error->ring[i].cpu_ring_tail = rbuf->tail;
> >+			error->ring[i].ringbuffer =
> >+				i915_error_ggtt_object_create(dev_priv,
> >+							      rbuf->obj);
> >+		}
> >
> >  		error->ring[i].hws_page =
> >  			i915_error_ggtt_object_create(dev_priv, ring->status_page.obj);
> 
> I think the code you deleted is intended to capture the *default*
> ringbuffer if there is no request active -- sometimes we will switch
> an engine to the default context (and therefore ringbuffer) when
> there's no work to be done.

So answer the question, why? I don't have a use for it. This code in
particular is nothing more than a hack for execlists and in no way
reflects my intentions for the postmortem debugging tool.

> Another option that's sometimes useful is to capture the ringbuffer
> pointed to by the START register. This helps in finding situations
> where the driver and the GPU disagree about what should be in
> progress.

That is a possibitly, except is no more interesting than inspecting the
START vs expected (and requires the stop_machine rework to walk the
lists without crashing).
 
> I've got a few patches that update some of the error capture that's
> always been missing in execlist mode (like, actually capturing the
> active context), and add some more decoding of what we do capture.

No decoding. That is easier done in userspace. And I sent patches to do
more capturing many, many months ago, along with fixing up most of the
invalid ppgtt state.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 3/3] drm/i915: Clean up GPU hang message
  2015-12-11 22:59 ` [PATCH v2 3/3] drm/i915: Clean up GPU hang message Chris Wilson
@ 2015-12-14 11:28   ` Dave Gordon
  2015-12-14 11:39     ` Chris Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Dave Gordon @ 2015-12-14 11:28 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On 11/12/15 22:59, Chris Wilson wrote:
> Remove some redundant kernel messages as we deduce a hung GPU and
> capture the error state.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_irq.c | 16 ++++++----------
>   1 file changed, 6 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 4cfbd694b3a8..365d4872a67d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2963,7 +2963,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   	struct drm_device *dev = dev_priv->dev;
>   	struct intel_engine_cs *ring;
>   	int i;
> -	int busy_count = 0, rings_hung = 0;
> +	int busy_count = 0;
>   	bool stuck[I915_NUM_RINGS] = { 0 };
>   #define BUSY 1
>   #define KICK 5
> @@ -3057,17 +3057,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   	}
>
>   	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
> -			DRM_INFO("%s on %s\n",
> -				 stuck[i] ? "stuck" : "no progress",
> -				 ring->name);
> -			rings_hung++;
> -		}
> +		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
> +			return i915_handle_error(dev, true,
> +						 "%s on %s",
> +						 stuck[i] ? "No progress" : "Hang",
> +						 ring->name);
>   	}
>
> -	if (rings_hung)
> -		return i915_handle_error(dev, true, "Ring hung");
> -
>   	/* Reset timer in case GPU hangs without another request being added */
>   	if (busy_count)
>   		i915_queue_hangcheck(dev_priv);

This version provides less information (in dmesg & syslog) in the case 
that multiple rings have (been detected as) hung. Does this ever happen? 
Is the original more useful for finding bugs in hangcheck itself? If 
this is a test that has disabled error state capture (because it's 
*trying* to hang one or more rings) can we still know how many rings 
have been diagnosed as hung?

.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 3/3] drm/i915: Clean up GPU hang message
  2015-12-14 11:28   ` Dave Gordon
@ 2015-12-14 11:39     ` Chris Wilson
  2015-12-14 13:45       ` Chris Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2015-12-14 11:39 UTC (permalink / raw)
  To: Dave Gordon; +Cc: intel-gfx

On Mon, Dec 14, 2015 at 11:28:39AM +0000, Dave Gordon wrote:
> On 11/12/15 22:59, Chris Wilson wrote:
> >Remove some redundant kernel messages as we deduce a hung GPU and
> >capture the error state.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/i915_irq.c | 16 ++++++----------
> >  1 file changed, 6 insertions(+), 10 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> >index 4cfbd694b3a8..365d4872a67d 100644
> >--- a/drivers/gpu/drm/i915/i915_irq.c
> >+++ b/drivers/gpu/drm/i915/i915_irq.c
> >@@ -2963,7 +2963,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> >  	struct drm_device *dev = dev_priv->dev;
> >  	struct intel_engine_cs *ring;
> >  	int i;
> >-	int busy_count = 0, rings_hung = 0;
> >+	int busy_count = 0;
> >  	bool stuck[I915_NUM_RINGS] = { 0 };
> >  #define BUSY 1
> >  #define KICK 5
> >@@ -3057,17 +3057,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> >  	}
> >
> >  	for_each_ring(ring, dev_priv, i) {
> >-		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
> >-			DRM_INFO("%s on %s\n",
> >-				 stuck[i] ? "stuck" : "no progress",
> >-				 ring->name);
> >-			rings_hung++;
> >-		}
> >+		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
> >+			return i915_handle_error(dev, true,
> >+						 "%s on %s",
> >+						 stuck[i] ? "No progress" : "Hang",
> >+						 ring->name);
> >  	}
> >
> >-	if (rings_hung)
> >-		return i915_handle_error(dev, true, "Ring hung");
> >-
> >  	/* Reset timer in case GPU hangs without another request being added */
> >  	if (busy_count)
> >  		i915_queue_hangcheck(dev_priv);
> 
> This version provides less information (in dmesg & syslog) in the
> case that multiple rings have (been detected as) hung. Does this
> ever happen?

Not often. And intended, since that information is already in the error
state.

> Is the original more useful for finding bugs in
> hangcheck itself?

No. See i915_hangcheck_info.

> If this is a test that has disabled error state
> capture (because it's *trying* to hang one or more rings) can we
> still know how many rings have been diagnosed as hung?

If you have a use case, then you can look at the interface you require.
i915_hangcheck_info should be sufficient in most cases, or at least a
good starting point. But you may want a more specific debugfs to avoid
parsing.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 3/3] drm/i915: Clean up GPU hang message
  2015-12-14 11:39     ` Chris Wilson
@ 2015-12-14 13:45       ` Chris Wilson
  0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2015-12-14 13:45 UTC (permalink / raw)
  To: Dave Gordon, intel-gfx

On Mon, Dec 14, 2015 at 11:39:47AM +0000, Chris Wilson wrote:
> On Mon, Dec 14, 2015 at 11:28:39AM +0000, Dave Gordon wrote:
> > If this is a test that has disabled error state
> > capture (because it's *trying* to hang one or more rings) can we
> > still know how many rings have been diagnosed as hung?
> 
> If you have a use case, then you can look at the interface you require.
> i915_hangcheck_info should be sufficient in most cases, or at least a
> good starting point. But you may want a more specific debugfs to avoid
> parsing.

Gah, even better is just to use the reset-stats ioctl!
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request
  2015-12-14 11:28   ` Chris Wilson
@ 2015-12-15 16:53     ` Dave Gordon
  0 siblings, 0 replies; 10+ messages in thread
From: Dave Gordon @ 2015-12-15 16:53 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On 14/12/15 11:28, Chris Wilson wrote:
> On Mon, Dec 14, 2015 at 11:14:31AM +0000, Dave Gordon wrote:
>> On 11/12/15 22:59, Chris Wilson wrote:
>>> The request tells us where to read the ringbuf from, so use that
>>> information to simplify the error capture. If no request was active at
>>> the time of the hang, the ring is idle and there is no information
>>> inside the ring pertaining to the hang.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>   drivers/gpu/drm/i915/i915_gpu_error.c | 29 ++++++++++-------------------
>>>   1 file changed, 10 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> index 3e137fc701cf..6eefe9c36931 100644
>>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>>> @@ -995,7 +995,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
>>>
>>>   	for (i = 0; i < I915_NUM_RINGS; i++) {
>>>   		struct intel_engine_cs *ring = &dev_priv->ring[i];
>>> -		struct intel_ringbuffer *rbuf;
>>> +		struct intel_ringbuffer *rbuf = NULL;
>>>
>>>   		error->ring[i].pid = -1;
>>>
>>> @@ -1039,26 +1039,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
>>>   				}
>>>   				rcu_read_unlock();
>>>   			}
>>> +
>>> +			rbuf = request->ringbuf;
>>>   		}
>>>
>>> -		if (i915.enable_execlists) {
>>> -			/* TODO: This is only a small fix to keep basic error
>>> -			 * capture working, but we need to add more information
>>> -			 * for it to be useful (e.g. dump the context being
>>> -			 * executed).
>>> -			 */
>>> -			if (request)
>>> -				rbuf = request->ctx->engine[ring->id].ringbuf;
>>> -			else
>>> -				rbuf = ring->default_context->engine[ring->id].ringbuf;
>>> -		} else
>>> -			rbuf = ring->buffer;
>>> -
>>> -		error->ring[i].cpu_ring_head = rbuf->head;
>>> -		error->ring[i].cpu_ring_tail = rbuf->tail;
>>> -
>>> -		error->ring[i].ringbuffer =
>>> -			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
>>> +		if (rbuf) {
>>> +			error->ring[i].cpu_ring_head = rbuf->head;
>>> +			error->ring[i].cpu_ring_tail = rbuf->tail;
>>> +			error->ring[i].ringbuffer =
>>> +				i915_error_ggtt_object_create(dev_priv,
>>> +							      rbuf->obj);
>>> +		}
>>>
>>>   		error->ring[i].hws_page =
>>>   			i915_error_ggtt_object_create(dev_priv, ring->status_page.obj);
>>
>> I think the code you deleted is intended to capture the *default*
>> ringbuffer if there is no request active -- sometimes we will switch
>> an engine to the default context (and therefore ringbuffer) when
>> there's no work to be done.
>
> So answer the question, why? I don't have a use for it. This code in
> particular is nothing more than a hack for execlists and in no way
> reflects my intentions for the postmortem debugging tool.
>
>> Another option that's sometimes useful is to capture the ringbuffer
>> pointed to by the START register. This helps in finding situations
>> where the driver and the GPU disagree about what should be in
>> progress.
>
> That is a possibitly, except is no more interesting than inspecting the
> START vs expected (and requires the stop_machine rework to walk the
> lists without crashing).
>
>> I've got a few patches that update some of the error capture that's
>> always been missing in execlist mode (like, actually capturing the
>> active context), and add some more decoding of what we do capture.
>
> No decoding. That is easier done in userspace. And I sent patches to do
> more capturing many, many months ago, along with fixing up most of the
> invalid ppgtt state.
> -Chris

Anyway, the removal of the unnecessary execlist/non-execlist is 
worthwhile, so

Reviewed-by: Dave Gordon <david.s.gordon@intel.com>

and then maybe I'll rework the default and/or START capture on top of 
this later.

.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 2/3] drm/i915: Allow userspace to request no-error-capture upon GPU hangs
  2015-12-11 22:59 ` [PATCH v2 2/3] drm/i915: Allow userspace to request no-error-capture upon GPU hangs Chris Wilson
@ 2015-12-15 16:59   ` Dave Gordon
  0 siblings, 0 replies; 10+ messages in thread
From: Dave Gordon @ 2015-12-15 16:59 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On 11/12/15 22:59, Chris Wilson wrote:
> igt likes to inject GPU hangs into its command streams. However, as we
> expect these hangs, we don't actually want them recorded in the dmesg
> output or stored in the i915_error_state (usually). To accomodate this
> allow userspace to set a flag on the context that any hang emanating
> from that context will not be recorded. We still do the error capture
> (otherwise how do we find the guilty context and know its intent?) as
> part of the reason for random GPU hang injection is to exercise the race
> conditions between the error capture and normal execution.
>
> v2: Split out the request->ringbuf error capture changes.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.h         |  8 ++++++--
>   drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++
>   drivers/gpu/drm/i915/i915_gpu_error.c   | 13 ++++++++-----
>   include/uapi/drm/i915_drm.h             |  1 +
>   4 files changed, 28 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b33091c2c39e..c511b3cbf9b2 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -493,6 +493,7 @@ struct drm_i915_error_state {
>   	struct timeval time;
>
>   	char error_msg[128];
> +	bool simulated;
>   	int iommu;
>   	u32 reset_count;
>   	u32 suspend_count;
> @@ -845,7 +846,9 @@ struct i915_ctx_hang_stats {
>   /* This must match up with the value previously used for execbuf2.rsvd1. */
>   #define DEFAULT_CONTEXT_HANDLE 0

The #defines below need a comment, at the very least:

/* Bits in struct intel_context::flags below */

otherwise we can't tell where they are appropriate.

With that fixed, then

Reviewed-by: Dave Gordon <david.s.gordon@intel.com>

> -#define CONTEXT_NO_ZEROMAP (1<<0)
> +#define CONTEXT_NO_ZEROMAP		(1<<0)
> +#define CONTEXT_NO_ERROR_CAPTURE	(1<<1)
> +
>   /**
>    * struct intel_context - as the name implies, represents a context.
>    * @ref: reference count.
> @@ -870,11 +873,12 @@ struct intel_context {
>   	int user_handle;
>   	uint8_t remap_slice;
>   	struct drm_i915_private *i915;
> -	int flags;
>   	struct drm_i915_file_private *file_priv;
>   	struct i915_ctx_hang_stats hang_stats;
>   	struct i915_hw_ppgtt *ppgtt;
>
> +	unsigned flags;
> +
>   	/* Legacy ring buffer submission */
>   	struct {
>   		struct drm_i915_gem_object *rcs_state;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 900ffd044db8..d9998ab9d94d 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -938,6 +938,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>   		else
>   			args->value = to_i915(dev)->gtt.base.total;
>   		break;
> +	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
> +		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> @@ -983,6 +986,16 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>   			ctx->flags |= args->value ? CONTEXT_NO_ZEROMAP : 0;
>   		}
>   		break;
> +	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
> +		if (args->size) {
> +			ret = -EINVAL;
> +		} else {
> +			if (args->value)
> +				ctx->flags |= CONTEXT_NO_ERROR_CAPTURE;
> +			else
> +				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
> +		}
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 6eefe9c36931..6db6d7e02aea 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1040,6 +1040,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
>   				rcu_read_unlock();
>   			}
>
> +			error->simulated |= request->ctx->flags & CONTEXT_NO_ERROR_CAPTURE;
>   			rbuf = request->ringbuf;
>   		}
>
> @@ -1336,12 +1337,14 @@ void i915_capture_error_state(struct drm_device *dev, bool wedged,
>   	i915_error_capture_msg(dev, error, wedged, error_msg);
>   	DRM_INFO("%s\n", error->error_msg);
>
> -	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> -	if (dev_priv->gpu_error.first_error == NULL) {
> -		dev_priv->gpu_error.first_error = error;
> -		error = NULL;
> +	if (!error->simulated) {
> +		spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> +		if (dev_priv->gpu_error.first_error == NULL) {
> +			dev_priv->gpu_error.first_error = error;
> +			error = NULL;
> +		}
> +		spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
>   	}
> -	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
>
>   	if (error) {
>   		i915_error_state_free(&error->ref);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index acf21026c78a..7fee4416dcc7 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1140,6 +1140,7 @@ struct drm_i915_gem_context_param {
>   #define I915_CONTEXT_PARAM_BAN_PERIOD	0x1
>   #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
>   #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
> +#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
>   	__u64 value;
>   };
>
>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-12-15 16:59 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-12-11 22:59 [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request Chris Wilson
2015-12-11 22:59 ` [PATCH v2 2/3] drm/i915: Allow userspace to request no-error-capture upon GPU hangs Chris Wilson
2015-12-15 16:59   ` Dave Gordon
2015-12-11 22:59 ` [PATCH v2 3/3] drm/i915: Clean up GPU hang message Chris Wilson
2015-12-14 11:28   ` Dave Gordon
2015-12-14 11:39     ` Chris Wilson
2015-12-14 13:45       ` Chris Wilson
2015-12-14 11:14 ` [PATCH v2 1/3] drm/i915: Record the ringbuffer associated with the request Dave Gordon
2015-12-14 11:28   ` Chris Wilson
2015-12-15 16:53     ` Dave Gordon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).