intel-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Allow userspace to request no-error-capture upon GPU hangs
@ 2015-12-11 22:18 Chris Wilson
  2015-12-16  8:54 ` Daniel Vetter
  0 siblings, 1 reply; 5+ messages in thread
From: Chris Wilson @ 2015-12-11 22:18 UTC (permalink / raw)
  To: intel-gfx

igt likes to inject GPU hangs into its command streams. However, as we
expect these hangs, we don't actually want them recorded in the dmesg
output or stored in the i915_error_state (usually). To accomodate this
allow userspace to set a flag on the context that any hang emanating
from that context will not be recorded. We still do the error capture
(otherwise how do we find the guilty context and know its intent?) as
part of the reason for random GPU hang injection is to exercise the race
conditions between the error capture and normal execution.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h         |  8 ++++++--
 drivers/gpu/drm/i915/i915_gem_context.c | 13 ++++++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c   | 36 ++++++++++++++-------------------
 include/uapi/drm/i915_drm.h             |  1 +
 4 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b33091c2c39e..c511b3cbf9b2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -493,6 +493,7 @@ struct drm_i915_error_state {
 	struct timeval time;
 
 	char error_msg[128];
+	bool simulated;
 	int iommu;
 	u32 reset_count;
 	u32 suspend_count;
@@ -845,7 +846,9 @@ struct i915_ctx_hang_stats {
 /* This must match up with the value previously used for execbuf2.rsvd1. */
 #define DEFAULT_CONTEXT_HANDLE 0
 
-#define CONTEXT_NO_ZEROMAP (1<<0)
+#define CONTEXT_NO_ZEROMAP		(1<<0)
+#define CONTEXT_NO_ERROR_CAPTURE	(1<<1)
+
 /**
  * struct intel_context - as the name implies, represents a context.
  * @ref: reference count.
@@ -870,11 +873,12 @@ struct intel_context {
 	int user_handle;
 	uint8_t remap_slice;
 	struct drm_i915_private *i915;
-	int flags;
 	struct drm_i915_file_private *file_priv;
 	struct i915_ctx_hang_stats hang_stats;
 	struct i915_hw_ppgtt *ppgtt;
 
+	unsigned flags;
+
 	/* Legacy ring buffer submission */
 	struct {
 		struct drm_i915_gem_object *rcs_state;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 900ffd044db8..d9998ab9d94d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -938,6 +938,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		else
 			args->value = to_i915(dev)->gtt.base.total;
 		break;
+	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
+		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -983,6 +986,16 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 			ctx->flags |= args->value ? CONTEXT_NO_ZEROMAP : 0;
 		}
 		break;
+	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
+		if (args->size) {
+			ret = -EINVAL;
+		} else {
+			if (args->value)
+				ctx->flags |= CONTEXT_NO_ERROR_CAPTURE;
+			else
+				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
+		}
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 3e137fc701cf..cb0d6f347d42 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -995,7 +995,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
 
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		struct intel_engine_cs *ring = &dev_priv->ring[i];
-		struct intel_ringbuffer *rbuf;
+		struct intel_ringbuffer *rbuf = NULL;
 
 		error->ring[i].pid = -1;
 
@@ -1039,23 +1039,15 @@ static void i915_gem_record_rings(struct drm_device *dev,
 				}
 				rcu_read_unlock();
 			}
-		}
 
-		if (i915.enable_execlists) {
-			/* TODO: This is only a small fix to keep basic error
-			 * capture working, but we need to add more information
-			 * for it to be useful (e.g. dump the context being
-			 * executed).
-			 */
-			if (request)
-				rbuf = request->ctx->engine[ring->id].ringbuf;
-			else
-				rbuf = ring->default_context->engine[ring->id].ringbuf;
-		} else
-			rbuf = ring->buffer;
+			error->simulated |= request->ctx->flags & CONTEXT_NO_ERROR_CAPTURE;
+			rbuf = request->ringbuf;
+		}
 
-		error->ring[i].cpu_ring_head = rbuf->head;
-		error->ring[i].cpu_ring_tail = rbuf->tail;
+		if (rbuf) {
+			error->ring[i].cpu_ring_head = rbuf->head;
+			error->ring[i].cpu_ring_tail = rbuf->tail;
+		}
 
 		error->ring[i].ringbuffer =
 			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
@@ -1345,12 +1337,14 @@ void i915_capture_error_state(struct drm_device *dev, bool wedged,
 	i915_error_capture_msg(dev, error, wedged, error_msg);
 	DRM_INFO("%s\n", error->error_msg);
 
-	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
-	if (dev_priv->gpu_error.first_error == NULL) {
-		dev_priv->gpu_error.first_error = error;
-		error = NULL;
+	if (!error->simulated) {
+		spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
+		if (dev_priv->gpu_error.first_error == NULL) {
+			dev_priv->gpu_error.first_error = error;
+			error = NULL;
+		}
+		spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
 	}
-	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
 
 	if (error) {
 		i915_error_state_free(&error->ref);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index acf21026c78a..7fee4416dcc7 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1140,6 +1140,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_BAN_PERIOD	0x1
 #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
+#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
 	__u64 value;
 };
 
-- 
2.6.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/i915: Allow userspace to request no-error-capture upon GPU hangs
  2015-12-11 22:18 [PATCH] drm/i915: Allow userspace to request no-error-capture upon GPU hangs Chris Wilson
@ 2015-12-16  8:54 ` Daniel Vetter
  2015-12-16 10:00   ` Chris Wilson
  0 siblings, 1 reply; 5+ messages in thread
From: Daniel Vetter @ 2015-12-16  8:54 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Fri, Dec 11, 2015 at 10:18:35PM +0000, Chris Wilson wrote:
> igt likes to inject GPU hangs into its command streams. However, as we
> expect these hangs, we don't actually want them recorded in the dmesg
> output or stored in the i915_error_state (usually). To accomodate this
> allow userspace to set a flag on the context that any hang emanating
> from that context will not be recorded. We still do the error capture
> (otherwise how do we find the guilty context and know its intent?) as
> part of the reason for random GPU hang injection is to exercise the race
> conditions between the error capture and normal execution.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Hm, I do like that we exercise the full paths all the time, increasing
chances for fireworks. What's the motivation here? Is there some
substantial speed-up?
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  8 ++++++--
>  drivers/gpu/drm/i915/i915_gem_context.c | 13 ++++++++++++
>  drivers/gpu/drm/i915/i915_gpu_error.c   | 36 ++++++++++++++-------------------
>  include/uapi/drm/i915_drm.h             |  1 +
>  4 files changed, 35 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b33091c2c39e..c511b3cbf9b2 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -493,6 +493,7 @@ struct drm_i915_error_state {
>  	struct timeval time;
>  
>  	char error_msg[128];
> +	bool simulated;
>  	int iommu;
>  	u32 reset_count;
>  	u32 suspend_count;
> @@ -845,7 +846,9 @@ struct i915_ctx_hang_stats {
>  /* This must match up with the value previously used for execbuf2.rsvd1. */
>  #define DEFAULT_CONTEXT_HANDLE 0
>  
> -#define CONTEXT_NO_ZEROMAP (1<<0)
> +#define CONTEXT_NO_ZEROMAP		(1<<0)
> +#define CONTEXT_NO_ERROR_CAPTURE	(1<<1)
> +
>  /**
>   * struct intel_context - as the name implies, represents a context.
>   * @ref: reference count.
> @@ -870,11 +873,12 @@ struct intel_context {
>  	int user_handle;
>  	uint8_t remap_slice;
>  	struct drm_i915_private *i915;
> -	int flags;
>  	struct drm_i915_file_private *file_priv;
>  	struct i915_ctx_hang_stats hang_stats;
>  	struct i915_hw_ppgtt *ppgtt;
>  
> +	unsigned flags;
> +
>  	/* Legacy ring buffer submission */
>  	struct {
>  		struct drm_i915_gem_object *rcs_state;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 900ffd044db8..d9998ab9d94d 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -938,6 +938,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  		else
>  			args->value = to_i915(dev)->gtt.base.total;
>  		break;
> +	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
> +		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -983,6 +986,16 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  			ctx->flags |= args->value ? CONTEXT_NO_ZEROMAP : 0;
>  		}
>  		break;
> +	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
> +		if (args->size) {
> +			ret = -EINVAL;
> +		} else {
> +			if (args->value)
> +				ctx->flags |= CONTEXT_NO_ERROR_CAPTURE;
> +			else
> +				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
> +		}
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 3e137fc701cf..cb0d6f347d42 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -995,7 +995,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
>  
>  	for (i = 0; i < I915_NUM_RINGS; i++) {
>  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> -		struct intel_ringbuffer *rbuf;
> +		struct intel_ringbuffer *rbuf = NULL;
>  
>  		error->ring[i].pid = -1;
>  
> @@ -1039,23 +1039,15 @@ static void i915_gem_record_rings(struct drm_device *dev,
>  				}
>  				rcu_read_unlock();
>  			}
> -		}
>  
> -		if (i915.enable_execlists) {
> -			/* TODO: This is only a small fix to keep basic error
> -			 * capture working, but we need to add more information
> -			 * for it to be useful (e.g. dump the context being
> -			 * executed).
> -			 */
> -			if (request)
> -				rbuf = request->ctx->engine[ring->id].ringbuf;
> -			else
> -				rbuf = ring->default_context->engine[ring->id].ringbuf;
> -		} else
> -			rbuf = ring->buffer;
> +			error->simulated |= request->ctx->flags & CONTEXT_NO_ERROR_CAPTURE;
> +			rbuf = request->ringbuf;
> +		}
>  
> -		error->ring[i].cpu_ring_head = rbuf->head;
> -		error->ring[i].cpu_ring_tail = rbuf->tail;
> +		if (rbuf) {
> +			error->ring[i].cpu_ring_head = rbuf->head;
> +			error->ring[i].cpu_ring_tail = rbuf->tail;
> +		}
>  
>  		error->ring[i].ringbuffer =
>  			i915_error_ggtt_object_create(dev_priv, rbuf->obj);
> @@ -1345,12 +1337,14 @@ void i915_capture_error_state(struct drm_device *dev, bool wedged,
>  	i915_error_capture_msg(dev, error, wedged, error_msg);
>  	DRM_INFO("%s\n", error->error_msg);
>  
> -	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> -	if (dev_priv->gpu_error.first_error == NULL) {
> -		dev_priv->gpu_error.first_error = error;
> -		error = NULL;
> +	if (!error->simulated) {
> +		spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> +		if (dev_priv->gpu_error.first_error == NULL) {
> +			dev_priv->gpu_error.first_error = error;
> +			error = NULL;
> +		}
> +		spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
>  	}
> -	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
>  
>  	if (error) {
>  		i915_error_state_free(&error->ref);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index acf21026c78a..7fee4416dcc7 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1140,6 +1140,7 @@ struct drm_i915_gem_context_param {
>  #define I915_CONTEXT_PARAM_BAN_PERIOD	0x1
>  #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
>  #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
> +#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
>  	__u64 value;
>  };
>  
> -- 
> 2.6.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/i915: Allow userspace to request no-error-capture upon GPU hangs
  2015-12-16  8:54 ` Daniel Vetter
@ 2015-12-16 10:00   ` Chris Wilson
  2015-12-16 10:09     ` Daniel Vetter
  0 siblings, 1 reply; 5+ messages in thread
From: Chris Wilson @ 2015-12-16 10:00 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Wed, Dec 16, 2015 at 09:54:47AM +0100, Daniel Vetter wrote:
> On Fri, Dec 11, 2015 at 10:18:35PM +0000, Chris Wilson wrote:
> > igt likes to inject GPU hangs into its command streams. However, as we
> > expect these hangs, we don't actually want them recorded in the dmesg
> > output or stored in the i915_error_state (usually). To accomodate this
> > allow userspace to set a flag on the context that any hang emanating
> > from that context will not be recorded. We still do the error capture
> > (otherwise how do we find the guilty context and know its intent?) as
> > part of the reason for random GPU hang injection is to exercise the race
> > conditions between the error capture and normal execution.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Hm, I do like that we exercise the full paths all the time, increasing
> chances for fireworks. What's the motivation here? Is there some
> substantial speed-up?

No, since we keep doing the error-capture (we have to, we haven't fixed
the bugs in it yet!), the only benefits are:

(a) Reduce dmesg spam during igt
(b) simulating hangs doesn't leave an error-state around, or rather, we
don't leave the simulated error state and igt doesn't eat a *genuine* hang
that occurred during or before the test.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/i915: Allow userspace to request no-error-capture upon GPU hangs
  2015-12-16 10:00   ` Chris Wilson
@ 2015-12-16 10:09     ` Daniel Vetter
  2015-12-16 17:30       ` Dave Gordon
  0 siblings, 1 reply; 5+ messages in thread
From: Daniel Vetter @ 2015-12-16 10:09 UTC (permalink / raw)
  To: Chris Wilson, Daniel Vetter, intel-gfx

On Wed, Dec 16, 2015 at 10:00:44AM +0000, Chris Wilson wrote:
> On Wed, Dec 16, 2015 at 09:54:47AM +0100, Daniel Vetter wrote:
> > On Fri, Dec 11, 2015 at 10:18:35PM +0000, Chris Wilson wrote:
> > > igt likes to inject GPU hangs into its command streams. However, as we
> > > expect these hangs, we don't actually want them recorded in the dmesg
> > > output or stored in the i915_error_state (usually). To accomodate this
> > > allow userspace to set a flag on the context that any hang emanating
> > > from that context will not be recorded. We still do the error capture
> > > (otherwise how do we find the guilty context and know its intent?) as
> > > part of the reason for random GPU hang injection is to exercise the race
> > > conditions between the error capture and normal execution.
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > Hm, I do like that we exercise the full paths all the time, increasing
> > chances for fireworks. What's the motivation here? Is there some
> > substantial speed-up?
> 
> No, since we keep doing the error-capture (we have to, we haven't fixed
> the bugs in it yet!), the only benefits are:
> 
> (a) Reduce dmesg spam during igt
> (b) simulating hangs doesn't leave an error-state around, or rather, we
> don't leave the simulated error state and igt doesn't eat a *genuine* hang
> that occurred during or before the test.

Oh, should better wait for coffee to kick in - I didn't realize that all
that code still runs, and the only thing that changes is whether we'll
store the capture error state in the global slot used by debugfs.

Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/i915: Allow userspace to request no-error-capture upon GPU hangs
  2015-12-16 10:09     ` Daniel Vetter
@ 2015-12-16 17:30       ` Dave Gordon
  0 siblings, 0 replies; 5+ messages in thread
From: Dave Gordon @ 2015-12-16 17:30 UTC (permalink / raw)
  To: intel-gfx

On 16/12/15 10:09, Daniel Vetter wrote:
> On Wed, Dec 16, 2015 at 10:00:44AM +0000, Chris Wilson wrote:
>> On Wed, Dec 16, 2015 at 09:54:47AM +0100, Daniel Vetter wrote:
>>> On Fri, Dec 11, 2015 at 10:18:35PM +0000, Chris Wilson wrote:
>>>> igt likes to inject GPU hangs into its command streams. However, as we
>>>> expect these hangs, we don't actually want them recorded in the dmesg
>>>> output or stored in the i915_error_state (usually). To accomodate this
>>>> allow userspace to set a flag on the context that any hang emanating
>>>> from that context will not be recorded. We still do the error capture
>>>> (otherwise how do we find the guilty context and know its intent?) as
>>>> part of the reason for random GPU hang injection is to exercise the race
>>>> conditions between the error capture and normal execution.
>>>>
>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>
>>> Hm, I do like that we exercise the full paths all the time, increasing
>>> chances for fireworks. What's the motivation here? Is there some
>>> substantial speed-up?
>>
>> No, since we keep doing the error-capture (we have to, we haven't fixed
>> the bugs in it yet!), the only benefits are:
>>
>> (a) Reduce dmesg spam during igt
>> (b) simulating hangs doesn't leave an error-state around, or rather, we
>> don't leave the simulated error state and igt doesn't eat a *genuine* hang
>> that occurred during or before the test.
>
> Oh, should better wait for coffee to kick in - I didn't realize that all
> that code still runs, and the only thing that changes is whether we'll
> store the capture error state in the global slot used by debugfs.
>
> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>

Note this is the first version, obsoleted by the one Chris posted 41 
minutes later, and which I already gave an R-B, with qualifications:

http://www.spinics.net/lists/intel-gfx/msg83235.html

.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-12-16 17:30 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-12-11 22:18 [PATCH] drm/i915: Allow userspace to request no-error-capture upon GPU hangs Chris Wilson
2015-12-16  8:54 ` Daniel Vetter
2015-12-16 10:00   ` Chris Wilson
2015-12-16 10:09     ` Daniel Vetter
2015-12-16 17:30       ` Dave Gordon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).