[PATCH] drm/i915: Decouple GPU error reporting from ring initialisation

public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed

* [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation
@ 2014-01-23 21:49 Chris Wilson
  2014-01-24 11:50 ` Ville Syrjälä
  0 siblings, 1 reply; 7+ messages in thread
From: Chris Wilson @ 2014-01-23 21:49 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

Currently we report through our error state only the rings that have
been initialised (as detected by ring->obj). This check is done after
the GPU reset and ring re-initialisation, which means that the software
state may not be the same as when we captured the hardware error and we
may not print out any of the vital information for debugging the hang.

This (and the implied object leak) is a regression from

commit 3d57e5bd1284f44e325f3a52d966259ed42f9e05
Author: Ben Widawsky <ben@bwidawsk.net>
Date:   Mon Oct 14 10:01:36 2013 -0700

    drm/i915: Do a fuller init after reset

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h       |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c45cbbecd66a..64a1aca7804d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -334,6 +334,7 @@ struct drm_i915_error_state {
 	struct timeval time;
 
 	struct drm_i915_error_ring {
+		int valid;
 		struct drm_i915_error_object {
 			int page_count;
 			u32 gtt_offset;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 260a215e3619..e2af1d490f8d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -240,6 +240,9 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
 				  unsigned ring)
 {
 	BUG_ON(ring >= I915_NUM_RINGS); /* shut up confused gcc */
+	if (!error->ring[ring].valid)
+		return;
+
 	err_printf(m, "%s command stream:\n", ring_str(ring));
 	err_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
 	err_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
@@ -294,7 +297,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	struct drm_device *dev = error_priv->dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error = error_priv->error;
-	struct intel_ring_buffer *ring;
 	int i, j, page, offset, elt;
 
 	if (!error) {
@@ -329,7 +331,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	if (INTEL_INFO(dev)->gen == 7)
 		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
 
-	for_each_ring(ring, dev_priv, i)
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++)
 		i915_ring_error_state(m, dev, error, i);
 
 	for (i = 0; i < error->vm_count; i++) {
@@ -388,8 +390,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			}
 		}
 
-		obj = error->ring[i].ctx;
-		if (obj) {
+		if ((obj = error->ring[i].ctx)) {
 			err_printf(m, "%s --- HW Context = 0x%08x\n",
 				   dev_priv->ring[i].name,
 				   obj->gtt_offset);
@@ -826,11 +827,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
 				  struct drm_i915_error_state *error)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct intel_ring_buffer *ring;
 	struct drm_i915_gem_request *request;
 	int i, count;
 
-	for_each_ring(ring, dev_priv, i) {
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		struct intel_ring_buffer *ring = &dev_priv->ring[i];
+
+		if (ring->dev == NULL)
+			continue;
+
+		error->ring[i].valid = true;
+
 		i915_record_ring_state(dev, error, ring);
 
 		error->ring[i].batchbuffer =
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation
  2014-01-23 21:49 [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation Chris Wilson
@ 2014-01-24 11:50 ` Ville Syrjälä
  2014-01-24 11:55   ` Chris Wilson
  0 siblings, 1 reply; 7+ messages in thread
From: Ville Syrjälä @ 2014-01-24 11:50 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, Ben Widawsky

On Thu, Jan 23, 2014 at 09:49:43PM +0000, Chris Wilson wrote:
> Currently we report through our error state only the rings that have
> been initialised (as detected by ring->obj). This check is done after
> the GPU reset and ring re-initialisation, which means that the software
> state may not be the same as when we captured the hardware error and we
> may not print out any of the vital information for debugging the hang.
> 
> This (and the implied object leak) is a regression from
> 
> commit 3d57e5bd1284f44e325f3a52d966259ed42f9e05
> Author: Ben Widawsky <ben@bwidawsk.net>
> Date:   Mon Oct 14 10:01:36 2013 -0700
> 
>     drm/i915: Do a fuller init after reset
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  1 +
>  drivers/gpu/drm/i915/i915_gpu_error.c | 19 +++++++++++++------
>  2 files changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c45cbbecd66a..64a1aca7804d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -334,6 +334,7 @@ struct drm_i915_error_state {
>  	struct timeval time;
>  
>  	struct drm_i915_error_ring {
> +		int valid;

bool

>  		struct drm_i915_error_object {
>  			int page_count;
>  			u32 gtt_offset;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 260a215e3619..e2af1d490f8d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -240,6 +240,9 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
>  				  unsigned ring)
>  {
>  	BUG_ON(ring >= I915_NUM_RINGS); /* shut up confused gcc */
> +	if (!error->ring[ring].valid)
> +		return;
> +
>  	err_printf(m, "%s command stream:\n", ring_str(ring));
>  	err_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
>  	err_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
> @@ -294,7 +297,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	struct drm_device *dev = error_priv->dev;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
>  	struct drm_i915_error_state *error = error_priv->error;
> -	struct intel_ring_buffer *ring;
>  	int i, j, page, offset, elt;
>  
>  	if (!error) {
> @@ -329,7 +331,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	if (INTEL_INFO(dev)->gen == 7)
>  		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
>  
> -	for_each_ring(ring, dev_priv, i)
> +	for (i = 0; i < ARRAY_SIZE(error->ring); i++)
>  		i915_ring_error_state(m, dev, error, i);
>  
>  	for (i = 0; i < error->vm_count; i++) {
> @@ -388,8 +390,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  			}
>  		}
>  
> -		obj = error->ring[i].ctx;
> -		if (obj) {
> +		if ((obj = error->ring[i].ctx)) {

Unrelated change. Although it does make this more consistent w/ the
surrouding code. But I admit to not being a fan of assignments inside
if statements.

>  			err_printf(m, "%s --- HW Context = 0x%08x\n",
>  				   dev_priv->ring[i].name,
>  				   obj->gtt_offset);
> @@ -826,11 +827,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
>  				  struct drm_i915_error_state *error)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> -	struct intel_ring_buffer *ring;
>  	struct drm_i915_gem_request *request;
>  	int i, count;
>  
> -	for_each_ring(ring, dev_priv, i) {
> +	for (i = 0; i < I915_NUM_RINGS; i++) {
> +		struct intel_ring_buffer *ring = &dev_priv->ring[i];
> +
> +		if (ring->dev == NULL)
> +			continue;
> +
> +		error->ring[i].valid = true;
> +

The code here runs before the reset, and it would actually oops if
ring->obj==NULL, so using for_each_ring() here looks appropriate.

>  		i915_record_ring_state(dev, error, ring);
>  
>  		error->ring[i].batchbuffer =
> -- 
> 1.8.5.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation
  2014-01-24 11:50 ` Ville Syrjälä
@ 2014-01-24 11:55   ` Chris Wilson
  2014-01-24 12:06     ` Ville Syrjälä
  0 siblings, 1 reply; 7+ messages in thread
From: Chris Wilson @ 2014-01-24 11:55 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: intel-gfx, Ben Widawsky

On Fri, Jan 24, 2014 at 01:50:25PM +0200, Ville Syrjälä wrote:
> On Thu, Jan 23, 2014 at 09:49:43PM +0000, Chris Wilson wrote:
> > Currently we report through our error state only the rings that have
> > been initialised (as detected by ring->obj). This check is done after
> > the GPU reset and ring re-initialisation, which means that the software
> > state may not be the same as when we captured the hardware error and we
> > may not print out any of the vital information for debugging the hang.
> > 
> > This (and the implied object leak) is a regression from
> > 
> > commit 3d57e5bd1284f44e325f3a52d966259ed42f9e05
> > Author: Ben Widawsky <ben@bwidawsk.net>
> > Date:   Mon Oct 14 10:01:36 2013 -0700
> > 
> >     drm/i915: Do a fuller init after reset
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Ben Widawsky <ben@bwidawsk.net>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h       |  1 +
> >  drivers/gpu/drm/i915/i915_gpu_error.c | 19 +++++++++++++------
> >  2 files changed, 14 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index c45cbbecd66a..64a1aca7804d 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -334,6 +334,7 @@ struct drm_i915_error_state {
> >  	struct timeval time;
> >  
> >  	struct drm_i915_error_ring {
> > +		int valid;
> 
> bool

in a struct? I tend to think it leads to laziness not to coalesce them
into bitfields.

> > -		obj = error->ring[i].ctx;
> > -		if (obj) {
> > +		if ((obj = error->ring[i].ctx)) {
> 
> Unrelated change. Although it does make this more consistent w/ the
> surrouding code. But I admit to not being a fan of assignments inside
> if statements.

The inconsistency was uglier.

> >  			err_printf(m, "%s --- HW Context = 0x%08x\n",
> >  				   dev_priv->ring[i].name,
> >  				   obj->gtt_offset);
> > @@ -826,11 +827,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
> >  				  struct drm_i915_error_state *error)
> >  {
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > -	struct intel_ring_buffer *ring;
> >  	struct drm_i915_gem_request *request;
> >  	int i, count;
> >  
> > -	for_each_ring(ring, dev_priv, i) {
> > +	for (i = 0; i < I915_NUM_RINGS; i++) {
> > +		struct intel_ring_buffer *ring = &dev_priv->ring[i];
> > +
> > +		if (ring->dev == NULL)
> > +			continue;
> > +
> > +		error->ring[i].valid = true;
> > +
> 
> The code here runs before the reset, and it would actually oops if
> ring->obj==NULL, so using for_each_ring() here looks appropriate.

No, we need to record that ring->obj is NULL, especially if the ring
registers are still set...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation
  2014-01-24 11:55   ` Chris Wilson
@ 2014-01-24 12:06     ` Ville Syrjälä
  2014-01-27 13:52       ` Chris Wilson
  0 siblings, 1 reply; 7+ messages in thread
From: Ville Syrjälä @ 2014-01-24 12:06 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, Ben Widawsky

On Fri, Jan 24, 2014 at 11:55:21AM +0000, Chris Wilson wrote:
> On Fri, Jan 24, 2014 at 01:50:25PM +0200, Ville Syrjälä wrote:
> > On Thu, Jan 23, 2014 at 09:49:43PM +0000, Chris Wilson wrote:
> > > Currently we report through our error state only the rings that have
> > > been initialised (as detected by ring->obj). This check is done after
> > > the GPU reset and ring re-initialisation, which means that the software
> > > state may not be the same as when we captured the hardware error and we
> > > may not print out any of the vital information for debugging the hang.
> > > 
> > > This (and the implied object leak) is a regression from
> > > 
> > > commit 3d57e5bd1284f44e325f3a52d966259ed42f9e05
> > > Author: Ben Widawsky <ben@bwidawsk.net>
> > > Date:   Mon Oct 14 10:01:36 2013 -0700
> > > 
> > >     drm/i915: Do a fuller init after reset
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Ben Widawsky <ben@bwidawsk.net>
> > > ---
> > >  drivers/gpu/drm/i915/i915_drv.h       |  1 +
> > >  drivers/gpu/drm/i915/i915_gpu_error.c | 19 +++++++++++++------
> > >  2 files changed, 14 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > index c45cbbecd66a..64a1aca7804d 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > @@ -334,6 +334,7 @@ struct drm_i915_error_state {
> > >  	struct timeval time;
> > >  
> > >  	struct drm_i915_error_ring {
> > > +		int valid;
> > 
> > bool
> 
> in a struct? I tend to think it leads to laziness not to coalesce them
> into bitfields.

bool valid:1; then ;)

> 
> > > -		obj = error->ring[i].ctx;
> > > -		if (obj) {
> > > +		if ((obj = error->ring[i].ctx)) {
> > 
> > Unrelated change. Although it does make this more consistent w/ the
> > surrouding code. But I admit to not being a fan of assignments inside
> > if statements.
> 
> The inconsistency was uglier.
> 
> > >  			err_printf(m, "%s --- HW Context = 0x%08x\n",
> > >  				   dev_priv->ring[i].name,
> > >  				   obj->gtt_offset);
> > > @@ -826,11 +827,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
> > >  				  struct drm_i915_error_state *error)
> > >  {
> > >  	struct drm_i915_private *dev_priv = dev->dev_private;
> > > -	struct intel_ring_buffer *ring;
> > >  	struct drm_i915_gem_request *request;
> > >  	int i, count;
> > >  
> > > -	for_each_ring(ring, dev_priv, i) {
> > > +	for (i = 0; i < I915_NUM_RINGS; i++) {
> > > +		struct intel_ring_buffer *ring = &dev_priv->ring[i];
> > > +
> > > +		if (ring->dev == NULL)
> > > +			continue;
> > > +
> > > +		error->ring[i].valid = true;
> > > +
> > 
> > The code here runs before the reset, and it would actually oops if
> > ring->obj==NULL, so using for_each_ring() here looks appropriate.
> 
> No, we need to record that ring->obj is NULL, especially if the ring
> registers are still set...

OK so we just need to actually fix the scratch.obj==NULL case, and then
I guess it's fine.

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation
  2014-01-24 12:06     ` Ville Syrjälä
@ 2014-01-27 13:52       ` Chris Wilson
  2014-01-27 14:05         ` Ville Syrjälä
  0 siblings, 1 reply; 7+ messages in thread
From: Chris Wilson @ 2014-01-27 13:52 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson, Ben Widawsky, Ville Syrjälä, stable

Currently we report through our error state only the rings that have
been initialised (as detected by ring->obj). This check is done after
the GPU reset and ring re-initialisation, which means that the software
state may not be the same as when we captured the hardware error and we
may not print out any of the vital information for debugging the hang.

This (and the implied object leak) is a regression from

commit 3d57e5bd1284f44e325f3a52d966259ed42f9e05
Author: Ben Widawsky <ben@bwidawsk.net>
Date:   Mon Oct 14 10:01:36 2013 -0700

    drm/i915: Do a fuller init after reset

Note that we are already starting to get bug reports with incomplete
error states from 3.13.

v2: Prevent a NULL dereference on 830gm/845g after a GPU reset where
    the scratch obj may be NULL.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
References: https://bugs.freedesktop.org/show_bug.cgi?id=74094
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/i915/i915_drv.h       |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 22 +++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2e6c67d944eb..0249c9aa345a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -335,6 +335,7 @@ struct drm_i915_error_state {
 	struct timeval time;
 
 	struct drm_i915_error_ring {
+		bool valid;
 		struct drm_i915_error_object {
 			int page_count;
 			u32 gtt_offset;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 6832473bc386..96e945c3d44f 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -240,6 +240,9 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
 				  unsigned ring)
 {
 	BUG_ON(ring >= I915_NUM_RINGS); /* shut up confused gcc */
+	if (!error->ring[ring].valid)
+		return;
+
 	err_printf(m, "%s command stream:\n", ring_str(ring));
 	err_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
 	err_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
@@ -295,7 +298,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	struct drm_device *dev = error_priv->dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error = error_priv->error;
-	struct intel_ring_buffer *ring;
 	int i, j, page, offset, elt;
 
 	if (!error) {
@@ -330,7 +332,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	if (INTEL_INFO(dev)->gen == 7)
 		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
 
-	for_each_ring(ring, dev_priv, i)
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++)
 		i915_ring_error_state(m, dev, error, i);
 
 	for (i = 0; i < error->vm_count; i++) {
@@ -405,8 +407,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			}
 		}
 
-		obj = error->ring[i].ctx;
-		if (obj) {
+		if ((obj = error->ring[i].ctx)) {
 			err_printf(m, "%s --- HW Context = 0x%08x\n",
 				   dev_priv->ring[i].name,
 				   obj->gtt_offset);
@@ -730,7 +731,8 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 			return NULL;
 
 		obj = ring->scratch.obj;
-		if (acthd >= i915_gem_obj_ggtt_offset(obj) &&
+		if (obj != NULL &&
+		    acthd >= i915_gem_obj_ggtt_offset(obj) &&
 		    acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
 			return i915_error_ggtt_object_create(dev_priv, obj);
 	}
@@ -875,11 +877,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
 				  struct drm_i915_error_state *error)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct intel_ring_buffer *ring;
 	struct drm_i915_gem_request *request;
 	int i, count;
 
-	for_each_ring(ring, dev_priv, i) {
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		struct intel_ring_buffer *ring = &dev_priv->ring[i];
+
+		if (ring->dev == NULL)
+			continue;
+
+		error->ring[i].valid = true;
+
 		i915_record_ring_state(dev, error, ring);
 
 		error->ring[i].batchbuffer =
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation
  2014-01-27 13:52       ` Chris Wilson
@ 2014-01-27 14:05         ` Ville Syrjälä
  2014-01-27 16:13           ` [Intel-gfx] " Daniel Vetter
  0 siblings, 1 reply; 7+ messages in thread
From: Ville Syrjälä @ 2014-01-27 14:05 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, Ben Widawsky, stable

On Mon, Jan 27, 2014 at 01:52:34PM +0000, Chris Wilson wrote:
> Currently we report through our error state only the rings that have
> been initialised (as detected by ring->obj). This check is done after
> the GPU reset and ring re-initialisation, which means that the software
> state may not be the same as when we captured the hardware error and we
> may not print out any of the vital information for debugging the hang.
> 
> This (and the implied object leak) is a regression from
> 
> commit 3d57e5bd1284f44e325f3a52d966259ed42f9e05
> Author: Ben Widawsky <ben@bwidawsk.net>
> Date:   Mon Oct 14 10:01:36 2013 -0700
> 
>     drm/i915: Do a fuller init after reset
> 
> Note that we are already starting to get bug reports with incomplete
> error states from 3.13.
> 
> v2: Prevent a NULL dereference on 830gm/845g after a GPU reset where
>     the scratch obj may be NULL.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Ben Widawsky <ben@bwidawsk.net>
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=74094
> Cc: stable@vger.kernel.org

Looks OK to me.

Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  1 +
>  drivers/gpu/drm/i915/i915_gpu_error.c | 22 +++++++++++++++-------
>  2 files changed, 16 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2e6c67d944eb..0249c9aa345a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -335,6 +335,7 @@ struct drm_i915_error_state {
>  	struct timeval time;
>  
>  	struct drm_i915_error_ring {
> +		bool valid;
>  		struct drm_i915_error_object {
>  			int page_count;
>  			u32 gtt_offset;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 6832473bc386..96e945c3d44f 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -240,6 +240,9 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
>  				  unsigned ring)
>  {
>  	BUG_ON(ring >= I915_NUM_RINGS); /* shut up confused gcc */
> +	if (!error->ring[ring].valid)
> +		return;
> +
>  	err_printf(m, "%s command stream:\n", ring_str(ring));
>  	err_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
>  	err_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
> @@ -295,7 +298,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	struct drm_device *dev = error_priv->dev;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
>  	struct drm_i915_error_state *error = error_priv->error;
> -	struct intel_ring_buffer *ring;
>  	int i, j, page, offset, elt;
>  
>  	if (!error) {
> @@ -330,7 +332,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	if (INTEL_INFO(dev)->gen == 7)
>  		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
>  
> -	for_each_ring(ring, dev_priv, i)
> +	for (i = 0; i < ARRAY_SIZE(error->ring); i++)
>  		i915_ring_error_state(m, dev, error, i);
>  
>  	for (i = 0; i < error->vm_count; i++) {
> @@ -405,8 +407,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  			}
>  		}
>  
> -		obj = error->ring[i].ctx;
> -		if (obj) {
> +		if ((obj = error->ring[i].ctx)) {
>  			err_printf(m, "%s --- HW Context = 0x%08x\n",
>  				   dev_priv->ring[i].name,
>  				   obj->gtt_offset);
> @@ -730,7 +731,8 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			return NULL;
>  
>  		obj = ring->scratch.obj;
> -		if (acthd >= i915_gem_obj_ggtt_offset(obj) &&
> +		if (obj != NULL &&
> +		    acthd >= i915_gem_obj_ggtt_offset(obj) &&
>  		    acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
>  			return i915_error_ggtt_object_create(dev_priv, obj);
>  	}
> @@ -875,11 +877,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
>  				  struct drm_i915_error_state *error)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> -	struct intel_ring_buffer *ring;
>  	struct drm_i915_gem_request *request;
>  	int i, count;
>  
> -	for_each_ring(ring, dev_priv, i) {
> +	for (i = 0; i < I915_NUM_RINGS; i++) {
> +		struct intel_ring_buffer *ring = &dev_priv->ring[i];
> +
> +		if (ring->dev == NULL)
> +			continue;
> +
> +		error->ring[i].valid = true;
> +
>  		i915_record_ring_state(dev, error, ring);
>  
>  		error->ring[i].batchbuffer =
> -- 
> 1.8.5.3

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation
  2014-01-27 14:05         ` Ville Syrjälä
@ 2014-01-27 16:13           ` Daniel Vetter
  0 siblings, 0 replies; 7+ messages in thread
From: Daniel Vetter @ 2014-01-27 16:13 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: Chris Wilson, intel-gfx, Ben Widawsky, stable

On Mon, Jan 27, 2014 at 04:05:24PM +0200, Ville Syrjälä wrote:
> On Mon, Jan 27, 2014 at 01:52:34PM +0000, Chris Wilson wrote:
> > Currently we report through our error state only the rings that have
> > been initialised (as detected by ring->obj). This check is done after
> > the GPU reset and ring re-initialisation, which means that the software
> > state may not be the same as when we captured the hardware error and we
> > may not print out any of the vital information for debugging the hang.
> > 
> > This (and the implied object leak) is a regression from
> > 
> > commit 3d57e5bd1284f44e325f3a52d966259ed42f9e05
> > Author: Ben Widawsky <ben@bwidawsk.net>
> > Date:   Mon Oct 14 10:01:36 2013 -0700
> > 
> >     drm/i915: Do a fuller init after reset
> > 
> > Note that we are already starting to get bug reports with incomplete
> > error states from 3.13.
> > 
> > v2: Prevent a NULL dereference on 830gm/845g after a GPU reset where
> >     the scratch obj may be NULL.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Ben Widawsky <ben@bwidawsk.net>
> > Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=74094
> > Cc: stable@vger.kernel.org
> 
> Looks OK to me.
> 
> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>

Picked up for -fixes, thanks for the patch. I've also added a tag for the
stable team to fasttrack this one since it's a developer feature.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2014-01-27 16:13 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-01-23 21:49 [PATCH] drm/i915: Decouple GPU error reporting from ring initialisation Chris Wilson
2014-01-24 11:50 ` Ville Syrjälä
2014-01-24 11:55   ` Chris Wilson
2014-01-24 12:06     ` Ville Syrjälä
2014-01-27 13:52       ` Chris Wilson
2014-01-27 14:05         ` Ville Syrjälä
2014-01-27 16:13           ` [Intel-gfx] " Daniel Vetter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox