public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/i915: Tune down output when context is banned
@ 2014-01-17 14:20 Mika Kuoppala
  2014-01-17 14:20 ` [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring Mika Kuoppala
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Mika Kuoppala @ 2014-01-17 14:20 UTC (permalink / raw)
  To: intel-gfx

Demote DRM_ERROR to DRM_INFO when context is banned.

Reference: https://bugs.freedesktop.org/show_bug.cgi?id=73652
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5fcdb14..d270351 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2313,7 +2313,7 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
 		return true;
 
 	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
-		DRM_ERROR("context hanging too fast, declaring banned!\n");
+		DRM_INFO("context hanging too fast, declaring banned!\n");
 		return true;
 	}
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring
  2014-01-17 14:20 [PATCH 1/3] drm/i915: Tune down output when context is banned Mika Kuoppala
@ 2014-01-17 14:20 ` Mika Kuoppala
  2014-01-17 14:50   ` Mika Kuoppala
  2014-01-26 18:49   ` Ben Widawsky
  2014-01-17 14:20 ` [PATCH 3/3] drm/i915: Get rid of acthd based batch search on reset stats Mika Kuoppala
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 12+ messages in thread
From: Mika Kuoppala @ 2014-01-17 14:20 UTC (permalink / raw)
  To: intel-gfx

Instead of going through all the requests to find a batch that
hanged the machine, use hangcheck score and the fact that
first noncompleted request on hanged ring is, with great
probability, the guilty one. This also ensure that we get one
guilty batch per hang instead of possibly more (for each ring)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         |   19 ++++++++++---------
 drivers/gpu/drm/i915/i915_irq.c         |    3 +--
 drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d270351..27a97c3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2322,20 +2322,17 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
 
 static void i915_set_reset_status(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request,
-				  u32 acthd)
+				  u32 acthd, const bool guilty)
 {
 	struct i915_ctx_hang_stats *hs = NULL;
-	bool inside, guilty;
+	bool inside;
 	unsigned long offset = 0;
 
-	/* Innocent until proven guilty */
-	guilty = false;
-
 	if (request->batch_obj)
 		offset = i915_gem_obj_offset(request->batch_obj,
 					     request_to_vm(request));
 
-	if (ring->hangcheck.action != HANGCHECK_WAIT &&
+	if (guilty &&
 	    i915_request_guilty(request, acthd, &inside)) {
 		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
 			  ring->name,
@@ -2343,8 +2340,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 			  offset,
 			  request->ctx ? request->ctx->id : 0,
 			  acthd);
-
-		guilty = true;
 	}
 
 	/* If contexts are disabled or this is the default context, use
@@ -2383,12 +2378,18 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
 	u32 completed_seqno = ring->get_seqno(ring, false);
 	u32 acthd = intel_ring_get_active_head(ring);
 	struct drm_i915_gem_request *request;
+	bool guilty = false;
 
 	list_for_each_entry(request, &ring->request_list, list) {
 		if (i915_seqno_passed(completed_seqno, request->seqno))
 			continue;
 
-		i915_set_reset_status(ring, request, acthd);
+		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
+			guilty = true;
+			i915_set_reset_status(ring, request, acthd, true);
+		} else {
+			i915_set_reset_status(ring, request, acthd, false);
+		}
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6d11e25..e24f9ef 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2473,7 +2473,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
-#define FIRE 30
 
 	if (!i915_enable_hangcheck)
 		return;
@@ -2557,7 +2556,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
 	}
 
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score > FIRE) {
+		if (ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
 			DRM_INFO("%s on %s\n",
 				 stuck[i] ? "stuck" : "no progress",
 				 ring->name);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 71a73f4..6018793 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
 	HANGCHECK_HUNG,
 };
 
+#define HANGCHECK_SCORE_GUILTY 31
+
 struct intel_ring_hangcheck {
 	bool deadlock;
 	u32 seqno;
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] drm/i915: Get rid of acthd based batch search on reset stats
  2014-01-17 14:20 [PATCH 1/3] drm/i915: Tune down output when context is banned Mika Kuoppala
  2014-01-17 14:20 ` [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring Mika Kuoppala
@ 2014-01-17 14:20 ` Mika Kuoppala
  2014-01-26 18:58   ` Ben Widawsky
  2014-01-17 14:27 ` [PATCH 1/3] drm/i915: Tune down output when context is banned Chris Wilson
  2014-01-22 15:41 ` [PATCH v2 1/3] drm/i915: Tune down debug " Mika Kuoppala
  3 siblings, 1 reply; 12+ messages in thread
From: Mika Kuoppala @ 2014-01-17 14:20 UTC (permalink / raw)
  To: intel-gfx

As we seek the guilty one using request ordering and hangcheck
score, these were needed only for debug output.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c |   87 ++-------------------------------------
 1 file changed, 3 insertions(+), 84 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 27a97c3..d796c7f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2241,70 +2241,6 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	spin_unlock(&file_priv->mm.lock);
 }
 
-static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj,
-				    struct i915_address_space *vm)
-{
-	if (acthd >= i915_gem_obj_offset(obj, vm) &&
-	    acthd < i915_gem_obj_offset(obj, vm) + obj->base.size)
-		return true;
-
-	return false;
-}
-
-static bool i915_head_inside_request(const u32 acthd_unmasked,
-				     const u32 request_start,
-				     const u32 request_end)
-{
-	const u32 acthd = acthd_unmasked & HEAD_ADDR;
-
-	if (request_start < request_end) {
-		if (acthd >= request_start && acthd < request_end)
-			return true;
-	} else if (request_start > request_end) {
-		if (acthd >= request_start || acthd < request_end)
-			return true;
-	}
-
-	return false;
-}
-
-static struct i915_address_space *
-request_to_vm(struct drm_i915_gem_request *request)
-{
-	struct drm_i915_private *dev_priv = request->ring->dev->dev_private;
-	struct i915_address_space *vm;
-
-	if (request->ctx)
-		vm = request->ctx->vm;
-	else
-		vm = &dev_priv->gtt.base;
-
-	return vm;
-}
-
-static bool i915_request_guilty(struct drm_i915_gem_request *request,
-				const u32 acthd, bool *inside)
-{
-	/* There is a possibility that unmasked head address
-	 * pointing inside the ring, matches the batch_obj address range.
-	 * However this is extremely unlikely.
-	 */
-	if (request->batch_obj) {
-		if (i915_head_inside_object(acthd, request->batch_obj,
-					    request_to_vm(request))) {
-			*inside = true;
-			return true;
-		}
-	}
-
-	if (i915_head_inside_request(acthd, request->head, request->tail)) {
-		*inside = false;
-		return true;
-	}
-
-	return false;
-}
-
 static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
 {
 	const unsigned long elapsed = get_seconds() - hs->guilty_ts;
@@ -2322,25 +2258,9 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
 
 static void i915_set_reset_status(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request,
-				  u32 acthd, const bool guilty)
+				  const bool guilty)
 {
 	struct i915_ctx_hang_stats *hs = NULL;
-	bool inside;
-	unsigned long offset = 0;
-
-	if (request->batch_obj)
-		offset = i915_gem_obj_offset(request->batch_obj,
-					     request_to_vm(request));
-
-	if (guilty &&
-	    i915_request_guilty(request, acthd, &inside)) {
-		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
-			  ring->name,
-			  inside ? "inside" : "flushing",
-			  offset,
-			  request->ctx ? request->ctx->id : 0,
-			  acthd);
-	}
 
 	/* If contexts are disabled or this is the default context, use
 	 * file_priv->reset_state
@@ -2376,7 +2296,6 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
 				       struct intel_ring_buffer *ring)
 {
 	u32 completed_seqno = ring->get_seqno(ring, false);
-	u32 acthd = intel_ring_get_active_head(ring);
 	struct drm_i915_gem_request *request;
 	bool guilty = false;
 
@@ -2386,9 +2305,9 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
 
 		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
 			guilty = true;
-			i915_set_reset_status(ring, request, acthd, true);
+			i915_set_reset_status(ring, request, true);
 		} else {
-			i915_set_reset_status(ring, request, acthd, false);
+			i915_set_reset_status(ring, request, false);
 		}
 	}
 }
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] drm/i915: Tune down output when context is banned
  2014-01-17 14:20 [PATCH 1/3] drm/i915: Tune down output when context is banned Mika Kuoppala
  2014-01-17 14:20 ` [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring Mika Kuoppala
  2014-01-17 14:20 ` [PATCH 3/3] drm/i915: Get rid of acthd based batch search on reset stats Mika Kuoppala
@ 2014-01-17 14:27 ` Chris Wilson
  2014-01-22 15:41 ` [PATCH v2 1/3] drm/i915: Tune down debug " Mika Kuoppala
  3 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2014-01-17 14:27 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Fri, Jan 17, 2014 at 04:20:29PM +0200, Mika Kuoppala wrote:
> Demote DRM_ERROR to DRM_INFO when context is banned.

I am happy with this as we have a separate error message for when the
GPU is truly wedged. However, I feel that if the global context is
banned, that does deserve an error, at least a warning. Ok, not
quite so happy. 
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring
  2014-01-17 14:20 ` [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring Mika Kuoppala
@ 2014-01-17 14:50   ` Mika Kuoppala
  2014-01-26 18:49   ` Ben Widawsky
  1 sibling, 0 replies; 12+ messages in thread
From: Mika Kuoppala @ 2014-01-17 14:50 UTC (permalink / raw)
  To: intel-gfx

Mika Kuoppala <mika.kuoppala@linux.intel.com> writes:

> Instead of going through all the requests to find a batch that
> hanged the machine, use hangcheck score and the fact that
> first noncompleted request on hanged ring is, with great
> probability, the guilty one. This also ensure that we get one
> guilty batch per hang instead of possibly more (for each ring)
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>

missing in here.
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |   19 ++++++++++---------
>  drivers/gpu/drm/i915/i915_irq.c         |    3 +--
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
>  3 files changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d270351..27a97c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2322,20 +2322,17 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>  
>  static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  				  struct drm_i915_gem_request *request,
> -				  u32 acthd)
> +				  u32 acthd, const bool guilty)
>  {
>  	struct i915_ctx_hang_stats *hs = NULL;
> -	bool inside, guilty;
> +	bool inside;
>  	unsigned long offset = 0;
>  
> -	/* Innocent until proven guilty */
> -	guilty = false;
> -
>  	if (request->batch_obj)
>  		offset = i915_gem_obj_offset(request->batch_obj,
>  					     request_to_vm(request));
>  
> -	if (ring->hangcheck.action != HANGCHECK_WAIT &&
> +	if (guilty &&
>  	    i915_request_guilty(request, acthd, &inside)) {
>  		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
>  			  ring->name,
> @@ -2343,8 +2340,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  			  offset,
>  			  request->ctx ? request->ctx->id : 0,
>  			  acthd);
> -
> -		guilty = true;
>  	}
>  
>  	/* If contexts are disabled or this is the default context, use
> @@ -2383,12 +2378,18 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  	u32 completed_seqno = ring->get_seqno(ring, false);
>  	u32 acthd = intel_ring_get_active_head(ring);
>  	struct drm_i915_gem_request *request;
> +	bool guilty = false;
>  
>  	list_for_each_entry(request, &ring->request_list, list) {
>  		if (i915_seqno_passed(completed_seqno, request->seqno))
>  			continue;
>  
> -		i915_set_reset_status(ring, request, acthd);
> +		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
> +			guilty = true;
> +			i915_set_reset_status(ring, request, acthd, true);
> +		} else {
> +			i915_set_reset_status(ring, request, acthd, false);
> +		}
>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6d11e25..e24f9ef 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2473,7 +2473,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  #define BUSY 1
>  #define KICK 5
>  #define HUNG 20
> -#define FIRE 30
>  
>  	if (!i915_enable_hangcheck)
>  		return;
> @@ -2557,7 +2556,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  	}
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.score > FIRE) {
> +		if (ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>  			DRM_INFO("%s on %s\n",
>  				 stuck[i] ? "stuck" : "no progress",
>  				 ring->name);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 71a73f4..6018793 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
>  	HANGCHECK_HUNG,
>  };
>  
> +#define HANGCHECK_SCORE_GUILTY 31
> +
>  struct intel_ring_hangcheck {
>  	bool deadlock;
>  	u32 seqno;
> -- 
> 1.7.9.5

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 1/3] drm/i915: Tune down debug output when context is banned
  2014-01-17 14:20 [PATCH 1/3] drm/i915: Tune down output when context is banned Mika Kuoppala
                   ` (2 preceding siblings ...)
  2014-01-17 14:27 ` [PATCH 1/3] drm/i915: Tune down output when context is banned Chris Wilson
@ 2014-01-22 15:41 ` Mika Kuoppala
  2014-01-26 18:17   ` Ben Widawsky
  3 siblings, 1 reply; 12+ messages in thread
From: Mika Kuoppala @ 2014-01-22 15:41 UTC (permalink / raw)
  To: intel-gfx

If we have stopped rings then we know that test is running
so no need for spam. In addition, only spam when default
context gets banned.

v2: - make sure default context ban gets shown (Chris)
    - use helper for checking for default context, everywhere (Chris)

Reference: https://bugs.freedesktop.org/show_bug.cgi?id=73652
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |    5 +++++
 drivers/gpu/drm/i915/i915_gem.c         |   18 +++++++++++-------
 drivers/gpu/drm/i915/i915_gem_context.c |    9 ++-------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f888fea..3a43388 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2267,6 +2267,11 @@ static inline void i915_gem_context_unreference(struct i915_hw_context *ctx)
 		kref_put(&ctx->ref, i915_gem_context_free);
 }
 
+static inline bool i915_gem_context_is_default(struct i915_hw_context *ctx)
+{
+	return (ctx->id == DEFAULT_CONTEXT_ID);
+}
+
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file);
 int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5fcdb14..36d18d8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2305,15 +2305,20 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
 	return false;
 }
 
-static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
+static bool i915_context_is_banned(const struct drm_i915_gem_request *request,
+				   const struct i915_ctx_hang_stats *hs)
 {
 	const unsigned long elapsed = get_seconds() - hs->guilty_ts;
+	struct drm_i915_private *dev_priv = request->ring->dev->dev_private;
 
 	if (hs->banned)
 		return true;
 
 	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
-		DRM_ERROR("context hanging too fast, declaring banned!\n");
+		if (dev_priv->gpu_error.stop_rings == 0 &&
+		    request->ctx && i915_gem_context_is_default(request->ctx))
+			DRM_ERROR("context hanging too fast, declaring banned!\n");
+
 		return true;
 	}
 
@@ -2347,17 +2352,16 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 		guilty = true;
 	}
 
-	/* If contexts are disabled or this is the default context, use
-	 * file_priv->reset_state
-	 */
-	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
+	/* For default context, we track reset statistics per file_priv
+	 * to allow more fine grained control */
+	if (request->ctx && !i915_gem_context_is_default(request->ctx))
 		hs = &request->ctx->hang_stats;
 	else if (request->file_priv)
 		hs = &request->file_priv->private_default_ctx->hang_stats;
 
 	if (hs) {
 		if (guilty) {
-			hs->banned = i915_context_is_banned(hs);
+			hs->banned = i915_context_is_banned(request, hs);
 			hs->batch_active++;
 			hs->guilty_ts = get_seconds();
 		} else {
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 112f865..4c55730 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -228,11 +228,6 @@ err_out:
 	return ERR_PTR(ret);
 }
 
-static inline bool is_default_context(struct i915_hw_context *ctx)
-{
-	return (ctx->id == DEFAULT_CONTEXT_ID);
-}
-
 /**
  * The default context needs to exist per ring that uses contexts. It stores the
  * context state of the GPU for applications that don't utilize HW contexts, as
@@ -474,7 +469,7 @@ static int context_idr_cleanup(int id, void *p, void *data)
 	struct i915_hw_context *ctx = p;
 
 	/* Ignore the default context because close will handle it */
-	if (is_default_context(ctx))
+	if (i915_gem_context_is_default(ctx))
 		return 0;
 
 	i915_gem_context_unreference(ctx);
@@ -649,7 +644,7 @@ static int do_switch(struct intel_ring_buffer *ring,
 		vma->bind_vma(vma, to->obj->cache_level, GLOBAL_BIND);
 	}
 
-	if (!to->is_initialized || is_default_context(to))
+	if (!to->is_initialized || i915_gem_context_is_default(to))
 		hw_flags |= MI_RESTORE_INHIBIT;
 
 	ret = mi_set_context(ring, to, hw_flags);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/3] drm/i915: Tune down debug output when context is banned
  2014-01-22 15:41 ` [PATCH v2 1/3] drm/i915: Tune down debug " Mika Kuoppala
@ 2014-01-26 18:17   ` Ben Widawsky
  2014-01-29 15:28     ` Mika Kuoppala
  0 siblings, 1 reply; 12+ messages in thread
From: Ben Widawsky @ 2014-01-26 18:17 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx


On Wed, Jan 22, 2014 at 05:41:29PM +0200, Mika Kuoppala wrote:
> If we have stopped rings then we know that test is running
> so no need for spam. In addition, only spam when default
> context gets banned.
> 
> v2: - make sure default context ban gets shown (Chris)
>     - use helper for checking for default context, everywhere (Chris)
> 

Overall, passing requests around this much to these functions seems
awkward to me. A lot of my comments are targeting that. If you're
totally of a different mind, I can reconsider.

> Reference: https://bugs.freedesktop.org/show_bug.cgi?id=73652
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |    5 +++++
>  drivers/gpu/drm/i915/i915_gem.c         |   18 +++++++++++-------
>  drivers/gpu/drm/i915/i915_gem_context.c |    9 ++-------
>  3 files changed, 18 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index f888fea..3a43388 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2267,6 +2267,11 @@ static inline void i915_gem_context_unreference(struct i915_hw_context *ctx)
>  		kref_put(&ctx->ref, i915_gem_context_free);
>  }
>  
> +static inline bool i915_gem_context_is_default(struct i915_hw_context *ctx)
> +{
> +	return (ctx->id == DEFAULT_CONTEXT_ID);
> +}
> +

Drop the parenthesis - I know that it's my bad.

>  int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
>  				  struct drm_file *file);
>  int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 5fcdb14..36d18d8 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2305,15 +2305,20 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
>  	return false;
>  }
>  
> -static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
> +static bool i915_context_is_banned(const struct drm_i915_gem_request *request,
> +				   const struct i915_ctx_hang_stats *hs)
>  {
>  	const unsigned long elapsed = get_seconds() - hs->guilty_ts;
> +	struct drm_i915_private *dev_priv = request->ring->dev->dev_private;
>  
>  	if (hs->banned)
>  		return true;
>  
>  	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
> -		DRM_ERROR("context hanging too fast, declaring banned!\n");
> +		if (dev_priv->gpu_error.stop_rings == 0 &&
> +		    request->ctx && i915_gem_context_is_default(request->ctx))
> +			DRM_ERROR("context hanging too fast, declaring banned!\n");
> +

Correct me if I am wrong, but, request->ctx should always be non-NULL
when using full PPGTT, and hardware contexts (not sure if this patch is
only targeted for that). In case of the latter, HAS_HW_CONTEXTS should
be == request->ctx. Passing a request to a function which shouldn't care
about requests feels bad to me. As such, I'd vote for:

i915_context_is_banned(struct i915_hw_context *ctx) - you can similarly
thread the ctx up through the call stack, instead of the hs.

I'd also suggest making i915_gem_context_is_default() check regardless
of stop_rings. The last request is optional.

>  		return true;
>  	}
>  
> @@ -2347,17 +2352,16 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  		guilty = true;
>  	}
>  
> -	/* If contexts are disabled or this is the default context, use
> -	 * file_priv->reset_state
> -	 */
> -	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
> +	/* For default context, we track reset statistics per file_priv
> +	 * to allow more fine grained control */
> +	if (request->ctx && !i915_gem_context_is_default(request->ctx))

As I said above, you could replace request->ctx here if you wanted.

>  		hs = &request->ctx->hang_stats;
>  	else if (request->file_priv)
>  		hs = &request->file_priv->private_default_ctx->hang_stats;
>  
>  	if (hs) {
>  		if (guilty) {
> -			hs->banned = i915_context_is_banned(hs);
> +			hs->banned = i915_context_is_banned(request, hs);
>  			hs->batch_active++;
>  			hs->guilty_ts = get_seconds();
>  		} else {
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 112f865..4c55730 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -228,11 +228,6 @@ err_out:
>  	return ERR_PTR(ret);
>  }
>  
> -static inline bool is_default_context(struct i915_hw_context *ctx)
> -{
> -	return (ctx->id == DEFAULT_CONTEXT_ID);
> -}
> -
>  /**
>   * The default context needs to exist per ring that uses contexts. It stores the
>   * context state of the GPU for applications that don't utilize HW contexts, as
> @@ -474,7 +469,7 @@ static int context_idr_cleanup(int id, void *p, void *data)
>  	struct i915_hw_context *ctx = p;
>  
>  	/* Ignore the default context because close will handle it */
> -	if (is_default_context(ctx))
> +	if (i915_gem_context_is_default(ctx))
>  		return 0;
>  
>  	i915_gem_context_unreference(ctx);
> @@ -649,7 +644,7 @@ static int do_switch(struct intel_ring_buffer *ring,
>  		vma->bind_vma(vma, to->obj->cache_level, GLOBAL_BIND);
>  	}
>  
> -	if (!to->is_initialized || is_default_context(to))
> +	if (!to->is_initialized || i915_gem_context_is_default(to))
>  		hw_flags |= MI_RESTORE_INHIBIT;
>  
>  	ret = mi_set_context(ring, to, hw_flags);
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring
  2014-01-17 14:20 ` [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring Mika Kuoppala
  2014-01-17 14:50   ` Mika Kuoppala
@ 2014-01-26 18:49   ` Ben Widawsky
  2014-01-29 15:20     ` Mika Kuoppala
  1 sibling, 1 reply; 12+ messages in thread
From: Ben Widawsky @ 2014-01-26 18:49 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Fri, Jan 17, 2014 at 04:20:30PM +0200, Mika Kuoppala wrote:
> Instead of going through all the requests to find a batch that
> hanged the machine, use hangcheck score and the fact that
hung, hanged???
> first noncompleted request on hanged ring is, with great
> probability, the guilty one. This also ensure that we get one
> guilty batch per hang instead of possibly more (for each ring)
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |   19 ++++++++++---------
>  drivers/gpu/drm/i915/i915_irq.c         |    3 +--
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
>  3 files changed, 13 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d270351..27a97c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2322,20 +2322,17 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>  
>  static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  				  struct drm_i915_gem_request *request,
> -				  u32 acthd)
> +				  u32 acthd, const bool guilty)
>  {
>  	struct i915_ctx_hang_stats *hs = NULL;
> -	bool inside, guilty;
> +	bool inside;
>  	unsigned long offset = 0;
>  
> -	/* Innocent until proven guilty */
> -	guilty = false;
> -
>  	if (request->batch_obj)
>  		offset = i915_gem_obj_offset(request->batch_obj,
>  					     request_to_vm(request));
>  
> -	if (ring->hangcheck.action != HANGCHECK_WAIT &&
> +	if (guilty &&
>  	    i915_request_guilty(request, acthd, &inside)) {
>  		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
>  			  ring->name,
> @@ -2343,8 +2340,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  			  offset,
>  			  request->ctx ? request->ctx->id : 0,
>  			  acthd);
> -
> -		guilty = true;
>  	}
>  
>  	/* If contexts are disabled or this is the default context, use
> @@ -2383,12 +2378,18 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  	u32 completed_seqno = ring->get_seqno(ring, false);
>  	u32 acthd = intel_ring_get_active_head(ring);
>  	struct drm_i915_gem_request *request;
> +	bool guilty = false;
>  
>  	list_for_each_entry(request, &ring->request_list, list) {
>  		if (i915_seqno_passed(completed_seqno, request->seqno))
>  			continue;
>  
> -		i915_set_reset_status(ring, request, acthd);
> +		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {

checkpatch complains about the above.

> +			guilty = true;
> +			i915_set_reset_status(ring, request, acthd, true);
> +		} else {
> +			i915_set_reset_status(ring, request, acthd, false);
> +		}

I don't think the logic is correct. This will find the first request
(sequentially) that was hung, and not the first ring that hung.
Shouldn't we scan everything and take the first incomplete request with
the highest hangcheck.score?

Maybe I am crazy, but suppose we emit seqno 1 to ring X, and seqno 2 to
ring Y (the latter occurring after the first hang check)

event		RING X       RING Y
seqno 1 emit
seqno 2 emit	BUSY
		HUNG	      HUNG
		HUNG	      HUNG

The case here is somewhat academic since they both hung, but one might
argue that the first hang on ring Y is what caused the hang on ring X.

BTW, this change has convinced me that we really need to define
BUSY/KICK/HUNG as relative and not absolute values...

>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6d11e25..e24f9ef 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2473,7 +2473,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  #define BUSY 1
>  #define KICK 5
>  #define HUNG 20
> -#define FIRE 30
>  
>  	if (!i915_enable_hangcheck)
>  		return;
> @@ -2557,7 +2556,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  	}
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.score > FIRE) {
> +		if (ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>  			DRM_INFO("%s on %s\n",
>  				 stuck[i] ? "stuck" : "no progress",
>  				 ring->name);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 71a73f4..6018793 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
>  	HANGCHECK_HUNG,
>  };
>  
> +#define HANGCHECK_SCORE_GUILTY 31
> +
>  struct intel_ring_hangcheck {
>  	bool deadlock;
>  	u32 seqno;

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] drm/i915: Get rid of acthd based batch search on reset stats
  2014-01-17 14:20 ` [PATCH 3/3] drm/i915: Get rid of acthd based batch search on reset stats Mika Kuoppala
@ 2014-01-26 18:58   ` Ben Widawsky
  2014-01-28 16:30     ` Rodrigo Vivi
  0 siblings, 1 reply; 12+ messages in thread
From: Ben Widawsky @ 2014-01-26 18:58 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Fri, Jan 17, 2014 at 04:20:31PM +0200, Mika Kuoppala wrote:
> As we seek the guilty one using request ordering and hangcheck
> score, these were needed only for debug output.
> 

As it's only for debug output, I'd rather just leave this here until
we're completely convinced the new mechanism works.

However, the patch looks functionally correct to me, so I'll leave the
decision up to Daniel.

Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c |   87 ++-------------------------------------
>  1 file changed, 3 insertions(+), 84 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 27a97c3..d796c7f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2241,70 +2241,6 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>  	spin_unlock(&file_priv->mm.lock);
>  }
>  
> -static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj,
> -				    struct i915_address_space *vm)
> -{
> -	if (acthd >= i915_gem_obj_offset(obj, vm) &&
> -	    acthd < i915_gem_obj_offset(obj, vm) + obj->base.size)
> -		return true;
> -
> -	return false;
> -}
> -
> -static bool i915_head_inside_request(const u32 acthd_unmasked,
> -				     const u32 request_start,
> -				     const u32 request_end)
> -{
> -	const u32 acthd = acthd_unmasked & HEAD_ADDR;
> -
> -	if (request_start < request_end) {
> -		if (acthd >= request_start && acthd < request_end)
> -			return true;
> -	} else if (request_start > request_end) {
> -		if (acthd >= request_start || acthd < request_end)
> -			return true;
> -	}
> -
> -	return false;
> -}
> -
> -static struct i915_address_space *
> -request_to_vm(struct drm_i915_gem_request *request)
> -{
> -	struct drm_i915_private *dev_priv = request->ring->dev->dev_private;
> -	struct i915_address_space *vm;
> -
> -	if (request->ctx)
> -		vm = request->ctx->vm;
> -	else
> -		vm = &dev_priv->gtt.base;
> -
> -	return vm;
> -}
> -
> -static bool i915_request_guilty(struct drm_i915_gem_request *request,
> -				const u32 acthd, bool *inside)
> -{
> -	/* There is a possibility that unmasked head address
> -	 * pointing inside the ring, matches the batch_obj address range.
> -	 * However this is extremely unlikely.
> -	 */
> -	if (request->batch_obj) {
> -		if (i915_head_inside_object(acthd, request->batch_obj,
> -					    request_to_vm(request))) {
> -			*inside = true;
> -			return true;
> -		}
> -	}
> -
> -	if (i915_head_inside_request(acthd, request->head, request->tail)) {
> -		*inside = false;
> -		return true;
> -	}
> -
> -	return false;
> -}
> -
>  static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>  {
>  	const unsigned long elapsed = get_seconds() - hs->guilty_ts;
> @@ -2322,25 +2258,9 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>  
>  static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  				  struct drm_i915_gem_request *request,
> -				  u32 acthd, const bool guilty)
> +				  const bool guilty)
>  {
>  	struct i915_ctx_hang_stats *hs = NULL;
> -	bool inside;
> -	unsigned long offset = 0;
> -
> -	if (request->batch_obj)
> -		offset = i915_gem_obj_offset(request->batch_obj,
> -					     request_to_vm(request));
> -
> -	if (guilty &&
> -	    i915_request_guilty(request, acthd, &inside)) {
> -		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
> -			  ring->name,
> -			  inside ? "inside" : "flushing",
> -			  offset,
> -			  request->ctx ? request->ctx->id : 0,
> -			  acthd);
> -	}
>  
>  	/* If contexts are disabled or this is the default context, use
>  	 * file_priv->reset_state
> @@ -2376,7 +2296,6 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  				       struct intel_ring_buffer *ring)
>  {
>  	u32 completed_seqno = ring->get_seqno(ring, false);
> -	u32 acthd = intel_ring_get_active_head(ring);
>  	struct drm_i915_gem_request *request;
>  	bool guilty = false;
>  
> @@ -2386,9 +2305,9 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  
>  		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>  			guilty = true;
> -			i915_set_reset_status(ring, request, acthd, true);
> +			i915_set_reset_status(ring, request, true);
>  		} else {
> -			i915_set_reset_status(ring, request, acthd, false);
> +			i915_set_reset_status(ring, request, false);
>  		}
>  	}
>  }
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] drm/i915: Get rid of acthd based batch search on reset stats
  2014-01-26 18:58   ` Ben Widawsky
@ 2014-01-28 16:30     ` Rodrigo Vivi
  0 siblings, 0 replies; 12+ messages in thread
From: Rodrigo Vivi @ 2014-01-28 16:30 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

This series fixes: https://bugs.freedesktop.org/show_bug.cgi?id=73533

Tested-by: Rodrigo Vivi <rodrigo.vivi@gmail.com>

On Sun, Jan 26, 2014 at 4:58 PM, Ben Widawsky <ben@bwidawsk.net> wrote:
> On Fri, Jan 17, 2014 at 04:20:31PM +0200, Mika Kuoppala wrote:
>> As we seek the guilty one using request ordering and hangcheck
>> score, these were needed only for debug output.
>>
>
> As it's only for debug output, I'd rather just leave this here until
> we're completely convinced the new mechanism works.
>
> However, the patch looks functionally correct to me, so I'll leave the
> decision up to Daniel.
>
> Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_gem.c |   87 ++-------------------------------------
>>  1 file changed, 3 insertions(+), 84 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 27a97c3..d796c7f 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2241,70 +2241,6 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>>       spin_unlock(&file_priv->mm.lock);
>>  }
>>
>> -static bool i915_head_inside_object(u32 acthd, struct drm_i915_gem_object *obj,
>> -                                 struct i915_address_space *vm)
>> -{
>> -     if (acthd >= i915_gem_obj_offset(obj, vm) &&
>> -         acthd < i915_gem_obj_offset(obj, vm) + obj->base.size)
>> -             return true;
>> -
>> -     return false;
>> -}
>> -
>> -static bool i915_head_inside_request(const u32 acthd_unmasked,
>> -                                  const u32 request_start,
>> -                                  const u32 request_end)
>> -{
>> -     const u32 acthd = acthd_unmasked & HEAD_ADDR;
>> -
>> -     if (request_start < request_end) {
>> -             if (acthd >= request_start && acthd < request_end)
>> -                     return true;
>> -     } else if (request_start > request_end) {
>> -             if (acthd >= request_start || acthd < request_end)
>> -                     return true;
>> -     }
>> -
>> -     return false;
>> -}
>> -
>> -static struct i915_address_space *
>> -request_to_vm(struct drm_i915_gem_request *request)
>> -{
>> -     struct drm_i915_private *dev_priv = request->ring->dev->dev_private;
>> -     struct i915_address_space *vm;
>> -
>> -     if (request->ctx)
>> -             vm = request->ctx->vm;
>> -     else
>> -             vm = &dev_priv->gtt.base;
>> -
>> -     return vm;
>> -}
>> -
>> -static bool i915_request_guilty(struct drm_i915_gem_request *request,
>> -                             const u32 acthd, bool *inside)
>> -{
>> -     /* There is a possibility that unmasked head address
>> -      * pointing inside the ring, matches the batch_obj address range.
>> -      * However this is extremely unlikely.
>> -      */
>> -     if (request->batch_obj) {
>> -             if (i915_head_inside_object(acthd, request->batch_obj,
>> -                                         request_to_vm(request))) {
>> -                     *inside = true;
>> -                     return true;
>> -             }
>> -     }
>> -
>> -     if (i915_head_inside_request(acthd, request->head, request->tail)) {
>> -             *inside = false;
>> -             return true;
>> -     }
>> -
>> -     return false;
>> -}
>> -
>>  static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>>  {
>>       const unsigned long elapsed = get_seconds() - hs->guilty_ts;
>> @@ -2322,25 +2258,9 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>>
>>  static void i915_set_reset_status(struct intel_ring_buffer *ring,
>>                                 struct drm_i915_gem_request *request,
>> -                               u32 acthd, const bool guilty)
>> +                               const bool guilty)
>>  {
>>       struct i915_ctx_hang_stats *hs = NULL;
>> -     bool inside;
>> -     unsigned long offset = 0;
>> -
>> -     if (request->batch_obj)
>> -             offset = i915_gem_obj_offset(request->batch_obj,
>> -                                          request_to_vm(request));
>> -
>> -     if (guilty &&
>> -         i915_request_guilty(request, acthd, &inside)) {
>> -             DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
>> -                       ring->name,
>> -                       inside ? "inside" : "flushing",
>> -                       offset,
>> -                       request->ctx ? request->ctx->id : 0,
>> -                       acthd);
>> -     }
>>
>>       /* If contexts are disabled or this is the default context, use
>>        * file_priv->reset_state
>> @@ -2376,7 +2296,6 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>>                                      struct intel_ring_buffer *ring)
>>  {
>>       u32 completed_seqno = ring->get_seqno(ring, false);
>> -     u32 acthd = intel_ring_get_active_head(ring);
>>       struct drm_i915_gem_request *request;
>>       bool guilty = false;
>>
>> @@ -2386,9 +2305,9 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>>
>>               if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>>                       guilty = true;
>> -                     i915_set_reset_status(ring, request, acthd, true);
>> +                     i915_set_reset_status(ring, request, true);
>>               } else {
>> -                     i915_set_reset_status(ring, request, acthd, false);
>> +                     i915_set_reset_status(ring, request, false);
>>               }
>>       }
>>  }
>> --
>> 1.7.9.5
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
> --
> Ben Widawsky, Intel Open Source Technology Center
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx



-- 
Rodrigo Vivi
Blog: http://blog.vivi.eng.br

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring
  2014-01-26 18:49   ` Ben Widawsky
@ 2014-01-29 15:20     ` Mika Kuoppala
  0 siblings, 0 replies; 12+ messages in thread
From: Mika Kuoppala @ 2014-01-29 15:20 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

Ben Widawsky <ben@bwidawsk.net> writes:

> On Fri, Jan 17, 2014 at 04:20:30PM +0200, Mika Kuoppala wrote:
>> Instead of going through all the requests to find a batch that
>> hanged the machine, use hangcheck score and the fact that
> hung, hanged???
>> first noncompleted request on hanged ring is, with great
>> probability, the guilty one. This also ensure that we get one
>> guilty batch per hang instead of possibly more (for each ring)
>> 
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_gem.c         |   19 ++++++++++---------
>>  drivers/gpu/drm/i915/i915_irq.c         |    3 +--
>>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
>>  3 files changed, 13 insertions(+), 11 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index d270351..27a97c3 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2322,20 +2322,17 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>>  
>>  static void i915_set_reset_status(struct intel_ring_buffer *ring,
>>  				  struct drm_i915_gem_request *request,
>> -				  u32 acthd)
>> +				  u32 acthd, const bool guilty)
>>  {
>>  	struct i915_ctx_hang_stats *hs = NULL;
>> -	bool inside, guilty;
>> +	bool inside;
>>  	unsigned long offset = 0;
>>  
>> -	/* Innocent until proven guilty */
>> -	guilty = false;
>> -
>>  	if (request->batch_obj)
>>  		offset = i915_gem_obj_offset(request->batch_obj,
>>  					     request_to_vm(request));
>>  
>> -	if (ring->hangcheck.action != HANGCHECK_WAIT &&
>> +	if (guilty &&
>>  	    i915_request_guilty(request, acthd, &inside)) {
>>  		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
>>  			  ring->name,
>> @@ -2343,8 +2340,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>>  			  offset,
>>  			  request->ctx ? request->ctx->id : 0,
>>  			  acthd);
>> -
>> -		guilty = true;
>>  	}
>>  
>>  	/* If contexts are disabled or this is the default context, use
>> @@ -2383,12 +2378,18 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>>  	u32 completed_seqno = ring->get_seqno(ring, false);
>>  	u32 acthd = intel_ring_get_active_head(ring);
>>  	struct drm_i915_gem_request *request;
>> +	bool guilty = false;
>>  
>>  	list_for_each_entry(request, &ring->request_list, list) {
>>  		if (i915_seqno_passed(completed_seqno, request->seqno))
>>  			continue;
>>  
>> -		i915_set_reset_status(ring, request, acthd);
>> +		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>
> checkpatch complains about the above.
>> +			guilty = true;
>> +			i915_set_reset_status(ring, request, acthd, true);
>> +		} else {
>> +			i915_set_reset_status(ring, request, acthd, false);
>> +		}
>
> I don't think the logic is correct. This will find the first request
> (sequentially) that was hung, and not the first ring that hung.
> Shouldn't we scan everything and take the first incomplete request with
> the highest hangcheck.score?
>
> Maybe I am crazy, but suppose we emit seqno 1 to ring X, and seqno 2 to
> ring Y (the latter occurring after the first hang check)
>
> event		RING X       RING Y
> seqno 1 emit
> seqno 2 emit	BUSY
> 		HUNG	      HUNG
> 		HUNG	      HUNG
>
> The case here is somewhat academic since they both hung, but one might
> argue that the first hang on ring Y is what caused the hang on ring X.
> 

Logic was wrong. Based on above and the feedback I got from Chris
and Daniel from my test RFC thread this is the reworked version:

1391007939-5741-4-git-send-email-mika.kuoppala@intel.com

We can get, depending ofcourse of hangcheck triggering timing, multiple
guilty batches for one hang if there were multiple rings involved.

> BTW, this change has convinced me that we really need to define
> BUSY/KICK/HUNG as relative and not absolute values...
>

I hope the reworked version of the patch voids your concern.
If not, could you elaborate this a bit more.

-Mika

>>  	}
>>  }
>>  
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>> index 6d11e25..e24f9ef 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -2473,7 +2473,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
>>  #define BUSY 1
>>  #define KICK 5
>>  #define HUNG 20
>> -#define FIRE 30
>>  
>>  	if (!i915_enable_hangcheck)
>>  		return;
>> @@ -2557,7 +2556,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>>  	}
>>  
>>  	for_each_ring(ring, dev_priv, i) {
>> -		if (ring->hangcheck.score > FIRE) {
>> +		if (ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>>  			DRM_INFO("%s on %s\n",
>>  				 stuck[i] ? "stuck" : "no progress",
>>  				 ring->name);
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index 71a73f4..6018793 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
>>  	HANGCHECK_HUNG,
>>  };
>>  
>> +#define HANGCHECK_SCORE_GUILTY 31
>> +
>>  struct intel_ring_hangcheck {
>>  	bool deadlock;
>>  	u32 seqno;
>
> -- 
> Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/3] drm/i915: Tune down debug output when context is banned
  2014-01-26 18:17   ` Ben Widawsky
@ 2014-01-29 15:28     ` Mika Kuoppala
  0 siblings, 0 replies; 12+ messages in thread
From: Mika Kuoppala @ 2014-01-29 15:28 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

Ben Widawsky <ben@bwidawsk.net> writes:

> On Wed, Jan 22, 2014 at 05:41:29PM +0200, Mika Kuoppala wrote:
>> If we have stopped rings then we know that test is running
>> so no need for spam. In addition, only spam when default
>> context gets banned.
>> 
>> v2: - make sure default context ban gets shown (Chris)
>>     - use helper for checking for default context, everywhere (Chris)
>> 
>
> Overall, passing requests around this much to these functions seems
> awkward to me. A lot of my comments are targeting that. If you're
> totally of a different mind, I can reconsider.
>
>> Reference: https://bugs.freedesktop.org/show_bug.cgi?id=73652
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h         |    5 +++++
>>  drivers/gpu/drm/i915/i915_gem.c         |   18 +++++++++++-------
>>  drivers/gpu/drm/i915/i915_gem_context.c |    9 ++-------
>>  3 files changed, 18 insertions(+), 14 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index f888fea..3a43388 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2267,6 +2267,11 @@ static inline void i915_gem_context_unreference(struct i915_hw_context *ctx)
>>  		kref_put(&ctx->ref, i915_gem_context_free);
>>  }
>>  
>> +static inline bool i915_gem_context_is_default(struct i915_hw_context *ctx)
>> +{
>> +	return (ctx->id == DEFAULT_CONTEXT_ID);
>> +}
>> +
>
> Drop the parenthesis - I know that it's my bad.
>
>>  int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
>>  				  struct drm_file *file);
>>  int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 5fcdb14..36d18d8 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2305,15 +2305,20 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
>>  	return false;
>>  }
>>  
>> -static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>> +static bool i915_context_is_banned(const struct drm_i915_gem_request *request,
>> +				   const struct i915_ctx_hang_stats *hs)
>>  {
>>  	const unsigned long elapsed = get_seconds() - hs->guilty_ts;
>> +	struct drm_i915_private *dev_priv = request->ring->dev->dev_private;
>>  
>>  	if (hs->banned)
>>  		return true;
>>  
>>  	if (elapsed <= DRM_I915_CTX_BAN_PERIOD) {
>> -		DRM_ERROR("context hanging too fast, declaring banned!\n");
>> +		if (dev_priv->gpu_error.stop_rings == 0 &&
>> +		    request->ctx && i915_gem_context_is_default(request->ctx))
>> +			DRM_ERROR("context hanging too fast, declaring banned!\n");
>> +
>
> Correct me if I am wrong, but, request->ctx should always be non-NULL
> when using full PPGTT, and hardware contexts (not sure if this patch is
> only targeted for that). In case of the latter, HAS_HW_CONTEXTS should
> be == request->ctx. Passing a request to a function which shouldn't care
> about requests feels bad to me. As such, I'd vote for:
>
> i915_context_is_banned(struct i915_hw_context *ctx) - you can similarly
> thread the ctx up through the call stack, instead of the hs.
>

I tried to take advantage of above and came up with:
1391007939-5741-2-git-send-email-mika.kuoppala@intel.com

With 4/4 applied on that series, we only pass i915_hw_context *'s around.

> I'd also suggest making i915_gem_context_is_default() check regardless
> of stop_rings. The last request is optional.
>

Hmm. I think logic is correct here.

Only output if tests are not running and if so, only if we are
dealing with default context.

-Mika

>>  		return true;
>>  	}
>>  
>> @@ -2347,17 +2352,16 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>>  		guilty = true;
>>  	}
>>  
>> -	/* If contexts are disabled or this is the default context, use
>> -	 * file_priv->reset_state
>> -	 */
>> -	if (request->ctx && request->ctx->id != DEFAULT_CONTEXT_ID)
>> +	/* For default context, we track reset statistics per file_priv
>> +	 * to allow more fine grained control */
>> +	if (request->ctx && !i915_gem_context_is_default(request->ctx))
>
> As I said above, you could replace request->ctx here if you wanted.
>
>>  		hs = &request->ctx->hang_stats;
>>  	else if (request->file_priv)
>>  		hs = &request->file_priv->private_default_ctx->hang_stats;
>>  
>>  	if (hs) {
>>  		if (guilty) {
>> -			hs->banned = i915_context_is_banned(hs);
>> +			hs->banned = i915_context_is_banned(request, hs);
>>  			hs->batch_active++;
>>  			hs->guilty_ts = get_seconds();
>>  		} else {
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index 112f865..4c55730 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -228,11 +228,6 @@ err_out:
>>  	return ERR_PTR(ret);
>>  }
>>  
>> -static inline bool is_default_context(struct i915_hw_context *ctx)
>> -{
>> -	return (ctx->id == DEFAULT_CONTEXT_ID);
>> -}
>> -
>>  /**
>>   * The default context needs to exist per ring that uses contexts. It stores the
>>   * context state of the GPU for applications that don't utilize HW contexts, as
>> @@ -474,7 +469,7 @@ static int context_idr_cleanup(int id, void *p, void *data)
>>  	struct i915_hw_context *ctx = p;
>>  
>>  	/* Ignore the default context because close will handle it */
>> -	if (is_default_context(ctx))
>> +	if (i915_gem_context_is_default(ctx))
>>  		return 0;
>>  
>>  	i915_gem_context_unreference(ctx);
>> @@ -649,7 +644,7 @@ static int do_switch(struct intel_ring_buffer *ring,
>>  		vma->bind_vma(vma, to->obj->cache_level, GLOBAL_BIND);
>>  	}
>>  
>> -	if (!to->is_initialized || is_default_context(to))
>> +	if (!to->is_initialized || i915_gem_context_is_default(to))
>>  		hw_flags |= MI_RESTORE_INHIBIT;
>>  
>>  	ret = mi_set_context(ring, to, hw_flags);
>> -- 
>> 1.7.9.5
>> 
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
> -- 
> Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-01-29 15:29 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-01-17 14:20 [PATCH 1/3] drm/i915: Tune down output when context is banned Mika Kuoppala
2014-01-17 14:20 ` [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring Mika Kuoppala
2014-01-17 14:50   ` Mika Kuoppala
2014-01-26 18:49   ` Ben Widawsky
2014-01-29 15:20     ` Mika Kuoppala
2014-01-17 14:20 ` [PATCH 3/3] drm/i915: Get rid of acthd based batch search on reset stats Mika Kuoppala
2014-01-26 18:58   ` Ben Widawsky
2014-01-28 16:30     ` Rodrigo Vivi
2014-01-17 14:27 ` [PATCH 1/3] drm/i915: Tune down output when context is banned Chris Wilson
2014-01-22 15:41 ` [PATCH v2 1/3] drm/i915: Tune down debug " Mika Kuoppala
2014-01-26 18:17   ` Ben Widawsky
2014-01-29 15:28     ` Mika Kuoppala

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox