[PATCH 1/6] drm/i915: hangcheck robustification

public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed

* [PATCH 1/6] drm/i915: hangcheck robustification
@ 2011-10-11 14:39 Daniel Vetter
  2011-10-11 14:39 ` [PATCH 2/6] drm/i915: kicking rings stuck on semaphores considered harmful Daniel Vetter
                   ` (5 more replies)
  0 siblings, 6 replies; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 14:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

From: Ben Widawsky <ben@bwidawsk.net>

This was pulled out of the per ring error handling patch series as it
actually fixes two issues, and bikeshedding appears to be going on
there.

First, remove setting hangcheck_count when we do notify ring. While it
seems counterintuitive to be setting up a timer to catch hangcheck_count
greater than 0 with hangcheck_count already greater than 0, actually
when we go to check if the GPU is hung we clear that value if the gpu is
still alive . Leaving this is actually harmful as submitting work could
falsely clear the count while the hanghcheck code is checking the count.
I can't think of case where this doesn't just delay the inevitable
reset... but I didn't spend too much time thinking about it.

Second, for Gen5+ we have more information to be considered when
determining if the GPU is stuck, primarily the media ring (and blitter
ring in gen6). This patch will check all available rings, and also updates
error state with the new information. It theoretically cant fix false
positives, but I haven't actually come across such a case.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
[danvet: remove remnants of a unrelated cleanup patch]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.h |    7 +-
 drivers/gpu/drm/i915/i915_irq.c |  148 +++++++++++++++++++++++++++-----------
 2 files changed, 108 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 15c0ca5..4e73a86 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -330,10 +330,9 @@ typedef struct drm_i915_private {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
 	struct timer_list hangcheck_timer;
-	int hangcheck_count;
-	uint32_t last_acthd;
-	uint32_t last_instdone;
-	uint32_t last_instdone1;
+	int hangcheck_count; /* Should only be modified in hanghceck timer */
+	uint32_t last_acthd[I915_NUM_RINGS];
+	uint64_t last_instdone[I915_NUM_RINGS];
 
 	unsigned long cfb_size;
 	unsigned int cfb_fb;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 012732b..2218d12 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -365,7 +365,6 @@ static void notify_ring(struct drm_device *dev,
 	ring->irq_seqno = seqno;
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->hangcheck_count = 0;
 		mod_timer(&dev_priv->hangcheck_timer,
 			  jiffies +
 			  msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD));
@@ -1656,6 +1655,91 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 	return false;
 }
 
+static bool
+instdone_stuck(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint64_t instdone = 0, instdone1 = 0;
+	uint64_t vcs_instdone = 0, bcs_instdone = 0;
+	bool stuck;
+
+	switch (INTEL_INFO(dev)->gen) {
+	case 7:
+	case 6:
+		bcs_instdone = I915_READ(BCS_INSTDONE);
+	case 5:
+		vcs_instdone = I915_READ(VCS_INSTDONE);
+	case 4:
+		instdone = I915_READ(INSTDONE_I965);
+		instdone1 = I915_READ(INSTDONE1);
+		break;
+	case 3:
+	case 2:
+		instdone = I915_READ(INSTDONE);
+		break;
+	}
+
+	stuck =
+	    (dev_priv->last_instdone[RCS] == ((instdone << 32) | instdone1)) &&
+	    (dev_priv->last_instdone[VCS] == vcs_instdone) &&
+	    (dev_priv->last_instdone[BCS] == bcs_instdone);
+
+	dev_priv->last_instdone[RCS] = (instdone << 32) | instdone1;
+	dev_priv->last_instdone[VCS] = vcs_instdone;
+	dev_priv->last_instdone[BCS] = bcs_instdone;
+
+	return stuck;
+}
+
+static bool
+acthd_stuck(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t acthd = 0, vcs_acthd = 0, bcs_acthd = 0;
+	bool stuck = false;
+
+	switch (INTEL_INFO(dev)->gen) {
+	case 7:
+	case 6:
+		bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]);
+	case 5:
+		vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]);
+	case 4:
+	case 3:
+	case 2:
+		acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]);
+		break;
+	}
+
+	stuck =
+	    (dev_priv->last_acthd[RCS] == acthd) &&
+	    (dev_priv->last_acthd[VCS] == vcs_acthd) &&
+	    (dev_priv->last_acthd[BCS] == bcs_acthd);
+
+	dev_priv->last_acthd[RCS] = acthd;
+	dev_priv->last_acthd[VCS] = vcs_acthd;
+	dev_priv->last_acthd[BCS] = bcs_acthd;
+
+	return stuck;
+}
+
+static bool gpu_stuck(struct drm_device *dev)
+{
+	#define NUM_HANGCHECKS_TO_RESET 1
+
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	if (!acthd_stuck(dev) || !instdone_stuck(dev))
+		dev_priv->hangcheck_count = 0;
+	else
+		dev_priv->hangcheck_count++;
+
+	if (dev_priv->hangcheck_count > NUM_HANGCHECKS_TO_RESET)
+		return true;
+
+	return false;
+}
+
 /**
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. The first time this is called we simply record
@@ -1666,13 +1750,11 @@ void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	uint32_t acthd, instdone, instdone1;
 	bool err = false;
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	/* If all work is done then ACTHD clearly hasn't advanced. */
 	if (i915_hangcheck_ring_idle(&dev_priv->ring[RCS], &err) &&
 	    i915_hangcheck_ring_idle(&dev_priv->ring[VCS], &err) &&
 	    i915_hangcheck_ring_idle(&dev_priv->ring[BCS], &err)) {
@@ -1682,50 +1764,30 @@ void i915_hangcheck_elapsed(unsigned long data)
 		return;
 	}
 
-	if (INTEL_INFO(dev)->gen < 4) {
-		acthd = I915_READ(ACTHD);
-		instdone = I915_READ(INSTDONE);
-		instdone1 = 0;
-	} else {
-		acthd = I915_READ(ACTHD_I965);
-		instdone = I915_READ(INSTDONE_I965);
-		instdone1 = I915_READ(INSTDONE1);
-	}
+	if (gpu_stuck(dev)) {
+		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
 
-	if (dev_priv->last_acthd == acthd &&
-	    dev_priv->last_instdone == instdone &&
-	    dev_priv->last_instdone1 == instdone1) {
-		if (dev_priv->hangcheck_count++ > 1) {
-			DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
-
-			if (!IS_GEN2(dev)) {
-				/* Is the chip hanging on a WAIT_FOR_EVENT?
-				 * If so we can simply poke the RB_WAIT bit
-				 * and break the hang. This should work on
-				 * all but the second generation chipsets.
-				 */
-
-				if (kick_ring(&dev_priv->ring[RCS]))
-					goto repeat;
+		if (!IS_GEN2(dev)) {
+			/* Is the chip hanging on a WAIT_FOR_EVENT?
+			 * If so we can simply poke the RB_WAIT bit
+			 * and break the hang. This should work on
+			 * all but the second generation chipsets.
+			 */
 
-				if (HAS_BSD(dev) &&
-				    kick_ring(&dev_priv->ring[VCS]))
-					goto repeat;
+			if (kick_ring(&dev_priv->ring[RCS]))
+				goto repeat;
 
-				if (HAS_BLT(dev) &&
-				    kick_ring(&dev_priv->ring[BCS]))
-					goto repeat;
-			}
+			if (HAS_BSD(dev) &&
+			    kick_ring(&dev_priv->ring[VCS]))
+				goto repeat;
 
-			i915_handle_error(dev, true);
-			return;
+			if (HAS_BLT(dev) &&
+			    kick_ring(&dev_priv->ring[BCS]))
+				goto repeat;
 		}
-	} else {
-		dev_priv->hangcheck_count = 0;
 
-		dev_priv->last_acthd = acthd;
-		dev_priv->last_instdone = instdone;
-		dev_priv->last_instdone1 = instdone1;
+		i915_handle_error(dev, true);
+		return;
 	}
 
 repeat:
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/6] drm/i915: kicking rings stuck on semaphores considered harmful
  2011-10-11 14:39 [PATCH 1/6] drm/i915: hangcheck robustification Daniel Vetter
@ 2011-10-11 14:39 ` Daniel Vetter
  2011-10-11 15:51   ` Chris Wilson
  2011-10-11 20:48   ` Ben Widawsky
  2011-10-11 14:39 ` [PATCH 3/6] drm/i915: don't bail out of intel_wait_ring_buffer too early Daniel Vetter
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 14:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

If our semaphore logic gets confused and we have a ring stuck waiting
for one, there's a decent chance it'll just execute garbage when being
kicked. Also, kicking the ring obscures the place where the error
first occured, making error_state decoding much harder.

So drop this an let gpu reset handle this mess in a clean fashion.

In contrast, kicking rings stuck on MI_WAIT is rather harmless, at
worst there'll be a bit of screen-flickering. There's also old
broken userspace out there which needs this as a  work-around.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_irq.c |    7 -------
 1 files changed, 0 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2218d12..3563a2e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1645,13 +1645,6 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 		I915_WRITE_CTL(ring, tmp);
 		return true;
 	}
-	if (IS_GEN6(dev) &&
-	    (tmp & RING_WAIT_SEMAPHORE)) {
-		DRM_ERROR("Kicking stuck semaphore on %s\n",
-			  ring->name);
-		I915_WRITE_CTL(ring, tmp);
-		return true;
-	}
 	return false;
 }
 
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/6] drm/i915: don't bail out of intel_wait_ring_buffer too early
  2011-10-11 14:39 [PATCH 1/6] drm/i915: hangcheck robustification Daniel Vetter
  2011-10-11 14:39 ` [PATCH 2/6] drm/i915: kicking rings stuck on semaphores considered harmful Daniel Vetter
@ 2011-10-11 14:39 ` Daniel Vetter
  2011-10-11 15:53   ` Chris Wilson
  2011-10-11 14:39 ` [PATCH 4/6] drm/i915: switch ring->id to be a real id Daniel Vetter
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 14:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

In the pre-gem days with non-existing hangcheck and gpu reset code,
this timeout of 3 seconds was pretty important to avoid stuck
processes.

But now we have the hangcheck code in gem that goes to great length
to ensure that the gpu is really dead before declaring it wedged.

So there's no need for this timeout anymore. Actually it's even harmful
because we can bail out too early (e.g. with xscreensaver slip)
when running giant batchbuffers. And our code isn't robust enough
to properly unroll any state-changes, we pretty much rely on the gpu
reset code cleaning up the mess (like cache tracking, fencing state,
active list/request tracking, ...).

With this change intel_begin_ring can only fail when the gpu is
wedged, and it will return -EAGAIN (like wait_request in case the
gpu reset is still outstanding).

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 0e99589..00f9eb5 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1024,7 +1024,8 @@ int intel_wait_ring_buffer(struct intel_ring_buffer *ring, int n)
 		msleep(1);
 		if (atomic_read(&dev_priv->mm.wedged))
 			return -EAGAIN;
-	} while (!time_after(jiffies, end));
+	} while (drm_core_check_feature(dev, DRIVER_GEM)
+			|| !time_after(jiffies, end));
 	trace_i915_ring_wait_end(ring);
 	return -EBUSY;
 }
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 4/6] drm/i915: switch ring->id to be a real id
  2011-10-11 14:39 [PATCH 1/6] drm/i915: hangcheck robustification Daniel Vetter
  2011-10-11 14:39 ` [PATCH 2/6] drm/i915: kicking rings stuck on semaphores considered harmful Daniel Vetter
  2011-10-11 14:39 ` [PATCH 3/6] drm/i915: don't bail out of intel_wait_ring_buffer too early Daniel Vetter
@ 2011-10-11 14:39 ` Daniel Vetter
  2011-10-11 15:55   ` Chris Wilson
  2011-10-11 14:39 ` [PATCH 5/6] drm/i915: refactor ring error state capture to use arrays Daniel Vetter
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 14:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

... and add a helpr function for the places where we want a flag.

This way we can use ring->id to index into arrays.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |    8 ++++----
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |    4 ++--
 drivers/gpu/drm/i915/i915_irq.c            |    2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c    |   14 +++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   20 ++++++++++----------
 5 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 8e95d66..75c5dd6 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -659,9 +659,9 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
 static const char *ring_str(int ring)
 {
 	switch (ring) {
-	case RING_RENDER: return " render";
-	case RING_BSD: return " bsd";
-	case RING_BLT: return " blt";
+	case RCS: return "render";
+	case VCS: return "bsd";
+	case BCS: return "blt";
 	default: return "";
 	}
 }
@@ -714,7 +714,7 @@ static void print_error_buffers(struct seq_file *m,
 			   tiling_flag(err->tiling),
 			   dirty_flag(err->dirty),
 			   purgeable_flag(err->purgeable),
-			   ring_str(err->ring),
+			   err->ring != -1 ? ring_str(err->ring) : "",
 			   cache_level_str(err->cache_level));
 
 		if (err->name)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 3693e83..926ed48 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -202,9 +202,9 @@ i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj,
 	cd->invalidate_domains |= invalidate_domains;
 	cd->flush_domains |= flush_domains;
 	if (flush_domains & I915_GEM_GPU_DOMAINS)
-		cd->flush_rings |= obj->ring->id;
+		cd->flush_rings |= intel_ring_flag(obj->ring);
 	if (invalidate_domains & I915_GEM_GPU_DOMAINS)
-		cd->flush_rings |= ring->id;
+		cd->flush_rings |= intel_ring_flag(ring);
 }
 
 struct eb_objects {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 3563a2e..6056e21 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -801,7 +801,7 @@ static u32 capture_bo_list(struct drm_i915_error_buffer *err,
 		err->tiling = obj->tiling_mode;
 		err->dirty = obj->dirty;
 		err->purgeable = obj->madv != I915_MADV_WILLNEED;
-		err->ring = obj->ring ? obj->ring->id : 0;
+		err->ring = obj->ring ? obj->ring->id : -1;
 		err->cache_level = obj->cache_level;
 
 		if (++i == count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 00f9eb5..7dffd26 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -613,13 +613,13 @@ void intel_ring_setup_status_page(struct intel_ring_buffer *ring)
 	 */
 	if (IS_GEN7(dev)) {
 		switch (ring->id) {
-		case RING_RENDER:
+		case RCS:
 			mmio = RENDER_HWS_PGA_GEN7;
 			break;
-		case RING_BLT:
+		case BCS:
 			mmio = BLT_HWS_PGA_GEN7;
 			break;
-		case RING_BSD:
+		case VCS:
 			mmio = BSD_HWS_PGA_GEN7;
 			break;
 		}
@@ -1064,7 +1064,7 @@ void intel_ring_advance(struct intel_ring_buffer *ring)
 
 static const struct intel_ring_buffer render_ring = {
 	.name			= "render ring",
-	.id			= RING_RENDER,
+	.id			= RCS,
 	.mmio_base		= RENDER_RING_BASE,
 	.size			= 32 * PAGE_SIZE,
 	.init			= init_render_ring,
@@ -1087,7 +1087,7 @@ static const struct intel_ring_buffer render_ring = {
 
 static const struct intel_ring_buffer bsd_ring = {
 	.name                   = "bsd ring",
-	.id			= RING_BSD,
+	.id			= VCS,
 	.mmio_base		= BSD_RING_BASE,
 	.size			= 32 * PAGE_SIZE,
 	.init			= init_ring_common,
@@ -1197,7 +1197,7 @@ gen6_bsd_ring_put_irq(struct intel_ring_buffer *ring)
 /* ring buffer for Video Codec for Gen6+ */
 static const struct intel_ring_buffer gen6_bsd_ring = {
 	.name			= "gen6 bsd ring",
-	.id			= RING_BSD,
+	.id			= VCS,
 	.mmio_base		= GEN6_BSD_RING_BASE,
 	.size			= 32 * PAGE_SIZE,
 	.init			= init_ring_common,
@@ -1332,7 +1332,7 @@ static void blt_ring_cleanup(struct intel_ring_buffer *ring)
 
 static const struct intel_ring_buffer gen6_blt_ring = {
 	.name			= "blt ring",
-	.id			= RING_BLT,
+	.id			= BCS,
 	.mmio_base		= BLT_RING_BASE,
 	.size			= 32 * PAGE_SIZE,
 	.init			= blt_ring_init,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 68281c9..c8b9cc0 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -1,13 +1,6 @@
 #ifndef _INTEL_RINGBUFFER_H_
 #define _INTEL_RINGBUFFER_H_
 
-enum {
-	RCS = 0x0,
-	VCS,
-	BCS,
-	I915_NUM_RINGS,
-};
-
 struct  intel_hw_status_page {
 	u32	__iomem	*page_addr;
 	unsigned int	gfx_addr;
@@ -36,10 +29,11 @@ struct  intel_hw_status_page {
 struct  intel_ring_buffer {
 	const char	*name;
 	enum intel_ring_id {
-		RING_RENDER = 0x1,
-		RING_BSD = 0x2,
-		RING_BLT = 0x4,
+		RCS = 0x0,
+		VCS,
+		BCS,
 	} id;
+#define I915_NUM_RINGS 3
 	u32		mmio_base;
 	void		__iomem *virtual_start;
 	struct		drm_device *dev;
@@ -119,6 +113,12 @@ struct  intel_ring_buffer {
 	void *private;
 };
 
+static inline unsigned
+intel_ring_flag(struct intel_ring_buffer *ring)
+{
+	return 1 << ring->id;
+}
+
 static inline u32
 intel_ring_sync_index(struct intel_ring_buffer *ring,
 		      struct intel_ring_buffer *other)
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 5/6] drm/i915: refactor ring error state capture to use arrays
  2011-10-11 14:39 [PATCH 1/6] drm/i915: hangcheck robustification Daniel Vetter
                   ` (2 preceding siblings ...)
  2011-10-11 14:39 ` [PATCH 4/6] drm/i915: switch ring->id to be a real id Daniel Vetter
@ 2011-10-11 14:39 ` Daniel Vetter
  2011-10-11 15:57   ` Chris Wilson
  2011-10-11 14:39 ` [PATCH 6/6] drm/i915: collect more per ring error state Daniel Vetter
  2011-10-19 11:32 ` [PATCH 1/6] drm/i915: hangcheck robustification Chris Wilson
  5 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 14:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

The code already got unwindy and we want to dump more per-ring
registers.

Only functional change is that we now also capture the video
ring registers on ilk.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_debugfs.c |   55 +++++++++++++-----------
 drivers/gpu/drm/i915/i915_drv.h     |   20 ++-------
 drivers/gpu/drm/i915/i915_irq.c     |   79 ++++++++++++++++++----------------
 drivers/gpu/drm/i915/i915_reg.h     |   11 +----
 4 files changed, 79 insertions(+), 86 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 75c5dd6..ea7237d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -727,6 +727,26 @@ static void print_error_buffers(struct seq_file *m,
 	}
 }
 
+static void i915_ring_error_state(struct seq_file *m,
+				  struct drm_device *dev,
+				  struct drm_i915_error_state *error,
+				  unsigned ring)
+{
+	seq_printf(m, "%s command stream:\n", ring_str(ring));
+	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd[ring]);
+	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir[ring]);
+	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr[ring]);
+	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone[ring]);
+	if (ring == RCS) {
+		if (INTEL_INFO(dev)->gen >= 4) {
+			seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
+			seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
+		}
+		seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
+	}
+	seq_printf(m, "  seqno: 0x%08x\n", error->seqno[ring]);
+}
+
 static int i915_error_state(struct seq_file *m, void *unused)
 {
 	struct drm_info_node *node = (struct drm_info_node *) m->private;
@@ -749,36 +769,19 @@ static int i915_error_state(struct seq_file *m, void *unused)
 	seq_printf(m, "PCI ID: 0x%04x\n", dev->pci_device);
 	seq_printf(m, "EIR: 0x%08x\n", error->eir);
 	seq_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
-	if (INTEL_INFO(dev)->gen >= 6) {
-		seq_printf(m, "ERROR: 0x%08x\n", error->error);
-		seq_printf(m, "Blitter command stream:\n");
-		seq_printf(m, "  ACTHD:    0x%08x\n", error->bcs_acthd);
-		seq_printf(m, "  IPEIR:    0x%08x\n", error->bcs_ipeir);
-		seq_printf(m, "  IPEHR:    0x%08x\n", error->bcs_ipehr);
-		seq_printf(m, "  INSTDONE: 0x%08x\n", error->bcs_instdone);
-		seq_printf(m, "  seqno:    0x%08x\n", error->bcs_seqno);
-		seq_printf(m, "Video (BSD) command stream:\n");
-		seq_printf(m, "  ACTHD:    0x%08x\n", error->vcs_acthd);
-		seq_printf(m, "  IPEIR:    0x%08x\n", error->vcs_ipeir);
-		seq_printf(m, "  IPEHR:    0x%08x\n", error->vcs_ipehr);
-		seq_printf(m, "  INSTDONE: 0x%08x\n", error->vcs_instdone);
-		seq_printf(m, "  seqno:    0x%08x\n", error->vcs_seqno);
-	}
-	seq_printf(m, "Render command stream:\n");
-	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd);
-	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir);
-	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr);
-	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone);
-	if (INTEL_INFO(dev)->gen >= 4) {
-		seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
-		seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
-	}
-	seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
-	seq_printf(m, "  seqno: 0x%08x\n", error->seqno);
 
 	for (i = 0; i < dev_priv->num_fence_regs; i++)
 		seq_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
 
+	if (INTEL_INFO(dev)->gen >= 6) 
+		seq_printf(m, "ERROR: 0x%08x\n", error->error);
+
+	i915_ring_error_state(m, dev, error, RCS);
+	if (HAS_BLT(dev))
+		i915_ring_error_state(m, dev, error, BCS);
+	if (HAS_BSD(dev))
+		i915_ring_error_state(m, dev, error, VCS);
+
 	if (error->active_bo)
 		print_error_buffers(m, "Active",
 				    error->active_bo,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4e73a86..5f3ff9d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -149,25 +149,15 @@ struct drm_i915_error_state {
 	u32 eir;
 	u32 pgtbl_er;
 	u32 pipestat[I915_MAX_PIPES];
-	u32 ipeir;
-	u32 ipehr;
-	u32 instdone;
-	u32 acthd;
+	u32 ipeir[I915_NUM_RINGS];
+	u32 ipehr[I915_NUM_RINGS];
+	u32 instdone[I915_NUM_RINGS];
+	u32 acthd[I915_NUM_RINGS];
 	u32 error; /* gen6+ */
-	u32 bcs_acthd; /* gen6+ blt engine */
-	u32 bcs_ipehr;
-	u32 bcs_ipeir;
-	u32 bcs_instdone;
-	u32 bcs_seqno;
-	u32 vcs_acthd; /* gen6+ bsd engine */
-	u32 vcs_ipehr;
-	u32 vcs_ipeir;
-	u32 vcs_instdone;
-	u32 vcs_seqno;
 	u32 instpm;
 	u32 instps;
 	u32 instdone1;
-	u32 seqno;
+	u32 seqno[I915_NUM_RINGS];
 	u64 bbaddr;
 	u64 fence[16];
 	struct timeval time;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6056e21..79aba7f 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -872,6 +872,32 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 	return NULL;
 }
 
+static void i915_record_ring_state(struct drm_device *dev,
+				   struct drm_i915_error_state *error,
+				   struct intel_ring_buffer *ring)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	if (INTEL_INFO(dev)->gen >= 4) {
+		error->ipeir[ring->id] = I915_READ(RING_IPEIR(ring->mmio_base));
+		error->ipehr[ring->id] = I915_READ(RING_IPEHR(ring->mmio_base));
+		error->instdone[ring->id] = I915_READ(RING_INSTDONE(ring->mmio_base));
+		if (ring->id == RCS) {
+			error->instps = I915_READ(INSTPS);
+			error->instdone1 = I915_READ(INSTDONE1);
+			error->bbaddr = I915_READ64(BB_ADDR);
+		}
+	} else {
+		error->ipeir[ring->id] = I915_READ(IPEIR);
+		error->ipehr[ring->id] = I915_READ(IPEHR);
+		error->instdone[ring->id] = I915_READ(INSTDONE);
+		error->bbaddr = 0;
+	}
+
+	error->seqno[ring->id] = dev_priv->ring->get_seqno(ring);
+	error->acthd[ring->id] = intel_ring_get_active_head(ring);
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -905,47 +931,23 @@ static void i915_capture_error_state(struct drm_device *dev)
 	DRM_INFO("capturing error event; look for more information in /debug/dri/%d/i915_error_state\n",
 		 dev->primary->index);
 
-	error->seqno = dev_priv->ring[RCS].get_seqno(&dev_priv->ring[RCS]);
 	error->eir = I915_READ(EIR);
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 	for_each_pipe(pipe)
 		error->pipestat[pipe] = I915_READ(PIPESTAT(pipe));
 	error->instpm = I915_READ(INSTPM);
-	error->error = 0;
-	if (INTEL_INFO(dev)->gen >= 6) {
+
+	if (INTEL_INFO(dev)->gen >= 6)
 		error->error = I915_READ(ERROR_GEN6);
+	else
+		error->error = 0;
+
+	i915_record_ring_state(dev, error, &dev_priv->ring[RCS]);
+	if (HAS_BLT(dev))
+		i915_record_ring_state(dev, error, &dev_priv->ring[BCS]);
+	if (HAS_BSD(dev))
+		i915_record_ring_state(dev, error, &dev_priv->ring[VCS]);
 
-		error->bcs_acthd = I915_READ(BCS_ACTHD);
-		error->bcs_ipehr = I915_READ(BCS_IPEHR);
-		error->bcs_ipeir = I915_READ(BCS_IPEIR);
-		error->bcs_instdone = I915_READ(BCS_INSTDONE);
-		error->bcs_seqno = 0;
-		if (dev_priv->ring[BCS].get_seqno)
-			error->bcs_seqno = dev_priv->ring[BCS].get_seqno(&dev_priv->ring[BCS]);
-
-		error->vcs_acthd = I915_READ(VCS_ACTHD);
-		error->vcs_ipehr = I915_READ(VCS_IPEHR);
-		error->vcs_ipeir = I915_READ(VCS_IPEIR);
-		error->vcs_instdone = I915_READ(VCS_INSTDONE);
-		error->vcs_seqno = 0;
-		if (dev_priv->ring[VCS].get_seqno)
-			error->vcs_seqno = dev_priv->ring[VCS].get_seqno(&dev_priv->ring[VCS]);
-	}
-	if (INTEL_INFO(dev)->gen >= 4) {
-		error->ipeir = I915_READ(IPEIR_I965);
-		error->ipehr = I915_READ(IPEHR_I965);
-		error->instdone = I915_READ(INSTDONE_I965);
-		error->instps = I915_READ(INSTPS);
-		error->instdone1 = I915_READ(INSTDONE1);
-		error->acthd = I915_READ(ACTHD_I965);
-		error->bbaddr = I915_READ64(BB_ADDR);
-	} else {
-		error->ipeir = I915_READ(IPEIR);
-		error->ipehr = I915_READ(IPEHR);
-		error->instdone = I915_READ(INSTDONE);
-		error->acthd = I915_READ(ACTHD);
-		error->bbaddr = 0;
-	}
 	i915_gem_record_fences(dev, error);
 
 	/* Record the active batch and ring buffers */
@@ -1659,11 +1661,14 @@ instdone_stuck(struct drm_device *dev)
 	switch (INTEL_INFO(dev)->gen) {
 	case 7:
 	case 6:
-		bcs_instdone = I915_READ(BCS_INSTDONE);
+		bcs_instdone =
+		    I915_READ(RING_INSTDONE(dev_priv->ring[BCS].mmio_base));
 	case 5:
-		vcs_instdone = I915_READ(VCS_INSTDONE);
+		vcs_instdone =
+		    I915_READ(RING_INSTDONE(dev_priv->ring[VCS].mmio_base));
 	case 4:
-		instdone = I915_READ(INSTDONE_I965);
+		instdone =
+		    I915_READ(RING_INSTDONE(dev_priv->ring[RCS].mmio_base));
 		instdone1 = I915_READ(INSTDONE1);
 		break;
 	case 3:
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 138eae1..bf7923c 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -346,6 +346,9 @@
 #define IPEIR_I965	0x02064
 #define IPEHR_I965	0x02068
 #define INSTDONE_I965	0x0206c
+#define RING_IPEIR(base)	((base)+0x64)
+#define RING_IPEHR(base)	((base)+0x68)
+#define RING_INSTDONE(base)	((base)+0x6c)
 #define INSTPS		0x02070 /* 965+ only */
 #define INSTDONE1	0x0207c /* 965+ only */
 #define ACTHD_I965	0x02074
@@ -359,14 +362,6 @@
 #define INSTDONE	0x02090
 #define NOPID		0x02094
 #define HWSTAM		0x02098
-#define VCS_INSTDONE	0x1206C
-#define VCS_IPEIR	0x12064
-#define VCS_IPEHR	0x12068
-#define VCS_ACTHD	0x12074
-#define BCS_INSTDONE	0x2206C
-#define BCS_IPEIR	0x22064
-#define BCS_IPEHR	0x22068
-#define BCS_ACTHD	0x22074
 
 #define ERROR_GEN6	0x040a0
 
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 6/6] drm/i915: collect more per ring error state
  2011-10-11 14:39 [PATCH 1/6] drm/i915: hangcheck robustification Daniel Vetter
                   ` (3 preceding siblings ...)
  2011-10-11 14:39 ` [PATCH 5/6] drm/i915: refactor ring error state capture to use arrays Daniel Vetter
@ 2011-10-11 14:39 ` Daniel Vetter
  2011-10-11 16:01   ` Chris Wilson
  2011-10-19 11:32 ` [PATCH 1/6] drm/i915: hangcheck robustification Chris Wilson
  5 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 14:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

Based on a patch by Ben Widawsky, but with different colors
for the bikeshed.

In contrast to Ben's patch this one doesn't add the fault regs.
Afaics they're for the optional page fault support which
- we're not enabling
- and which seems to be unsupported by the hw team. Recent bspec
  lacks tons of information about this that the public docs released
  half a year back still contain.

Also dump ring HEAD/TAIL registers - I've recently seen a few
error_state where just guessing these is not good enough.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>

dump head/tail into error_state
---
 drivers/gpu/drm/i915/i915_debugfs.c |    5 ++++-
 drivers/gpu/drm/i915/i915_drv.h     |    5 ++++-
 drivers/gpu/drm/i915/i915_irq.c     |    7 ++++++-
 drivers/gpu/drm/i915/i915_reg.h     |    2 ++
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index ea7237d..5d73ae5 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -733,17 +733,20 @@ static void i915_ring_error_state(struct seq_file *m,
 				  unsigned ring)
 {
 	seq_printf(m, "%s command stream:\n", ring_str(ring));
+	seq_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
+	seq_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
 	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd[ring]);
 	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir[ring]);
 	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr[ring]);
 	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone[ring]);
+	seq_printf(m, "  INSTPS: 0x%08x\n", error->instps[ring]);
 	if (ring == RCS) {
 		if (INTEL_INFO(dev)->gen >= 4) {
 			seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
-			seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
 		}
 		seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
 	}
+	seq_printf(m, "  FADDR: 0x%08x\n", error->faddr[ring]);
 	seq_printf(m, "  seqno: 0x%08x\n", error->seqno[ring]);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5f3ff9d..9f6987f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -149,16 +149,19 @@ struct drm_i915_error_state {
 	u32 eir;
 	u32 pgtbl_er;
 	u32 pipestat[I915_MAX_PIPES];
+	u32 tail[I915_MAX_PIPES];
+	u32 head[I915_MAX_PIPES];
 	u32 ipeir[I915_NUM_RINGS];
 	u32 ipehr[I915_NUM_RINGS];
 	u32 instdone[I915_NUM_RINGS];
 	u32 acthd[I915_NUM_RINGS];
 	u32 error; /* gen6+ */
 	u32 instpm;
-	u32 instps;
+	u32 instps[I915_NUM_RINGS];
 	u32 instdone1;
 	u32 seqno[I915_NUM_RINGS];
 	u64 bbaddr;
+	u32 faddr[I915_NUM_RINGS];
 	u64 fence[16];
 	struct timeval time;
 	struct drm_i915_error_object {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 79aba7f..7b87599 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -878,12 +878,15 @@ static void i915_record_ring_state(struct drm_device *dev,
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
+	if (INTEL_INFO(dev)->gen >= 6) {
+		error->faddr[ring->id] = I915_READ(RING_DMA_FADD(ring->mmio_base));
+	}
 	if (INTEL_INFO(dev)->gen >= 4) {
 		error->ipeir[ring->id] = I915_READ(RING_IPEIR(ring->mmio_base));
 		error->ipehr[ring->id] = I915_READ(RING_IPEHR(ring->mmio_base));
 		error->instdone[ring->id] = I915_READ(RING_INSTDONE(ring->mmio_base));
+		error->instps[ring->id] = I915_READ(RING_INSTPS(ring->mmio_base));
 		if (ring->id == RCS) {
-			error->instps = I915_READ(INSTPS);
 			error->instdone1 = I915_READ(INSTDONE1);
 			error->bbaddr = I915_READ64(BB_ADDR);
 		}
@@ -896,6 +899,8 @@ static void i915_record_ring_state(struct drm_device *dev,
 
 	error->seqno[ring->id] = dev_priv->ring->get_seqno(ring);
 	error->acthd[ring->id] = intel_ring_get_active_head(ring);
+	error->head[ring->id] = I915_READ_HEAD(ring);
+	error->tail[ring->id] = I915_READ_TAIL(ring);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index bf7923c..34a4692 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -349,6 +349,8 @@
 #define RING_IPEIR(base)	((base)+0x64)
 #define RING_IPEHR(base)	((base)+0x68)
 #define RING_INSTDONE(base)	((base)+0x6c)
+#define RING_INSTPS(base)	((base)+0x70)
+#define RING_DMA_FADD(base)	((base)+0x78)
 #define INSTPS		0x02070 /* 965+ only */
 #define INSTDONE1	0x0207c /* 965+ only */
 #define ACTHD_I965	0x02074
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/6] drm/i915: kicking rings stuck on semaphores considered harmful
  2011-10-11 14:39 ` [PATCH 2/6] drm/i915: kicking rings stuck on semaphores considered harmful Daniel Vetter
@ 2011-10-11 15:51   ` Chris Wilson
  2011-10-11 20:48   ` Ben Widawsky
  1 sibling, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2011-10-11 15:51 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Tue, 11 Oct 2011 16:39:10 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> If our semaphore logic gets confused and we have a ring stuck waiting
> for one, there's a decent chance it'll just execute garbage when being
> kicked. Also, kicking the ring obscures the place where the error
> first occured, making error_state decoding much harder.
> 
> So drop this an let gpu reset handle this mess in a clean fashion.
> 
> In contrast, kicking rings stuck on MI_WAIT is rather harmless, at
> worst there'll be a bit of screen-flickering. There's also old
> broken userspace out there which needs this as a  work-around.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Chris Wilson <chris@hchris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/6] drm/i915: don't bail out of intel_wait_ring_buffer too early
  2011-10-11 14:39 ` [PATCH 3/6] drm/i915: don't bail out of intel_wait_ring_buffer too early Daniel Vetter
@ 2011-10-11 15:53   ` Chris Wilson
  2011-10-11 17:25     ` [PATCH] " Daniel Vetter
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2011-10-11 15:53 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Tue, 11 Oct 2011 16:39:11 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> In the pre-gem days with non-existing hangcheck and gpu reset code,
> this timeout of 3 seconds was pretty important to avoid stuck
> processes.
> 
> But now we have the hangcheck code in gem that goes to great length
> to ensure that the gpu is really dead before declaring it wedged.
> 
> So there's no need for this timeout anymore. Actually it's even harmful
> because we can bail out too early (e.g. with xscreensaver slip)
> when running giant batchbuffers. And our code isn't robust enough
> to properly unroll any state-changes, we pretty much rely on the gpu
> reset code cleaning up the mess (like cache tracking, fencing state,
> active list/request tracking, ...).
> 
> With this change intel_begin_ring can only fail when the gpu is
> wedged, and it will return -EAGAIN (like wait_request in case the
> gpu reset is still outstanding).
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>

This makes me nervous, as it wasn't very long ago that we hit this
timeout during resume. (A bug nevertheless, but promoting such to an
infinite loop is worse...)

Can we just make the timeout insanely large in the HAS_GEM case?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/6] drm/i915: switch ring->id to be a real id
  2011-10-11 14:39 ` [PATCH 4/6] drm/i915: switch ring->id to be a real id Daniel Vetter
@ 2011-10-11 15:55   ` Chris Wilson
  2011-10-11 17:27     ` [PATCH] drm/i915: don't bail out of intel_wait_ring_buffer too early Daniel Vetter
  2011-10-11 17:29     ` [PATCH] drm/i915: switch ring->id to be a real id Daniel Vetter
  0 siblings, 2 replies; 26+ messages in thread
From: Chris Wilson @ 2011-10-11 15:55 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Tue, 11 Oct 2011 16:39:12 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> ... and add a helpr function for the places where we want a flag.
> 
> This way we can use ring->id to index into arrays.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        |    8 ++++----
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |    4 ++--
>  drivers/gpu/drm/i915/i915_irq.c            |    2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |   14 +++++++-------
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |   20 ++++++++++----------
>  5 files changed, 24 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 8e95d66..75c5dd6 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -659,9 +659,9 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
>  static const char *ring_str(int ring)
>  {
>  	switch (ring) {
> -	case RING_RENDER: return " render";
> -	case RING_BSD: return " bsd";
> -	case RING_BLT: return " blt";
> +	case RCS: return "render";
> +	case VCS: return "bsd";
> +	case BCS: return "blt";
>  	default: return "";

You dropped the beautifying space. On purpose?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/6] drm/i915: refactor ring error state capture to use arrays
  2011-10-11 14:39 ` [PATCH 5/6] drm/i915: refactor ring error state capture to use arrays Daniel Vetter
@ 2011-10-11 15:57   ` Chris Wilson
  0 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2011-10-11 15:57 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Tue, 11 Oct 2011 16:39:13 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> The code already got unwindy and we want to dump more per-ring
                       ^ unwieldly
> registers.
> 
> Only functional change is that we now also capture the video
> ring registers on ilk.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>

Ah, I see why now the space had to go. But we still need that space at the
other callsite.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 6/6] drm/i915: collect more per ring error state
  2011-10-11 14:39 ` [PATCH 6/6] drm/i915: collect more per ring error state Daniel Vetter
@ 2011-10-11 16:01   ` Chris Wilson
  2011-10-11 17:30     ` [PATCH] " Daniel Vetter
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2011-10-11 16:01 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Tue, 11 Oct 2011 16:39:14 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> Based on a patch by Ben Widawsky, but with different colors
> for the bikeshed.
> 
> In contrast to Ben's patch this one doesn't add the fault regs.
> Afaics they're for the optional page fault support which
> - we're not enabling
> - and which seems to be unsupported by the hw team. Recent bspec
>   lacks tons of information about this that the public docs released
>   half a year back still contain.
> 
> Also dump ring HEAD/TAIL registers - I've recently seen a few
> error_state where just guessing these is not good enough.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> 
> dump head/tail into error_state
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c |    5 ++++-
>  drivers/gpu/drm/i915/i915_drv.h     |    5 ++++-
>  drivers/gpu/drm/i915/i915_irq.c     |    7 ++++++-
>  drivers/gpu/drm/i915/i915_reg.h     |    2 ++
>  4 files changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index ea7237d..5d73ae5 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -733,17 +733,20 @@ static void i915_ring_error_state(struct seq_file *m,
>  				  unsigned ring)
>  {
>  	seq_printf(m, "%s command stream:\n", ring_str(ring));
> +	seq_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
> +	seq_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
>  	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd[ring]);
>  	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir[ring]);
>  	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr[ring]);
>  	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone[ring]);
> +	seq_printf(m, "  INSTPS: 0x%08x\n", error->instps[ring]);
>  	if (ring == RCS) {
>  		if (INTEL_INFO(dev)->gen >= 4) {
>  			seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
> -			seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
>  		}
>  		seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);

Hmm, isn't INSTPM also per-ring?

Can you also add the DMA fetch addresses whilst you are here? I've never
found an instance where they were more informative than the ACTHD, but
that was only for a very small sampling of local errors.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] drm/i915: don't bail out of intel_wait_ring_buffer too early
  2011-10-11 15:53   ` Chris Wilson
@ 2011-10-11 17:25     ` Daniel Vetter
  2011-10-18 15:24       ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 17:25 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson; +Cc: Daniel Vetter

In the pre-gem days with non-existing hangcheck and gpu reset code,
this timeout of 3 seconds was pretty important to avoid stuck
processes.

But now we have the hangcheck code in gem that goes to great length
to ensure that the gpu is really dead before declaring it wedged.

So there's no need for this timeout anymore. Actually it's even harmful
because we can bail out too early (e.g. with xscreensaver slip)
when running giant batchbuffers. And our code isn't robust enough
to properly unroll any state-changes, we pretty much rely on the gpu
reset code cleaning up the mess (like cache tracking, fencing state,
active list/request tracking, ...).

With this change intel_begin_ring can only fail when the gpu is
wedged, and it will return -EAGAIN (like wait_request in case the
gpu reset is still outstanding).

v2: Chris Wilson noted that on resume timers aren't running and hence
we won't ever get kicked out of this loop by the hangcheck code. Use
an insanely large timeout instead for the HAS_GEM case to prevent
resume bugs from totally hanging the machine.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c |   11 ++++++++++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 0e99589..d33cbf7 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1006,7 +1006,16 @@ int intel_wait_ring_buffer(struct intel_ring_buffer *ring, int n)
 	}

 	trace_i915_ring_wait_begin(ring);
-	end = jiffies + 3 * HZ;
+	if (drm_core_check_feature(dev, DRIVER_GEM))
+		/* With GEM the hangcheck timer should kick us out of the loop,
+		 * leaving it early runs the risk of corrupting GEM state (due
+		 * to running on almost untested codepaths). But on resume
+		 * timers don't work yet, so prevent a complete hang in that
+		 * case by choosing an insanely large timeout. */
+		end = jiffies + 60 * HZ;
+	else
+		end = jiffies + 3 * HZ;
+
 	do {
 		ring->head = I915_READ_HEAD(ring);
 		ring->space = ring_space(ring);
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] drm/i915: don't bail out of intel_wait_ring_buffer too early
  2011-10-11 15:55   ` Chris Wilson
@ 2011-10-11 17:27     ` Daniel Vetter
  2011-10-11 19:31       ` Daniel Vetter
  2011-10-11 17:29     ` [PATCH] drm/i915: switch ring->id to be a real id Daniel Vetter
  1 sibling, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 17:27 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson; +Cc: Daniel Vetter

In the pre-gem days with non-existing hangcheck and gpu reset code,
this timeout of 3 seconds was pretty important to avoid stuck
processes.

But now we have the hangcheck code in gem that goes to great length
to ensure that the gpu is really dead before declaring it wedged.

So there's no need for this timeout anymore. Actually it's even harmful
because we can bail out too early (e.g. with xscreensaver slip)
when running giant batchbuffers. And our code isn't robust enough
to properly unroll any state-changes, we pretty much rely on the gpu
reset code cleaning up the mess (like cache tracking, fencing state,
active list/request tracking, ...).

With this change intel_begin_ring can only fail when the gpu is
wedged, and it will return -EAGAIN (like wait_request in case the
gpu reset is still outstanding).

v2: Chris Wilson noted that on resume timers aren't running and hence
we won't ever get kicked out of this loop by the hangcheck code. Use
an insanely large timeout instead for the HAS_GEM case to prevent
resume bugs from totally hanging the machine.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_ringbuffer.c |   11 ++++++++++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 0e99589..d33cbf7 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1006,7 +1006,16 @@ int intel_wait_ring_buffer(struct intel_ring_buffer *ring, int n)
 	}

 	trace_i915_ring_wait_begin(ring);
-	end = jiffies + 3 * HZ;
+	if (drm_core_check_feature(dev, DRIVER_GEM))
+		/* With GEM the hangcheck timer should kick us out of the loop,
+		 * leaving it early runs the risk of corrupting GEM state (due
+		 * to running on almost untested codepaths). But on resume
+		 * timers don't work yet, so prevent a complete hang in that
+		 * case by choosing an insanely large timeout. */
+		end = jiffies + 60 * HZ;
+	else
+		end = jiffies + 3 * HZ;
+
 	do {
 		ring->head = I915_READ_HEAD(ring);
 		ring->space = ring_space(ring);
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] drm/i915: switch ring->id to be a real id
  2011-10-11 15:55   ` Chris Wilson
  2011-10-11 17:27     ` [PATCH] drm/i915: don't bail out of intel_wait_ring_buffer too early Daniel Vetter
@ 2011-10-11 17:29     ` Daniel Vetter
  2011-10-18 15:27       ` Chris Wilson
  1 sibling, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 17:29 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson; +Cc: Daniel Vetter

... and add a helpr function for the places where we want a flag.

This way we can use ring->id to index into arrays.

v2: Resurrect the missing beautification-space Chris Wilson noted.
I'm moving this space around because I'll reuse ring_str in the next
patch.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |    9 +++++----
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |    4 ++--
 drivers/gpu/drm/i915/i915_irq.c            |    2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c    |   14 +++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   20 ++++++++++----------
 5 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 8e95d66..0ad7d10 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -659,9 +659,9 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
 static const char *ring_str(int ring)
 {
 	switch (ring) {
-	case RING_RENDER: return " render";
-	case RING_BSD: return " bsd";
-	case RING_BLT: return " blt";
+	case RCS: return "render";
+	case VCS: return "bsd";
+	case BCS: return "blt";
 	default: return "";
 	}
 }
@@ -704,7 +704,7 @@ static void print_error_buffers(struct seq_file *m,
 	seq_printf(m, "%s [%d]:\n", name, count);
 
 	while (count--) {
-		seq_printf(m, "  %08x %8u %04x %04x %08x%s%s%s%s%s%s",
+		seq_printf(m, "  %08x %8u %04x %04x %08x%s%s%s%s%s%s%s",
 			   err->gtt_offset,
 			   err->size,
 			   err->read_domains,
@@ -714,6 +714,7 @@ static void print_error_buffers(struct seq_file *m,
 			   tiling_flag(err->tiling),
 			   dirty_flag(err->dirty),
 			   purgeable_flag(err->purgeable),
+			   err->ring != -1 ? " " : "",
 			   ring_str(err->ring),
 			   cache_level_str(err->cache_level));
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 3693e83..926ed48 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -202,9 +202,9 @@ i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj,
 	cd->invalidate_domains |= invalidate_domains;
 	cd->flush_domains |= flush_domains;
 	if (flush_domains & I915_GEM_GPU_DOMAINS)
-		cd->flush_rings |= obj->ring->id;
+		cd->flush_rings |= intel_ring_flag(obj->ring);
 	if (invalidate_domains & I915_GEM_GPU_DOMAINS)
-		cd->flush_rings |= ring->id;
+		cd->flush_rings |= intel_ring_flag(ring);
 }
 
 struct eb_objects {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 3563a2e..6056e21 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -801,7 +801,7 @@ static u32 capture_bo_list(struct drm_i915_error_buffer *err,
 		err->tiling = obj->tiling_mode;
 		err->dirty = obj->dirty;
 		err->purgeable = obj->madv != I915_MADV_WILLNEED;
-		err->ring = obj->ring ? obj->ring->id : 0;
+		err->ring = obj->ring ? obj->ring->id : -1;
 		err->cache_level = obj->cache_level;
 
 		if (++i == count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index d33cbf7..a056ecf 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -613,13 +613,13 @@ void intel_ring_setup_status_page(struct intel_ring_buffer *ring)
 	 */
 	if (IS_GEN7(dev)) {
 		switch (ring->id) {
-		case RING_RENDER:
+		case RCS:
 			mmio = RENDER_HWS_PGA_GEN7;
 			break;
-		case RING_BLT:
+		case BCS:
 			mmio = BLT_HWS_PGA_GEN7;
 			break;
-		case RING_BSD:
+		case VCS:
 			mmio = BSD_HWS_PGA_GEN7;
 			break;
 		}
@@ -1072,7 +1072,7 @@ void intel_ring_advance(struct intel_ring_buffer *ring)
 
 static const struct intel_ring_buffer render_ring = {
 	.name			= "render ring",
-	.id			= RING_RENDER,
+	.id			= RCS,
 	.mmio_base		= RENDER_RING_BASE,
 	.size			= 32 * PAGE_SIZE,
 	.init			= init_render_ring,
@@ -1095,7 +1095,7 @@ static const struct intel_ring_buffer render_ring = {
 
 static const struct intel_ring_buffer bsd_ring = {
 	.name                   = "bsd ring",
-	.id			= RING_BSD,
+	.id			= VCS,
 	.mmio_base		= BSD_RING_BASE,
 	.size			= 32 * PAGE_SIZE,
 	.init			= init_ring_common,
@@ -1205,7 +1205,7 @@ gen6_bsd_ring_put_irq(struct intel_ring_buffer *ring)
 /* ring buffer for Video Codec for Gen6+ */
 static const struct intel_ring_buffer gen6_bsd_ring = {
 	.name			= "gen6 bsd ring",
-	.id			= RING_BSD,
+	.id			= VCS,
 	.mmio_base		= GEN6_BSD_RING_BASE,
 	.size			= 32 * PAGE_SIZE,
 	.init			= init_ring_common,
@@ -1340,7 +1340,7 @@ static void blt_ring_cleanup(struct intel_ring_buffer *ring)
 
 static const struct intel_ring_buffer gen6_blt_ring = {
 	.name			= "blt ring",
-	.id			= RING_BLT,
+	.id			= BCS,
 	.mmio_base		= BLT_RING_BASE,
 	.size			= 32 * PAGE_SIZE,
 	.init			= blt_ring_init,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 68281c9..c8b9cc0 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -1,13 +1,6 @@
 #ifndef _INTEL_RINGBUFFER_H_
 #define _INTEL_RINGBUFFER_H_
 
-enum {
-	RCS = 0x0,
-	VCS,
-	BCS,
-	I915_NUM_RINGS,
-};
-
 struct  intel_hw_status_page {
 	u32	__iomem	*page_addr;
 	unsigned int	gfx_addr;
@@ -36,10 +29,11 @@ struct  intel_hw_status_page {
 struct  intel_ring_buffer {
 	const char	*name;
 	enum intel_ring_id {
-		RING_RENDER = 0x1,
-		RING_BSD = 0x2,
-		RING_BLT = 0x4,
+		RCS = 0x0,
+		VCS,
+		BCS,
 	} id;
+#define I915_NUM_RINGS 3
 	u32		mmio_base;
 	void		__iomem *virtual_start;
 	struct		drm_device *dev;
@@ -119,6 +113,12 @@ struct  intel_ring_buffer {
 	void *private;
 };
 
+static inline unsigned
+intel_ring_flag(struct intel_ring_buffer *ring)
+{
+	return 1 << ring->id;
+}
+
 static inline u32
 intel_ring_sync_index(struct intel_ring_buffer *ring,
 		      struct intel_ring_buffer *other)
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] drm/i915: collect more per ring error state
  2011-10-11 16:01   ` Chris Wilson
@ 2011-10-11 17:30     ` Daniel Vetter
  2011-10-11 19:23       ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 17:30 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson; +Cc: Daniel Vetter

Based on a patch by Ben Widawsky, but with different colors
for the bikeshed.

In contrast to Ben's patch this one doesn't add the fault regs.
Afaics they're for the optional page fault support which
- we're not enabling
- and which seems to be unsupported by the hw team. Recent bspec
  lacks tons of information about this that the public docs released
  half a year back still contain.

Also dump ring HEAD/TAIL registers - I've recently seen a few
error_state where just guessing these is not good enough.

v2: Also dump INSTPM for every ring.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_debugfs.c |   13 +++++++------
 drivers/gpu/drm/i915/i915_drv.h     |    7 +++++--
 drivers/gpu/drm/i915/i915_irq.c     |    9 +++++++--
 drivers/gpu/drm/i915/i915_reg.h     |    3 +++
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d618f78..043fc52 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -734,17 +734,18 @@ static void i915_ring_error_state(struct seq_file *m,
 				  unsigned ring)
 {
 	seq_printf(m, "%s command stream:\n", ring_str(ring));
+	seq_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
+	seq_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
 	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd[ring]);
 	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir[ring]);
 	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr[ring]);
 	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone[ring]);
-	if (ring == RCS) {
-		if (INTEL_INFO(dev)->gen >= 4) {
-			seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
-			seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
-		}
-		seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
+	if (ring == RCS && INTEL_INFO(dev)->gen >= 4) {
+		seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
 	}
+	seq_printf(m, "  INSTPS: 0x%08x\n", error->instps[ring]);
+	seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm[ring]);
+	seq_printf(m, "  FADDR: 0x%08x\n", error->faddr[ring]);
 	seq_printf(m, "  seqno: 0x%08x\n", error->seqno[ring]);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5f3ff9d..2abd6fd 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -149,16 +149,19 @@ struct drm_i915_error_state {
 	u32 eir;
 	u32 pgtbl_er;
 	u32 pipestat[I915_MAX_PIPES];
+	u32 tail[I915_MAX_PIPES];
+	u32 head[I915_MAX_PIPES];
 	u32 ipeir[I915_NUM_RINGS];
 	u32 ipehr[I915_NUM_RINGS];
 	u32 instdone[I915_NUM_RINGS];
 	u32 acthd[I915_NUM_RINGS];
 	u32 error; /* gen6+ */
-	u32 instpm;
-	u32 instps;
+	u32 instpm[I915_NUM_RINGS];
+	u32 instps[I915_NUM_RINGS];
 	u32 instdone1;
 	u32 seqno[I915_NUM_RINGS];
 	u64 bbaddr;
+	u32 faddr[I915_NUM_RINGS];
 	u64 fence[16];
 	struct timeval time;
 	struct drm_i915_error_object {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 79aba7f..aa70981 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -878,12 +878,15 @@ static void i915_record_ring_state(struct drm_device *dev,
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
+	if (INTEL_INFO(dev)->gen >= 6) {
+		error->faddr[ring->id] = I915_READ(RING_DMA_FADD(ring->mmio_base));
+	}
 	if (INTEL_INFO(dev)->gen >= 4) {
 		error->ipeir[ring->id] = I915_READ(RING_IPEIR(ring->mmio_base));
 		error->ipehr[ring->id] = I915_READ(RING_IPEHR(ring->mmio_base));
 		error->instdone[ring->id] = I915_READ(RING_INSTDONE(ring->mmio_base));
+		error->instps[ring->id] = I915_READ(RING_INSTPS(ring->mmio_base));
 		if (ring->id == RCS) {
-			error->instps = I915_READ(INSTPS);
 			error->instdone1 = I915_READ(INSTDONE1);
 			error->bbaddr = I915_READ64(BB_ADDR);
 		}
@@ -894,8 +897,11 @@ static void i915_record_ring_state(struct drm_device *dev,
 		error->bbaddr = 0;
 	}
 
+	error->instpm[ring->id] = I915_READ(RING_INSTPM(ring->mmio_base));
 	error->seqno[ring->id] = dev_priv->ring->get_seqno(ring);
 	error->acthd[ring->id] = intel_ring_get_active_head(ring);
+	error->head[ring->id] = I915_READ_HEAD(ring);
+	error->tail[ring->id] = I915_READ_TAIL(ring);
 }
 
 /**
@@ -935,7 +941,6 @@ static void i915_capture_error_state(struct drm_device *dev)
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 	for_each_pipe(pipe)
 		error->pipestat[pipe] = I915_READ(PIPESTAT(pipe));
-	error->instpm = I915_READ(INSTPM);
 
 	if (INTEL_INFO(dev)->gen >= 6)
 		error->error = I915_READ(ERROR_GEN6);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index bf7923c..d228fab 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -349,6 +349,9 @@
 #define RING_IPEIR(base)	((base)+0x64)
 #define RING_IPEHR(base)	((base)+0x68)
 #define RING_INSTDONE(base)	((base)+0x6c)
+#define RING_INSTPS(base)	((base)+0x70)
+#define RING_DMA_FADD(base)	((base)+0x78)
+#define RING_INSTPM(base)	((base)+0xc0)
 #define INSTPS		0x02070 /* 965+ only */
 #define INSTDONE1	0x0207c /* 965+ only */
 #define ACTHD_I965	0x02074
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] drm/i915: collect more per ring error state
  2011-10-11 19:23       ` Chris Wilson
@ 2011-10-11 19:20         ` Daniel Vetter
  2011-10-30 18:39           ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 19:20 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

Based on a patch by Ben Widawsky, but with different colors
for the bikeshed.

In contrast to Ben's patch this one doesn't add the fault regs.
Afaics they're for the optional page fault support which
- we're not enabling
- and which seems to be unsupported by the hw team. Recent bspec
  lacks tons of information about this that the public docs released
  half a year back still contain.

Also dump ring HEAD/TAIL registers - I've recently seen a few
error_state where just guessing these is not good enough.

v2: Also dump INSTPM for every ring.

v3: Fix a few really silly goof-ups spotted by Chris Wilson.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_debugfs.c |   16 ++++++++++------
 drivers/gpu/drm/i915/i915_drv.h     |    7 +++++--
 drivers/gpu/drm/i915/i915_irq.c     |    9 +++++++--
 drivers/gpu/drm/i915/i915_reg.h     |    3 +++
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d618f78..5f5eb5b 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -734,17 +734,21 @@ static void i915_ring_error_state(struct seq_file *m,
 				  unsigned ring)
 {
 	seq_printf(m, "%s command stream:\n", ring_str(ring));
+	seq_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
+	seq_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
 	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd[ring]);
 	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir[ring]);
 	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr[ring]);
 	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone[ring]);
-	if (ring == RCS) {
-		if (INTEL_INFO(dev)->gen >= 4) {
-			seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
-			seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
-		}
-		seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
+	if (ring == RCS && INTEL_INFO(dev)->gen >= 4) {
+		seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
+		seq_printf(m, "  BBADDR: 0x%08llx\n", error->bbaddr);
 	}
+	if (INTEL_INFO(dev)->gen >= 4)
+		seq_printf(m, "  INSTPS: 0x%08x\n", error->instps[ring]);
+	seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm[ring]);
+	if (INTEL_INFO(dev)->gen >= 6)
+		seq_printf(m, "  FADDR: 0x%08x\n", error->faddr[ring]);
 	seq_printf(m, "  seqno: 0x%08x\n", error->seqno[ring]);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5f3ff9d..3701369 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -149,16 +149,19 @@ struct drm_i915_error_state {
 	u32 eir;
 	u32 pgtbl_er;
 	u32 pipestat[I915_MAX_PIPES];
+	u32 tail[I915_NUM_RINGS];
+	u32 head[I915_NUM_RINGS];
 	u32 ipeir[I915_NUM_RINGS];
 	u32 ipehr[I915_NUM_RINGS];
 	u32 instdone[I915_NUM_RINGS];
 	u32 acthd[I915_NUM_RINGS];
 	u32 error; /* gen6+ */
-	u32 instpm;
-	u32 instps;
+	u32 instpm[I915_NUM_RINGS];
+	u32 instps[I915_NUM_RINGS];
 	u32 instdone1;
 	u32 seqno[I915_NUM_RINGS];
 	u64 bbaddr;
+	u32 faddr[I915_NUM_RINGS];
 	u64 fence[16];
 	struct timeval time;
 	struct drm_i915_error_object {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 79aba7f..ea1721f 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -878,12 +878,15 @@ static void i915_record_ring_state(struct drm_device *dev,
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
+	if (INTEL_INFO(dev)->gen >= 6) 
+		error->faddr[ring->id] = I915_READ(RING_DMA_FADD(ring->mmio_base));
+
 	if (INTEL_INFO(dev)->gen >= 4) {
 		error->ipeir[ring->id] = I915_READ(RING_IPEIR(ring->mmio_base));
 		error->ipehr[ring->id] = I915_READ(RING_IPEHR(ring->mmio_base));
 		error->instdone[ring->id] = I915_READ(RING_INSTDONE(ring->mmio_base));
+		error->instps[ring->id] = I915_READ(RING_INSTPS(ring->mmio_base));
 		if (ring->id == RCS) {
-			error->instps = I915_READ(INSTPS);
 			error->instdone1 = I915_READ(INSTDONE1);
 			error->bbaddr = I915_READ64(BB_ADDR);
 		}
@@ -894,8 +897,11 @@ static void i915_record_ring_state(struct drm_device *dev,
 		error->bbaddr = 0;
 	}
 
+	error->instpm[ring->id] = I915_READ(RING_INSTPM(ring->mmio_base));
 	error->seqno[ring->id] = dev_priv->ring->get_seqno(ring);
 	error->acthd[ring->id] = intel_ring_get_active_head(ring);
+	error->head[ring->id] = I915_READ_HEAD(ring);
+	error->tail[ring->id] = I915_READ_TAIL(ring);
 }
 
 /**
@@ -935,7 +941,6 @@ static void i915_capture_error_state(struct drm_device *dev)
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 	for_each_pipe(pipe)
 		error->pipestat[pipe] = I915_READ(PIPESTAT(pipe));
-	error->instpm = I915_READ(INSTPM);
 
 	if (INTEL_INFO(dev)->gen >= 6)
 		error->error = I915_READ(ERROR_GEN6);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index bf7923c..d228fab 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -349,6 +349,9 @@
 #define RING_IPEIR(base)	((base)+0x64)
 #define RING_IPEHR(base)	((base)+0x68)
 #define RING_INSTDONE(base)	((base)+0x6c)
+#define RING_INSTPS(base)	((base)+0x70)
+#define RING_DMA_FADD(base)	((base)+0x78)
+#define RING_INSTPM(base)	((base)+0xc0)
 #define INSTPS		0x02070 /* 965+ only */
 #define INSTDONE1	0x0207c /* 965+ only */
 #define ACTHD_I965	0x02074
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] drm/i915: collect more per ring error state
  2011-10-11 17:30     ` [PATCH] " Daniel Vetter
@ 2011-10-11 19:23       ` Chris Wilson
  2011-10-11 19:20         ` Daniel Vetter
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2011-10-11 19:23 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Tue, 11 Oct 2011 19:30:12 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> Based on a patch by Ben Widawsky, but with different colors
> for the bikeshed.
> 
> In contrast to Ben's patch this one doesn't add the fault regs.
> Afaics they're for the optional page fault support which
> - we're not enabling
> - and which seems to be unsupported by the hw team. Recent bspec
>   lacks tons of information about this that the public docs released
>   half a year back still contain.
> 
> Also dump ring HEAD/TAIL registers - I've recently seen a few
> error_state where just guessing these is not good enough.
> 
> v2: Also dump INSTPM for every ring.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>

Close...

> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5f3ff9d..2abd6fd 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -149,16 +149,19 @@ struct drm_i915_error_state {
>  	u32 eir;
>  	u32 pgtbl_er;
>  	u32 pipestat[I915_MAX_PIPES];
> +	u32 tail[I915_MAX_PIPES];
> +	u32 head[I915_MAX_PIPES];

itym I915_NUM_RINGS

>  	u32 ipeir[I915_NUM_RINGS];
>  	u32 ipehr[I915_NUM_RINGS];
>  	u32 instdone[I915_NUM_RINGS];
>  	u32 acthd[I915_NUM_RINGS];
>  	u32 error; /* gen6+ */
> -	u32 instpm;
> -	u32 instps;
> +	u32 instpm[I915_NUM_RINGS];
> +	u32 instps[I915_NUM_RINGS];
>  	u32 instdone1;
>  	u32 seqno[I915_NUM_RINGS];
>  	u64 bbaddr;
> +	u32 faddr[I915_NUM_RINGS];
>  	u64 fence[16];
>  	struct timeval time;
>  	struct drm_i915_error_object {
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 79aba7f..aa70981 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -878,12 +878,15 @@ static void i915_record_ring_state(struct drm_device *dev,
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  
> +	if (INTEL_INFO(dev)->gen >= 6) {
> +		error->faddr[ring->id] = I915_READ(RING_DMA_FADD(ring->mmio_base));
> +	}
checkpatch hates you. Hates us all really. But we can keep it quiet here
by not including the braces around the single line.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] drm/i915: don't bail out of intel_wait_ring_buffer too early
  2011-10-11 17:27     ` [PATCH] drm/i915: don't bail out of intel_wait_ring_buffer too early Daniel Vetter
@ 2011-10-11 19:31       ` Daniel Vetter
  0 siblings, 0 replies; 26+ messages in thread
From: Daniel Vetter @ 2011-10-11 19:31 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson; +Cc: Daniel Vetter

Oops, git send-email fail, please ignore this double-post.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/6] drm/i915: kicking rings stuck on semaphores considered harmful
  2011-10-11 14:39 ` [PATCH 2/6] drm/i915: kicking rings stuck on semaphores considered harmful Daniel Vetter
  2011-10-11 15:51   ` Chris Wilson
@ 2011-10-11 20:48   ` Ben Widawsky
  1 sibling, 0 replies; 26+ messages in thread
From: Ben Widawsky @ 2011-10-11 20:48 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Tue, 11 Oct 2011 16:39:10 +0200
Daniel Vetter <daniel.vetter@ffwll.ch> wrote:

> If our semaphore logic gets confused and we have a ring stuck waiting
> for one, there's a decent chance it'll just execute garbage when being
> kicked. Also, kicking the ring obscures the place where the error
> first occured, making error_state decoding much harder.
> 
> So drop this an let gpu reset handle this mess in a clean fashion.
> 
> In contrast, kicking rings stuck on MI_WAIT is rather harmless, at
> worst there'll be a bit of screen-flickering. There's also old
> broken userspace out there which needs this as a  work-around.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_irq.c |    7 -------
>  1 files changed, 0 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 2218d12..3563a2e 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1645,13 +1645,6 @@ static bool kick_ring(struct intel_ring_buffer *ring)
>  		I915_WRITE_CTL(ring, tmp);
>  		return true;
>  	}
> -	if (IS_GEN6(dev) &&
> -	    (tmp & RING_WAIT_SEMAPHORE)) {
> -		DRM_ERROR("Kicking stuck semaphore on %s\n",
> -			  ring->name);
> -		I915_WRITE_CTL(ring, tmp);
> -		return true;
> -	}
>  	return false;
>  }
>  

If you see this before redoing the patch series, I think a DRM_INFO
might be handy when we notice the stuck semaphore.

Either way:
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] drm/i915: don't bail out of intel_wait_ring_buffer too early
  2011-10-11 17:25     ` [PATCH] " Daniel Vetter
@ 2011-10-18 15:24       ` Chris Wilson
  0 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2011-10-18 15:24 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Tue, 11 Oct 2011 19:25:57 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> In the pre-gem days with non-existing hangcheck and gpu reset code,
> this timeout of 3 seconds was pretty important to avoid stuck
> processes.
> 
> But now we have the hangcheck code in gem that goes to great length
> to ensure that the gpu is really dead before declaring it wedged.
> 
> So there's no need for this timeout anymore. Actually it's even harmful
> because we can bail out too early (e.g. with xscreensaver slip)
> when running giant batchbuffers. And our code isn't robust enough
> to properly unroll any state-changes, we pretty much rely on the gpu
> reset code cleaning up the mess (like cache tracking, fencing state,
> active list/request tracking, ...).
> 
> With this change intel_begin_ring can only fail when the gpu is
> wedged, and it will return -EAGAIN (like wait_request in case the
> gpu reset is still outstanding).
> 
> v2: Chris Wilson noted that on resume timers aren't running and hence
> we won't ever get kicked out of this loop by the hangcheck code. Use
> an insanely large timeout instead for the HAS_GEM case to prevent
> resume bugs from totally hanging the machine.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

Personally I would have done:
 end = jiffies;
 if (has_gem)
	end += infinity;
 else
        end += 3s;
but that is one nitpick too far ;-)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] drm/i915: switch ring->id to be a real id
  2011-10-11 17:29     ` [PATCH] drm/i915: switch ring->id to be a real id Daniel Vetter
@ 2011-10-18 15:27       ` Chris Wilson
  0 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2011-10-18 15:27 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Tue, 11 Oct 2011 19:29:27 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> ... and add a helpr function for the places where we want a flag.
> 
> This way we can use ring->id to index into arrays.
> 
> v2: Resurrect the missing beautification-space Chris Wilson noted.
> I'm moving this space around because I'll reuse ring_str in the next
> patch.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
No neater solution presents itself, so
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/6] drm/i915: hangcheck robustification
  2011-10-11 14:39 [PATCH 1/6] drm/i915: hangcheck robustification Daniel Vetter
                   ` (4 preceding siblings ...)
  2011-10-11 14:39 ` [PATCH 6/6] drm/i915: collect more per ring error state Daniel Vetter
@ 2011-10-19 11:32 ` Chris Wilson
  2011-10-19 15:02   ` Ben Widawsky
  5 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2011-10-19 11:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

On Tue, 11 Oct 2011 16:39:09 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> From: Ben Widawsky <ben@bwidawsk.net>
> 
> This was pulled out of the per ring error handling patch series as it
> actually fixes two issues, and bikeshedding appears to be going on
> there.
> 
> First, remove setting hangcheck_count when we do notify ring. While it
> seems counterintuitive to be setting up a timer to catch hangcheck_count
> greater than 0 with hangcheck_count already greater than 0, actually
> when we go to check if the GPU is hung we clear that value if the gpu is
> still alive . Leaving this is actually harmful as submitting work could
> falsely clear the count while the hanghcheck code is checking the count.
> I can't think of case where this doesn't just delay the inevitable
> reset... but I didn't spend too much time thinking about it.
> 
> Second, for Gen5+ we have more information to be considered when
> determining if the GPU is stuck, primarily the media ring (and blitter
> ring in gen6). This patch will check all available rings, and also updates
> error state with the new information. It theoretically cant fix false
> positives, but I haven't actually come across such a case.
> 
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> [danvet: remove remnants of a unrelated cleanup patch]
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>

NAK: This failed to detect a hang, leaving my box frozen. I suspect that
the value of INSTDONE was fluctuating on the render ring even though we
had now requests pending and so could assume that it was idle.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/6] drm/i915: hangcheck robustification
  2011-10-19 11:32 ` [PATCH 1/6] drm/i915: hangcheck robustification Chris Wilson
@ 2011-10-19 15:02   ` Ben Widawsky
  2011-10-19 15:48     ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2011-10-19 15:02 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Daniel Vetter, intel-gfx

On Wed, 19 Oct 2011 12:32:25 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> On Tue, 11 Oct 2011 16:39:09 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> > From: Ben Widawsky <ben@bwidawsk.net>
> > 
> > This was pulled out of the per ring error handling patch series as it
> > actually fixes two issues, and bikeshedding appears to be going on
> > there.
> > 
> > First, remove setting hangcheck_count when we do notify ring. While it
> > seems counterintuitive to be setting up a timer to catch hangcheck_count
> > greater than 0 with hangcheck_count already greater than 0, actually
> > when we go to check if the GPU is hung we clear that value if the gpu is
> > still alive . Leaving this is actually harmful as submitting work could
> > falsely clear the count while the hanghcheck code is checking the count.
> > I can't think of case where this doesn't just delay the inevitable
> > reset... but I didn't spend too much time thinking about it.
> > 
> > Second, for Gen5+ we have more information to be considered when
> > determining if the GPU is stuck, primarily the media ring (and blitter
> > ring in gen6). This patch will check all available rings, and also updates
> > error state with the new information. It theoretically cant fix false
> > positives, but I haven't actually come across such a case.
> > 
> > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > [danvet: remove remnants of a unrelated cleanup patch]
> > Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> 
> NAK: This failed to detect a hang, leaving my box frozen. I suspect that
> the value of INSTDONE was fluctuating on the render ring even though we
> had now requests pending and so could assume that it was idle.
> -Chris
> 
How is that different than the previous behavior? We checked instdone on
the render ring before this patch too.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/6] drm/i915: hangcheck robustification
  2011-10-19 15:02   ` Ben Widawsky
@ 2011-10-19 15:48     ` Chris Wilson
  0 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2011-10-19 15:48 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Daniel Vetter, intel-gfx

On Wed, 19 Oct 2011 08:02:57 -0700, Ben Widawsky <ben@bwidawsk.net> wrote:
> On Wed, 19 Oct 2011 12:32:25 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > NAK: This failed to detect a hang, leaving my box frozen. I suspect that
> > the value of INSTDONE was fluctuating on the render ring even though we
> > had now requests pending and so could assume that it was idle.
> > -Chris
> > 
> How is that different than the previous behavior? We checked instdone on
> the render ring before this patch too.

As mentioned on irc, that is probably a wild goose chase. This is the
first false negative I have observed with hangcheck, so I blame the
robustification patch (as being a self-grandeous patch ;-). A false
negative has much worse consequences than a false positive - so we need
to be sure we can avoid such at any cost.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] drm/i915: collect more per ring error state
  2011-10-11 19:20         ` Daniel Vetter
@ 2011-10-30 18:39           ` Chris Wilson
  2011-10-30 18:46             ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2011-10-30 18:39 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Tue, 11 Oct 2011 21:20:21 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> Based on a patch by Ben Widawsky, but with different colors
> for the bikeshed.
> 
> In contrast to Ben's patch this one doesn't add the fault regs.
> Afaics they're for the optional page fault support which
> - we're not enabling
> - and which seems to be unsupported by the hw team. Recent bspec
>   lacks tons of information about this that the public docs released
>   half a year back still contain.
> 
> Also dump ring HEAD/TAIL registers - I've recently seen a few
> error_state where just guessing these is not good enough.
> 
> v2: Also dump INSTPM for every ring.
> 
> v3: Fix a few really silly goof-ups spotted by Chris Wilson.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>

This is independent of the "hangcheck robustification" (which I feel is
false advertising ;-) and adds some mildly useful debug information.
The patch itself is fine and worthy of a reviewed-by, there is just one
nearby bug that may as well be squashed in with this cleanup...

> ---
>  drivers/gpu/drm/i915/i915_debugfs.c |   16 ++++++++++------
>  drivers/gpu/drm/i915/i915_drv.h     |    7 +++++--
>  drivers/gpu/drm/i915/i915_irq.c     |    9 +++++++--
>  drivers/gpu/drm/i915/i915_reg.h     |    3 +++
>  4 files changed, 25 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index d618f78..5f5eb5b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -734,17 +734,21 @@ static void i915_ring_error_state(struct seq_file *m,
>  				  unsigned ring)
>  {
>  	seq_printf(m, "%s command stream:\n", ring_str(ring));
> +	seq_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
> +	seq_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
>  	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd[ring]);
>  	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir[ring]);
>  	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr[ring]);
>  	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone[ring]);
> -	if (ring == RCS) {
> -		if (INTEL_INFO(dev)->gen >= 4) {
> -			seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
> -			seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
> -		}
> -		seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
> +	if (ring == RCS && INTEL_INFO(dev)->gen >= 4) {
> +		seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
> +		seq_printf(m, "  BBADDR: 0x%08llx\n", error->bbaddr);
>  	}
> +	if (INTEL_INFO(dev)->gen >= 4)
> +		seq_printf(m, "  INSTPS: 0x%08x\n", error->instps[ring]);
> +	seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm[ring]);
> +	if (INTEL_INFO(dev)->gen >= 6)
> +		seq_printf(m, "  FADDR: 0x%08x\n", error->faddr[ring]);
>  	seq_printf(m, "  seqno: 0x%08x\n", error->seqno[ring]);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5f3ff9d..3701369 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -149,16 +149,19 @@ struct drm_i915_error_state {
>  	u32 eir;
>  	u32 pgtbl_er;
>  	u32 pipestat[I915_MAX_PIPES];
> +	u32 tail[I915_NUM_RINGS];
> +	u32 head[I915_NUM_RINGS];
>  	u32 ipeir[I915_NUM_RINGS];
>  	u32 ipehr[I915_NUM_RINGS];
>  	u32 instdone[I915_NUM_RINGS];
>  	u32 acthd[I915_NUM_RINGS];
>  	u32 error; /* gen6+ */
> -	u32 instpm;
> -	u32 instps;
> +	u32 instpm[I915_NUM_RINGS];
> +	u32 instps[I915_NUM_RINGS];
>  	u32 instdone1;
>  	u32 seqno[I915_NUM_RINGS];
>  	u64 bbaddr;
> +	u32 faddr[I915_NUM_RINGS];
>  	u64 fence[16];
>  	struct timeval time;
>  	struct drm_i915_error_object {
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 79aba7f..ea1721f 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -878,12 +878,15 @@ static void i915_record_ring_state(struct drm_device *dev,
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  
> +	if (INTEL_INFO(dev)->gen >= 6) 
> +		error->faddr[ring->id] = I915_READ(RING_DMA_FADD(ring->mmio_base));
> +
>  	if (INTEL_INFO(dev)->gen >= 4) {
>  		error->ipeir[ring->id] = I915_READ(RING_IPEIR(ring->mmio_base));
>  		error->ipehr[ring->id] = I915_READ(RING_IPEHR(ring->mmio_base));
>  		error->instdone[ring->id] = I915_READ(RING_INSTDONE(ring->mmio_base));
> +		error->instps[ring->id] = I915_READ(RING_INSTPS(ring->mmio_base));
>  		if (ring->id == RCS) {
> -			error->instps = I915_READ(INSTPS);
>  			error->instdone1 = I915_READ(INSTDONE1);
>  			error->bbaddr = I915_READ64(BB_ADDR);
>  		}
> @@ -894,8 +897,11 @@ static void i915_record_ring_state(struct drm_device *dev,
>  		error->bbaddr = 0;
>  	}
>  
> +	error->instpm[ring->id] = I915_READ(RING_INSTPM(ring->mmio_base));
>  	error->seqno[ring->id] = dev_priv->ring->get_seqno(ring);
Whilst you are here, you may as well fix this ^.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] drm/i915: collect more per ring error state
  2011-10-30 18:39           ` Chris Wilson
@ 2011-10-30 18:46             ` Chris Wilson
  0 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2011-10-30 18:46 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

On Sun, 30 Oct 2011 18:39:03 +0000, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Tue, 11 Oct 2011 21:20:21 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> > Based on a patch by Ben Widawsky, but with different colors
> > for the bikeshed.
> > 
> > In contrast to Ben's patch this one doesn't add the fault regs.
> > Afaics they're for the optional page fault support which
> > - we're not enabling
> > - and which seems to be unsupported by the hw team. Recent bspec
> >   lacks tons of information about this that the public docs released
> >   half a year back still contain.
> > 
> > Also dump ring HEAD/TAIL registers - I've recently seen a few
> > error_state where just guessing these is not good enough.
> > 
> > v2: Also dump INSTPM for every ring.
> > 
> > v3: Fix a few really silly goof-ups spotted by Chris Wilson.
> > 
> > Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> 
> This is independent of the "hangcheck robustification" (which I feel is
> false advertising ;-) and adds some mildly useful debug information.
> The patch itself is fine and worthy of a reviewed-by, there is just one
> nearby bug that may as well be squashed in with this cleanup...
> 
> > ---
> >  drivers/gpu/drm/i915/i915_debugfs.c |   16 ++++++++++------
> >  drivers/gpu/drm/i915/i915_drv.h     |    7 +++++--
> >  drivers/gpu/drm/i915/i915_irq.c     |    9 +++++++--
> >  drivers/gpu/drm/i915/i915_reg.h     |    3 +++
> >  4 files changed, 25 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index d618f78..5f5eb5b 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -734,17 +734,21 @@ static void i915_ring_error_state(struct seq_file *m,
> >  				  unsigned ring)
> >  {
> >  	seq_printf(m, "%s command stream:\n", ring_str(ring));
> > +	seq_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
> > +	seq_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
> >  	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd[ring]);
> >  	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir[ring]);
> >  	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr[ring]);
> >  	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone[ring]);
> > -	if (ring == RCS) {
> > -		if (INTEL_INFO(dev)->gen >= 4) {
> > -			seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
> > -			seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
> > -		}
> > -		seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
> > +	if (ring == RCS && INTEL_INFO(dev)->gen >= 4) {
> > +		seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
> > +		seq_printf(m, "  BBADDR: 0x%08llx\n", error->bbaddr);
> >  	}
> > +	if (INTEL_INFO(dev)->gen >= 4)
> > +		seq_printf(m, "  INSTPS: 0x%08x\n", error->instps[ring]);
> > +	seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm[ring]);
> > +	if (INTEL_INFO(dev)->gen >= 6)
> > +		seq_printf(m, "  FADDR: 0x%08x\n", error->faddr[ring]);
> >  	seq_printf(m, "  seqno: 0x%08x\n", error->seqno[ring]);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 5f3ff9d..3701369 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -149,16 +149,19 @@ struct drm_i915_error_state {
> >  	u32 eir;
> >  	u32 pgtbl_er;
> >  	u32 pipestat[I915_MAX_PIPES];
> > +	u32 tail[I915_NUM_RINGS];
> > +	u32 head[I915_NUM_RINGS];
> >  	u32 ipeir[I915_NUM_RINGS];
> >  	u32 ipehr[I915_NUM_RINGS];
> >  	u32 instdone[I915_NUM_RINGS];
> >  	u32 acthd[I915_NUM_RINGS];
> >  	u32 error; /* gen6+ */
> > -	u32 instpm;
> > -	u32 instps;
> > +	u32 instpm[I915_NUM_RINGS];
> > +	u32 instps[I915_NUM_RINGS];
> >  	u32 instdone1;
> >  	u32 seqno[I915_NUM_RINGS];
> >  	u64 bbaddr;
> > +	u32 faddr[I915_NUM_RINGS];
> >  	u64 fence[16];
> >  	struct timeval time;
> >  	struct drm_i915_error_object {
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index 79aba7f..ea1721f 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -878,12 +878,15 @@ static void i915_record_ring_state(struct drm_device *dev,
> >  {
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >  
> > +	if (INTEL_INFO(dev)->gen >= 6) 
> > +		error->faddr[ring->id] = I915_READ(RING_DMA_FADD(ring->mmio_base));
> > +
> >  	if (INTEL_INFO(dev)->gen >= 4) {
> >  		error->ipeir[ring->id] = I915_READ(RING_IPEIR(ring->mmio_base));
> >  		error->ipehr[ring->id] = I915_READ(RING_IPEHR(ring->mmio_base));
> >  		error->instdone[ring->id] = I915_READ(RING_INSTDONE(ring->mmio_base));
> > +		error->instps[ring->id] = I915_READ(RING_INSTPS(ring->mmio_base));
> >  		if (ring->id == RCS) {
> > -			error->instps = I915_READ(INSTPS);
> >  			error->instdone1 = I915_READ(INSTDONE1);
> >  			error->bbaddr = I915_READ64(BB_ADDR);
> >  		}
> > @@ -894,8 +897,11 @@ static void i915_record_ring_state(struct drm_device *dev,
> >  		error->bbaddr = 0;
> >  	}
> >  
> > +	error->instpm[ring->id] = I915_READ(RING_INSTPM(ring->mmio_base));
> >  	error->seqno[ring->id] = dev_priv->ring->get_seqno(ring);
> Whilst you are here, you may as well fix this ^.

Daniel pointed out that I already reviewed a patch to fix this, so I
hereby review this patch to add the extra debug information!
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2011-10-30 18:46 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-10-11 14:39 [PATCH 1/6] drm/i915: hangcheck robustification Daniel Vetter
2011-10-11 14:39 ` [PATCH 2/6] drm/i915: kicking rings stuck on semaphores considered harmful Daniel Vetter
2011-10-11 15:51   ` Chris Wilson
2011-10-11 20:48   ` Ben Widawsky
2011-10-11 14:39 ` [PATCH 3/6] drm/i915: don't bail out of intel_wait_ring_buffer too early Daniel Vetter
2011-10-11 15:53   ` Chris Wilson
2011-10-11 17:25     ` [PATCH] " Daniel Vetter
2011-10-18 15:24       ` Chris Wilson
2011-10-11 14:39 ` [PATCH 4/6] drm/i915: switch ring->id to be a real id Daniel Vetter
2011-10-11 15:55   ` Chris Wilson
2011-10-11 17:27     ` [PATCH] drm/i915: don't bail out of intel_wait_ring_buffer too early Daniel Vetter
2011-10-11 19:31       ` Daniel Vetter
2011-10-11 17:29     ` [PATCH] drm/i915: switch ring->id to be a real id Daniel Vetter
2011-10-18 15:27       ` Chris Wilson
2011-10-11 14:39 ` [PATCH 5/6] drm/i915: refactor ring error state capture to use arrays Daniel Vetter
2011-10-11 15:57   ` Chris Wilson
2011-10-11 14:39 ` [PATCH 6/6] drm/i915: collect more per ring error state Daniel Vetter
2011-10-11 16:01   ` Chris Wilson
2011-10-11 17:30     ` [PATCH] " Daniel Vetter
2011-10-11 19:23       ` Chris Wilson
2011-10-11 19:20         ` Daniel Vetter
2011-10-30 18:39           ` Chris Wilson
2011-10-30 18:46             ` Chris Wilson
2011-10-19 11:32 ` [PATCH 1/6] drm/i915: hangcheck robustification Chris Wilson
2011-10-19 15:02   ` Ben Widawsky
2011-10-19 15:48     ` Chris Wilson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox