* [PATCH 1/3] drm/i915: collect per ring page fault info on error
2011-10-02 2:15 [PATCH 0/3] some hangcheck cleanups Ben Widawsky
@ 2011-10-02 2:15 ` Ben Widawsky
2011-10-02 2:15 ` [PATCH 2/3] drm/i915: check acthd for all rings Ben Widawsky
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: Ben Widawsky @ 2011-10-02 2:15 UTC (permalink / raw)
To: intel-gfx; +Cc: Ben Widawsky
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
drivers/gpu/drm/i915/i915_debugfs.c | 3 +++
drivers/gpu/drm/i915/i915_drv.h | 1 +
drivers/gpu/drm/i915/i915_irq.c | 3 +++
drivers/gpu/drm/i915/i915_reg.h | 3 +++
4 files changed, 10 insertions(+), 0 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 8e95d66..1f02971 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -750,6 +750,9 @@ static int i915_error_state(struct seq_file *m, void *unused)
seq_printf(m, "EIR: 0x%08x\n", error->eir);
seq_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
if (INTEL_INFO(dev)->gen >= 6) {
+ seq_printf(m, "GFX Page Fault: 0x%08x\n", error->page_fault[RCS]);
+ seq_printf(m, "Media Page Fault: 0x%08x\n", error->page_fault[VCS]);
+ seq_printf(m, "Blitter Page Fault: 0x%08x\n", error->page_fault[BCS]);
seq_printf(m, "ERROR: 0x%08x\n", error->error);
seq_printf(m, "Blitter command stream:\n");
seq_printf(m, " ACTHD: 0x%08x\n", error->bcs_acthd);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 15c0ca5..279560e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -153,6 +153,7 @@ struct drm_i915_error_state {
u32 ipehr;
u32 instdone;
u32 acthd;
+ u32 page_fault[I915_NUM_RINGS];
u32 error; /* gen6+ */
u32 bcs_acthd; /* gen6+ blt engine */
u32 bcs_ipehr;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 012732b..990abda 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -914,6 +914,9 @@ static void i915_capture_error_state(struct drm_device *dev)
error->instpm = I915_READ(INSTPM);
error->error = 0;
if (INTEL_INFO(dev)->gen >= 6) {
+ error->page_fault[RCS] = I915_READ(GEN6_GFX_FAULT);
+ error->page_fault[VCS] = I915_READ(GEN6_MED_FAULT);
+ error->page_fault[BCS] = I915_READ(GEN6_BLT_FAULT);
error->error = I915_READ(ERROR_GEN6);
error->bcs_acthd = I915_READ(BCS_ACTHD);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 138eae1..4fd736e 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -368,6 +368,9 @@
#define BCS_IPEHR 0x22068
#define BCS_ACTHD 0x22074
+#define GEN6_GFX_FAULT 0x04094
+#define GEN6_MED_FAULT 0x04194
+#define GEN6_BLT_FAULT 0x04294
#define ERROR_GEN6 0x040a0
/* GM45+ chicken bits -- debug workaround bits that may be required
--
1.7.6.4
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 2/3] drm/i915: check acthd for all rings
2011-10-02 2:15 [PATCH 0/3] some hangcheck cleanups Ben Widawsky
2011-10-02 2:15 ` [PATCH 1/3] drm/i915: collect per ring page fault info on error Ben Widawsky
@ 2011-10-02 2:15 ` Ben Widawsky
2011-10-03 16:59 ` Ben Widawsky
2011-10-02 2:15 ` [PATCH 3/3] drm/i915: kicking rings considered harmful Ben Widawsky
2011-10-03 9:11 ` [PATCH 0/3] some hangcheck cleanups Daniel Vetter
3 siblings, 1 reply; 9+ messages in thread
From: Ben Widawsky @ 2011-10-02 2:15 UTC (permalink / raw)
To: intel-gfx; +Cc: Ben Widawsky
On Gen6+ we have other rings which may be in use. We haven't hung if the
blit or media ring is still going
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
drivers/gpu/drm/i915/i915_debugfs.c | 6 +-
drivers/gpu/drm/i915/i915_drv.h | 9 +--
drivers/gpu/drm/i915/i915_irq.c | 146 ++++++++++++++++++++++++-----------
3 files changed, 107 insertions(+), 54 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1f02971..c00dee5 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -755,20 +755,20 @@ static int i915_error_state(struct seq_file *m, void *unused)
seq_printf(m, "Blitter Page Fault: 0x%08x\n", error->page_fault[BCS]);
seq_printf(m, "ERROR: 0x%08x\n", error->error);
seq_printf(m, "Blitter command stream:\n");
- seq_printf(m, " ACTHD: 0x%08x\n", error->bcs_acthd);
+ seq_printf(m, " ACTHD: 0x%08x\n", error->acthd[BCS]);
seq_printf(m, " IPEIR: 0x%08x\n", error->bcs_ipeir);
seq_printf(m, " IPEHR: 0x%08x\n", error->bcs_ipehr);
seq_printf(m, " INSTDONE: 0x%08x\n", error->bcs_instdone);
seq_printf(m, " seqno: 0x%08x\n", error->bcs_seqno);
seq_printf(m, "Video (BSD) command stream:\n");
- seq_printf(m, " ACTHD: 0x%08x\n", error->vcs_acthd);
+ seq_printf(m, " ACTHD: 0x%08x\n", error->acthd[VCS]);
seq_printf(m, " IPEIR: 0x%08x\n", error->vcs_ipeir);
seq_printf(m, " IPEHR: 0x%08x\n", error->vcs_ipehr);
seq_printf(m, " INSTDONE: 0x%08x\n", error->vcs_instdone);
seq_printf(m, " seqno: 0x%08x\n", error->vcs_seqno);
}
seq_printf(m, "Render command stream:\n");
- seq_printf(m, " ACTHD: 0x%08x\n", error->acthd);
+ seq_printf(m, " ACTHD: 0x%08x\n", error->acthd[RCS]);
seq_printf(m, " IPEIR: 0x%08x\n", error->ipeir);
seq_printf(m, " IPEHR: 0x%08x\n", error->ipehr);
seq_printf(m, " INSTDONE: 0x%08x\n", error->instdone);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 279560e..d4e8d42 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -152,15 +152,13 @@ struct drm_i915_error_state {
u32 ipeir;
u32 ipehr;
u32 instdone;
- u32 acthd;
+ u32 acthd[I915_NUM_RINGS];
u32 page_fault[I915_NUM_RINGS];
u32 error; /* gen6+ */
- u32 bcs_acthd; /* gen6+ blt engine */
u32 bcs_ipehr;
u32 bcs_ipeir;
u32 bcs_instdone;
u32 bcs_seqno;
- u32 vcs_acthd; /* gen6+ bsd engine */
u32 vcs_ipehr;
u32 vcs_ipeir;
u32 vcs_instdone;
@@ -332,9 +330,8 @@ typedef struct drm_i915_private {
#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
struct timer_list hangcheck_timer;
int hangcheck_count;
- uint32_t last_acthd;
- uint32_t last_instdone;
- uint32_t last_instdone1;
+ uint32_t last_acthd[I915_NUM_RINGS];
+ uint64_t last_instdone[I915_NUM_RINGS];
unsigned long cfb_size;
unsigned int cfb_fb;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 990abda..7f228ec 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -919,7 +919,7 @@ static void i915_capture_error_state(struct drm_device *dev)
error->page_fault[BCS] = I915_READ(GEN6_BLT_FAULT);
error->error = I915_READ(ERROR_GEN6);
- error->bcs_acthd = I915_READ(BCS_ACTHD);
+ error->acthd[BCS] = I915_READ(BCS_ACTHD);
error->bcs_ipehr = I915_READ(BCS_IPEHR);
error->bcs_ipeir = I915_READ(BCS_IPEIR);
error->bcs_instdone = I915_READ(BCS_INSTDONE);
@@ -927,7 +927,7 @@ static void i915_capture_error_state(struct drm_device *dev)
if (dev_priv->ring[BCS].get_seqno)
error->bcs_seqno = dev_priv->ring[BCS].get_seqno(&dev_priv->ring[BCS]);
- error->vcs_acthd = I915_READ(VCS_ACTHD);
+ error->acthd[VCS] = I915_READ(VCS_ACTHD);
error->vcs_ipehr = I915_READ(VCS_IPEHR);
error->vcs_ipeir = I915_READ(VCS_IPEIR);
error->vcs_instdone = I915_READ(VCS_INSTDONE);
@@ -941,13 +941,13 @@ static void i915_capture_error_state(struct drm_device *dev)
error->instdone = I915_READ(INSTDONE_I965);
error->instps = I915_READ(INSTPS);
error->instdone1 = I915_READ(INSTDONE1);
- error->acthd = I915_READ(ACTHD_I965);
+ error->acthd[RCS] = I915_READ(ACTHD_I965);
error->bbaddr = I915_READ64(BB_ADDR);
} else {
error->ipeir = I915_READ(IPEIR);
error->ipehr = I915_READ(IPEHR);
error->instdone = I915_READ(INSTDONE);
- error->acthd = I915_READ(ACTHD);
+ error->acthd[RCS] = I915_READ(ACTHD);
error->bbaddr = 0;
}
i915_gem_record_fences(dev, error);
@@ -1659,6 +1659,83 @@ static bool kick_ring(struct intel_ring_buffer *ring)
return false;
}
+static bool
+instdone_stuck(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ uint64_t instdone = 0, instdone1 = 0, vcs_instdone = 0, bcs_instdone = 0;
+ bool stuck;
+
+ switch (INTEL_INFO(dev)->gen) {
+ case 7:
+ case 6:
+ bcs_instdone = I915_READ(BCS_INSTDONE);
+ case 5:
+ vcs_instdone = I915_READ(VCS_INSTDONE);
+ case 4:
+ instdone = I915_READ(INSTDONE_I965);
+ instdone1 = I915_READ(INSTDONE1);
+ break;
+ case 3:
+ case 2:
+ instdone = I915_READ(INSTDONE);
+ break;
+ }
+
+ stuck =
+ (dev_priv->last_instdone[RCS] == ((instdone << 32) | instdone1)) &&
+ (dev_priv->last_instdone[VCS] == vcs_instdone) &&
+ (dev_priv->last_instdone[BCS] == bcs_instdone);
+
+ dev_priv->last_instdone[RCS] = (instdone << 32) | instdone1;
+ dev_priv->last_instdone[VCS] = vcs_instdone;
+ dev_priv->last_instdone[BCS] = bcs_instdone;
+
+ return stuck;
+}
+
+static bool
+acthd_stuck(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ uint32_t acthd = 0, vcs_acthd = 0, bcs_acthd = 0;
+ bool stuck = false;
+
+ switch (INTEL_INFO(dev)->gen) {
+ case 7:
+ case 6:
+ bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]);
+ case 5:
+ vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]);
+ case 4:
+ case 3:
+ case 2:
+ acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]);
+ break;
+ }
+
+ stuck =
+ (dev_priv->last_acthd[RCS] == acthd) &&
+ (dev_priv->last_acthd[VCS] == vcs_acthd) &&
+ (dev_priv->last_acthd[BCS] == bcs_acthd);
+
+ dev_priv->last_acthd[RCS] = acthd;
+ dev_priv->last_acthd[VCS] = vcs_acthd;
+ dev_priv->last_acthd[BCS] = bcs_acthd;
+
+ return stuck;
+}
+
+static bool gpu_stuck(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+
+ if (dev_priv->hangcheck_count++ == 0)
+ return false;
+
+ return acthd_stuck(dev) || instdone_stuck(dev);
+}
+
/**
* This is called when the chip hasn't reported back with completed
* batchbuffers in a long time. The first time this is called we simply record
@@ -1669,7 +1746,6 @@ void i915_hangcheck_elapsed(unsigned long data)
{
struct drm_device *dev = (struct drm_device *)data;
drm_i915_private_t *dev_priv = dev->dev_private;
- uint32_t acthd, instdone, instdone1;
bool err = false;
if (!i915_enable_hangcheck)
@@ -1685,50 +1761,30 @@ void i915_hangcheck_elapsed(unsigned long data)
return;
}
- if (INTEL_INFO(dev)->gen < 4) {
- acthd = I915_READ(ACTHD);
- instdone = I915_READ(INSTDONE);
- instdone1 = 0;
- } else {
- acthd = I915_READ(ACTHD_I965);
- instdone = I915_READ(INSTDONE_I965);
- instdone1 = I915_READ(INSTDONE1);
- }
-
- if (dev_priv->last_acthd == acthd &&
- dev_priv->last_instdone == instdone &&
- dev_priv->last_instdone1 == instdone1) {
- if (dev_priv->hangcheck_count++ > 1) {
- DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
+ if (gpu_stuck(dev)) {
+ DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
- if (!IS_GEN2(dev)) {
- /* Is the chip hanging on a WAIT_FOR_EVENT?
- * If so we can simply poke the RB_WAIT bit
- * and break the hang. This should work on
- * all but the second generation chipsets.
- */
-
- if (kick_ring(&dev_priv->ring[RCS]))
- goto repeat;
+ if (!IS_GEN2(dev)) {
+ /* Is the chip hanging on a WAIT_FOR_EVENT?
+ * If so we can simply poke the RB_WAIT bit
+ * and break the hang. This should work on
+ * all but the second generation chipsets.
+ */
- if (HAS_BSD(dev) &&
- kick_ring(&dev_priv->ring[VCS]))
- goto repeat;
+ if (kick_ring(&dev_priv->ring[RCS]))
+ goto repeat;
- if (HAS_BLT(dev) &&
- kick_ring(&dev_priv->ring[BCS]))
- goto repeat;
- }
+ if (HAS_BSD(dev) &&
+ kick_ring(&dev_priv->ring[VCS]))
+ goto repeat;
- i915_handle_error(dev, true);
- return;
+ if (HAS_BLT(dev) &&
+ kick_ring(&dev_priv->ring[BCS]))
+ goto repeat;
}
- } else {
- dev_priv->hangcheck_count = 0;
- dev_priv->last_acthd = acthd;
- dev_priv->last_instdone = instdone;
- dev_priv->last_instdone1 = instdone1;
+ i915_handle_error(dev, true);
+ return;
}
repeat:
--
1.7.6.4
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 3/3] drm/i915: kicking rings considered harmful
2011-10-02 2:15 [PATCH 0/3] some hangcheck cleanups Ben Widawsky
2011-10-02 2:15 ` [PATCH 1/3] drm/i915: collect per ring page fault info on error Ben Widawsky
2011-10-02 2:15 ` [PATCH 2/3] drm/i915: check acthd for all rings Ben Widawsky
@ 2011-10-02 2:15 ` Ben Widawsky
2011-10-02 3:59 ` Eric Anholt
2011-10-03 9:11 ` [PATCH 0/3] some hangcheck cleanups Daniel Vetter
3 siblings, 1 reply; 9+ messages in thread
From: Ben Widawsky @ 2011-10-02 2:15 UTC (permalink / raw)
To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Only do it in the hope of resurrecting the gpu. Disable when reset is
disabled because it seems to tremendously increases our changes to
actually capture an error_state before the system goes all belly-up.
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
drivers/gpu/drm/i915/i915_drv.c | 2 +-
drivers/gpu/drm/i915/i915_drv.h | 1 +
drivers/gpu/drm/i915/i915_irq.c | 2 +-
3 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index b79c6f1..ad85c13 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -91,7 +91,7 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
"Override selection of SDVO panel mode in the VBT "
"(default: auto)");
-static bool i915_try_reset __read_mostly = true;
+bool i915_try_reset __read_mostly = true;
module_param_named(reset, i915_try_reset, bool, 0600);
MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d4e8d42..1b14a62 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1001,6 +1001,7 @@ extern unsigned int i915_semaphores __read_mostly;
extern unsigned int i915_lvds_downclock __read_mostly;
extern unsigned int i915_panel_use_ssc __read_mostly;
extern int i915_vbt_sdvo_panel_type __read_mostly;
+extern bool i915_try_reset __read_mostly;
extern unsigned int i915_enable_rc6 __read_mostly;
extern unsigned int i915_enable_fbc __read_mostly;
extern bool i915_enable_hangcheck __read_mostly;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 7f228ec..b15fc4a 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1764,7 +1764,7 @@ void i915_hangcheck_elapsed(unsigned long data)
if (gpu_stuck(dev)) {
DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
- if (!IS_GEN2(dev)) {
+ if (!IS_GEN2(dev) && i915_try_reset) {
/* Is the chip hanging on a WAIT_FOR_EVENT?
* If so we can simply poke the RB_WAIT bit
* and break the hang. This should work on
--
1.7.6.4
^ permalink raw reply related [flat|nested] 9+ messages in thread