* [PATCH 0/3] some hangcheck cleanups
@ 2011-10-02 2:15 Ben Widawsky
2011-10-02 2:15 ` [PATCH 1/3] drm/i915: collect per ring page fault info on error Ben Widawsky
` (3 more replies)
0 siblings, 4 replies; 9+ messages in thread
From: Ben Widawsky @ 2011-10-02 2:15 UTC (permalink / raw)
To: intel-gfx; +Cc: Ben Widawsky
Do hangchecks for all rings instead of just RCS + cleanup.
A bit of extra info on error.
Don't kick the rings when doing debugging.
Ben Widawsky (2):
drm/i915: collect per ring page fault info on error
drm/i915: check acthd for all rings
Daniel Vetter (1):
drm/i915: kicking rings considered harmful
drivers/gpu/drm/i915/i915_debugfs.c | 9 ++-
drivers/gpu/drm/i915/i915_drv.c | 2 +-
drivers/gpu/drm/i915/i915_drv.h | 11 +--
drivers/gpu/drm/i915/i915_irq.c | 149 ++++++++++++++++++++++++-----------
drivers/gpu/drm/i915/i915_reg.h | 3 +
5 files changed, 119 insertions(+), 55 deletions(-)
--
1.7.6.4
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/3] drm/i915: collect per ring page fault info on error
2011-10-02 2:15 [PATCH 0/3] some hangcheck cleanups Ben Widawsky
@ 2011-10-02 2:15 ` Ben Widawsky
2011-10-02 2:15 ` [PATCH 2/3] drm/i915: check acthd for all rings Ben Widawsky
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: Ben Widawsky @ 2011-10-02 2:15 UTC (permalink / raw)
To: intel-gfx; +Cc: Ben Widawsky
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
drivers/gpu/drm/i915/i915_debugfs.c | 3 +++
drivers/gpu/drm/i915/i915_drv.h | 1 +
drivers/gpu/drm/i915/i915_irq.c | 3 +++
drivers/gpu/drm/i915/i915_reg.h | 3 +++
4 files changed, 10 insertions(+), 0 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 8e95d66..1f02971 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -750,6 +750,9 @@ static int i915_error_state(struct seq_file *m, void *unused)
seq_printf(m, "EIR: 0x%08x\n", error->eir);
seq_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
if (INTEL_INFO(dev)->gen >= 6) {
+ seq_printf(m, "GFX Page Fault: 0x%08x\n", error->page_fault[RCS]);
+ seq_printf(m, "Media Page Fault: 0x%08x\n", error->page_fault[VCS]);
+ seq_printf(m, "Blitter Page Fault: 0x%08x\n", error->page_fault[BCS]);
seq_printf(m, "ERROR: 0x%08x\n", error->error);
seq_printf(m, "Blitter command stream:\n");
seq_printf(m, " ACTHD: 0x%08x\n", error->bcs_acthd);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 15c0ca5..279560e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -153,6 +153,7 @@ struct drm_i915_error_state {
u32 ipehr;
u32 instdone;
u32 acthd;
+ u32 page_fault[I915_NUM_RINGS];
u32 error; /* gen6+ */
u32 bcs_acthd; /* gen6+ blt engine */
u32 bcs_ipehr;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 012732b..990abda 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -914,6 +914,9 @@ static void i915_capture_error_state(struct drm_device *dev)
error->instpm = I915_READ(INSTPM);
error->error = 0;
if (INTEL_INFO(dev)->gen >= 6) {
+ error->page_fault[RCS] = I915_READ(GEN6_GFX_FAULT);
+ error->page_fault[VCS] = I915_READ(GEN6_MED_FAULT);
+ error->page_fault[BCS] = I915_READ(GEN6_BLT_FAULT);
error->error = I915_READ(ERROR_GEN6);
error->bcs_acthd = I915_READ(BCS_ACTHD);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 138eae1..4fd736e 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -368,6 +368,9 @@
#define BCS_IPEHR 0x22068
#define BCS_ACTHD 0x22074
+#define GEN6_GFX_FAULT 0x04094
+#define GEN6_MED_FAULT 0x04194
+#define GEN6_BLT_FAULT 0x04294
#define ERROR_GEN6 0x040a0
/* GM45+ chicken bits -- debug workaround bits that may be required
--
1.7.6.4
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/3] drm/i915: check acthd for all rings
2011-10-02 2:15 [PATCH 0/3] some hangcheck cleanups Ben Widawsky
2011-10-02 2:15 ` [PATCH 1/3] drm/i915: collect per ring page fault info on error Ben Widawsky
@ 2011-10-02 2:15 ` Ben Widawsky
2011-10-03 16:59 ` Ben Widawsky
2011-10-02 2:15 ` [PATCH 3/3] drm/i915: kicking rings considered harmful Ben Widawsky
2011-10-03 9:11 ` [PATCH 0/3] some hangcheck cleanups Daniel Vetter
3 siblings, 1 reply; 9+ messages in thread
From: Ben Widawsky @ 2011-10-02 2:15 UTC (permalink / raw)
To: intel-gfx; +Cc: Ben Widawsky
On Gen6+ we have other rings which may be in use. We haven't hung if the
blit or media ring is still going
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
drivers/gpu/drm/i915/i915_debugfs.c | 6 +-
drivers/gpu/drm/i915/i915_drv.h | 9 +--
drivers/gpu/drm/i915/i915_irq.c | 146 ++++++++++++++++++++++++-----------
3 files changed, 107 insertions(+), 54 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1f02971..c00dee5 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -755,20 +755,20 @@ static int i915_error_state(struct seq_file *m, void *unused)
seq_printf(m, "Blitter Page Fault: 0x%08x\n", error->page_fault[BCS]);
seq_printf(m, "ERROR: 0x%08x\n", error->error);
seq_printf(m, "Blitter command stream:\n");
- seq_printf(m, " ACTHD: 0x%08x\n", error->bcs_acthd);
+ seq_printf(m, " ACTHD: 0x%08x\n", error->acthd[BCS]);
seq_printf(m, " IPEIR: 0x%08x\n", error->bcs_ipeir);
seq_printf(m, " IPEHR: 0x%08x\n", error->bcs_ipehr);
seq_printf(m, " INSTDONE: 0x%08x\n", error->bcs_instdone);
seq_printf(m, " seqno: 0x%08x\n", error->bcs_seqno);
seq_printf(m, "Video (BSD) command stream:\n");
- seq_printf(m, " ACTHD: 0x%08x\n", error->vcs_acthd);
+ seq_printf(m, " ACTHD: 0x%08x\n", error->acthd[VCS]);
seq_printf(m, " IPEIR: 0x%08x\n", error->vcs_ipeir);
seq_printf(m, " IPEHR: 0x%08x\n", error->vcs_ipehr);
seq_printf(m, " INSTDONE: 0x%08x\n", error->vcs_instdone);
seq_printf(m, " seqno: 0x%08x\n", error->vcs_seqno);
}
seq_printf(m, "Render command stream:\n");
- seq_printf(m, " ACTHD: 0x%08x\n", error->acthd);
+ seq_printf(m, " ACTHD: 0x%08x\n", error->acthd[RCS]);
seq_printf(m, " IPEIR: 0x%08x\n", error->ipeir);
seq_printf(m, " IPEHR: 0x%08x\n", error->ipehr);
seq_printf(m, " INSTDONE: 0x%08x\n", error->instdone);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 279560e..d4e8d42 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -152,15 +152,13 @@ struct drm_i915_error_state {
u32 ipeir;
u32 ipehr;
u32 instdone;
- u32 acthd;
+ u32 acthd[I915_NUM_RINGS];
u32 page_fault[I915_NUM_RINGS];
u32 error; /* gen6+ */
- u32 bcs_acthd; /* gen6+ blt engine */
u32 bcs_ipehr;
u32 bcs_ipeir;
u32 bcs_instdone;
u32 bcs_seqno;
- u32 vcs_acthd; /* gen6+ bsd engine */
u32 vcs_ipehr;
u32 vcs_ipeir;
u32 vcs_instdone;
@@ -332,9 +330,8 @@ typedef struct drm_i915_private {
#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
struct timer_list hangcheck_timer;
int hangcheck_count;
- uint32_t last_acthd;
- uint32_t last_instdone;
- uint32_t last_instdone1;
+ uint32_t last_acthd[I915_NUM_RINGS];
+ uint64_t last_instdone[I915_NUM_RINGS];
unsigned long cfb_size;
unsigned int cfb_fb;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 990abda..7f228ec 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -919,7 +919,7 @@ static void i915_capture_error_state(struct drm_device *dev)
error->page_fault[BCS] = I915_READ(GEN6_BLT_FAULT);
error->error = I915_READ(ERROR_GEN6);
- error->bcs_acthd = I915_READ(BCS_ACTHD);
+ error->acthd[BCS] = I915_READ(BCS_ACTHD);
error->bcs_ipehr = I915_READ(BCS_IPEHR);
error->bcs_ipeir = I915_READ(BCS_IPEIR);
error->bcs_instdone = I915_READ(BCS_INSTDONE);
@@ -927,7 +927,7 @@ static void i915_capture_error_state(struct drm_device *dev)
if (dev_priv->ring[BCS].get_seqno)
error->bcs_seqno = dev_priv->ring[BCS].get_seqno(&dev_priv->ring[BCS]);
- error->vcs_acthd = I915_READ(VCS_ACTHD);
+ error->acthd[VCS] = I915_READ(VCS_ACTHD);
error->vcs_ipehr = I915_READ(VCS_IPEHR);
error->vcs_ipeir = I915_READ(VCS_IPEIR);
error->vcs_instdone = I915_READ(VCS_INSTDONE);
@@ -941,13 +941,13 @@ static void i915_capture_error_state(struct drm_device *dev)
error->instdone = I915_READ(INSTDONE_I965);
error->instps = I915_READ(INSTPS);
error->instdone1 = I915_READ(INSTDONE1);
- error->acthd = I915_READ(ACTHD_I965);
+ error->acthd[RCS] = I915_READ(ACTHD_I965);
error->bbaddr = I915_READ64(BB_ADDR);
} else {
error->ipeir = I915_READ(IPEIR);
error->ipehr = I915_READ(IPEHR);
error->instdone = I915_READ(INSTDONE);
- error->acthd = I915_READ(ACTHD);
+ error->acthd[RCS] = I915_READ(ACTHD);
error->bbaddr = 0;
}
i915_gem_record_fences(dev, error);
@@ -1659,6 +1659,83 @@ static bool kick_ring(struct intel_ring_buffer *ring)
return false;
}
+static bool
+instdone_stuck(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ uint64_t instdone = 0, instdone1 = 0, vcs_instdone = 0, bcs_instdone = 0;
+ bool stuck;
+
+ switch (INTEL_INFO(dev)->gen) {
+ case 7:
+ case 6:
+ bcs_instdone = I915_READ(BCS_INSTDONE);
+ case 5:
+ vcs_instdone = I915_READ(VCS_INSTDONE);
+ case 4:
+ instdone = I915_READ(INSTDONE_I965);
+ instdone1 = I915_READ(INSTDONE1);
+ break;
+ case 3:
+ case 2:
+ instdone = I915_READ(INSTDONE);
+ break;
+ }
+
+ stuck =
+ (dev_priv->last_instdone[RCS] == ((instdone << 32) | instdone1)) &&
+ (dev_priv->last_instdone[VCS] == vcs_instdone) &&
+ (dev_priv->last_instdone[BCS] == bcs_instdone);
+
+ dev_priv->last_instdone[RCS] = (instdone << 32) | instdone1;
+ dev_priv->last_instdone[VCS] = vcs_instdone;
+ dev_priv->last_instdone[BCS] = bcs_instdone;
+
+ return stuck;
+}
+
+static bool
+acthd_stuck(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ uint32_t acthd = 0, vcs_acthd = 0, bcs_acthd = 0;
+ bool stuck = false;
+
+ switch (INTEL_INFO(dev)->gen) {
+ case 7:
+ case 6:
+ bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]);
+ case 5:
+ vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]);
+ case 4:
+ case 3:
+ case 2:
+ acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]);
+ break;
+ }
+
+ stuck =
+ (dev_priv->last_acthd[RCS] == acthd) &&
+ (dev_priv->last_acthd[VCS] == vcs_acthd) &&
+ (dev_priv->last_acthd[BCS] == bcs_acthd);
+
+ dev_priv->last_acthd[RCS] = acthd;
+ dev_priv->last_acthd[VCS] = vcs_acthd;
+ dev_priv->last_acthd[BCS] = bcs_acthd;
+
+ return stuck;
+}
+
+static bool gpu_stuck(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+
+ if (dev_priv->hangcheck_count++ == 0)
+ return false;
+
+ return acthd_stuck(dev) || instdone_stuck(dev);
+}
+
/**
* This is called when the chip hasn't reported back with completed
* batchbuffers in a long time. The first time this is called we simply record
@@ -1669,7 +1746,6 @@ void i915_hangcheck_elapsed(unsigned long data)
{
struct drm_device *dev = (struct drm_device *)data;
drm_i915_private_t *dev_priv = dev->dev_private;
- uint32_t acthd, instdone, instdone1;
bool err = false;
if (!i915_enable_hangcheck)
@@ -1685,50 +1761,30 @@ void i915_hangcheck_elapsed(unsigned long data)
return;
}
- if (INTEL_INFO(dev)->gen < 4) {
- acthd = I915_READ(ACTHD);
- instdone = I915_READ(INSTDONE);
- instdone1 = 0;
- } else {
- acthd = I915_READ(ACTHD_I965);
- instdone = I915_READ(INSTDONE_I965);
- instdone1 = I915_READ(INSTDONE1);
- }
-
- if (dev_priv->last_acthd == acthd &&
- dev_priv->last_instdone == instdone &&
- dev_priv->last_instdone1 == instdone1) {
- if (dev_priv->hangcheck_count++ > 1) {
- DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
+ if (gpu_stuck(dev)) {
+ DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
- if (!IS_GEN2(dev)) {
- /* Is the chip hanging on a WAIT_FOR_EVENT?
- * If so we can simply poke the RB_WAIT bit
- * and break the hang. This should work on
- * all but the second generation chipsets.
- */
-
- if (kick_ring(&dev_priv->ring[RCS]))
- goto repeat;
+ if (!IS_GEN2(dev)) {
+ /* Is the chip hanging on a WAIT_FOR_EVENT?
+ * If so we can simply poke the RB_WAIT bit
+ * and break the hang. This should work on
+ * all but the second generation chipsets.
+ */
- if (HAS_BSD(dev) &&
- kick_ring(&dev_priv->ring[VCS]))
- goto repeat;
+ if (kick_ring(&dev_priv->ring[RCS]))
+ goto repeat;
- if (HAS_BLT(dev) &&
- kick_ring(&dev_priv->ring[BCS]))
- goto repeat;
- }
+ if (HAS_BSD(dev) &&
+ kick_ring(&dev_priv->ring[VCS]))
+ goto repeat;
- i915_handle_error(dev, true);
- return;
+ if (HAS_BLT(dev) &&
+ kick_ring(&dev_priv->ring[BCS]))
+ goto repeat;
}
- } else {
- dev_priv->hangcheck_count = 0;
- dev_priv->last_acthd = acthd;
- dev_priv->last_instdone = instdone;
- dev_priv->last_instdone1 = instdone1;
+ i915_handle_error(dev, true);
+ return;
}
repeat:
--
1.7.6.4
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 3/3] drm/i915: kicking rings considered harmful
2011-10-02 2:15 [PATCH 0/3] some hangcheck cleanups Ben Widawsky
2011-10-02 2:15 ` [PATCH 1/3] drm/i915: collect per ring page fault info on error Ben Widawsky
2011-10-02 2:15 ` [PATCH 2/3] drm/i915: check acthd for all rings Ben Widawsky
@ 2011-10-02 2:15 ` Ben Widawsky
2011-10-02 3:59 ` Eric Anholt
2011-10-03 9:11 ` [PATCH 0/3] some hangcheck cleanups Daniel Vetter
3 siblings, 1 reply; 9+ messages in thread
From: Ben Widawsky @ 2011-10-02 2:15 UTC (permalink / raw)
To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Only do it in the hope of resurrecting the gpu. Disable when reset is
disabled because it seems to tremendously increases our changes to
actually capture an error_state before the system goes all belly-up.
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
drivers/gpu/drm/i915/i915_drv.c | 2 +-
drivers/gpu/drm/i915/i915_drv.h | 1 +
drivers/gpu/drm/i915/i915_irq.c | 2 +-
3 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index b79c6f1..ad85c13 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -91,7 +91,7 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
"Override selection of SDVO panel mode in the VBT "
"(default: auto)");
-static bool i915_try_reset __read_mostly = true;
+bool i915_try_reset __read_mostly = true;
module_param_named(reset, i915_try_reset, bool, 0600);
MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d4e8d42..1b14a62 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1001,6 +1001,7 @@ extern unsigned int i915_semaphores __read_mostly;
extern unsigned int i915_lvds_downclock __read_mostly;
extern unsigned int i915_panel_use_ssc __read_mostly;
extern int i915_vbt_sdvo_panel_type __read_mostly;
+extern bool i915_try_reset __read_mostly;
extern unsigned int i915_enable_rc6 __read_mostly;
extern unsigned int i915_enable_fbc __read_mostly;
extern bool i915_enable_hangcheck __read_mostly;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 7f228ec..b15fc4a 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1764,7 +1764,7 @@ void i915_hangcheck_elapsed(unsigned long data)
if (gpu_stuck(dev)) {
DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
- if (!IS_GEN2(dev)) {
+ if (!IS_GEN2(dev) && i915_try_reset) {
/* Is the chip hanging on a WAIT_FOR_EVENT?
* If so we can simply poke the RB_WAIT bit
* and break the hang. This should work on
--
1.7.6.4
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 3/3] drm/i915: kicking rings considered harmful
2011-10-02 2:15 ` [PATCH 3/3] drm/i915: kicking rings considered harmful Ben Widawsky
@ 2011-10-02 3:59 ` Eric Anholt
2011-10-02 6:09 ` Ben Widawsky
0 siblings, 1 reply; 9+ messages in thread
From: Eric Anholt @ 2011-10-02 3:59 UTC (permalink / raw)
To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky
[-- Attachment #1.1: Type: text/plain, Size: 554 bytes --]
On Sat, 1 Oct 2011 19:15:19 -0700, Ben Widawsky <ben@bwidawsk.net> wrote:
> From: Daniel Vetter <daniel.vetter@ffwll.ch>
>
> Only do it in the hope of resurrecting the gpu. Disable when reset is
> disabled because it seems to tremendously increases our changes to
> actually capture an error_state before the system goes all belly-up.
This patch needs a better subject, like "Disable stuck semaphore kick when
GPU reset is disabled", instead of something suggesting that kicking
rings is bad in general and going to be removed by the patch.
[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]
[-- Attachment #2: Type: text/plain, Size: 159 bytes --]
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 3/3] drm/i915: kicking rings considered harmful
2011-10-02 3:59 ` Eric Anholt
@ 2011-10-02 6:09 ` Ben Widawsky
2011-10-03 8:58 ` Daniel Vetter
0 siblings, 1 reply; 9+ messages in thread
From: Ben Widawsky @ 2011-10-02 6:09 UTC (permalink / raw)
To: Eric Anholt; +Cc: Daniel Vetter, intel-gfx
On Sat, 01 Oct 2011 20:59:28 -0700
Eric Anholt <eric@anholt.net> wrote:
> On Sat, 1 Oct 2011 19:15:19 -0700, Ben Widawsky <ben@bwidawsk.net> wrote:
> > From: Daniel Vetter <daniel.vetter@ffwll.ch>
> >
> > Only do it in the hope of resurrecting the gpu. Disable when reset is
> > disabled because it seems to tremendously increases our changes to
> > actually capture an error_state before the system goes all belly-up.
>
> This patch needs a better subject, like "Disable stuck semaphore kick when
> GPU reset is disabled", instead of something suggesting that kicking
> rings is bad in general and going to be removed by the patch.
I'm in favor of removing the ring kick code. I've never seen it actually work,
and Chris mentioned something on IRC about how it's no longer quite as
necessary. I'll let Daniel handle the authoritative response though.
Ben
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 3/3] drm/i915: kicking rings considered harmful
2011-10-02 6:09 ` Ben Widawsky
@ 2011-10-03 8:58 ` Daniel Vetter
0 siblings, 0 replies; 9+ messages in thread
From: Daniel Vetter @ 2011-10-03 8:58 UTC (permalink / raw)
To: Ben Widawsky; +Cc: intel-gfx, Daniel Vetter
On Sat, Oct 01, 2011 at 11:09:33PM -0700, Ben Widawsky wrote:
> On Sat, 01 Oct 2011 20:59:28 -0700
> Eric Anholt <eric@anholt.net> wrote:
>
> > On Sat, 1 Oct 2011 19:15:19 -0700, Ben Widawsky <ben@bwidawsk.net> wrote:
> > > From: Daniel Vetter <daniel.vetter@ffwll.ch>
> > >
> > > Only do it in the hope of resurrecting the gpu. Disable when reset is
> > > disabled because it seems to tremendously increases our changes to
> > > actually capture an error_state before the system goes all belly-up.
> >
> > This patch needs a better subject, like "Disable stuck semaphore kick when
> > GPU reset is disabled", instead of something suggesting that kicking
> > rings is bad in general and going to be removed by the patch.
>
> I'm in favor of removing the ring kick code. I've never seen it actually work,
> and Chris mentioned something on IRC about how it's no longer quite as
> necessary. I'll let Daniel handle the authoritative response though.
Yeah, just kill the stuck semaphore ring kick code. If we deadlock on
semaphores or something like that, we likely have a massive problem and a
full reset sounds like the best course of action. Otoh I think we need to
retain the kick ring stuck in MI_WAIT code, at least to keep on supporting
old (broken) userspace.
Cheers, Daniel
--
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 0/3] some hangcheck cleanups
2011-10-02 2:15 [PATCH 0/3] some hangcheck cleanups Ben Widawsky
` (2 preceding siblings ...)
2011-10-02 2:15 ` [PATCH 3/3] drm/i915: kicking rings considered harmful Ben Widawsky
@ 2011-10-03 9:11 ` Daniel Vetter
3 siblings, 0 replies; 9+ messages in thread
From: Daniel Vetter @ 2011-10-03 9:11 UTC (permalink / raw)
To: Ben Widawsky; +Cc: intel-gfx
On Sat, Oct 01, 2011 at 07:15:16PM -0700, Ben Widawsky wrote:
> Do hangchecks for all rings instead of just RCS + cleanup.
> A bit of extra info on error.
> Don't kick the rings when doing debugging.
>
> Ben Widawsky (2):
> drm/i915: collect per ring page fault info on error
> drm/i915: check acthd for all rings
These two are:
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
--
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/3] drm/i915: check acthd for all rings
2011-10-02 2:15 ` [PATCH 2/3] drm/i915: check acthd for all rings Ben Widawsky
@ 2011-10-03 16:59 ` Ben Widawsky
0 siblings, 0 replies; 9+ messages in thread
From: Ben Widawsky @ 2011-10-03 16:59 UTC (permalink / raw)
To: intel-gfx
On Sat, Oct 01, 2011 at 07:15:18PM -0700, Ben Widawsky wrote:
> On Gen6+ we have other rings which may be in use. We haven't hung if the
> blit or media ring is still going
>
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 6 +-
> drivers/gpu/drm/i915/i915_drv.h | 9 +--
> drivers/gpu/drm/i915/i915_irq.c | 146 ++++++++++++++++++++++++-----------
> 3 files changed, 107 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 1f02971..c00dee5 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -755,20 +755,20 @@ static int i915_error_state(struct seq_file *m, void *unused)
> seq_printf(m, "Blitter Page Fault: 0x%08x\n", error->page_fault[BCS]);
> seq_printf(m, "ERROR: 0x%08x\n", error->error);
> seq_printf(m, "Blitter command stream:\n");
> - seq_printf(m, " ACTHD: 0x%08x\n", error->bcs_acthd);
> + seq_printf(m, " ACTHD: 0x%08x\n", error->acthd[BCS]);
> seq_printf(m, " IPEIR: 0x%08x\n", error->bcs_ipeir);
> seq_printf(m, " IPEHR: 0x%08x\n", error->bcs_ipehr);
> seq_printf(m, " INSTDONE: 0x%08x\n", error->bcs_instdone);
> seq_printf(m, " seqno: 0x%08x\n", error->bcs_seqno);
> seq_printf(m, "Video (BSD) command stream:\n");
> - seq_printf(m, " ACTHD: 0x%08x\n", error->vcs_acthd);
> + seq_printf(m, " ACTHD: 0x%08x\n", error->acthd[VCS]);
> seq_printf(m, " IPEIR: 0x%08x\n", error->vcs_ipeir);
> seq_printf(m, " IPEHR: 0x%08x\n", error->vcs_ipehr);
> seq_printf(m, " INSTDONE: 0x%08x\n", error->vcs_instdone);
> seq_printf(m, " seqno: 0x%08x\n", error->vcs_seqno);
> }
> seq_printf(m, "Render command stream:\n");
> - seq_printf(m, " ACTHD: 0x%08x\n", error->acthd);
> + seq_printf(m, " ACTHD: 0x%08x\n", error->acthd[RCS]);
> seq_printf(m, " IPEIR: 0x%08x\n", error->ipeir);
> seq_printf(m, " IPEHR: 0x%08x\n", error->ipehr);
> seq_printf(m, " INSTDONE: 0x%08x\n", error->instdone);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 279560e..d4e8d42 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -152,15 +152,13 @@ struct drm_i915_error_state {
> u32 ipeir;
> u32 ipehr;
> u32 instdone;
> - u32 acthd;
> + u32 acthd[I915_NUM_RINGS];
> u32 page_fault[I915_NUM_RINGS];
> u32 error; /* gen6+ */
> - u32 bcs_acthd; /* gen6+ blt engine */
> u32 bcs_ipehr;
> u32 bcs_ipeir;
> u32 bcs_instdone;
> u32 bcs_seqno;
> - u32 vcs_acthd; /* gen6+ bsd engine */
> u32 vcs_ipehr;
> u32 vcs_ipeir;
> u32 vcs_instdone;
> @@ -332,9 +330,8 @@ typedef struct drm_i915_private {
> #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
> struct timer_list hangcheck_timer;
> int hangcheck_count;
> - uint32_t last_acthd;
> - uint32_t last_instdone;
> - uint32_t last_instdone1;
> + uint32_t last_acthd[I915_NUM_RINGS];
> + uint64_t last_instdone[I915_NUM_RINGS];
>
> unsigned long cfb_size;
> unsigned int cfb_fb;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 990abda..7f228ec 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -919,7 +919,7 @@ static void i915_capture_error_state(struct drm_device *dev)
> error->page_fault[BCS] = I915_READ(GEN6_BLT_FAULT);
> error->error = I915_READ(ERROR_GEN6);
>
> - error->bcs_acthd = I915_READ(BCS_ACTHD);
> + error->acthd[BCS] = I915_READ(BCS_ACTHD);
> error->bcs_ipehr = I915_READ(BCS_IPEHR);
> error->bcs_ipeir = I915_READ(BCS_IPEIR);
> error->bcs_instdone = I915_READ(BCS_INSTDONE);
> @@ -927,7 +927,7 @@ static void i915_capture_error_state(struct drm_device *dev)
> if (dev_priv->ring[BCS].get_seqno)
> error->bcs_seqno = dev_priv->ring[BCS].get_seqno(&dev_priv->ring[BCS]);
>
> - error->vcs_acthd = I915_READ(VCS_ACTHD);
> + error->acthd[VCS] = I915_READ(VCS_ACTHD);
> error->vcs_ipehr = I915_READ(VCS_IPEHR);
> error->vcs_ipeir = I915_READ(VCS_IPEIR);
> error->vcs_instdone = I915_READ(VCS_INSTDONE);
> @@ -941,13 +941,13 @@ static void i915_capture_error_state(struct drm_device *dev)
> error->instdone = I915_READ(INSTDONE_I965);
> error->instps = I915_READ(INSTPS);
> error->instdone1 = I915_READ(INSTDONE1);
> - error->acthd = I915_READ(ACTHD_I965);
> + error->acthd[RCS] = I915_READ(ACTHD_I965);
> error->bbaddr = I915_READ64(BB_ADDR);
> } else {
> error->ipeir = I915_READ(IPEIR);
> error->ipehr = I915_READ(IPEHR);
> error->instdone = I915_READ(INSTDONE);
> - error->acthd = I915_READ(ACTHD);
> + error->acthd[RCS] = I915_READ(ACTHD);
> error->bbaddr = 0;
> }
> i915_gem_record_fences(dev, error);
> @@ -1659,6 +1659,83 @@ static bool kick_ring(struct intel_ring_buffer *ring)
> return false;
> }
>
> +static bool
> +instdone_stuck(struct drm_device *dev)
> +{
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + uint64_t instdone = 0, instdone1 = 0, vcs_instdone = 0, bcs_instdone = 0;
> + bool stuck;
> +
> + switch (INTEL_INFO(dev)->gen) {
> + case 7:
> + case 6:
> + bcs_instdone = I915_READ(BCS_INSTDONE);
> + case 5:
> + vcs_instdone = I915_READ(VCS_INSTDONE);
> + case 4:
> + instdone = I915_READ(INSTDONE_I965);
> + instdone1 = I915_READ(INSTDONE1);
> + break;
> + case 3:
> + case 2:
> + instdone = I915_READ(INSTDONE);
> + break;
> + }
> +
> + stuck =
> + (dev_priv->last_instdone[RCS] == ((instdone << 32) | instdone1)) &&
> + (dev_priv->last_instdone[VCS] == vcs_instdone) &&
> + (dev_priv->last_instdone[BCS] == bcs_instdone);
> +
> + dev_priv->last_instdone[RCS] = (instdone << 32) | instdone1;
> + dev_priv->last_instdone[VCS] = vcs_instdone;
> + dev_priv->last_instdone[BCS] = bcs_instdone;
> +
> + return stuck;
> +}
> +
> +static bool
> +acthd_stuck(struct drm_device *dev)
> +{
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + uint32_t acthd = 0, vcs_acthd = 0, bcs_acthd = 0;
> + bool stuck = false;
> +
> + switch (INTEL_INFO(dev)->gen) {
> + case 7:
> + case 6:
> + bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]);
> + case 5:
> + vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]);
> + case 4:
> + case 3:
> + case 2:
> + acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]);
> + break;
> + }
> +
> + stuck =
> + (dev_priv->last_acthd[RCS] == acthd) &&
> + (dev_priv->last_acthd[VCS] == vcs_acthd) &&
> + (dev_priv->last_acthd[BCS] == bcs_acthd);
> +
> + dev_priv->last_acthd[RCS] = acthd;
> + dev_priv->last_acthd[VCS] = vcs_acthd;
> + dev_priv->last_acthd[BCS] = bcs_acthd;
> +
> + return stuck;
> +}
> +
> +static bool gpu_stuck(struct drm_device *dev)
> +{
> + struct drm_i915_private *dev_priv = dev->dev_private;
> +
> + if (dev_priv->hangcheck_count++ == 0)
> + return false;
> +
> + return acthd_stuck(dev) || instdone_stuck(dev);
> +}
> +
This should be:
return acthd_stuck(dev) && instdone_stuck(dev);
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2011-10-03 17:00 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-10-02 2:15 [PATCH 0/3] some hangcheck cleanups Ben Widawsky
2011-10-02 2:15 ` [PATCH 1/3] drm/i915: collect per ring page fault info on error Ben Widawsky
2011-10-02 2:15 ` [PATCH 2/3] drm/i915: check acthd for all rings Ben Widawsky
2011-10-03 16:59 ` Ben Widawsky
2011-10-02 2:15 ` [PATCH 3/3] drm/i915: kicking rings considered harmful Ben Widawsky
2011-10-02 3:59 ` Eric Anholt
2011-10-02 6:09 ` Ben Widawsky
2011-10-03 8:58 ` Daniel Vetter
2011-10-03 9:11 ` [PATCH 0/3] some hangcheck cleanups Daniel Vetter
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.