From: Carlos Santa <carlos.santa@intel.com>
To: intel-gfx@lists.freedesktop.org
Subject: drm/i915: Replace global_seqno with a hangcheck heartbeat seqno
Date: Wed, 20 Feb 2019 18:58:20 -0800 [thread overview]
Message-ID: <20190221025820.28447-7-carlos.santa@intel.com> (raw)
In-Reply-To: <20190221025820.28447-1-carlos.santa@intel.com>
From: Chris Wilson <chris@chris-wilson.co.uk>
To determine whether an engine has 'stuck', we simply check whether or
not is still on the same seqno for several seconds. To keep this simple
mechanism intact over the loss of a global seqno, we can simply add a
new global heartbeat seqno instead. As we cannot know the sequence in
which requests will then be completed, we use a primitive random number
generator instead (with a cycle long enough to not matter over an
interval of a few thousand requests between hangcheck samples).
The alternative to using a dedicated seqno on every request is to issue
a heartbeat request and query its progress through the system. Sadly
this requires us to reduce struct_mutex so that we can issue requests
without requiring that bkl.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_debugfs.c | 7 ++++---
drivers/gpu/drm/i915/intel_engine_cs.c | 5 +++--
drivers/gpu/drm/i915/intel_hangcheck.c | 6 +++---
drivers/gpu/drm/i915/intel_lrc.c | 15 +++++++++++++++
drivers/gpu/drm/i915/intel_ringbuffer.c | 20 ++++++++++++++++++--
drivers/gpu/drm/i915/intel_ringbuffer.h | 19 ++++++++++++++++++-
6 files changed, 61 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index ea15e6336515..a6cbd7dfa64c 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1298,7 +1298,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
with_intel_runtime_pm(dev_priv, wakeref) {
for_each_engine(engine, dev_priv, id) {
acthd[id] = intel_engine_get_active_head(engine);
- seqno[id] = intel_engine_get_seqno(engine);
+ seqno[id] = intel_engine_get_hangcheck_seqno(engine);
}
intel_engine_get_instdone(dev_priv->engine[RCS], &instdone);
@@ -1318,8 +1318,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
for_each_engine(engine, dev_priv, id) {
seq_printf(m, "%s:\n", engine->name);
seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
- engine->hangcheck.seqno, seqno[id],
- intel_engine_last_submit(engine),
+ engine->hangcheck.last_seqno,
+ seqno[id],
+ engine->hangcheck.next_seqno,
jiffies_to_msecs(jiffies -
engine->hangcheck.action_timestamp));
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 4b4004d56e53..fe29ec0c008b 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1498,10 +1498,11 @@ void intel_engine_dump(struct intel_engine_cs *engine,
if (i915_terminally_wedged(&engine->i915->gpu_error))
drm_printf(m, "*** WEDGED ***\n");
- drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
+ drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x/%x [%d ms]\n",
intel_engine_get_seqno(engine),
intel_engine_last_submit(engine),
- engine->hangcheck.seqno,
+ engine->hangcheck.last_seqno,
+ engine->hangcheck.next_seqno,
jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
drm_printf(m, "\tReset count: %d (global %d)\n",
i915_reset_engine_count(error, engine),
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index a219c796e56d..e04b2560369e 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -133,21 +133,21 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
struct hangcheck *hc)
{
hc->acthd = intel_engine_get_active_head(engine);
- hc->seqno = intel_engine_get_seqno(engine);
+ hc->seqno = intel_engine_get_hangcheck_seqno(engine);
}
static void hangcheck_store_sample(struct intel_engine_cs *engine,
const struct hangcheck *hc)
{
engine->hangcheck.acthd = hc->acthd;
- engine->hangcheck.seqno = hc->seqno;
+ engine->hangcheck.last_seqno = hc->seqno;
}
static enum intel_engine_hangcheck_action
hangcheck_get_action(struct intel_engine_cs *engine,
const struct hangcheck *hc)
{
- if (engine->hangcheck.seqno != hc->seqno)
+ if (engine->hangcheck.last_seqno != hc->seqno)
return ENGINE_ACTIVE_SEQNO;
if (intel_engine_is_idle(engine))
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 4fcee493dddb..3c8b11f6e830 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -180,6 +180,12 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
I915_GEM_HWS_INDEX_ADDR);
}
+static inline u32 intel_hws_hangcheck_address(struct intel_engine_cs *engine)
+{
+ return (i915_ggtt_offset(engine->status_page.vma) +
+ I915_GEM_HWS_HANGCHECK_ADDR);
+}
+
static inline struct i915_priolist *to_priolist(struct rb_node *rb)
{
return rb_entry(rb, struct i915_priolist, node);
@@ -2209,6 +2215,10 @@ static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
request->fence.seqno,
request->timeline->hwsp_offset);
+ cs = gen8_emit_ggtt_write(cs,
+ intel_engine_next_hangcheck_seqno(request->engine),
+ intel_hws_hangcheck_address(request->engine));
+
cs = gen8_emit_ggtt_write(cs,
request->global_seqno,
intel_hws_seqno_address(request->engine));
@@ -2233,6 +2243,11 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
PIPE_CONTROL_FLUSH_ENABLE |
PIPE_CONTROL_CS_STALL);
+ cs = gen8_emit_ggtt_write_rcs(cs,
+ intel_engine_next_hangcheck_seqno(request->engine),
+ intel_hws_hangcheck_address(request->engine),
+ PIPE_CONTROL_CS_STALL);
+
cs = gen8_emit_ggtt_write_rcs(cs,
request->global_seqno,
intel_hws_seqno_address(request->engine),
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index b889b27f8aeb..a1c85a338d50 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -460,12 +460,15 @@ static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
*cs++ = rq->fence.seqno;
+ *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+ *cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
+ *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
*cs++ = rq->global_seqno;
*cs++ = MI_USER_INTERRUPT;
- *cs++ = MI_NOOP;
rq->tail = intel_ring_offset(rq, cs);
assert_ring_tail_valid(rq->ring, rq->tail);
@@ -485,6 +488,10 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
*cs++ = rq->fence.seqno;
+ *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+ *cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
+ *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
*cs++ = rq->global_seqno;
@@ -500,6 +507,7 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
*cs++ = 0;
*cs++ = MI_USER_INTERRUPT;
+ *cs++ = MI_NOOP;
rq->tail = intel_ring_offset(rq, cs);
assert_ring_tail_valid(rq->ring, rq->tail);
@@ -943,11 +951,16 @@ static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
*cs++ = I915_GEM_HWS_SEQNO_ADDR;
*cs++ = rq->fence.seqno;
+ *cs++ = MI_STORE_DWORD_INDEX;
+ *cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
+ *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
*cs++ = MI_STORE_DWORD_INDEX;
*cs++ = I915_GEM_HWS_INDEX_ADDR;
*cs++ = rq->global_seqno;
*cs++ = MI_USER_INTERRUPT;
+ *cs++ = MI_NOOP;
rq->tail = intel_ring_offset(rq, cs);
assert_ring_tail_valid(rq->ring, rq->tail);
@@ -969,6 +982,10 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
*cs++ = I915_GEM_HWS_SEQNO_ADDR;
*cs++ = rq->fence.seqno;
+ *cs++ = MI_STORE_DWORD_INDEX;
+ *cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
+ *cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
BUILD_BUG_ON(GEN5_WA_STORES < 1);
for (i = 0; i < GEN5_WA_STORES; i++) {
*cs++ = MI_STORE_DWORD_INDEX;
@@ -977,7 +994,6 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
}
*cs++ = MI_USER_INTERRUPT;
- *cs++ = MI_NOOP;
rq->tail = intel_ring_offset(rq, cs);
assert_ring_tail_valid(rq->ring, rq->tail);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 8bbdf9fba196..752794cd0fb5 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -6,6 +6,7 @@
#include <linux/hashtable.h>
#include <linux/irq_work.h>
+#include <linux/random.h>
#include <linux/seqlock.h>
#include "i915_gem_batch_pool.h"
@@ -119,7 +120,8 @@ struct intel_instdone {
struct intel_engine_hangcheck {
u64 acthd;
- u32 seqno;
+ u32 last_seqno;
+ u32 next_seqno;
unsigned long action_timestamp;
struct intel_instdone instdone;
};
@@ -712,6 +714,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
#define I915_GEM_HWS_INDEX_ADDR (I915_GEM_HWS_INDEX * sizeof(u32))
#define I915_GEM_HWS_PREEMPT 0x32
#define I915_GEM_HWS_PREEMPT_ADDR (I915_GEM_HWS_PREEMPT * sizeof(u32))
+#define I915_GEM_HWS_HANGCHECK 0x34
+#define I915_GEM_HWS_HANGCHECK_ADDR (I915_GEM_HWS_HANGCHECK * sizeof(u32))
#define I915_GEM_HWS_SEQNO 0x40
#define I915_GEM_HWS_SEQNO_ADDR (I915_GEM_HWS_SEQNO * sizeof(u32))
#define I915_GEM_HWS_SCRATCH 0x80
@@ -1060,4 +1064,17 @@ static inline bool inject_preempt_hang(struct intel_engine_execlists *execlists)
#endif
+static inline u32 intel_engine_next_hangcheck_seqno(struct intel_engine_cs *engine)
+{
+ engine->hangcheck.next_seqno =
+ next_pseudo_random32(engine->hangcheck.next_seqno);
+
+ return engine->hangcheck.next_seqno;
+}
+
+static inline u32 intel_engine_get_hangcheck_seqno(struct intel_engine_cs *engine)
+{
+ return intel_read_status_page(engine, I915_GEM_HWS_HANGCHECK);
+}
+
#endif /* _INTEL_RINGBUFFER_H_ */
--
2.17.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
next prev parent reply other threads:[~2019-02-21 2:59 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-02-21 2:58 [PATCH v4 0/5] GEN8+ GPU Watchdog Reset Support Carlos Santa
2019-02-21 2:58 ` [PATCH v4 1/5] drm/i915: Add engine reset count in get-reset-stats ioctl Carlos Santa
2019-02-25 13:34 ` Tvrtko Ursulin
2019-03-06 23:08 ` Carlos Santa
2019-03-07 7:27 ` Tvrtko Ursulin
2019-02-21 2:58 ` [PATCH v4 2/5] drm/i915: Watchdog timeout: IRQ handler for gen8+ Carlos Santa
2019-02-28 17:38 ` Tvrtko Ursulin
2019-03-01 1:51 ` Carlos Santa
2019-03-01 9:36 ` Chris Wilson
2019-03-02 2:08 ` Carlos Santa
2019-03-08 3:16 ` Carlos Santa
2019-03-11 10:39 ` Tvrtko Ursulin
2019-03-18 0:15 ` Carlos Santa
2019-03-19 12:39 ` Tvrtko Ursulin
2019-03-19 12:46 ` Tvrtko Ursulin
2019-03-19 17:52 ` Carlos Santa
2019-02-21 2:58 ` [PATCH v4 3/5] drm/i915: Watchdog timeout: Ringbuffer command emission " Carlos Santa
2019-02-21 2:58 ` [PATCH v4 4/5] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout Carlos Santa
2019-02-28 17:22 ` Tvrtko Ursulin
2019-02-21 2:58 ` [PATCH v4 5/5] drm/i915: Watchdog timeout: Include threshold value in error state Carlos Santa
2019-02-21 2:58 ` Carlos Santa [this message]
2019-02-21 3:24 ` ✗ Fi.CI.BAT: failure for drm/i915: Replace global_seqno with a hangcheck heartbeat seqno (rev3) Patchwork
2019-03-11 11:54 ` [PATCH v4 0/5] GEN8+ GPU Watchdog Reset Support Chris Wilson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190221025820.28447-7-carlos.santa@intel.com \
--to=carlos.santa@intel.com \
--cc=intel-gfx@lists.freedesktop.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox