* [PATCH 1/2] drm/i915: Detach hangcheck from request lists
@ 2015-05-08 13:39 Mika Kuoppala
2015-05-08 13:39 ` [PATCH 2/2] drm/i915: Make hangcheck logging more compact Mika Kuoppala
2015-05-19 11:03 ` [PATCH 1/2] drm/i915: Detach hangcheck from request lists Tomas Elf
0 siblings, 2 replies; 6+ messages in thread
From: Mika Kuoppala @ 2015-05-08 13:39 UTC (permalink / raw)
To: intel-gfx; +Cc: miku
Hangcheck tries to peek into request list to see
if the ring was busy or not. But that leads to race
against the list addition in request submission.
And hangcheck saw a ring being idle, when in fact work was
just being submitted.
There is strong desire to keep hangcheck without
locks of any kind as it is our last line of defense
against failures that surpass our imagination.
Rework the hangcheck logic so that we find out
the ring busyness by inspecting hw engine state
and omit the request->list inspection.
v2: start is u32, push for 80 cols (Chris)
References: https://bugs.freedesktop.org/show_bug.cgi?id=89931
References: https://bugs.freedesktop.org/show_bug.cgi?id=89644
References: https://bugs.freedesktop.org/show_bug.cgi?id=88984
References: https://bugs.freedesktop.org/show_bug.cgi?id=88981
References: https://bugs.freedesktop.org/show_bug.cgi?id=87730
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 10 +-
drivers/gpu/drm/i915/i915_gpu_error.c | 4 +-
drivers/gpu/drm/i915/i915_irq.c | 214 ++++++++++++++++++++------------
drivers/gpu/drm/i915/intel_ringbuffer.h | 3 +
4 files changed, 149 insertions(+), 82 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index adbbdda..8c647bf 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1295,6 +1295,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
struct drm_i915_private *dev_priv = dev->dev_private;
struct intel_engine_cs *ring;
u64 acthd[I915_NUM_RINGS];
+ u32 start[I915_NUM_RINGS];
u32 seqno[I915_NUM_RINGS];
int i;
@@ -1308,6 +1309,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
for_each_ring(ring, dev_priv, i) {
seqno[i] = ring->get_seqno(ring, false);
acthd[i] = intel_ring_get_active_head(ring);
+ start[i] = I915_READ_START(ring);
}
intel_runtime_pm_put(dev_priv);
@@ -1328,8 +1330,14 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
(long long)acthd[i]);
seq_printf(m, "\tmax ACTHD = 0x%08llx\n",
(long long)ring->hangcheck.max_acthd);
+ seq_printf(m, "\tSTART = 0x%08x [current 0x%08x]\n",
+ ring->hangcheck.start,
+ start[i]);
+
seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
- seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
+ seq_printf(m, "\taction = %s [%d]\n",
+ i915_hangcheck_action_to_str(ring->hangcheck.action),
+ ring->hangcheck.action);
}
return 0;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index a3e330d..9c0db19 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -220,7 +220,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
}
}
-static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
+const char *i915_hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
{
switch (a) {
case HANGCHECK_IDLE:
@@ -301,7 +301,7 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
err_printf(m, " ring->head: 0x%08x\n", ring->cpu_ring_head);
err_printf(m, " ring->tail: 0x%08x\n", ring->cpu_ring_tail);
err_printf(m, " hangcheck: %s [%d]\n",
- hangcheck_action_to_str(ring->hangcheck_action),
+ i915_hangcheck_action_to_str(ring->hangcheck_action),
ring->hangcheck_score);
}
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 9da955e..a3244bd 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2685,20 +2685,6 @@ static void gen8_disable_vblank(struct drm_device *dev, int pipe)
spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags);
}
-static struct drm_i915_gem_request *
-ring_last_request(struct intel_engine_cs *ring)
-{
- return list_entry(ring->request_list.prev,
- struct drm_i915_gem_request, list);
-}
-
-static bool
-ring_idle(struct intel_engine_cs *ring)
-{
- return (list_empty(&ring->request_list) ||
- i915_gem_request_completed(ring_last_request(ring), false));
-}
-
static bool
ipehr_is_semaphore_wait(struct drm_device *dev, u32 ipehr)
{
@@ -2882,6 +2868,109 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
return HANGCHECK_HUNG;
}
+static bool engine_busy(struct intel_engine_cs *ring,
+ const u64 acthd, const u32 start)
+{
+ struct drm_i915_private *dev_priv = to_i915(ring->dev);
+ u32 tail, head;
+
+ if (ring->hangcheck.acthd != acthd)
+ return true;
+
+ if (ring->hangcheck.start != start)
+ return true;
+
+ head = I915_READ_HEAD(ring) & HEAD_ADDR;
+ tail = I915_READ_TAIL(ring) & TAIL_ADDR;
+
+ if (head != tail)
+ return true;
+
+ /* Stop rings mechanism keeps head==tail even if
+ * there is work to be done.
+ */
+ if (ring->buffer &&
+ ring->buffer->tail != tail &&
+ waitqueue_active(&ring->irq_queue))
+ return true;
+
+ return false;
+}
+
+#define engine_idle(ring, acthd, start) (!engine_busy((ring), (acthd), (start)))
+
+static bool check_for_missed_irq(struct intel_engine_cs *ring)
+{
+ struct drm_i915_private *dev_priv = to_i915(ring->dev);
+ bool irq_missing;
+
+ if (!waitqueue_active(&ring->irq_queue))
+ return false;
+
+ irq_missing = !test_and_set_bit(ring->id,
+ &dev_priv->gpu_error.missed_irq_rings);
+
+ if (irq_missing) {
+ const bool irq_test =
+ dev_priv->gpu_error.test_irq_rings &
+ intel_ring_flag(ring);
+
+ if (irq_test)
+ DRM_INFO("Fake missed irq on %s\n", ring->name);
+ else
+ DRM_ERROR("Missed irq on %s\n", ring->name);
+ }
+
+ return true;
+}
+
+static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
+{
+#define BUSY 1
+#define KICK 5
+#define HUNG 20
+
+ struct intel_ring_hangcheck *hc = &ring->hangcheck;
+ bool there_is_hope = true;
+
+ /* We always increment the hangcheck score
+ * if the ring is busy and still processing
+ * the same request, so that no single request
+ * can run indefinitely (such as a chain of
+ * batches). If we detect a loop in acthd,
+ * we increment the busyness twice as fast.
+ * If this ring is in a legitimate wait for another
+ * ring, we omit incrementing the score. In that case
+ * the waiting ring is a victim and we want to be sure we
+ * catch the right culprit. Then every time we do kick
+ * the ring, add a small increment to the
+ * score so that we can catch a batch that is
+ * being repeatedly kicked and so responsible
+ * for stalling the machine.
+ */
+ hc->action = ring_stuck(ring, acthd);
+
+ switch (hc->action) {
+ case HANGCHECK_IDLE:
+ case HANGCHECK_WAIT:
+ case HANGCHECK_ACTIVE:
+ hc->score += BUSY;
+ break;
+ case HANGCHECK_ACTIVE_LOOP:
+ hc->score += 2 * BUSY;
+ break;
+ case HANGCHECK_KICK:
+ hc->score += KICK;
+ break;
+ case HANGCHECK_HUNG:
+ hc->score += HUNG;
+ there_is_hope = false;
+ break;
+ }
+
+ return there_is_hope;
+}
+
/*
* This is called when the chip hasn't reported back with completed
* batchbuffers in a long time. We keep track per ring seqno progress and
@@ -2900,15 +2989,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
int i;
int busy_count = 0, rings_hung = 0;
bool stuck[I915_NUM_RINGS] = { 0 };
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
if (!i915.enable_hangcheck)
return;
for_each_ring(ring, dev_priv, i) {
+ struct intel_ring_hangcheck *hc = &ring->hangcheck;
u64 acthd;
+ u32 start;
u32 seqno;
bool busy = true;
@@ -2916,76 +3004,44 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
seqno = ring->get_seqno(ring, false);
acthd = intel_ring_get_active_head(ring);
+ start = I915_READ_START(ring);
- if (ring->hangcheck.seqno == seqno) {
- if (ring_idle(ring)) {
- ring->hangcheck.action = HANGCHECK_IDLE;
-
- if (waitqueue_active(&ring->irq_queue)) {
- /* Issue a wake-up to catch stuck h/w. */
- if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
- if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
- DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
- ring->name);
- else
- DRM_INFO("Fake missed irq on %s\n",
- ring->name);
- wake_up_all(&ring->irq_queue);
- }
- /* Safeguard against driver failure */
- ring->hangcheck.score += BUSY;
- } else
- busy = false;
- } else {
- /* We always increment the hangcheck score
- * if the ring is busy and still processing
- * the same request, so that no single request
- * can run indefinitely (such as a chain of
- * batches). The only time we do not increment
- * the hangcheck score on this ring, if this
- * ring is in a legitimate wait for another
- * ring. In that case the waiting ring is a
- * victim and we want to be sure we catch the
- * right culprit. Then every time we do kick
- * the ring, add a small increment to the
- * score so that we can catch a batch that is
- * being repeatedly kicked and so responsible
- * for stalling the machine.
- */
- ring->hangcheck.action = ring_stuck(ring,
- acthd);
-
- switch (ring->hangcheck.action) {
- case HANGCHECK_IDLE:
- case HANGCHECK_WAIT:
- case HANGCHECK_ACTIVE:
- break;
- case HANGCHECK_ACTIVE_LOOP:
- ring->hangcheck.score += BUSY;
- break;
- case HANGCHECK_KICK:
- ring->hangcheck.score += KICK;
- break;
- case HANGCHECK_HUNG:
- ring->hangcheck.score += HUNG;
- stuck[i] = true;
- break;
- }
- }
- } else {
- ring->hangcheck.action = HANGCHECK_ACTIVE;
+ if (hc->seqno != seqno) {
+ hc->action = HANGCHECK_ACTIVE;
/* Gradually reduce the count so that we catch DoS
* attempts across multiple batches.
*/
- if (ring->hangcheck.score > 0)
- ring->hangcheck.score--;
+ if (hc->score > 0)
+ hc->score--;
- ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
+ hc->max_acthd = 0;
+ goto engine_check_done;
}
- ring->hangcheck.seqno = seqno;
- ring->hangcheck.acthd = acthd;
+ if (engine_idle(ring, acthd, start)) {
+ busy = check_for_missed_irq(ring);
+ if (busy) {
+ /* Start waking up irrespective of
+ our missed_irq bookkeeping */
+ if (hc->score >= KICK)
+ wake_up_all(&ring->irq_queue);
+
+ hc->score += KICK;
+ hc->action = HANGCHECK_ACTIVE;
+ } else {
+ hc->score = 0;
+ hc->action = HANGCHECK_IDLE;
+ }
+ goto engine_check_done;
+ }
+
+ hangcheck_handle_stuck_ring(ring, acthd);
+
+engine_check_done:
+ hc->seqno = seqno;
+ hc->acthd = acthd;
+ hc->start = start;
busy_count += busy;
}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 39f6dfc..83edcef 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -83,11 +83,14 @@ enum intel_ring_hangcheck_action {
HANGCHECK_HUNG,
};
+const char *i915_hangcheck_action_to_str(enum intel_ring_hangcheck_action a);
+
#define HANGCHECK_SCORE_RING_HUNG 31
struct intel_ring_hangcheck {
u64 acthd;
u64 max_acthd;
+ u32 start;
u32 seqno;
int score;
enum intel_ring_hangcheck_action action;
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/2] drm/i915: Make hangcheck logging more compact
2015-05-08 13:39 [PATCH 1/2] drm/i915: Detach hangcheck from request lists Mika Kuoppala
@ 2015-05-08 13:39 ` Mika Kuoppala
2015-05-08 23:55 ` shuang.he
2015-05-19 11:25 ` Tomas Elf
2015-05-19 11:03 ` [PATCH 1/2] drm/i915: Detach hangcheck from request lists Tomas Elf
1 sibling, 2 replies; 6+ messages in thread
From: Mika Kuoppala @ 2015-05-08 13:39 UTC (permalink / raw)
To: intel-gfx; +Cc: miku
With commit aaecdf611a05 ("drm/i915: Stop gathering error
states for CS error interrupts") we only call i915_handle_error()
on call sites where there is a stuck/hung gpu. So there is
no more need to carry around extra information into dmesg.
Emit one loud bang into dmesg with first hanging ring as
culprit. Rest of the details will be in error state.
Based-on-patch-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
drivers/gpu/drm/i915/i915_gpu_error.c | 4 +---
drivers/gpu/drm/i915/i915_irq.c | 26 ++++++++------------------
2 files changed, 9 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 9c0db19..292cf1f 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1251,9 +1251,7 @@ static void i915_error_capture_msg(struct drm_device *dev,
error->ring[ring_id].pid);
scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
- ", reason: %s, action: %s",
- error_msg,
- wedged ? "reset" : "continue");
+ ", %s", error_msg);
}
static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index a3244bd..a3b5001 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2924,14 +2924,12 @@ static bool check_for_missed_irq(struct intel_engine_cs *ring)
return true;
}
-static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
+static void hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
{
#define BUSY 1
#define KICK 5
#define HUNG 20
-
struct intel_ring_hangcheck *hc = &ring->hangcheck;
- bool there_is_hope = true;
/* We always increment the hangcheck score
* if the ring is busy and still processing
@@ -2964,11 +2962,8 @@ static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
break;
case HANGCHECK_HUNG:
hc->score += HUNG;
- there_is_hope = false;
break;
}
-
- return there_is_hope;
}
/*
@@ -2987,8 +2982,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
struct drm_device *dev = dev_priv->dev;
struct intel_engine_cs *ring;
int i;
- int busy_count = 0, rings_hung = 0;
- bool stuck[I915_NUM_RINGS] = { 0 };
+ int busy_count = 0, ring_hung = -1;
if (!i915.enable_hangcheck)
return;
@@ -3043,19 +3037,15 @@ engine_check_done:
hc->acthd = acthd;
hc->start = start;
busy_count += busy;
- }
- for_each_ring(ring, dev_priv, i) {
- if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
- DRM_INFO("%s on %s\n",
- stuck[i] ? "stuck" : "no progress",
- ring->name);
- rings_hung++;
- }
+ if (ring_hung == -1 &&
+ ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
+ ring_hung = i;
}
- if (rings_hung)
- return i915_handle_error(dev, true, "Ring hung");
+ if (ring_hung != -1)
+ return i915_handle_error(dev, true, "%s hung",
+ dev_priv->ring[ring_hung].name);
if (busy_count)
/* Reset timer case chip hangs without another request
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH 2/2] drm/i915: Make hangcheck logging more compact
2015-05-08 13:39 ` [PATCH 2/2] drm/i915: Make hangcheck logging more compact Mika Kuoppala
@ 2015-05-08 23:55 ` shuang.he
2015-05-19 11:25 ` Tomas Elf
1 sibling, 0 replies; 6+ messages in thread
From: shuang.he @ 2015-05-08 23:55 UTC (permalink / raw)
To: shuang.he, ethan.gao, intel-gfx, mika.kuoppala
Tested-By: Intel Graphics QA PRTS (Patch Regression Test System Contact: shuang.he@intel.com)
Task id: 6357
-------------------------------------Summary-------------------------------------
Platform Delta drm-intel-nightly Series Applied
PNV 276/276 276/276
ILK -1 302/302 301/302
SNB 316/316 316/316
IVB 342/342 342/342
BYT 286/286 286/286
BDW 321/321 321/321
-------------------------------------Detailed-------------------------------------
Platform Test drm-intel-nightly Series Applied
*ILK igt@kms_flip@vblank-vs-hang PASS(2) TIMEOUT(2)
Note: You need to pay more attention to line start with '*'
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 1/2] drm/i915: Detach hangcheck from request lists
2015-05-08 13:39 [PATCH 1/2] drm/i915: Detach hangcheck from request lists Mika Kuoppala
2015-05-08 13:39 ` [PATCH 2/2] drm/i915: Make hangcheck logging more compact Mika Kuoppala
@ 2015-05-19 11:03 ` Tomas Elf
2015-05-19 13:36 ` Chris Wilson
1 sibling, 1 reply; 6+ messages in thread
From: Tomas Elf @ 2015-05-19 11:03 UTC (permalink / raw)
To: Mika Kuoppala, intel-gfx@lists.freedesktop.org; +Cc: miku@iki.fi
On 08/05/2015 14:39, Mika Kuoppala wrote:
> Hangcheck tries to peek into request list to see
> if the ring was busy or not. But that leads to race
> against the list addition in request submission.
> And hangcheck saw a ring being idle, when in fact work was
> just being submitted.
>
> There is strong desire to keep hangcheck without
> locks of any kind as it is our last line of defense
> against failures that surpass our imagination.
>
> Rework the hangcheck logic so that we find out
> the ring busyness by inspecting hw engine state
> and omit the request->list inspection.
>
> v2: start is u32, push for 80 cols (Chris)
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=89931
> References: https://bugs.freedesktop.org/show_bug.cgi?id=89644
> References: https://bugs.freedesktop.org/show_bug.cgi?id=88984
> References: https://bugs.freedesktop.org/show_bug.cgi?id=88981
> References: https://bugs.freedesktop.org/show_bug.cgi?id=87730
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 10 +-
> drivers/gpu/drm/i915/i915_gpu_error.c | 4 +-
> drivers/gpu/drm/i915/i915_irq.c | 214 ++++++++++++++++++++------------
> drivers/gpu/drm/i915/intel_ringbuffer.h | 3 +
> 4 files changed, 149 insertions(+), 82 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index adbbdda..8c647bf 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1295,6 +1295,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
> struct drm_i915_private *dev_priv = dev->dev_private;
> struct intel_engine_cs *ring;
> u64 acthd[I915_NUM_RINGS];
> + u32 start[I915_NUM_RINGS];
> u32 seqno[I915_NUM_RINGS];
> int i;
>
> @@ -1308,6 +1309,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
> for_each_ring(ring, dev_priv, i) {
> seqno[i] = ring->get_seqno(ring, false);
> acthd[i] = intel_ring_get_active_head(ring);
> + start[i] = I915_READ_START(ring);
> }
>
> intel_runtime_pm_put(dev_priv);
> @@ -1328,8 +1330,14 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
> (long long)acthd[i]);
> seq_printf(m, "\tmax ACTHD = 0x%08llx\n",
> (long long)ring->hangcheck.max_acthd);
> + seq_printf(m, "\tSTART = 0x%08x [current 0x%08x]\n",
> + ring->hangcheck.start,
> + start[i]);
> +
> seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
> - seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
> + seq_printf(m, "\taction = %s [%d]\n",
> + i915_hangcheck_action_to_str(ring->hangcheck.action),
> + ring->hangcheck.action);
> }
>
Based on feedback I have seen in the past it seems like we want to keep
unrelated changes to separate patches. So in this case maybe we should
move the debugfs changes into its own patch rather keep it together with
the hangcheck changes?
> return 0;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a3e330d..9c0db19 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -220,7 +220,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
> }
> }
>
> -static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
> +const char *i915_hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
> {
> switch (a) {
> case HANGCHECK_IDLE:
> @@ -301,7 +301,7 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
> err_printf(m, " ring->head: 0x%08x\n", ring->cpu_ring_head);
> err_printf(m, " ring->tail: 0x%08x\n", ring->cpu_ring_tail);
> err_printf(m, " hangcheck: %s [%d]\n",
> - hangcheck_action_to_str(ring->hangcheck_action),
> + i915_hangcheck_action_to_str(ring->hangcheck_action),
> ring->hangcheck_score);
See above. Maybe you could put this change together with the debugfs
changes in a separate patch seeing as it's more about presenting the
hangcheck action in a more informative way rather than anything related
to the hangcheck logic itself.
> }
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 9da955e..a3244bd 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2685,20 +2685,6 @@ static void gen8_disable_vblank(struct drm_device *dev, int pipe)
> spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags);
> }
>
> -static struct drm_i915_gem_request *
> -ring_last_request(struct intel_engine_cs *ring)
> -{
> - return list_entry(ring->request_list.prev,
> - struct drm_i915_gem_request, list);
> -}
> -
> -static bool
> -ring_idle(struct intel_engine_cs *ring)
> -{
> - return (list_empty(&ring->request_list) ||
> - i915_gem_request_completed(ring_last_request(ring), false));
> -}
> -
> static bool
> ipehr_is_semaphore_wait(struct drm_device *dev, u32 ipehr)
> {
> @@ -2882,6 +2868,109 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
> return HANGCHECK_HUNG;
> }
>
> +static bool engine_busy(struct intel_engine_cs *ring,
> + const u64 acthd, const u32 start)
> +{
> + struct drm_i915_private *dev_priv = to_i915(ring->dev);
> + u32 tail, head;
> +
> + if (ring->hangcheck.acthd != acthd)
> + return true;
> +
> + if (ring->hangcheck.start != start)
> + return true;
> +
> + head = I915_READ_HEAD(ring) & HEAD_ADDR;
> + tail = I915_READ_TAIL(ring) & TAIL_ADDR;
> +
> + if (head != tail)
> + return true;
> +
> + /* Stop rings mechanism keeps head==tail even if
> + * there is work to be done.
> + */
1. The preferred style for multi-line comments is:
/* (empty line)
* (first line)
* (second line)
*/
2. I don't quite understand the comment. I know there is such a thing as
stop_rings, which we use for simulating hangs, but how does that factor
in here? If stop_rings affects the state of MMIO tail register vs.
ringbuffer tail then a small remark explaining that would be helpful.
> + if (ring->buffer &&
> + ring->buffer->tail != tail &&
> + waitqueue_active(&ring->irq_queue))
> + return true;
> +
1. For some reason going from one waitqueue_active() check in
i915_hangcheck_elapsed() to two separate calls in two separate functions
does not sit perfectly well with me. Maybe it's not that important but
would it make sense to take the body of check_for_missed_irq() and
integrate it in engine_idle(), call waitqueue_active() once and use the
result twice: first in the check in the block above and then in the
missing irq check that follows immediately?
2. In the block above where we check waitqueue_active() together with
tail: Are there no cases when we might be interested in the waitqueue
being active and MMIO tail == ringbuffer tail? Are there no cases where
some client might hang on indefinitely even when the tail has caught up
that we might want to do something about? Do we know for certain that
when tail has caught up the state of waitqueue_active() is irrelevant? I
think I'm missing something here - please enlighten me. (and maybe
consider adding a comment explaining this in no uncertain terms)
3. Doing the ring->buffer check ensures that it won't run in execlist
mode since ring->buffer is only set in ringbuffer submission mode.
That's a pity since it's useful to check waitqueue_active().
> + return false;
> +}
> +
> +#define engine_idle(ring, acthd, start) (!engine_busy((ring), (acthd), (start)))
engine_idle() has one caller. Why not just skip this #define and add a !
in front of the calling statement instead of adding one #define for one
function for its sole caller? It's not like you're making this rewrite
any smoother by retaining parts of the semantics of the old
i915_hangcheck_elapsed() implementation (by letting the check be on
idleness rather than busyness) since you're renaming it from ring_idle()
to engine_idle() and adding more parameters. Is it just a question of
personal taste or is there anything else to justify this?
> +
> +static bool check_for_missed_irq(struct intel_engine_cs *ring)
> +{
> + struct drm_i915_private *dev_priv = to_i915(ring->dev);
> + bool irq_missing;
> +
> + if (!waitqueue_active(&ring->irq_queue))
> + return false;
> +
> + irq_missing = !test_and_set_bit(ring->id,
> + &dev_priv->gpu_error.missed_irq_rings);
> +
> + if (irq_missing) {
> + const bool irq_test =
> + dev_priv->gpu_error.test_irq_rings &
> + intel_ring_flag(ring);
> +
> + if (irq_test)
> + DRM_INFO("Fake missed irq on %s\n", ring->name);
> + else
> + DRM_ERROR("Missed irq on %s\n", ring->name);
> + }
> +
> + return true;
> +}
> +
> +static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
> +{
> +#define BUSY 1
> +#define KICK 5
> +#define HUNG 20
> +
> + struct intel_ring_hangcheck *hc = &ring->hangcheck;
> + bool there_is_hope = true;
> +
> + /* We always increment the hangcheck score
> + * if the ring is busy and still processing
> + * the same request, so that no single request
> + * can run indefinitely (such as a chain of
> + * batches). If we detect a loop in acthd,
> + * we increment the busyness twice as fast.
> + * If this ring is in a legitimate wait for another
> + * ring, we omit incrementing the score. In that case
> + * the waiting ring is a victim and we want to be sure we
> + * catch the right culprit. Then every time we do kick
> + * the ring, add a small increment to the
> + * score so that we can catch a batch that is
> + * being repeatedly kicked and so responsible
> + * for stalling the machine.
> + */
See preferred multi-line comment format.
> + hc->action = ring_stuck(ring, acthd);
> +
> + switch (hc->action) {
> + case HANGCHECK_IDLE:
> + case HANGCHECK_WAIT:
> + case HANGCHECK_ACTIVE:
> + hc->score += BUSY;
> + break;
> + case HANGCHECK_ACTIVE_LOOP:
> + hc->score += 2 * BUSY;
> + break;
> + case HANGCHECK_KICK:
> + hc->score += KICK;
> + break;
> + case HANGCHECK_HUNG:
> + hc->score += HUNG;
> + there_is_hope = false;
> + break;
> + }
> +
> + return there_is_hope;
> +}
> +
> /*
> * This is called when the chip hasn't reported back with completed
> * batchbuffers in a long time. We keep track per ring seqno progress and
> @@ -2900,15 +2989,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> int i;
> int busy_count = 0, rings_hung = 0;
> bool stuck[I915_NUM_RINGS] = { 0 };
> -#define BUSY 1
> -#define KICK 5
> -#define HUNG 20
>
> if (!i915.enable_hangcheck)
> return;
>
> for_each_ring(ring, dev_priv, i) {
> + struct intel_ring_hangcheck *hc = &ring->hangcheck;
> u64 acthd;
> + u32 start;
> u32 seqno;
> bool busy = true;
>
> @@ -2916,76 +3004,44 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>
> seqno = ring->get_seqno(ring, false);
> acthd = intel_ring_get_active_head(ring);
> + start = I915_READ_START(ring);
>
> - if (ring->hangcheck.seqno == seqno) {
> - if (ring_idle(ring)) {
> - ring->hangcheck.action = HANGCHECK_IDLE;
> -
> - if (waitqueue_active(&ring->irq_queue)) {
> - /* Issue a wake-up to catch stuck h/w. */
> - if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
> - if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
> - DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> - ring->name);
> - else
> - DRM_INFO("Fake missed irq on %s\n",
> - ring->name);
> - wake_up_all(&ring->irq_queue);
> - }
> - /* Safeguard against driver failure */
> - ring->hangcheck.score += BUSY;
> - } else
> - busy = false;
> - } else {
> - /* We always increment the hangcheck score
> - * if the ring is busy and still processing
> - * the same request, so that no single request
> - * can run indefinitely (such as a chain of
> - * batches). The only time we do not increment
> - * the hangcheck score on this ring, if this
> - * ring is in a legitimate wait for another
> - * ring. In that case the waiting ring is a
> - * victim and we want to be sure we catch the
> - * right culprit. Then every time we do kick
> - * the ring, add a small increment to the
> - * score so that we can catch a batch that is
> - * being repeatedly kicked and so responsible
> - * for stalling the machine.
> - */
> - ring->hangcheck.action = ring_stuck(ring,
> - acthd);
> -
> - switch (ring->hangcheck.action) {
> - case HANGCHECK_IDLE:
> - case HANGCHECK_WAIT:
> - case HANGCHECK_ACTIVE:
> - break;
> - case HANGCHECK_ACTIVE_LOOP:
> - ring->hangcheck.score += BUSY;
> - break;
> - case HANGCHECK_KICK:
> - ring->hangcheck.score += KICK;
> - break;
> - case HANGCHECK_HUNG:
> - ring->hangcheck.score += HUNG;
> - stuck[i] = true;
> - break;
> - }
> - }
> - } else {
> - ring->hangcheck.action = HANGCHECK_ACTIVE;
> + if (hc->seqno != seqno) {
> + hc->action = HANGCHECK_ACTIVE;
>
> /* Gradually reduce the count so that we catch DoS
> * attempts across multiple batches.
> */
> - if (ring->hangcheck.score > 0)
> - ring->hangcheck.score--;
> + if (hc->score > 0)
> + hc->score--;
>
> - ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
> + hc->max_acthd = 0;
> + goto engine_check_done;
> }
>
> - ring->hangcheck.seqno = seqno;
> - ring->hangcheck.acthd = acthd;
> + if (engine_idle(ring, acthd, start)) {
> + busy = check_for_missed_irq(ring);
> + if (busy) {
> + /* Start waking up irrespective of
> + our missed_irq bookkeeping */
See preferred multi-line comment format.
> + if (hc->score >= KICK)
> + wake_up_all(&ring->irq_queue);
> +
> + hc->score += KICK;
> + hc->action = HANGCHECK_ACTIVE;
> + } else {
> + hc->score = 0;
> + hc->action = HANGCHECK_IDLE;
> + }
> + goto engine_check_done;
> + }
> +
> + hangcheck_handle_stuck_ring(ring, acthd);
> +
> +engine_check_done:
> + hc->seqno = seqno;
> + hc->acthd = acthd;
> + hc->start = start;
> busy_count += busy;
> }
>
You've rearranged quite a few things, especially the general flow of the
if-statements and replaced several branches with goto engine_check_done.
Maybe it would make things even more clearer if you would add 1-2
intermediate patches where you introduce these structural changes first
before you start changing logic, such as:
1. Replacing request checks with MMIO register checks to determine
engine idleness.
2. Setting score 0 and action IDLE upon engine idle and
check_for_missed_irq non-busy.
I'm sure I could do a deeper analysis of all of the changes here but I
think it might be a good idea to start by splitting things up a bit. Or
am I being overzealous here?
Thanks,
Tomas
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 39f6dfc..83edcef 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -83,11 +83,14 @@ enum intel_ring_hangcheck_action {
> HANGCHECK_HUNG,
> };
>
> +const char *i915_hangcheck_action_to_str(enum intel_ring_hangcheck_action a);
> +
> #define HANGCHECK_SCORE_RING_HUNG 31
>
> struct intel_ring_hangcheck {
> u64 acthd;
> u64 max_acthd;
> + u32 start;
> u32 seqno;
> int score;
> enum intel_ring_hangcheck_action action;
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 2/2] drm/i915: Make hangcheck logging more compact
2015-05-08 13:39 ` [PATCH 2/2] drm/i915: Make hangcheck logging more compact Mika Kuoppala
2015-05-08 23:55 ` shuang.he
@ 2015-05-19 11:25 ` Tomas Elf
1 sibling, 0 replies; 6+ messages in thread
From: Tomas Elf @ 2015-05-19 11:25 UTC (permalink / raw)
To: Mika Kuoppala, intel-gfx@lists.freedesktop.org; +Cc: miku@iki.fi
On 08/05/2015 14:39, Mika Kuoppala wrote:
> With commit aaecdf611a05 ("drm/i915: Stop gathering error
> states for CS error interrupts") we only call i915_handle_error()
> on call sites where there is a stuck/hung gpu. So there is
> no more need to carry around extra information into dmesg.
>
> Emit one loud bang into dmesg with first hanging ring as
> culprit. Rest of the details will be in error state.
>
> Based-on-patch-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
> drivers/gpu/drm/i915/i915_gpu_error.c | 4 +---
> drivers/gpu/drm/i915/i915_irq.c | 26 ++++++++------------------
> 2 files changed, 9 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 9c0db19..292cf1f 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1251,9 +1251,7 @@ static void i915_error_capture_msg(struct drm_device *dev,
> error->ring[ring_id].pid);
>
> scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
> - ", reason: %s, action: %s",
> - error_msg,
> - wedged ? "reset" : "continue");
> + ", %s", error_msg);
> }
>
Once you've removed the reference to the wedged parameter from the
scnprintf statement I can't see any other references to it anywhere else
in the function. How about we remove that parameter entirely from the
function signature?
Thanks,
Tomas
> static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index a3244bd..a3b5001 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2924,14 +2924,12 @@ static bool check_for_missed_irq(struct intel_engine_cs *ring)
> return true;
> }
>
> -static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
> +static void hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
> {
> #define BUSY 1
> #define KICK 5
> #define HUNG 20
> -
> struct intel_ring_hangcheck *hc = &ring->hangcheck;
> - bool there_is_hope = true;
>
> /* We always increment the hangcheck score
> * if the ring is busy and still processing
> @@ -2964,11 +2962,8 @@ static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
> break;
> case HANGCHECK_HUNG:
> hc->score += HUNG;
> - there_is_hope = false;
> break;
> }
> -
> - return there_is_hope;
> }
>
> /*
> @@ -2987,8 +2982,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> struct drm_device *dev = dev_priv->dev;
> struct intel_engine_cs *ring;
> int i;
> - int busy_count = 0, rings_hung = 0;
> - bool stuck[I915_NUM_RINGS] = { 0 };
> + int busy_count = 0, ring_hung = -1;
>
> if (!i915.enable_hangcheck)
> return;
> @@ -3043,19 +3037,15 @@ engine_check_done:
> hc->acthd = acthd;
> hc->start = start;
> busy_count += busy;
> - }
>
> - for_each_ring(ring, dev_priv, i) {
> - if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
> - DRM_INFO("%s on %s\n",
> - stuck[i] ? "stuck" : "no progress",
> - ring->name);
> - rings_hung++;
> - }
> + if (ring_hung == -1 &&
> + ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
> + ring_hung = i;
> }
>
> - if (rings_hung)
> - return i915_handle_error(dev, true, "Ring hung");
> + if (ring_hung != -1)
> + return i915_handle_error(dev, true, "%s hung",
> + dev_priv->ring[ring_hung].name);
>
> if (busy_count)
> /* Reset timer case chip hangs without another request
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 1/2] drm/i915: Detach hangcheck from request lists
2015-05-19 11:03 ` [PATCH 1/2] drm/i915: Detach hangcheck from request lists Tomas Elf
@ 2015-05-19 13:36 ` Chris Wilson
0 siblings, 0 replies; 6+ messages in thread
From: Chris Wilson @ 2015-05-19 13:36 UTC (permalink / raw)
To: Tomas Elf; +Cc: intel-gfx@lists.freedesktop.org, miku@iki.fi
On Tue, May 19, 2015 at 12:03:44PM +0100, Tomas Elf wrote:
> >+ if (ring->buffer &&
> >+ ring->buffer->tail != tail &&
> >+ waitqueue_active(&ring->irq_queue))
> >+ return true;
> >+
>
> 1. For some reason going from one waitqueue_active() check in
> i915_hangcheck_elapsed() to two separate calls in two separate
> functions does not sit perfectly well with me. Maybe it's not that
> important but would it make sense to take the body of
> check_for_missed_irq() and integrate it in engine_idle(), call
> waitqueue_active() once and use the result twice: first in the check
> in the block above and then in the missing irq check that follows
> immediately?
No it is just that stop_rings is the wrong mechanism and that has lead
to this kerfuffle with using waitqueue_active() as a test for engine
busyness. That is plainly wrong.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2015-05-19 13:36 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-05-08 13:39 [PATCH 1/2] drm/i915: Detach hangcheck from request lists Mika Kuoppala
2015-05-08 13:39 ` [PATCH 2/2] drm/i915: Make hangcheck logging more compact Mika Kuoppala
2015-05-08 23:55 ` shuang.he
2015-05-19 11:25 ` Tomas Elf
2015-05-19 11:03 ` [PATCH 1/2] drm/i915: Detach hangcheck from request lists Tomas Elf
2015-05-19 13:36 ` Chris Wilson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox