From: Ben Widawsky <ben@bwidawsk.net>
To: Chris Wilson <chris@chris-wilson.co.uk>
Cc: intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/i915: Don't count semaphore waits towards a stuck ring
Date: Sun, 9 Jun 2013 18:12:50 -0700 [thread overview]
Message-ID: <20130610011250.GA1622@bwidawsk.net> (raw)
In-Reply-To: <1370595351-24981-1-git-send-email-chris@chris-wilson.co.uk>
On Fri, Jun 07, 2013 at 09:55:51AM +0100, Chris Wilson wrote:
> If we detect a ring is in a valid wait for another, just let it be.
> Eventually it will either begin to progress again, or the entire system
> will come grinding to a halt and then hangcheck will fire as soon as the
> deadlock is detected.
>
> This error was foretold by Ben in
> commit 05407ff889ceebe383aa5907219f86582ef96b72
> Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Date: Thu May 30 09:04:29 2013 +0300
>
> drm/i915: detect hang using per ring hangcheck_score
>
> "If ring B is waiting on ring A via semaphore, and ring A is making
> progress, albeit slowly - the hangcheck will fire. The check will
> determine that A is moving, however ring B will appear hung because
> the ACTHD doesn't move. I honestly can't say if that's actually a
> realistic problem to hit it probably implies the timeout value is too
> low."
>
> v2: Make sure we don't even incur the KICK cost whilst waiting.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Ben Widawsky <ben@bwidawsk.net>
> ---
> drivers/gpu/drm/i915/i915_irq.c | 95 ++++++++++++++++++++++---------
> drivers/gpu/drm/i915/intel_ringbuffer.h | 1 +
> 2 files changed, 70 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 2d1890d..05f8f75 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2331,21 +2331,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
> i915_seqno_passed(seqno, ring_last_seqno(ring)));
> }
>
> -static bool semaphore_passed(struct intel_ring_buffer *ring)
> +static struct intel_ring_buffer *
> +semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
> {
> struct drm_i915_private *dev_priv = ring->dev->dev_private;
> - u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
> - struct intel_ring_buffer *signaller;
> - u32 cmd, ipehr, acthd_min;
> + u32 cmd, ipehr, acthd, acthd_min;
>
> ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
> if ((ipehr & ~(0x3 << 16)) !=
> (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
> - return false;
> + return NULL;
>
> /* ACTHD is likely pointing to the dword after the actual command,
> * so scan backwards until we find the MBOX.
> */
> + acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
> acthd_min = max((int)acthd - 3 * 4, 0);
> do {
> cmd = ioread32(ring->virtual_start + acthd);
> @@ -2354,22 +2354,53 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
>
> acthd -= 4;
> if (acthd < acthd_min)
> - return false;
> + return NULL;
> } while (1);
>
> - signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
> - return i915_seqno_passed(signaller->get_seqno(signaller, false),
> - ioread32(ring->virtual_start+acthd+4)+1);
> + *seqno = ioread32(ring->virtual_start+acthd+4)+1;
> + return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
> +}
> +
>
I hated these since they were introduced... does this work still with
VECS? Seems at the very least %3 should be NUM_RINGS?
>
> +static int semaphore_passed(struct intel_ring_buffer *ring)
> +{
> + struct drm_i915_private *dev_priv = ring->dev->dev_private;
> + struct intel_ring_buffer *signaller;
> + u32 seqno, ctl;
> +
> + ring->hangcheck.deadlock = true;
> +
> + signaller = semaphore_waits_for(ring, &seqno);
> + if (signaller == NULL || signaller->hangcheck.deadlock)
> + return -1;
> +
> + /* cursory check for an unkickable deadlock */
> + ctl = I915_READ_CTL(signaller);
> + if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
> + return -1;
> +
> + return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
> }
>
> -static bool ring_hung(struct intel_ring_buffer *ring)
> +static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
> +{
> + struct intel_ring_buffer *ring;
> + int i;
> +
> + for_each_ring(ring, dev_priv, i)
> + ring->hangcheck.deadlock = false;
> +}
> +
> +static enum { pass, kick, hung } ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
> {
> struct drm_device *dev = ring->dev;
> struct drm_i915_private *dev_priv = dev->dev_private;
> u32 tmp;
>
> + if (ring->hangcheck.acthd != acthd)
> + return pass;
> +
> if (IS_GEN2(dev))
> - return true;
> + return hung;
>
> /* Is the chip hanging on a WAIT_FOR_EVENT?
> * If so we can simply poke the RB_WAIT bit
> @@ -2381,19 +2412,24 @@ static bool ring_hung(struct intel_ring_buffer *ring)
> DRM_ERROR("Kicking stuck wait on %s\n",
> ring->name);
> I915_WRITE_CTL(ring, tmp);
> - return false;
> - }
> -
> - if (INTEL_INFO(dev)->gen >= 6 &&
> - tmp & RING_WAIT_SEMAPHORE &&
> - semaphore_passed(ring)) {
> - DRM_ERROR("Kicking stuck semaphore on %s\n",
> - ring->name);
> - I915_WRITE_CTL(ring, tmp);
> - return false;
> + return kick;
> + }
> +
> + if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
> + switch (semaphore_passed(ring)) {
> + default:
> + return hung;
> + case 1:
> + DRM_ERROR("Kicking stuck semaphore on %s\n",
> + ring->name);
> + I915_WRITE_CTL(ring, tmp);
> + return kick;
> + case 0:
> + return pass;
> + }
> }
>
> - return true;
> + return hung;
> }
>
> /**
> @@ -2422,6 +2458,8 @@ void i915_hangcheck_elapsed(unsigned long data)
> for_each_ring(ring, dev_priv, i) {
> u32 seqno, acthd;
>
> + semaphore_clear_deadlocks(dev_priv);
> +
> seqno = ring->get_seqno(ring, false);
> acthd = intel_ring_get_active_head(ring);
>
> @@ -2436,16 +2474,21 @@ void i915_hangcheck_elapsed(unsigned long data)
> busy_count++;
> }
> } else {
> - stuck[i] = ring->hangcheck.acthd == acthd;
> - if (stuck[i]) {
> + switch (ring_stuck(ring, acthd)) {
> + case pass:
> + break;
> + case kick:
> /* Every time we kick the ring, add a
> * small increment to the hangcheck
> * score so that we can catch a
> * batch that is repeatedly kicked.
> */
> ring->hangcheck.score += KICK;
> - if (ring_hung(ring))
> - ring->hangcheck.score += HUNG;
> + break;
> + case hung:
> + ring->hangcheck.score += HUNG;
> + stuck[i] = true;
> + break;
> }
> busy_count++;
> }
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index efc403d..a3e9610 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -38,6 +38,7 @@ struct intel_hw_status_page {
> #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
>
> struct intel_ring_hangcheck {
> + bool deadlock;
> u32 seqno;
> u32 acthd;
> int score;
> --
> 1.7.10.4
>
>
I'm going to give this another look tomorrow if Daniel doesn't merge it
before then. The patch is a bit hard for me to read, but the idea seems
like how I'd want it to behave.
--
Ben Widawsky, Intel Open Source Technology Center
next prev parent reply other threads:[~2013-06-10 1:12 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-06-06 9:45 [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring Chris Wilson
2013-06-06 9:45 ` [PATCH 2/2] drm/i915: Don't count semaphore waits towards a stuck ring Chris Wilson
2013-06-07 8:55 ` [PATCH] " Chris Wilson
2013-06-10 1:12 ` Ben Widawsky [this message]
2013-06-06 14:41 ` [PATCH 1/2] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring Mika Kuoppala
2013-06-07 3:58 ` Ben Widawsky
2013-06-07 7:33 ` Chris Wilson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130610011250.GA1622@bwidawsk.net \
--to=ben@bwidawsk.net \
--cc=chris@chris-wilson.co.uk \
--cc=intel-gfx@lists.freedesktop.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.