Re: [Intel-gfx] [PATCH RFC] drm/i915/gt: Retry RING_HEAD reset until it sticks

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Rodrigo Vivi <rodrigo.vivi@intel.com>
To: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: David Airlie <airlied@linux.ie>,
	intel-gfx@lists.freedesktop.org, linux-kernel@vger.kernel.org,
	dri-devel@lists.freedesktop.org,
	Chris Wilson <chris@chris-wilson.co.uk>,
	Matthew Auld <matthew.auld@intel.com>,
	Andrzej Hajda <andrzej.hajda@intel.com>
Subject: Re: [Intel-gfx] [PATCH RFC] drm/i915/gt: Retry RING_HEAD reset until it sticks
Date: Fri, 15 Jul 2022 16:28:09 -0400	[thread overview]
Message-ID: <YtHN2ZXvpuIWYBpo@intel.com> (raw)
In-Reply-To: <2378da383d043de17172d928e59da0ec423cae76.1657873550.git.mchehab@kernel.org>

On Fri, Jul 15, 2022 at 09:26:16AM +0100, Mauro Carvalho Chehab wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> On Haswell, in particular, we see an issue where resets fails because

Can we then make this platform specific?
Only because some older hw doesn't behave like expected we shouldn't
make this a default & global workaround.

> the engine resumes from an incorrect RING_HEAD. Since the RING_HEAD
> doesn't point to the remaining requests to re-run, but may instead point
> into the uninitialised portion of the ring, the GPU may be then fed
> invalid instructions from a privileged context, oft pushing the GPU into
> an unrecoverable hang.
> 
> If at first the write doesn't succeed, try, try again.
> 
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/5432
> Testcase: igt/i915_selftest/hangcheck
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Andrzej Hajda <andrzej.hajda@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
> ---
>  .../gpu/drm/i915/gt/intel_ring_submission.c   | 44 +++++++++++++------
>  drivers/gpu/drm/i915/i915_utils.h             | 10 +++++
>  2 files changed, 40 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index d5d6f1fadcae..cc53feb1f8ed 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -190,6 +190,7 @@ static bool stop_ring(struct intel_engine_cs *engine)
>  static int xcs_resume(struct intel_engine_cs *engine)
>  {
>  	struct intel_ring *ring = engine->legacy.ring;
> +	ktime_t kt;
>  
>  	ENGINE_TRACE(engine, "ring:{HEAD:%04x, TAIL:%04x}\n",
>  		     ring->head, ring->tail);
> @@ -228,9 +229,20 @@ static int xcs_resume(struct intel_engine_cs *engine)
>  	set_pp_dir(engine);
>  
>  	/* First wake the ring up to an empty/idle ring */
> -	ENGINE_WRITE_FW(engine, RING_HEAD, ring->head);
> +	until_timeout_ns(kt, 2 * NSEC_PER_MSEC) {
> +		ENGINE_WRITE_FW(engine, RING_HEAD, ring->head);
> +		if (ENGINE_READ_FW(engine, RING_HEAD) == ring->head)
> +			break;
> +	}
> +
>  	ENGINE_WRITE_FW(engine, RING_TAIL, ring->head);
> -	ENGINE_POSTING_READ(engine, RING_TAIL);
> +	if (ENGINE_READ_FW(engine, RING_HEAD) != ENGINE_READ_FW(engine, RING_TAIL)) {
> +		ENGINE_TRACE(engine, "failed to reset empty ring: [%x, %x]: %x\n",
> +			     ENGINE_READ_FW(engine, RING_HEAD),
> +			     ENGINE_READ_FW(engine, RING_TAIL),
> +			     ring->head);
> +		goto err;
> +	}

commit message mentions until this point I'm afraid... everything below
(except the new until_timeout_ns) looks like a different patch to me,
or deserves some mention in the commit msg.

>  
>  	ENGINE_WRITE_FW(engine, RING_CTL,
>  			RING_CTL_SIZE(ring->size) | RING_VALID);
> @@ -239,12 +251,16 @@ static int xcs_resume(struct intel_engine_cs *engine)
>  	if (__intel_wait_for_register_fw(engine->uncore,
>  					 RING_CTL(engine->mmio_base),
>  					 RING_VALID, RING_VALID,
> -					 5000, 0, NULL))
> +					 5000, 0, NULL)) {
> +		ENGINE_TRACE(engine, "failed to restart\n");
>  		goto err;
> +	}
>  
> -	if (GRAPHICS_VER(engine->i915) > 2)
> +	if (GRAPHICS_VER(engine->i915) > 2) {
>  		ENGINE_WRITE_FW(engine,
>  				RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
> +		ENGINE_POSTING_READ(engine, RING_MI_MODE);
> +	}
>  
>  	/* Now awake, let it get started */
>  	if (ring->tail != ring->head) {
> @@ -257,16 +273,16 @@ static int xcs_resume(struct intel_engine_cs *engine)
>  	return 0;
>  
>  err:
> -	drm_err(&engine->i915->drm,
> -		"%s initialization failed; "
> -		"ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
> -		engine->name,
> -		ENGINE_READ(engine, RING_CTL),
> -		ENGINE_READ(engine, RING_CTL) & RING_VALID,
> -		ENGINE_READ(engine, RING_HEAD), ring->head,
> -		ENGINE_READ(engine, RING_TAIL), ring->tail,
> -		ENGINE_READ(engine, RING_START),
> -		i915_ggtt_offset(ring->vma));
> +	ENGINE_TRACE(engine,
> +		     "initialization failed; "
> +		     "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
> +		     ENGINE_READ(engine, RING_CTL),
> +		     ENGINE_READ(engine, RING_CTL) & RING_VALID,
> +		     ENGINE_READ(engine, RING_HEAD), ring->head,
> +		     ENGINE_READ(engine, RING_TAIL), ring->tail,
> +		     ENGINE_READ(engine, RING_START),
> +		     i915_ggtt_offset(ring->vma));
> +	GEM_TRACE_DUMP();
>  	return -EIO;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
> index c10d68cdc3ca..717fb6b9cc15 100644
> --- a/drivers/gpu/drm/i915/i915_utils.h
> +++ b/drivers/gpu/drm/i915/i915_utils.h
> @@ -256,6 +256,16 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
>  	}
>  }
>  
> +/**
> + * until_timeout_ns - Keep retrying (busy spin) until the duration has passed
> + * @end: temporary var to be used to track the spent time
> + * @timeout_ns: Maximum timeout, in nanosseconds
> + */
> +#define until_timeout_ns(end, timeout_ns) \
> +	for ((end) = ktime_get() + (timeout_ns); \
> +	     ktime_before(ktime_get(), (end)); \
> +	     cpu_relax())
> +

why do we need yet another timeout macro and cannot use any of the existent ways?

>  /**
>   * __wait_for - magic wait macro
>   *
> -- 
> 2.36.1
> 
>

WARNING: multiple messages have this Message-ID (diff)

From: Rodrigo Vivi <rodrigo.vivi@intel.com>
To: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Andrzej Hajda <andrzej.hajda@intel.com>,
	David Airlie <airlied@linux.ie>,
	<dri-devel@lists.freedesktop.org>, <linux-kernel@vger.kernel.org>,
	"Chris Wilson" <chris@chris-wilson.co.uk>,
	<intel-gfx@lists.freedesktop.org>,
	"Matthew Auld" <matthew.auld@intel.com>
Subject: Re: [Intel-gfx] [PATCH RFC] drm/i915/gt: Retry RING_HEAD reset until it sticks
Date: Fri, 15 Jul 2022 16:28:09 -0400	[thread overview]
Message-ID: <YtHN2ZXvpuIWYBpo@intel.com> (raw)
In-Reply-To: <2378da383d043de17172d928e59da0ec423cae76.1657873550.git.mchehab@kernel.org>

On Fri, Jul 15, 2022 at 09:26:16AM +0100, Mauro Carvalho Chehab wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> On Haswell, in particular, we see an issue where resets fails because

Can we then make this platform specific?
Only because some older hw doesn't behave like expected we shouldn't
make this a default & global workaround.

> the engine resumes from an incorrect RING_HEAD. Since the RING_HEAD
> doesn't point to the remaining requests to re-run, but may instead point
> into the uninitialised portion of the ring, the GPU may be then fed
> invalid instructions from a privileged context, oft pushing the GPU into
> an unrecoverable hang.
> 
> If at first the write doesn't succeed, try, try again.
> 
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/5432
> Testcase: igt/i915_selftest/hangcheck
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Andrzej Hajda <andrzej.hajda@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
> ---
>  .../gpu/drm/i915/gt/intel_ring_submission.c   | 44 +++++++++++++------
>  drivers/gpu/drm/i915/i915_utils.h             | 10 +++++
>  2 files changed, 40 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index d5d6f1fadcae..cc53feb1f8ed 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -190,6 +190,7 @@ static bool stop_ring(struct intel_engine_cs *engine)
>  static int xcs_resume(struct intel_engine_cs *engine)
>  {
>  	struct intel_ring *ring = engine->legacy.ring;
> +	ktime_t kt;
>  
>  	ENGINE_TRACE(engine, "ring:{HEAD:%04x, TAIL:%04x}\n",
>  		     ring->head, ring->tail);
> @@ -228,9 +229,20 @@ static int xcs_resume(struct intel_engine_cs *engine)
>  	set_pp_dir(engine);
>  
>  	/* First wake the ring up to an empty/idle ring */
> -	ENGINE_WRITE_FW(engine, RING_HEAD, ring->head);
> +	until_timeout_ns(kt, 2 * NSEC_PER_MSEC) {
> +		ENGINE_WRITE_FW(engine, RING_HEAD, ring->head);
> +		if (ENGINE_READ_FW(engine, RING_HEAD) == ring->head)
> +			break;
> +	}
> +
>  	ENGINE_WRITE_FW(engine, RING_TAIL, ring->head);
> -	ENGINE_POSTING_READ(engine, RING_TAIL);
> +	if (ENGINE_READ_FW(engine, RING_HEAD) != ENGINE_READ_FW(engine, RING_TAIL)) {
> +		ENGINE_TRACE(engine, "failed to reset empty ring: [%x, %x]: %x\n",
> +			     ENGINE_READ_FW(engine, RING_HEAD),
> +			     ENGINE_READ_FW(engine, RING_TAIL),
> +			     ring->head);
> +		goto err;
> +	}

commit message mentions until this point I'm afraid... everything below
(except the new until_timeout_ns) looks like a different patch to me,
or deserves some mention in the commit msg.

>  
>  	ENGINE_WRITE_FW(engine, RING_CTL,
>  			RING_CTL_SIZE(ring->size) | RING_VALID);
> @@ -239,12 +251,16 @@ static int xcs_resume(struct intel_engine_cs *engine)
>  	if (__intel_wait_for_register_fw(engine->uncore,
>  					 RING_CTL(engine->mmio_base),
>  					 RING_VALID, RING_VALID,
> -					 5000, 0, NULL))
> +					 5000, 0, NULL)) {
> +		ENGINE_TRACE(engine, "failed to restart\n");
>  		goto err;
> +	}
>  
> -	if (GRAPHICS_VER(engine->i915) > 2)
> +	if (GRAPHICS_VER(engine->i915) > 2) {
>  		ENGINE_WRITE_FW(engine,
>  				RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
> +		ENGINE_POSTING_READ(engine, RING_MI_MODE);
> +	}
>  
>  	/* Now awake, let it get started */
>  	if (ring->tail != ring->head) {
> @@ -257,16 +273,16 @@ static int xcs_resume(struct intel_engine_cs *engine)
>  	return 0;
>  
>  err:
> -	drm_err(&engine->i915->drm,
> -		"%s initialization failed; "
> -		"ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
> -		engine->name,
> -		ENGINE_READ(engine, RING_CTL),
> -		ENGINE_READ(engine, RING_CTL) & RING_VALID,
> -		ENGINE_READ(engine, RING_HEAD), ring->head,
> -		ENGINE_READ(engine, RING_TAIL), ring->tail,
> -		ENGINE_READ(engine, RING_START),
> -		i915_ggtt_offset(ring->vma));
> +	ENGINE_TRACE(engine,
> +		     "initialization failed; "
> +		     "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
> +		     ENGINE_READ(engine, RING_CTL),
> +		     ENGINE_READ(engine, RING_CTL) & RING_VALID,
> +		     ENGINE_READ(engine, RING_HEAD), ring->head,
> +		     ENGINE_READ(engine, RING_TAIL), ring->tail,
> +		     ENGINE_READ(engine, RING_START),
> +		     i915_ggtt_offset(ring->vma));
> +	GEM_TRACE_DUMP();
>  	return -EIO;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
> index c10d68cdc3ca..717fb6b9cc15 100644
> --- a/drivers/gpu/drm/i915/i915_utils.h
> +++ b/drivers/gpu/drm/i915/i915_utils.h
> @@ -256,6 +256,16 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
>  	}
>  }
>  
> +/**
> + * until_timeout_ns - Keep retrying (busy spin) until the duration has passed
> + * @end: temporary var to be used to track the spent time
> + * @timeout_ns: Maximum timeout, in nanosseconds
> + */
> +#define until_timeout_ns(end, timeout_ns) \
> +	for ((end) = ktime_get() + (timeout_ns); \
> +	     ktime_before(ktime_get(), (end)); \
> +	     cpu_relax())
> +

why do we need yet another timeout macro and cannot use any of the existent ways?

>  /**
>   * __wait_for - magic wait macro
>   *
> -- 
> 2.36.1
> 
>

next prev parent reply	other threads:[~2022-07-16 14:41 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-15  8:26 [Intel-gfx] [PATCH RFC] drm/i915/gt: Retry RING_HEAD reset until it sticks Mauro Carvalho Chehab
2022-07-15  8:26 ` Mauro Carvalho Chehab
2022-07-15  8:26 ` Mauro Carvalho Chehab
2022-07-15  9:27 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for " Patchwork
2022-07-15  9:46 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2022-07-15 10:11 ` [Intel-gfx] [PATCH RFC] " Andrzej Hajda
2022-07-15 10:11   ` Andrzej Hajda
2022-07-15 12:20 ` [Intel-gfx] ✗ Fi.CI.IGT: failure for " Patchwork
2022-07-15 20:28 ` Rodrigo Vivi [this message]
2022-07-15 20:28   ` [Intel-gfx] [PATCH RFC] " Rodrigo Vivi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YtHN2ZXvpuIWYBpo@intel.com \
    --to=rodrigo.vivi@intel.com \
    --cc=airlied@linux.ie \
    --cc=andrzej.hajda@intel.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=matthew.auld@intel.com \
    --cc=mchehab@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.