public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: fix HW lockup due to missing RPS IRQ workaround on GEN6
@ 2014-12-19  9:53 Imre Deak
  2014-12-19 10:21 ` Daniel Vetter
  0 siblings, 1 reply; 3+ messages in thread
From: Imre Deak @ 2014-12-19  9:53 UTC (permalink / raw)
  To: intel-gfx

In

commit dbea3cea69508e9d548ed4a6be13de35492e5d15
Author: Imre Deak <imre.deak@intel.com>
Date:   Mon Dec 15 18:59:28 2014 +0200

    drm/i915: sanitize RPS resetting during GPU reset

we disable RPS interrupts during GPU resetting, but don't apply the
necessary GEN6 HW workaround. This leads to a HW lockup during a
subsequent "looping batchbuffer" workload. This is triggered by the
testcase that submits exactly this kind of workload after a simulated
GPU reset. I'm not sure how likely the bug would have triggered
otherwise, since we would have applied the workaround anyway shortly
after the GPU reset, when enabling GT powersaving from the deferred
work.

This may also fix unrelated issues, since during driver loading /
suspending we also disable RPS interrupts and so we also had a short
window during the rest of the loading / resuming where a similar
workload could run without the workaround applied.

At the same time also keep the IRQ routing to display enabled at all
times and everywhere on GEN8+, since there isn't ever a reason to route
them to GT. For instance this way we could catch spurious interrupts on
the CPU instead of allowing some undefined behaviour on the GT (due to
the same spurious interrupt routed there).

Testcase: igt/gem_reset_stats/ban-ctx-render
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=87429
Signed-off-by: Imre Deak <imre.deak@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c  | 17 +++++++++++++++--
 drivers/gpu/drm/i915/intel_drv.h |  1 +
 drivers/gpu/drm/i915/intel_pm.c  | 11 +----------
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index aa3180c..12acc40 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -296,6 +296,20 @@ void gen6_enable_rps_interrupts(struct drm_device *dev)
 	spin_unlock_irq(&dev_priv->irq_lock);
 }
 
+u32 gen6_sanitize_rps_pm_mask(struct drm_i915_private *dev_priv, u32 mask)
+{
+	/* IVB and SNB hard hangs on looping batchbuffer
+	 * if GEN6_PM_UP_EI_EXPIRED is masked.
+	 */
+	if (INTEL_INFO(dev_priv->dev)->gen <= 7 && !IS_HASWELL(dev_priv->dev))
+		mask &= ~GEN6_PM_RP_UP_EI_EXPIRED;
+
+	if (INTEL_INFO(dev_priv->dev)->gen >= 8)
+		mask &= ~GEN8_PMINTR_REDIRECT_TO_NON_DISP;
+
+	return mask;
+}
+
 void gen6_disable_rps_interrupts(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -308,8 +322,7 @@ void gen6_disable_rps_interrupts(struct drm_device *dev)
 
 	spin_lock_irq(&dev_priv->irq_lock);
 
-	I915_WRITE(GEN6_PMINTRMSK, INTEL_INFO(dev_priv)->gen >= 8 ?
-		   ~GEN8_PMINTR_REDIRECT_TO_NON_DISP : ~0);
+	I915_WRITE(GEN6_PMINTRMSK, gen6_sanitize_rps_pm_mask(dev_priv, ~0));
 
 	__gen6_disable_pm_irq(dev_priv, dev_priv->pm_rps_events);
 	I915_WRITE(gen6_pm_ier(dev_priv), I915_READ(gen6_pm_ier(dev_priv)) &
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 588b618..bb871f3 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -795,6 +795,7 @@ void gen6_disable_pm_irq(struct drm_i915_private *dev_priv, uint32_t mask);
 void gen6_reset_rps_interrupts(struct drm_device *dev);
 void gen6_enable_rps_interrupts(struct drm_device *dev);
 void gen6_disable_rps_interrupts(struct drm_device *dev);
+u32 gen6_sanitize_rps_pm_mask(struct drm_i915_private *dev_priv, u32 mask);
 void intel_runtime_pm_disable_interrupts(struct drm_i915_private *dev_priv);
 void intel_runtime_pm_enable_interrupts(struct drm_i915_private *dev_priv);
 static inline bool intel_irqs_enabled(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index a3ebaa8..4bd1b8b 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3745,16 +3745,7 @@ static u32 gen6_rps_pm_mask(struct drm_i915_private *dev_priv, u8 val)
 	mask |= dev_priv->pm_rps_events & (GEN6_PM_RP_DOWN_EI_EXPIRED | GEN6_PM_RP_UP_EI_EXPIRED);
 	mask &= dev_priv->pm_rps_events;
 
-	/* IVB and SNB hard hangs on looping batchbuffer
-	 * if GEN6_PM_UP_EI_EXPIRED is masked.
-	 */
-	if (INTEL_INFO(dev_priv->dev)->gen <= 7 && !IS_HASWELL(dev_priv->dev))
-		mask |= GEN6_PM_RP_UP_EI_EXPIRED;
-
-	if (IS_GEN8(dev_priv->dev))
-		mask |= GEN8_PMINTR_REDIRECT_TO_NON_DISP;
-
-	return ~mask;
+	return gen6_sanitize_rps_pm_mask(dev_priv, ~mask);
 }
 
 /* gen6_set_rps is called to update the frequency request, but should also be
-- 
1.8.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/i915: fix HW lockup due to missing RPS IRQ workaround on GEN6
  2014-12-19  9:53 [PATCH] drm/i915: fix HW lockup due to missing RPS IRQ workaround on GEN6 Imre Deak
@ 2014-12-19 10:21 ` Daniel Vetter
  2014-12-19 12:30   ` Mika Kuoppala
  0 siblings, 1 reply; 3+ messages in thread
From: Daniel Vetter @ 2014-12-19 10:21 UTC (permalink / raw)
  To: Imre Deak; +Cc: intel-gfx

On Fri, Dec 19, 2014 at 11:53:21AM +0200, Imre Deak wrote:
> In
> 
> commit dbea3cea69508e9d548ed4a6be13de35492e5d15
> Author: Imre Deak <imre.deak@intel.com>
> Date:   Mon Dec 15 18:59:28 2014 +0200
> 
>     drm/i915: sanitize RPS resetting during GPU reset
> 
> we disable RPS interrupts during GPU resetting, but don't apply the
> necessary GEN6 HW workaround. This leads to a HW lockup during a
> subsequent "looping batchbuffer" workload. This is triggered by the
> testcase that submits exactly this kind of workload after a simulated
> GPU reset. I'm not sure how likely the bug would have triggered
> otherwise, since we would have applied the workaround anyway shortly
> after the GPU reset, when enabling GT powersaving from the deferred
> work.
> 
> This may also fix unrelated issues, since during driver loading /
> suspending we also disable RPS interrupts and so we also had a short
> window during the rest of the loading / resuming where a similar
> workload could run without the workaround applied.

Hm this might explain why the reset tests randomly hard-crashed snb every
once in a while. So symptoms matches. Mika, can you please give this a
spin?
 
> At the same time also keep the IRQ routing to display enabled at all
> times and everywhere on GEN8+, since there isn't ever a reason to route
> them to GT. For instance this way we could catch spurious interrupts on
> the CPU instead of allowing some undefined behaviour on the GT (due to
> the same spurious interrupt routed there).

I don't really grok that part, but it does feel like a separate patch
(especially since you also change from IS_GEN8 to gen >= 8). Or maybe I'm
just really confused.
-Daniel

> 
> Testcase: igt/gem_reset_stats/ban-ctx-render
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=87429
> Signed-off-by: Imre Deak <imre.deak@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_irq.c  | 17 +++++++++++++++--
>  drivers/gpu/drm/i915/intel_drv.h |  1 +
>  drivers/gpu/drm/i915/intel_pm.c  | 11 +----------
>  3 files changed, 17 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index aa3180c..12acc40 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -296,6 +296,20 @@ void gen6_enable_rps_interrupts(struct drm_device *dev)
>  	spin_unlock_irq(&dev_priv->irq_lock);
>  }
>  
> +u32 gen6_sanitize_rps_pm_mask(struct drm_i915_private *dev_priv, u32 mask)
> +{
> +	/* IVB and SNB hard hangs on looping batchbuffer
> +	 * if GEN6_PM_UP_EI_EXPIRED is masked.
> +	 */
> +	if (INTEL_INFO(dev_priv->dev)->gen <= 7 && !IS_HASWELL(dev_priv->dev))
> +		mask &= ~GEN6_PM_RP_UP_EI_EXPIRED;
> +
> +	if (INTEL_INFO(dev_priv->dev)->gen >= 8)
> +		mask &= ~GEN8_PMINTR_REDIRECT_TO_NON_DISP;
> +
> +	return mask;
> +}
> +
>  void gen6_disable_rps_interrupts(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> @@ -308,8 +322,7 @@ void gen6_disable_rps_interrupts(struct drm_device *dev)
>  
>  	spin_lock_irq(&dev_priv->irq_lock);
>  
> -	I915_WRITE(GEN6_PMINTRMSK, INTEL_INFO(dev_priv)->gen >= 8 ?
> -		   ~GEN8_PMINTR_REDIRECT_TO_NON_DISP : ~0);
> +	I915_WRITE(GEN6_PMINTRMSK, gen6_sanitize_rps_pm_mask(dev_priv, ~0));
>  
>  	__gen6_disable_pm_irq(dev_priv, dev_priv->pm_rps_events);
>  	I915_WRITE(gen6_pm_ier(dev_priv), I915_READ(gen6_pm_ier(dev_priv)) &
> diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
> index 588b618..bb871f3 100644
> --- a/drivers/gpu/drm/i915/intel_drv.h
> +++ b/drivers/gpu/drm/i915/intel_drv.h
> @@ -795,6 +795,7 @@ void gen6_disable_pm_irq(struct drm_i915_private *dev_priv, uint32_t mask);
>  void gen6_reset_rps_interrupts(struct drm_device *dev);
>  void gen6_enable_rps_interrupts(struct drm_device *dev);
>  void gen6_disable_rps_interrupts(struct drm_device *dev);
> +u32 gen6_sanitize_rps_pm_mask(struct drm_i915_private *dev_priv, u32 mask);
>  void intel_runtime_pm_disable_interrupts(struct drm_i915_private *dev_priv);
>  void intel_runtime_pm_enable_interrupts(struct drm_i915_private *dev_priv);
>  static inline bool intel_irqs_enabled(struct drm_i915_private *dev_priv)
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index a3ebaa8..4bd1b8b 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -3745,16 +3745,7 @@ static u32 gen6_rps_pm_mask(struct drm_i915_private *dev_priv, u8 val)
>  	mask |= dev_priv->pm_rps_events & (GEN6_PM_RP_DOWN_EI_EXPIRED | GEN6_PM_RP_UP_EI_EXPIRED);
>  	mask &= dev_priv->pm_rps_events;
>  
> -	/* IVB and SNB hard hangs on looping batchbuffer
> -	 * if GEN6_PM_UP_EI_EXPIRED is masked.
> -	 */
> -	if (INTEL_INFO(dev_priv->dev)->gen <= 7 && !IS_HASWELL(dev_priv->dev))
> -		mask |= GEN6_PM_RP_UP_EI_EXPIRED;
> -
> -	if (IS_GEN8(dev_priv->dev))
> -		mask |= GEN8_PMINTR_REDIRECT_TO_NON_DISP;
> -
> -	return ~mask;
> +	return gen6_sanitize_rps_pm_mask(dev_priv, ~mask);
>  }
>  
>  /* gen6_set_rps is called to update the frequency request, but should also be
> -- 
> 1.8.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/i915: fix HW lockup due to missing RPS IRQ workaround on GEN6
  2014-12-19 10:21 ` Daniel Vetter
@ 2014-12-19 12:30   ` Mika Kuoppala
  0 siblings, 0 replies; 3+ messages in thread
From: Mika Kuoppala @ 2014-12-19 12:30 UTC (permalink / raw)
  To: Daniel Vetter, Imre Deak; +Cc: intel-gfx

Daniel Vetter <daniel@ffwll.ch> writes:

> On Fri, Dec 19, 2014 at 11:53:21AM +0200, Imre Deak wrote:
>> In
>> 
>> commit dbea3cea69508e9d548ed4a6be13de35492e5d15
>> Author: Imre Deak <imre.deak@intel.com>
>> Date:   Mon Dec 15 18:59:28 2014 +0200
>> 
>>     drm/i915: sanitize RPS resetting during GPU reset
>> 
>> we disable RPS interrupts during GPU resetting, but don't apply the
>> necessary GEN6 HW workaround. This leads to a HW lockup during a
>> subsequent "looping batchbuffer" workload. This is triggered by the
>> testcase that submits exactly this kind of workload after a simulated
>> GPU reset. I'm not sure how likely the bug would have triggered
>> otherwise, since we would have applied the workaround anyway shortly
>> after the GPU reset, when enabling GT powersaving from the deferred
>> work.
>> 
>> This may also fix unrelated issues, since during driver loading /
>> suspending we also disable RPS interrupts and so we also had a short
>> window during the rest of the loading / resuming where a similar
>> workload could run without the workaround applied.
>
> Hm this might explain why the reset tests randomly hard-crashed snb every
> once in a while. So symptoms matches. Mika, can you please give this a
> spin?

307 loops of gem_reset_stats/ban-ctx-render on snb without issues.

-Mika

>> At the same time also keep the IRQ routing to display enabled at all
>> times and everywhere on GEN8+, since there isn't ever a reason to route
>> them to GT. For instance this way we could catch spurious interrupts on
>> the CPU instead of allowing some undefined behaviour on the GT (due to
>> the same spurious interrupt routed there).
>
> I don't really grok that part, but it does feel like a separate patch
> (especially since you also change from IS_GEN8 to gen >= 8). Or maybe I'm
> just really confused.
> -Daniel
>
>> 
>> Testcase: igt/gem_reset_stats/ban-ctx-render
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=87429
>> Signed-off-by: Imre Deak <imre.deak@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_irq.c  | 17 +++++++++++++++--
>>  drivers/gpu/drm/i915/intel_drv.h |  1 +
>>  drivers/gpu/drm/i915/intel_pm.c  | 11 +----------
>>  3 files changed, 17 insertions(+), 12 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>> index aa3180c..12acc40 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -296,6 +296,20 @@ void gen6_enable_rps_interrupts(struct drm_device *dev)
>>  	spin_unlock_irq(&dev_priv->irq_lock);
>>  }
>>  
>> +u32 gen6_sanitize_rps_pm_mask(struct drm_i915_private *dev_priv, u32 mask)
>> +{
>> +	/* IVB and SNB hard hangs on looping batchbuffer
>> +	 * if GEN6_PM_UP_EI_EXPIRED is masked.
>> +	 */
>> +	if (INTEL_INFO(dev_priv->dev)->gen <= 7 && !IS_HASWELL(dev_priv->dev))
>> +		mask &= ~GEN6_PM_RP_UP_EI_EXPIRED;
>> +
>> +	if (INTEL_INFO(dev_priv->dev)->gen >= 8)
>> +		mask &= ~GEN8_PMINTR_REDIRECT_TO_NON_DISP;
>> +
>> +	return mask;
>> +}
>> +
>>  void gen6_disable_rps_interrupts(struct drm_device *dev)
>>  {
>>  	struct drm_i915_private *dev_priv = dev->dev_private;
>> @@ -308,8 +322,7 @@ void gen6_disable_rps_interrupts(struct drm_device *dev)
>>  
>>  	spin_lock_irq(&dev_priv->irq_lock);
>>  
>> -	I915_WRITE(GEN6_PMINTRMSK, INTEL_INFO(dev_priv)->gen >= 8 ?
>> -		   ~GEN8_PMINTR_REDIRECT_TO_NON_DISP : ~0);
>> +	I915_WRITE(GEN6_PMINTRMSK, gen6_sanitize_rps_pm_mask(dev_priv, ~0));
>>  
>>  	__gen6_disable_pm_irq(dev_priv, dev_priv->pm_rps_events);
>>  	I915_WRITE(gen6_pm_ier(dev_priv), I915_READ(gen6_pm_ier(dev_priv)) &
>> diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
>> index 588b618..bb871f3 100644
>> --- a/drivers/gpu/drm/i915/intel_drv.h
>> +++ b/drivers/gpu/drm/i915/intel_drv.h
>> @@ -795,6 +795,7 @@ void gen6_disable_pm_irq(struct drm_i915_private *dev_priv, uint32_t mask);
>>  void gen6_reset_rps_interrupts(struct drm_device *dev);
>>  void gen6_enable_rps_interrupts(struct drm_device *dev);
>>  void gen6_disable_rps_interrupts(struct drm_device *dev);
>> +u32 gen6_sanitize_rps_pm_mask(struct drm_i915_private *dev_priv, u32 mask);
>>  void intel_runtime_pm_disable_interrupts(struct drm_i915_private *dev_priv);
>>  void intel_runtime_pm_enable_interrupts(struct drm_i915_private *dev_priv);
>>  static inline bool intel_irqs_enabled(struct drm_i915_private *dev_priv)
>> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
>> index a3ebaa8..4bd1b8b 100644
>> --- a/drivers/gpu/drm/i915/intel_pm.c
>> +++ b/drivers/gpu/drm/i915/intel_pm.c
>> @@ -3745,16 +3745,7 @@ static u32 gen6_rps_pm_mask(struct drm_i915_private *dev_priv, u8 val)
>>  	mask |= dev_priv->pm_rps_events & (GEN6_PM_RP_DOWN_EI_EXPIRED | GEN6_PM_RP_UP_EI_EXPIRED);
>>  	mask &= dev_priv->pm_rps_events;
>>  
>> -	/* IVB and SNB hard hangs on looping batchbuffer
>> -	 * if GEN6_PM_UP_EI_EXPIRED is masked.
>> -	 */
>> -	if (INTEL_INFO(dev_priv->dev)->gen <= 7 && !IS_HASWELL(dev_priv->dev))
>> -		mask |= GEN6_PM_RP_UP_EI_EXPIRED;
>> -
>> -	if (IS_GEN8(dev_priv->dev))
>> -		mask |= GEN8_PMINTR_REDIRECT_TO_NON_DISP;
>> -
>> -	return ~mask;
>> +	return gen6_sanitize_rps_pm_mask(dev_priv, ~mask);
>>  }
>>  
>>  /* gen6_set_rps is called to update the frequency request, but should also be
>> -- 
>> 1.8.4
>> 
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2014-12-19 12:31 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-12-19  9:53 [PATCH] drm/i915: fix HW lockup due to missing RPS IRQ workaround on GEN6 Imre Deak
2014-12-19 10:21 ` Daniel Vetter
2014-12-19 12:30   ` Mika Kuoppala

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox