From: Arun Siluvery <arun.siluvery@linux.intel.com>
To: intel-gfx@lists.freedesktop.org
Cc: Ian Lister <ian.lister@intel.com>, Tomas Elf <tomas.elf@intel.com>
Subject: [PATCH 07/20] drm/i915: Watchdog timeout: Hang detection integration into error handler
Date: Wed, 13 Jan 2016 17:28:19 +0000 [thread overview]
Message-ID: <1452706112-8617-8-git-send-email-arun.siluvery@linux.intel.com> (raw)
In-Reply-To: <1452706112-8617-1-git-send-email-arun.siluvery@linux.intel.com>
From: Tomas Elf <tomas.elf@intel.com>
This patch enables watchdog timeout hang detection as an entrypoint into the
driver error handler. This form of hang detection overrides the promotion logic
normally used by the periodic hang checker and instead allows for direct access
to the per-engine hang recovery path.
NOTE: I don't know if Ben Widawsky had any part in this code from 3 years
ago. There have been so many people involved in this already that I am in no
position to know. If I've missed anyone's sob line please let me know.
Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@intel.com>
Signed-off-by: Ian Lister <ian.lister@intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 2 +-
drivers/gpu/drm/i915/i915_drv.h | 6 +++---
drivers/gpu/drm/i915/i915_irq.c | 43 ++++++++++++++++++++++---------------
3 files changed, 30 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6d1b6c3..dabddda 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4720,7 +4720,7 @@ i915_wedged_set(void *data, u64 val)
intel_runtime_pm_get(dev_priv);
- i915_handle_error(dev, 0x0, val,
+ i915_handle_error(dev, 0x0, false, val,
"Manually setting wedged to %llu", val);
intel_runtime_pm_put(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 072ca37..80e6d01 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2766,9 +2766,9 @@ static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine)
/* i915_irq.c */
void i915_queue_hangcheck(struct drm_device *dev);
-__printf(4, 5)
-void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
- const char *fmt, ...);
+__printf(5, 6)
+void i915_handle_error(struct drm_device *dev, u32 engine_mask,
+ bool watchdog, bool wedged, const char *fmt, ...);
extern void intel_irq_init(struct drm_i915_private *dev_priv);
int intel_irq_install(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 8937c82..0710724 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2726,6 +2726,7 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
* If a previous engine reset was attempted too recently
* or if one of the current engine resets fails we fall
* back to legacy full GPU reset.
+ * @watchdog: true = Engine hang detected by hardware watchdog.
* @wedged: true = Hang detected, invoke hang recovery.
* @fmt, ...: Error message describing reason for error.
*
@@ -2737,8 +2738,8 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
* reset the associated engine. Failing that, try to fall back to legacy
* full GPU reset recovery mode.
*/
-void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
- const char *fmt, ...)
+void i915_handle_error(struct drm_device *dev, u32 engine_mask,
+ bool watchdog, bool wedged, const char *fmt, ...)
{
struct drm_i915_private *dev_priv = dev->dev_private;
va_list args;
@@ -2776,20 +2777,27 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
u32 i;
for_each_ring(engine, dev_priv, i) {
- u32 now, last_engine_reset_timediff;
if (!(intel_ring_flag(engine) & engine_mask))
continue;
- /* Measure the time since this engine was last reset */
- now = get_seconds();
- last_engine_reset_timediff =
- now - engine->hangcheck.last_engine_reset_time;
-
- full_reset = last_engine_reset_timediff <
- i915.gpu_reset_promotion_time;
-
- engine->hangcheck.last_engine_reset_time = now;
+ if (!watchdog) {
+ /* Measure the time since this engine was last reset */
+ u32 now = get_seconds();
+ u32 last_engine_reset_timediff =
+ now - engine->hangcheck.last_engine_reset_time;
+
+ full_reset = last_engine_reset_timediff <
+ i915.gpu_reset_promotion_time;
+
+ engine->hangcheck.last_engine_reset_time = now;
+ } else {
+ /*
+ * Watchdog timeout always results
+ * in engine reset.
+ */
+ full_reset = false;
+ }
/*
* This engine was not reset too recently - go ahead
@@ -2800,10 +2808,11 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
* This can still be overridden by a global
* reset e.g. if per-engine reset fails.
*/
- if (!full_reset)
+ if (watchdog || !full_reset)
atomic_or(I915_ENGINE_RESET_IN_PROGRESS,
&engine->hangcheck.flags);
- else
+
+ if (full_reset)
break;
} /* for_each_ring */
@@ -3187,7 +3196,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
*/
tmp = I915_READ_CTL(ring);
if (tmp & RING_WAIT) {
- i915_handle_error(dev, intel_ring_flag(ring), false,
+ i915_handle_error(dev, intel_ring_flag(ring), false, false,
"Kicking stuck wait on %s",
ring->name);
I915_WRITE_CTL(ring, tmp);
@@ -3199,7 +3208,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
default:
return HANGCHECK_HUNG;
case 1:
- i915_handle_error(dev, intel_ring_flag(ring), false,
+ i915_handle_error(dev, intel_ring_flag(ring), false, false,
"Kicking stuck semaphore on %s",
ring->name);
I915_WRITE_CTL(ring, tmp);
@@ -3349,7 +3358,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
}
if (engine_mask) {
- i915_handle_error(dev, engine_mask, true, "Ring hung (0x%02x)", engine_mask);
+ i915_handle_error(dev, engine_mask, false, true, "Ring hung (0x%02x)", engine_mask);
goto out;
}
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
next prev parent reply other threads:[~2016-01-13 17:28 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-01-13 17:28 [PATCH 00/20] TDR/watchdog support for gen8 Arun Siluvery
2016-01-13 17:28 ` [PATCH 01/20] drm/i915: Make i915_gem_reset_ring_status() public Arun Siluvery
2016-01-13 17:28 ` [PATCH 02/20] drm/i915: Generalise common GPU engine reset request/unrequest code Arun Siluvery
2016-01-22 11:24 ` Mika Kuoppala
2016-01-13 17:28 ` [PATCH 03/20] drm/i915: TDR / per-engine hang recovery support for gen8 Arun Siluvery
2016-01-13 21:16 ` Chris Wilson
2016-01-13 21:21 ` Chris Wilson
2016-01-29 14:16 ` Mika Kuoppala
2016-01-13 17:28 ` [PATCH 04/20] drm/i915: TDR / per-engine hang detection Arun Siluvery
2016-01-13 20:37 ` Chris Wilson
2016-01-13 17:28 ` [PATCH 05/20] drm/i915: Extending i915_gem_check_wedge to check engine reset in progress Arun Siluvery
2016-01-13 20:49 ` Chris Wilson
2016-01-13 17:28 ` [PATCH 06/20] drm/i915: Reinstate hang recovery work queue Arun Siluvery
2016-01-13 21:01 ` Chris Wilson
2016-01-13 17:28 ` Arun Siluvery [this message]
2016-01-13 21:13 ` [PATCH 07/20] drm/i915: Watchdog timeout: Hang detection integration into error handler Chris Wilson
2016-01-13 17:28 ` [PATCH 08/20] drm/i915: Watchdog timeout: IRQ handler for gen8 Arun Siluvery
2016-01-13 17:28 ` [PATCH 09/20] drm/i915: Watchdog timeout: Ringbuffer command emission " Arun Siluvery
2016-01-13 17:28 ` [PATCH 10/20] drm/i915: Watchdog timeout: DRM kernel interface enablement Arun Siluvery
2016-01-13 17:28 ` [PATCH 11/20] drm/i915: Fake lost context event interrupts through forced CSB checking Arun Siluvery
2016-01-13 17:28 ` [PATCH 12/20] drm/i915: Debugfs interface for per-engine hang recovery Arun Siluvery
2016-01-13 17:28 ` [PATCH 13/20] drm/i915: Test infrastructure for context state inconsistency simulation Arun Siluvery
2016-01-13 17:28 ` [PATCH 14/20] drm/i915: TDR/watchdog trace points Arun Siluvery
2016-01-13 17:28 ` [PATCH 15/20] drm/i915: Port of Added scheduler support to __wait_request() calls Arun Siluvery
2016-01-13 17:28 ` [PATCH 16/20] drm/i915: Fix __i915_wait_request() behaviour during hang detection Arun Siluvery
2016-01-13 17:28 ` [PATCH 17/20] drm/i915: Extended error state with TDR count, watchdog count and engine reset count Arun Siluvery
2016-01-13 17:28 ` [PATCH 18/20] drm/i915: TDR / per-engine hang recovery kernel docs Arun Siluvery
2016-01-13 17:28 ` [PATCH 19/20] drm/i915: drm/i915 changes to simulated hangs Arun Siluvery
2016-01-13 17:28 ` [PATCH 20/20] drm/i915: Enable TDR / per-engine hang recovery Arun Siluvery
2016-01-14 8:30 ` ✗ failure: Fi.CI.BAT Patchwork
-- strict thread matches above, loose matches on Subject: below --
2015-10-23 1:32 [PATCH 00/20] TDR/watchdog support for gen8 Tomas Elf
2015-10-23 1:32 ` [PATCH 07/20] drm/i915: Watchdog timeout: Hang detection integration into error handler Tomas Elf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1452706112-8617-8-git-send-email-arun.siluvery@linux.intel.com \
--to=arun.siluvery@linux.intel.com \
--cc=ian.lister@intel.com \
--cc=intel-gfx@lists.freedesktop.org \
--cc=tomas.elf@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).