intel-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
From: Arun Siluvery <arun.siluvery@linux.intel.com>
To: intel-gfx@lists.freedesktop.org
Cc: Ian Lister <ian.lister@intel.com>, Tomas Elf <tomas.elf@intel.com>
Subject: [PATCH 05/20] drm/i915: Extending i915_gem_check_wedge to check engine reset in progress
Date: Wed, 13 Jan 2016 17:28:17 +0000	[thread overview]
Message-ID: <1452706112-8617-6-git-send-email-arun.siluvery@linux.intel.com> (raw)
In-Reply-To: <1452706112-8617-1-git-send-email-arun.siluvery@linux.intel.com>

From: Tomas Elf <tomas.elf@intel.com>

i915_gem_wedge now returns a non-zero result in three different cases:

1. Legacy: A hang has been detected and full GPU reset is in progress.

2. Per-engine recovery:

	a. A single engine reference can be passed to the function, in which
	case only that engine will be checked. If that particular engine is
	detected to be hung and is to be reset this will yield a non-zero
	result but not if reset is in progress for any other engine.

	b. No engine reference is passed to the function, in which case all
	engines are checked for ongoing per-engine hang recovery.

Also, i915_wait_request was updated to take advantage of this new
functionality. This is important since the TDR hang recovery mechanism needs a
way to force waiting threads that hold the struct_mutex to give up the
struct_mutex and try again after the hang recovery has completed. If
i915_wait_request does not take per-engine hang recovery into account there is
no way for a waiting thread to know that a per-engine recovery is about to
happen and that it needs to back off.

Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@intel.com>
Signed-off-by: Ian Lister <ian.lister@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  3 +-
 drivers/gpu/drm/i915/i915_gem.c         | 60 +++++++++++++++++++++++++++------
 drivers/gpu/drm/i915/intel_lrc.c        |  4 +--
 drivers/gpu/drm/i915/intel_ringbuffer.c |  4 ++-
 4 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 85cf692..5be7d3e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3033,7 +3033,8 @@ i915_gem_find_active_request(struct intel_engine_cs *ring);
 
 bool i915_gem_retire_requests(struct drm_device *dev);
 void i915_gem_retire_requests_ring(struct intel_engine_cs *ring);
-int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
+int __must_check i915_gem_check_wedge(struct drm_i915_private *dev_priv,
+				      struct intel_engine_cs *engine,
 				      bool interruptible);
 
 static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e3cfed2..e6eb45d 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -80,12 +80,38 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
 	spin_unlock(&dev_priv->mm.object_stat_lock);
 }
 
+static inline int
+i915_engine_reset_in_progress(struct drm_i915_private *dev_priv,
+	struct intel_engine_cs *engine)
+{
+	int ret = 0;
+
+	if (engine) {
+		ret = !!(atomic_read(&dev_priv->ring[engine->id].hangcheck.flags)
+			& I915_ENGINE_RESET_IN_PROGRESS);
+	} else {
+		int i;
+
+		for (i = 0; i < I915_NUM_RINGS; i++)
+			if (atomic_read(&dev_priv->ring[i].hangcheck.flags)
+				& I915_ENGINE_RESET_IN_PROGRESS) {
+
+				ret = 1;
+				break;
+			}
+	}
+
+	return ret;
+}
+
 static int
-i915_gem_wait_for_error(struct i915_gpu_error *error)
+i915_gem_wait_for_error(struct drm_i915_private *dev_priv)
 {
 	int ret;
+	struct i915_gpu_error *error = &dev_priv->gpu_error;
 
 #define EXIT_COND (!i915_reset_in_progress(error) || \
+		   !i915_engine_reset_in_progress(dev_priv, NULL) || \
 		   i915_terminally_wedged(error))
 	if (EXIT_COND)
 		return 0;
@@ -114,7 +140,7 @@ int i915_mutex_lock_interruptible(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
+	ret = i915_gem_wait_for_error(dev_priv);
 	if (ret)
 		return ret;
 
@@ -1110,10 +1136,15 @@ put_rpm:
 }
 
 int
-i915_gem_check_wedge(struct i915_gpu_error *error,
+i915_gem_check_wedge(struct drm_i915_private *dev_priv,
+		     struct intel_engine_cs *engine,
 		     bool interruptible)
 {
-	if (i915_reset_in_progress(error)) {
+	struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+	if (i915_reset_in_progress(error) ||
+	    i915_engine_reset_in_progress(dev_priv, engine)) {
+
 		/* Non-interruptible callers can't handle -EAGAIN, hence return
 		 * -EIO unconditionally for these. */
 		if (!interruptible)
@@ -1253,6 +1284,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	unsigned long timeout_expire;
 	s64 before, now;
 	int ret;
+	int reset_in_progress = 0;
 
 	WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
 
@@ -1297,11 +1329,17 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 
 		/* We need to check whether any gpu reset happened in between
 		 * the caller grabbing the seqno and now ... */
-		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
+		reset_in_progress =
+			i915_gem_check_wedge(ring->dev->dev_private, NULL, interruptible);
+
+		if ((reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) ||
+		     reset_in_progress) {
+
 			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
 			 * is truely gone. */
-			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
-			if (ret == 0)
+			if (reset_in_progress)
+				ret = reset_in_progress;
+			else
 				ret = -EAGAIN;
 			break;
 		}
@@ -1470,7 +1508,7 @@ i915_wait_request(struct drm_i915_gem_request *req)
 
 	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+	ret = i915_gem_check_wedge(dev_priv, NULL, interruptible);
 	if (ret)
 		return ret;
 
@@ -1560,7 +1598,7 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 	if (!obj->active)
 		return 0;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, true);
+	ret = i915_gem_check_wedge(dev_priv, NULL, true);
 	if (ret)
 		return ret;
 
@@ -4104,11 +4142,11 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	unsigned reset_counter;
 	int ret;
 
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
+	ret = i915_gem_wait_for_error(dev_priv);
 	if (ret)
 		return ret;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, false);
+	ret = i915_gem_check_wedge(dev_priv, NULL, false);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fcec476..a2e56d4 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1066,8 +1066,8 @@ int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 
 	WARN_ON(req == NULL);
 	dev_priv = req->ring->dev->dev_private;
-
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
+	ret = i915_gem_check_wedge(dev_priv,
+				   req->ring,
 				   dev_priv->mm.interruptible);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index def0dcf..f959326 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2501,8 +2501,10 @@ int intel_ring_begin(struct drm_i915_gem_request *req,
 	ring = req->ring;
 	dev_priv = ring->dev->dev_private;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
+	ret = i915_gem_check_wedge(dev_priv,
+				   ring,
 				   dev_priv->mm.interruptible);
+
 	if (ret)
 		return ret;
 
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

  parent reply	other threads:[~2016-01-13 17:28 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-13 17:28 [PATCH 00/20] TDR/watchdog support for gen8 Arun Siluvery
2016-01-13 17:28 ` [PATCH 01/20] drm/i915: Make i915_gem_reset_ring_status() public Arun Siluvery
2016-01-13 17:28 ` [PATCH 02/20] drm/i915: Generalise common GPU engine reset request/unrequest code Arun Siluvery
2016-01-22 11:24   ` Mika Kuoppala
2016-01-13 17:28 ` [PATCH 03/20] drm/i915: TDR / per-engine hang recovery support for gen8 Arun Siluvery
2016-01-13 21:16   ` Chris Wilson
2016-01-13 21:21   ` Chris Wilson
2016-01-29 14:16   ` Mika Kuoppala
2016-01-13 17:28 ` [PATCH 04/20] drm/i915: TDR / per-engine hang detection Arun Siluvery
2016-01-13 20:37   ` Chris Wilson
2016-01-13 17:28 ` Arun Siluvery [this message]
2016-01-13 20:49   ` [PATCH 05/20] drm/i915: Extending i915_gem_check_wedge to check engine reset in progress Chris Wilson
2016-01-13 17:28 ` [PATCH 06/20] drm/i915: Reinstate hang recovery work queue Arun Siluvery
2016-01-13 21:01   ` Chris Wilson
2016-01-13 17:28 ` [PATCH 07/20] drm/i915: Watchdog timeout: Hang detection integration into error handler Arun Siluvery
2016-01-13 21:13   ` Chris Wilson
2016-01-13 17:28 ` [PATCH 08/20] drm/i915: Watchdog timeout: IRQ handler for gen8 Arun Siluvery
2016-01-13 17:28 ` [PATCH 09/20] drm/i915: Watchdog timeout: Ringbuffer command emission " Arun Siluvery
2016-01-13 17:28 ` [PATCH 10/20] drm/i915: Watchdog timeout: DRM kernel interface enablement Arun Siluvery
2016-01-13 17:28 ` [PATCH 11/20] drm/i915: Fake lost context event interrupts through forced CSB checking Arun Siluvery
2016-01-13 17:28 ` [PATCH 12/20] drm/i915: Debugfs interface for per-engine hang recovery Arun Siluvery
2016-01-13 17:28 ` [PATCH 13/20] drm/i915: Test infrastructure for context state inconsistency simulation Arun Siluvery
2016-01-13 17:28 ` [PATCH 14/20] drm/i915: TDR/watchdog trace points Arun Siluvery
2016-01-13 17:28 ` [PATCH 15/20] drm/i915: Port of Added scheduler support to __wait_request() calls Arun Siluvery
2016-01-13 17:28 ` [PATCH 16/20] drm/i915: Fix __i915_wait_request() behaviour during hang detection Arun Siluvery
2016-01-13 17:28 ` [PATCH 17/20] drm/i915: Extended error state with TDR count, watchdog count and engine reset count Arun Siluvery
2016-01-13 17:28 ` [PATCH 18/20] drm/i915: TDR / per-engine hang recovery kernel docs Arun Siluvery
2016-01-13 17:28 ` [PATCH 19/20] drm/i915: drm/i915 changes to simulated hangs Arun Siluvery
2016-01-13 17:28 ` [PATCH 20/20] drm/i915: Enable TDR / per-engine hang recovery Arun Siluvery
2016-01-14  8:30 ` ✗ failure: Fi.CI.BAT Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2015-10-23  1:32 [PATCH 00/20] TDR/watchdog support for gen8 Tomas Elf
2015-10-23  1:32 ` [PATCH 05/20] drm/i915: Extending i915_gem_check_wedge to check engine reset in progress Tomas Elf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1452706112-8617-6-git-send-email-arun.siluvery@linux.intel.com \
    --to=arun.siluvery@linux.intel.com \
    --cc=ian.lister@intel.com \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=tomas.elf@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).