public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed
* [PATCH 0/2] Reset GuC and retry on fw load failure
@ 2016-03-01 17:14 Arun Siluvery
  2016-03-01 17:14 ` [PATCH 1/2] drm/i915/tdr: Add helper function to perform Engine reset Arun Siluvery
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Arun Siluvery @ 2016-03-01 17:14 UTC (permalink / raw)
  To: intel-gfx

Below changes add a mechanism to reset GuC and retry fw loading if the
initial load fails. There are cetain HW issues because of which fw load can
fail and the WA is to retry after resetting GuC.

A patch from engine reset series (which are under review) is sneaked in
here as this changes reuses some of the functionality in that patch.

Arun Siluvery (2):
  drm/i915/tdr: Add helper function to perform Engine reset
  drm/i915/guc: Reset GuC and retry on firmware load failure

 drivers/gpu/drm/i915/i915_drv.h         |  2 +
 drivers/gpu/drm/i915/i915_guc_reg.h     |  1 +
 drivers/gpu/drm/i915/i915_reg.h         |  3 ++
 drivers/gpu/drm/i915/intel_guc_loader.c | 49 ++++++++++++++++++-
 drivers/gpu/drm/i915/intel_uncore.c     | 84 +++++++++++++++++++++++++++++++++
 5 files changed, 137 insertions(+), 2 deletions(-)

-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/2] drm/i915/tdr: Add helper function to perform Engine reset
  2016-03-01 17:14 [PATCH 0/2] Reset GuC and retry on fw load failure Arun Siluvery
@ 2016-03-01 17:14 ` Arun Siluvery
  2016-03-01 17:14 ` [PATCH 2/2] drm/i915/guc: Reset GuC and retry on firmware load failure Arun Siluvery
  2016-03-02  7:27 ` ✗ Fi.CI.BAT: warning for Reset GuC and retry on fw " Patchwork
  2 siblings, 0 replies; 4+ messages in thread
From: Arun Siluvery @ 2016-03-01 17:14 UTC (permalink / raw)
  To: intel-gfx; +Cc: Tomas Elf

This patch only adds a function to reset an engine and it is in preparation
for the complete engine reset feature.

At the moment this is only made available from Gen8 onwards.

v2: use indexed initialization and keep everything under forcewake (Dave)

Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  1 +
 drivers/gpu/drm/i915/i915_reg.h     |  2 ++
 drivers/gpu/drm/i915/intel_uncore.c | 67 +++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 09b85b2..55dadfc 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2741,6 +2741,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 			      unsigned long arg);
 #endif
 extern int intel_gpu_reset(struct drm_device *dev);
+extern int intel_engine_reset(struct intel_engine_cs *engine);
 extern bool intel_has_gpu_reset(struct drm_device *dev);
 extern int i915_reset(struct drm_device *dev);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index f76cbf3..a798e40 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -164,6 +164,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define  GEN6_GRDOM_RENDER		(1 << 1)
 #define  GEN6_GRDOM_MEDIA		(1 << 2)
 #define  GEN6_GRDOM_BLT			(1 << 3)
+#define  GEN6_GRDOM_VECS		(1 << 4)
+#define  GEN8_GRDOM_MEDIA2		(1 << 7)
 
 #define RING_PP_DIR_BASE(ring)		_MMIO((ring)->mmio_base+0x228)
 #define RING_PP_DIR_BASE_READ(ring)	_MMIO((ring)->mmio_base+0x518)
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 436d8f2..d003b78 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1616,6 +1616,73 @@ bool intel_has_gpu_reset(struct drm_device *dev)
 	return intel_get_gpu_reset(dev) != NULL;
 }
 
+static int wait_for_engine_reset(struct drm_i915_private *dev_priv,
+				 unsigned int grdom)
+{
+	int ret;
+
+#define _CND ((__raw_i915_read32(dev_priv, GEN6_GDRST) & grdom) == 0)
+
+	/*
+	 * Spin waiting for the device to ack the reset request.
+	 * Times out after 500 us
+	 */
+	ret = wait_for_atomic_us(_CND, 500);
+#undef _CND
+
+	return ret;
+}
+
+static int gen8_do_engine_reset(struct intel_engine_cs *engine)
+{
+	struct drm_device *dev = engine->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u32 reset_ctl;
+	int ret;
+	int engine_mask[I915_NUM_RINGS] = {
+		[RCS] = GEN6_GRDOM_RENDER,
+		[BCS] = GEN6_GRDOM_BLT,
+		[VCS] = GEN6_GRDOM_MEDIA,
+		[VCS2] = GEN8_GRDOM_MEDIA2,
+		[VECS] = GEN6_GRDOM_VECS,
+	};
+
+	if (WARN_ON_ONCE(!intel_ring_initialized(engine)))
+		return -EINVAL;
+
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+	/* reset engine */
+	__raw_i915_write32(dev_priv, GEN6_GDRST, engine_mask[engine->id]);
+
+	ret = wait_for_engine_reset(dev_priv, engine_mask[engine->id]);
+	if (ret)
+		goto out;
+
+	/* Confirm that reset control register is back to normal after reset */
+	reset_ctl = I915_READ(RING_RESET_CTL(engine->mmio_base));
+	WARN((reset_ctl & (RESET_CTL_REQUEST_RESET | RESET_CTL_READY_TO_RESET)),
+	     "%s reset control still active after reset !! (0x%08x)\n",
+	     engine->name, reset_ctl);
+
+out:
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+	return ret;
+}
+
+int intel_engine_reset(struct intel_engine_cs *engine)
+{
+	struct drm_device *dev = engine->dev;
+
+	if (INTEL_INFO(dev)->gen < 8) {
+		DRM_ERROR("Engine Reset not supported on Gen%d\n",
+			  INTEL_INFO(dev)->gen);
+		return -EINVAL;
+	}
+
+	return gen8_do_engine_reset(engine);
+}
+
 bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
 {
 	return check_for_unclaimed_mmio(dev_priv);
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] drm/i915/guc: Reset GuC and retry on firmware load failure
  2016-03-01 17:14 [PATCH 0/2] Reset GuC and retry on fw load failure Arun Siluvery
  2016-03-01 17:14 ` [PATCH 1/2] drm/i915/tdr: Add helper function to perform Engine reset Arun Siluvery
@ 2016-03-01 17:14 ` Arun Siluvery
  2016-03-02  7:27 ` ✗ Fi.CI.BAT: warning for Reset GuC and retry on fw " Patchwork
  2 siblings, 0 replies; 4+ messages in thread
From: Arun Siluvery @ 2016-03-01 17:14 UTC (permalink / raw)
  To: intel-gfx

Due to timing issues in the HW some of the status bits required for GuC
authentication doesn't get set occassionally, when that happens, GuC cannot
be initialized and we will be left with a wedged GPU. The WA suggested is
to perform a soft reset of GuC and attempt to reload the fw again for few
times before giving up.

As the failure is dependent on timing, tests performed by triggering manual
full gpu reset (i915_wedged) showed that we could sometimes hit this after
several thousand iterations but sometimes tests ran even longer without any
issues. Reset and reload mechanism proved helpful when we indeed hit fw
load failure so it is better to include this to improve driver stability.

This change implements the following WA,

WaEnableuKernelHeaderValidFix:skl,bxt
WaEnableGuCBootHashCheckNotSet:skl,bxt

Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: Alex Dai <yu.dai@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_guc_reg.h     |  1 +
 drivers/gpu/drm/i915/i915_reg.h         |  1 +
 drivers/gpu/drm/i915/intel_guc_loader.c | 49 +++++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_uncore.c     | 17 ++++++++++++
 5 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 55dadfc..3e5a2e5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2742,6 +2742,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 #endif
 extern int intel_gpu_reset(struct drm_device *dev);
 extern int intel_engine_reset(struct intel_engine_cs *engine);
+extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern bool intel_has_gpu_reset(struct drm_device *dev);
 extern int i915_reset(struct drm_device *dev);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_guc_reg.h b/drivers/gpu/drm/i915/i915_guc_reg.h
index e4ba582..94ceee5 100644
--- a/drivers/gpu/drm/i915/i915_guc_reg.h
+++ b/drivers/gpu/drm/i915/i915_guc_reg.h
@@ -27,6 +27,7 @@
 /* Definitions of GuC H/W registers, bits, etc */
 
 #define GUC_STATUS			_MMIO(0xc000)
+#define   GS_MIA_IN_RESET		(1 << 0)
 #define   GS_BOOTROM_SHIFT		1
 #define   GS_BOOTROM_MASK		  (0x7F << GS_BOOTROM_SHIFT)
 #define   GS_BOOTROM_RSA_FAILED		  (0x50 << GS_BOOTROM_SHIFT)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a798e40..4496fc7 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -166,6 +166,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define  GEN6_GRDOM_BLT			(1 << 3)
 #define  GEN6_GRDOM_VECS		(1 << 4)
 #define  GEN8_GRDOM_MEDIA2		(1 << 7)
+#define  GEN9_GRDOM_GUC		        (1 << 5)
 
 #define RING_PP_DIR_BASE(ring)		_MMIO((ring)->mmio_base+0x228)
 #define RING_PP_DIR_BASE_READ(ring)	_MMIO((ring)->mmio_base+0x518)
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
index 82a3c03..f9cb814 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -353,6 +353,24 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
 	return ret;
 }
 
+static int i915_reset_guc(struct drm_i915_private *dev_priv)
+{
+	int ret;
+	u32 guc_status;
+
+	ret = intel_guc_reset(dev_priv);
+	if (ret) {
+		DRM_ERROR("GuC reset failed, ret = %d\n", ret);
+		return ret;
+	}
+
+	guc_status = I915_READ(GUC_STATUS);
+	WARN(!(guc_status & GS_MIA_IN_RESET),
+	     "GuC status: 0x%x, MIA core expected to be in reset\n", guc_status);
+
+	return ret;
+}
+
 /**
  * intel_guc_ucode_load() - load GuC uCode into the device
  * @dev:	drm device
@@ -417,9 +435,36 @@ int intel_guc_ucode_load(struct drm_device *dev)
 	if (err)
 		goto fail;
 
+	/*
+	 * WaEnableuKernelHeaderValidFix:skl,bxt
+	 * For BXT, this is only upto B0 but below WA is required for later
+	 * steppings also so this is extended as well.
+	 */
+	/* WaEnableGuCBootHashCheckNotSet:skl,bxt */
 	err = guc_ucode_xfer(dev_priv);
-	if (err)
-		goto fail;
+	if (err) {
+		int retries = 3;
+
+		DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err);
+
+		while (retries--) {
+			err = i915_reset_guc(dev_priv);
+			if (err)
+				break;
+
+			err = guc_ucode_xfer(dev_priv);
+			if (!err) {
+				DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n");
+				break;
+			}
+			DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries);
+		}
+
+		if (err) {
+			DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err);
+			goto fail;
+		}
+	}
 
 	guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS;
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index d003b78..19220b9 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1683,6 +1683,23 @@ int intel_engine_reset(struct intel_engine_cs *engine)
 	return gen8_do_engine_reset(engine);
 }
 
+int intel_guc_reset(struct drm_i915_private *dev_priv)
+{
+	int ret;
+
+	if (!i915.enable_guc_submission)
+		return -EINVAL;
+
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+	__raw_i915_write32(dev_priv, GEN6_GDRST, GEN9_GRDOM_GUC);
+	ret = wait_for_engine_reset(dev_priv, GEN9_GRDOM_GUC);
+
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+
+	return ret;
+}
+
 bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
 {
 	return check_for_unclaimed_mmio(dev_priv);
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* ✗ Fi.CI.BAT: warning for Reset GuC and retry on fw load failure
  2016-03-01 17:14 [PATCH 0/2] Reset GuC and retry on fw load failure Arun Siluvery
  2016-03-01 17:14 ` [PATCH 1/2] drm/i915/tdr: Add helper function to perform Engine reset Arun Siluvery
  2016-03-01 17:14 ` [PATCH 2/2] drm/i915/guc: Reset GuC and retry on firmware load failure Arun Siluvery
@ 2016-03-02  7:27 ` Patchwork
  2 siblings, 0 replies; 4+ messages in thread
From: Patchwork @ 2016-03-02  7:27 UTC (permalink / raw)
  To: arun.siluvery; +Cc: intel-gfx

== Series Details ==

Series: Reset GuC and retry on fw load failure
URL   : https://patchwork.freedesktop.org/series/3985/
State : warning

== Summary ==

Series 3985v1 Reset GuC and retry on fw load failure
http://patchwork.freedesktop.org/api/1.0/series/3985/revisions/1/mbox/

Test gem_pread:
        Subgroup basic:
                pass       -> DMESG-WARN (skl-i5k-2)
Test gem_sync:
        Subgroup basic-default:
                pass       -> DMESG-WARN (skl-i5k-2)
Test kms_addfb_basic:
        Subgroup bad-pitch-1024:
                pass       -> DMESG-WARN (skl-i5k-2)
Test kms_flip:
        Subgroup basic-flip-vs-modeset:
                dmesg-warn -> PASS       (ilk-hp8440p) UNSTABLE
        Subgroup basic-flip-vs-wf_vblank:
                dmesg-warn -> PASS       (hsw-gt2)
                pass       -> DMESG-WARN (hsw-brixbox)
Test kms_pipe_crc_basic:
        Subgroup hang-read-crc-pipe-b:
                pass       -> DMESG-WARN (snb-x220t)
        Subgroup suspend-read-crc-pipe-c:
                dmesg-warn -> PASS       (bsw-nuc-2)
Test pm_rpm:
        Subgroup basic-pci-d3-state:
                dmesg-fail -> FAIL       (snb-x220t)
                pass       -> DMESG-WARN (snb-dellxps)
        Subgroup basic-rte:
                pass       -> DMESG-WARN (snb-x220t)
                dmesg-warn -> PASS       (snb-dellxps)
                pass       -> DMESG-WARN (byt-nuc) UNSTABLE

bdw-nuci7        total:169  pass:158  dwarn:0   dfail:0   fail:0   skip:11 
bdw-ultra        total:169  pass:155  dwarn:0   dfail:0   fail:0   skip:14 
bsw-nuc-2        total:169  pass:138  dwarn:0   dfail:0   fail:1   skip:30 
byt-nuc          total:169  pass:143  dwarn:1   dfail:0   fail:0   skip:25 
hsw-brixbox      total:169  pass:153  dwarn:1   dfail:0   fail:0   skip:15 
hsw-gt2          total:169  pass:158  dwarn:0   dfail:0   fail:1   skip:10 
ilk-hp8440p      total:169  pass:118  dwarn:0   dfail:0   fail:1   skip:50 
ivb-t430s        total:169  pass:153  dwarn:0   dfail:0   fail:1   skip:15 
skl-i5k-2        total:169  pass:150  dwarn:3   dfail:0   fail:0   skip:16 
skl-i7k-2        total:169  pass:153  dwarn:0   dfail:0   fail:0   skip:16 
snb-dellxps      total:169  pass:144  dwarn:1   dfail:0   fail:1   skip:23 
snb-x220t        total:169  pass:142  dwarn:2   dfail:0   fail:3   skip:22 

Results at /archive/results/CI_IGT_test/Patchwork_1508/

f9cadb616ff17d482312fba07db772b6604ce799 drm-intel-nightly: 2016y-03m-01d-17h-16m-32s UTC integration manifest
b3df55875684f4f83670a346fec6942c3d7cb2ed drm/i915/guc: Reset GuC and retry on firmware load failure
810e6933da3f6fa9cf41ecfe08321a1259118fa7 drm/i915/tdr: Add helper function to perform Engine reset

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2016-03-02  7:27 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-03-01 17:14 [PATCH 0/2] Reset GuC and retry on fw load failure Arun Siluvery
2016-03-01 17:14 ` [PATCH 1/2] drm/i915/tdr: Add helper function to perform Engine reset Arun Siluvery
2016-03-01 17:14 ` [PATCH 2/2] drm/i915/guc: Reset GuC and retry on firmware load failure Arun Siluvery
2016-03-02  7:27 ` ✗ Fi.CI.BAT: warning for Reset GuC and retry on fw " Patchwork

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox