Intel-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error
@ 2023-06-05 20:54 John.C.Harrison
  2023-06-06  2:43 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for " Patchwork
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: John.C.Harrison @ 2023-06-05 20:54 UTC (permalink / raw)
  To: Intel-GFX; +Cc: DRI-Devel

From: John Harrison <John.C.Harrison@Intel.com>

If GuC hits an internal error (and survives long enough to report it
to the KMD), it is basically toast and will stop until a GT reset and
subsequent GuC reload is performed. Previously, the KMD just printed
an error message and then waited for the heartbeat to eventually kick
in and trigger a reset (assuming the heartbeat had not been disabled).
Instead, force the reset immediately to guarantee that it happens and
to eliminate the very long heartbeat delay. The captured error state
is also more likely to be useful if captured at the time of the error
rather than many seconds later.

Note that it is not possible to trigger a reset from with the G2H
handler itself. The reset prepare process involves flushing
outstanding G2H contents. So a deadlock could result. Instead, the G2H
handler queues a worker thread to do the reset asynchronously.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc.c    | 26 +++++++++++++++++++++++
 drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  9 ++++++++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  6 +-----
 3 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 2eb891b270aec..c35cf10f52b56 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -159,6 +159,13 @@ static void gen11_disable_guc_interrupts(struct intel_guc *guc)
 	gen11_reset_guc_interrupts(guc);
 }
 
+static void guc_dead_worker_func(struct work_struct *w)
+{
+	struct intel_guc *guc = container_of(w, struct intel_guc, dead_guc_worker);
+
+	intel_gt_handle_error(guc_to_gt(guc), ALL_ENGINES, I915_ERROR_CAPTURE, "dead GuC");
+}
+
 void intel_guc_init_early(struct intel_guc *guc)
 {
 	struct intel_gt *gt = guc_to_gt(guc);
@@ -171,6 +178,8 @@ void intel_guc_init_early(struct intel_guc *guc)
 	intel_guc_slpc_init_early(&guc->slpc);
 	intel_guc_rc_init_early(guc);
 
+	INIT_WORK(&guc->dead_guc_worker, guc_dead_worker_func);
+
 	mutex_init(&guc->send_mutex);
 	spin_lock_init(&guc->irq_lock);
 	if (GRAPHICS_VER(i915) >= 11) {
@@ -585,6 +594,20 @@ int intel_guc_send_mmio(struct intel_guc *guc, const u32 *request, u32 len,
 	return ret;
 }
 
+int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action)
+{
+	if (action == INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED)
+		guc_err(guc, "Crash dump notification\n");
+	else if (action == INTEL_GUC_ACTION_NOTIFY_EXCEPTION)
+		guc_err(guc, "Exception notification\n");
+	else
+		guc_err(guc, "Unknown crash notification\n");
+
+	queue_work(system_unbound_wq, &guc->dead_guc_worker);
+
+	return 0;
+}
+
 int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
 				       const u32 *payload, u32 len)
 {
@@ -601,6 +624,9 @@ int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
 	if (msg & INTEL_GUC_RECV_MSG_EXCEPTION)
 		guc_err(guc, "Received early exception notification!\n");
 
+	if (msg & (INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED | INTEL_GUC_RECV_MSG_EXCEPTION))
+		queue_work(system_unbound_wq, &guc->dead_guc_worker);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 8dc291ff00935..0b54eec95fc00 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -266,6 +266,14 @@ struct intel_guc {
 		unsigned long last_stat_jiffies;
 	} timestamp;
 
+	/**
+	 * @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset.
+	 * Specifically used when the G2H handler wants to issue a reset. Resets
+	 * require flushing the G2H queue. So, the G2H processing itself must not
+	 * trigger a reset directly. Instead, go via this worker.
+	 */
+	struct work_struct dead_guc_worker;
+
 #ifdef CONFIG_DRM_I915_SELFTEST
 	/**
 	 * @number_guc_id_stolen: The number of guc_ids that have been stolen
@@ -476,6 +484,7 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
 					 const u32 *msg, u32 len);
 int intel_guc_error_capture_process_msg(struct intel_guc *guc,
 					const u32 *msg, u32 len);
+int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action);
 
 struct intel_engine_cs *
 intel_guc_lookup_engine(struct intel_guc *guc, u8 guc_class, u8 instance);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index f28a3a83742dc..7b09ad6931021 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -1116,12 +1116,8 @@ static int ct_process_request(struct intel_guc_ct *ct, struct ct_incoming_msg *r
 		ret = 0;
 		break;
 	case INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED:
-		CT_ERROR(ct, "Received GuC crash dump notification!\n");
-		ret = 0;
-		break;
 	case INTEL_GUC_ACTION_NOTIFY_EXCEPTION:
-		CT_ERROR(ct, "Received GuC exception notification!\n");
-		ret = 0;
+		ret = intel_guc_crash_process_msg(guc, action);
 		break;
 	default:
 		ret = -EOPNOTSUPP;
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [Intel-gfx] ✗ Fi.CI.SPARSE: warning for drm/i915/guc: Force a reset on internal GuC error
  2023-06-05 20:54 [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error John.C.Harrison
@ 2023-06-06  2:43 ` Patchwork
  2023-06-06  2:54 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Patchwork @ 2023-06-06  2:43 UTC (permalink / raw)
  To: john.c.harrison; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/guc: Force a reset on internal GuC error
URL   : https://patchwork.freedesktop.org/series/118890/
State : warning

== Summary ==

Error: dim sparse failed
Sparse version: v0.6.2
Fast mode used, each commit won't be checked separately.



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/guc: Force a reset on internal GuC error
  2023-06-05 20:54 [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error John.C.Harrison
  2023-06-06  2:43 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for " Patchwork
@ 2023-06-06  2:54 ` Patchwork
  2023-06-06 23:48 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
  2023-06-07  0:15 ` [Intel-gfx] [PATCH] " Ceraolo Spurio, Daniele
  3 siblings, 0 replies; 5+ messages in thread
From: Patchwork @ 2023-06-06  2:54 UTC (permalink / raw)
  To: john.c.harrison; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 4913 bytes --]

== Series Details ==

Series: drm/i915/guc: Force a reset on internal GuC error
URL   : https://patchwork.freedesktop.org/series/118890/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_13234 -> Patchwork_118890v1
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/index.html

Participating hosts (38 -> 36)
------------------------------

  Missing    (2): fi-kbl-soraka fi-snb-2520m 

Known issues
------------

  Here are the changes found in Patchwork_118890v1 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live@slpc:
    - bat-rpls-2:         NOTRUN -> [DMESG-WARN][1] ([i915#6367])
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/bat-rpls-2/igt@i915_selftest@live@slpc.html

  * igt@i915_suspend@basic-s2idle-without-i915:
    - bat-rpls-2:         NOTRUN -> [ABORT][2] ([i915#6687])
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/bat-rpls-2/igt@i915_suspend@basic-s2idle-without-i915.html

  * igt@kms_pipe_crc_basic@nonblocking-crc-frame-sequence@pipe-d-dp-1:
    - bat-dg2-8:          [PASS][3] -> [FAIL][4] ([i915#7932])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/bat-dg2-8/igt@kms_pipe_crc_basic@nonblocking-crc-frame-sequence@pipe-d-dp-1.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/bat-dg2-8/igt@kms_pipe_crc_basic@nonblocking-crc-frame-sequence@pipe-d-dp-1.html

  
#### Possible fixes ####

  * igt@i915_pm_rpm@basic-pci-d3-state:
    - {bat-mtlp-8}:       [ABORT][5] ([i915#7953]) -> [PASS][6]
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/bat-mtlp-8/igt@i915_pm_rpm@basic-pci-d3-state.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/bat-mtlp-8/igt@i915_pm_rpm@basic-pci-d3-state.html

  * igt@i915_selftest@live@reset:
    - bat-rpls-2:         [ABORT][7] ([i915#4983] / [i915#7461] / [i915#7913] / [i915#7981] / [i915#8347]) -> [PASS][8]
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/bat-rpls-2/igt@i915_selftest@live@reset.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/bat-rpls-2/igt@i915_selftest@live@reset.html

  * igt@i915_selftest@live@slpc:
    - bat-rpls-1:         [DMESG-WARN][9] ([i915#6367]) -> [PASS][10]
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/bat-rpls-1/igt@i915_selftest@live@slpc.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/bat-rpls-1/igt@i915_selftest@live@slpc.html

  * {igt@kms_pipe_crc_basic@compare-crc-sanitycheck-xr24@pipe-d-dp-5}:
    - {bat-adlp-11}:      [ABORT][11] ([i915#4423]) -> [PASS][12]
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/bat-adlp-11/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-xr24@pipe-d-dp-5.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/bat-adlp-11/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-xr24@pipe-d-dp-5.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [i915#3708]: https://gitlab.freedesktop.org/drm/intel/issues/3708
  [i915#4077]: https://gitlab.freedesktop.org/drm/intel/issues/4077
  [i915#4423]: https://gitlab.freedesktop.org/drm/intel/issues/4423
  [i915#4613]: https://gitlab.freedesktop.org/drm/intel/issues/4613
  [i915#4983]: https://gitlab.freedesktop.org/drm/intel/issues/4983
  [i915#6367]: https://gitlab.freedesktop.org/drm/intel/issues/6367
  [i915#6621]: https://gitlab.freedesktop.org/drm/intel/issues/6621
  [i915#6645]: https://gitlab.freedesktop.org/drm/intel/issues/6645
  [i915#6687]: https://gitlab.freedesktop.org/drm/intel/issues/6687
  [i915#7461]: https://gitlab.freedesktop.org/drm/intel/issues/7461
  [i915#7828]: https://gitlab.freedesktop.org/drm/intel/issues/7828
  [i915#7913]: https://gitlab.freedesktop.org/drm/intel/issues/7913
  [i915#7932]: https://gitlab.freedesktop.org/drm/intel/issues/7932
  [i915#7953]: https://gitlab.freedesktop.org/drm/intel/issues/7953
  [i915#7981]: https://gitlab.freedesktop.org/drm/intel/issues/7981
  [i915#8347]: https://gitlab.freedesktop.org/drm/intel/issues/8347


Build changes
-------------

  * Linux: CI_DRM_13234 -> Patchwork_118890v1

  CI-20190529: 20190529
  CI_DRM_13234: cb7bb5b791053c0ff10e314d24e6752795283803 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_7319: 2e1bcd49944452b5f9516eecee48e1fa3ae6a636 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_118890v1: cb7bb5b791053c0ff10e314d24e6752795283803 @ git://anongit.freedesktop.org/gfx-ci/linux


### Linux commits

4ea24f81628a drm/i915/guc: Force a reset on internal GuC error

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/index.html

[-- Attachment #2: Type: text/html, Size: 5347 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Intel-gfx] ✓ Fi.CI.IGT: success for drm/i915/guc: Force a reset on internal GuC error
  2023-06-05 20:54 [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error John.C.Harrison
  2023-06-06  2:43 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for " Patchwork
  2023-06-06  2:54 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
@ 2023-06-06 23:48 ` Patchwork
  2023-06-07  0:15 ` [Intel-gfx] [PATCH] " Ceraolo Spurio, Daniele
  3 siblings, 0 replies; 5+ messages in thread
From: Patchwork @ 2023-06-06 23:48 UTC (permalink / raw)
  To: John Harrison; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 8479 bytes --]

== Series Details ==

Series: drm/i915/guc: Force a reset on internal GuC error
URL   : https://patchwork.freedesktop.org/series/118890/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_13234_full -> Patchwork_118890v1_full
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  

Participating hosts (7 -> 7)
------------------------------

  No changes in participating hosts

Known issues
------------

  Here are the changes found in Patchwork_118890v1_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_exec_fair@basic-pace-share@rcs0:
    - shard-glk:          [PASS][1] -> [FAIL][2] ([i915#2842])
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-glk5/igt@gem_exec_fair@basic-pace-share@rcs0.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-glk4/igt@gem_exec_fair@basic-pace-share@rcs0.html

  * igt@kms_flip@flip-vs-expired-vblank-interruptible@c-dp1:
    - shard-apl:          [PASS][3] -> [FAIL][4] ([i915#79])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-apl1/igt@kms_flip@flip-vs-expired-vblank-interruptible@c-dp1.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-apl7/igt@kms_flip@flip-vs-expired-vblank-interruptible@c-dp1.html

  * igt@kms_frontbuffer_tracking@fbc-2p-primscrn-pri-shrfb-draw-mmap-cpu:
    - shard-snb:          [PASS][5] -> [SKIP][6] ([fdo#109271])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-snb1/igt@kms_frontbuffer_tracking@fbc-2p-primscrn-pri-shrfb-draw-mmap-cpu.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-snb5/igt@kms_frontbuffer_tracking@fbc-2p-primscrn-pri-shrfb-draw-mmap-cpu.html

  * igt@kms_hdr@bpc-switch-dpms@pipe-a-dp-1:
    - shard-apl:          [PASS][7] -> [FAIL][8] ([i915#1188])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-apl2/igt@kms_hdr@bpc-switch-dpms@pipe-a-dp-1.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-apl1/igt@kms_hdr@bpc-switch-dpms@pipe-a-dp-1.html

  * igt@kms_plane_scaling@plane-downscale-with-rotation-factor-0-75@pipe-b-hdmi-a-1:
    - shard-snb:          NOTRUN -> [SKIP][9] ([fdo#109271] / [i915#4579]) +5 similar issues
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-snb1/igt@kms_plane_scaling@plane-downscale-with-rotation-factor-0-75@pipe-b-hdmi-a-1.html

  * igt@kms_tv_load_detect@load-detect:
    - shard-snb:          NOTRUN -> [SKIP][10] ([fdo#109271]) +37 similar issues
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-snb2/igt@kms_tv_load_detect@load-detect.html

  
#### Possible fixes ####

  * igt@gem_exec_fair@basic-none@vecs0:
    - {shard-rkl}:        [FAIL][11] ([i915#2842]) -> [PASS][12] +2 similar issues
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-rkl-2/igt@gem_exec_fair@basic-none@vecs0.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-rkl-7/igt@gem_exec_fair@basic-none@vecs0.html

  * igt@gem_exec_fair@basic-throttle@rcs0:
    - {shard-tglu}:       [FAIL][13] ([i915#2842]) -> [PASS][14]
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-tglu-9/igt@gem_exec_fair@basic-throttle@rcs0.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-tglu-9/igt@gem_exec_fair@basic-throttle@rcs0.html

  * igt@i915_pm_rpm@dpms-mode-unset-lpsp:
    - {shard-rkl}:        [SKIP][15] ([i915#1397]) -> [PASS][16] +2 similar issues
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-rkl-2/igt@i915_pm_rpm@dpms-mode-unset-lpsp.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-rkl-7/igt@i915_pm_rpm@dpms-mode-unset-lpsp.html

  * igt@i915_pm_rps@reset:
    - shard-snb:          [DMESG-FAIL][17] ([i915#8319]) -> [PASS][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-snb4/igt@i915_pm_rps@reset.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-snb2/igt@i915_pm_rps@reset.html

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size:
    - shard-apl:          [FAIL][19] ([i915#2346]) -> [PASS][20]
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-apl1/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-apl7/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html

  * igt@kms_draw_crc@draw-method-blt@xrgb2101010-ytiled:
    - shard-glk:          [DMESG-WARN][21] ([i915#7936]) -> [PASS][22]
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-glk7/igt@kms_draw_crc@draw-method-blt@xrgb2101010-ytiled.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-glk6/igt@kms_draw_crc@draw-method-blt@xrgb2101010-ytiled.html

  * igt@kms_fbcon_fbt@fbc-suspend:
    - shard-apl:          [FAIL][23] ([i915#4767]) -> [PASS][24]
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-apl6/igt@kms_fbcon_fbt@fbc-suspend.html
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-apl2/igt@kms_fbcon_fbt@fbc-suspend.html

  * igt@kms_flip@2x-flip-vs-expired-vblank@ab-hdmi-a1-hdmi-a2:
    - shard-glk:          [FAIL][25] ([i915#2122]) -> [PASS][26] +1 similar issue
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13234/shard-glk9/igt@kms_flip@2x-flip-vs-expired-vblank@ab-hdmi-a1-hdmi-a2.html
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/shard-glk7/igt@kms_flip@2x-flip-vs-expired-vblank@ab-hdmi-a1-hdmi-a2.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109300]: https://bugs.freedesktop.org/show_bug.cgi?id=109300
  [i915#1072]: https://gitlab.freedesktop.org/drm/intel/issues/1072
  [i915#1188]: https://gitlab.freedesktop.org/drm/intel/issues/1188
  [i915#1397]: https://gitlab.freedesktop.org/drm/intel/issues/1397
  [i915#1937]: https://gitlab.freedesktop.org/drm/intel/issues/1937
  [i915#2122]: https://gitlab.freedesktop.org/drm/intel/issues/2122
  [i915#2346]: https://gitlab.freedesktop.org/drm/intel/issues/2346
  [i915#2681]: https://gitlab.freedesktop.org/drm/intel/issues/2681
  [i915#2842]: https://gitlab.freedesktop.org/drm/intel/issues/2842
  [i915#3591]: https://gitlab.freedesktop.org/drm/intel/issues/3591
  [i915#4070]: https://gitlab.freedesktop.org/drm/intel/issues/4070
  [i915#4078]: https://gitlab.freedesktop.org/drm/intel/issues/4078
  [i915#4579]: https://gitlab.freedesktop.org/drm/intel/issues/4579
  [i915#4767]: https://gitlab.freedesktop.org/drm/intel/issues/4767
  [i915#4816]: https://gitlab.freedesktop.org/drm/intel/issues/4816
  [i915#5176]: https://gitlab.freedesktop.org/drm/intel/issues/5176
  [i915#5235]: https://gitlab.freedesktop.org/drm/intel/issues/5235
  [i915#6268]: https://gitlab.freedesktop.org/drm/intel/issues/6268
  [i915#7116]: https://gitlab.freedesktop.org/drm/intel/issues/7116
  [i915#7461]: https://gitlab.freedesktop.org/drm/intel/issues/7461
  [i915#7742]: https://gitlab.freedesktop.org/drm/intel/issues/7742
  [i915#79]: https://gitlab.freedesktop.org/drm/intel/issues/79
  [i915#7936]: https://gitlab.freedesktop.org/drm/intel/issues/7936
  [i915#8292]: https://gitlab.freedesktop.org/drm/intel/issues/8292
  [i915#8304]: https://gitlab.freedesktop.org/drm/intel/issues/8304
  [i915#8311]: https://gitlab.freedesktop.org/drm/intel/issues/8311
  [i915#8319]: https://gitlab.freedesktop.org/drm/intel/issues/8319
  [i915#8502]: https://gitlab.freedesktop.org/drm/intel/issues/8502


Build changes
-------------

  * Linux: CI_DRM_13234 -> Patchwork_118890v1

  CI-20190529: 20190529
  CI_DRM_13234: cb7bb5b791053c0ff10e314d24e6752795283803 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_7319: 2e1bcd49944452b5f9516eecee48e1fa3ae6a636 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_118890v1: cb7bb5b791053c0ff10e314d24e6752795283803 @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v1/index.html

[-- Attachment #2: Type: text/html, Size: 8473 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error
  2023-06-05 20:54 [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error John.C.Harrison
                   ` (2 preceding siblings ...)
  2023-06-06 23:48 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
@ 2023-06-07  0:15 ` Ceraolo Spurio, Daniele
  3 siblings, 0 replies; 5+ messages in thread
From: Ceraolo Spurio, Daniele @ 2023-06-07  0:15 UTC (permalink / raw)
  To: John.C.Harrison, Intel-GFX; +Cc: DRI-Devel



On 6/5/2023 1:54 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> If GuC hits an internal error (and survives long enough to report it
> to the KMD), it is basically toast and will stop until a GT reset and
> subsequent GuC reload is performed. Previously, the KMD just printed
> an error message and then waited for the heartbeat to eventually kick
> in and trigger a reset (assuming the heartbeat had not been disabled).
> Instead, force the reset immediately to guarantee that it happens and
> to eliminate the very long heartbeat delay. The captured error state
> is also more likely to be useful if captured at the time of the error
> rather than many seconds later.
>
> Note that it is not possible to trigger a reset from with the G2H
> handler itself. The reset prepare process involves flushing
> outstanding G2H contents. So a deadlock could result. Instead, the G2H
> handler queues a worker thread to do the reset asynchronously.
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc.c    | 26 +++++++++++++++++++++++
>   drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  9 ++++++++
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  6 +-----
>   3 files changed, 36 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> index 2eb891b270aec..c35cf10f52b56 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> @@ -159,6 +159,13 @@ static void gen11_disable_guc_interrupts(struct intel_guc *guc)
>   	gen11_reset_guc_interrupts(guc);
>   }
>   
> +static void guc_dead_worker_func(struct work_struct *w)
> +{
> +	struct intel_guc *guc = container_of(w, struct intel_guc, dead_guc_worker);
> +
> +	intel_gt_handle_error(guc_to_gt(guc), ALL_ENGINES, I915_ERROR_CAPTURE, "dead GuC");
> +}
> +
>   void intel_guc_init_early(struct intel_guc *guc)
>   {
>   	struct intel_gt *gt = guc_to_gt(guc);
> @@ -171,6 +178,8 @@ void intel_guc_init_early(struct intel_guc *guc)
>   	intel_guc_slpc_init_early(&guc->slpc);
>   	intel_guc_rc_init_early(guc);
>   
> +	INIT_WORK(&guc->dead_guc_worker, guc_dead_worker_func);
> +
>   	mutex_init(&guc->send_mutex);
>   	spin_lock_init(&guc->irq_lock);
>   	if (GRAPHICS_VER(i915) >= 11) {
> @@ -585,6 +594,20 @@ int intel_guc_send_mmio(struct intel_guc *guc, const u32 *request, u32 len,
>   	return ret;
>   }
>   
> +int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action)
> +{
> +	if (action == INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED)
> +		guc_err(guc, "Crash dump notification\n");
> +	else if (action == INTEL_GUC_ACTION_NOTIFY_EXCEPTION)
> +		guc_err(guc, "Exception notification\n");
> +	else
> +		guc_err(guc, "Unknown crash notification\n");
> +
> +	queue_work(system_unbound_wq, &guc->dead_guc_worker);

Do we need to flush the worker on suspend/unload?

> +
> +	return 0;
> +}
> +
>   int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
>   				       const u32 *payload, u32 len)
>   {
> @@ -601,6 +624,9 @@ int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
>   	if (msg & INTEL_GUC_RECV_MSG_EXCEPTION)
>   		guc_err(guc, "Received early exception notification!\n");
>   
> +	if (msg & (INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED | INTEL_GUC_RECV_MSG_EXCEPTION))
> +		queue_work(system_unbound_wq, &guc->dead_guc_worker);

I'm a bit worried about queuing this for a failure during the init flow. 
If we have a systemic issue (e.g. bad memory) we could end up trying and 
failing to reset & reload multiple times, until the wedge code manages 
to set the wedge on init flag.

Daniele

> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index 8dc291ff00935..0b54eec95fc00 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -266,6 +266,14 @@ struct intel_guc {
>   		unsigned long last_stat_jiffies;
>   	} timestamp;
>   
> +	/**
> +	 * @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset.
> +	 * Specifically used when the G2H handler wants to issue a reset. Resets
> +	 * require flushing the G2H queue. So, the G2H processing itself must not
> +	 * trigger a reset directly. Instead, go via this worker.
> +	 */
> +	struct work_struct dead_guc_worker;
> +
>   #ifdef CONFIG_DRM_I915_SELFTEST
>   	/**
>   	 * @number_guc_id_stolen: The number of guc_ids that have been stolen
> @@ -476,6 +484,7 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
>   					 const u32 *msg, u32 len);
>   int intel_guc_error_capture_process_msg(struct intel_guc *guc,
>   					const u32 *msg, u32 len);
> +int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action);
>   
>   struct intel_engine_cs *
>   intel_guc_lookup_engine(struct intel_guc *guc, u8 guc_class, u8 instance);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index f28a3a83742dc..7b09ad6931021 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -1116,12 +1116,8 @@ static int ct_process_request(struct intel_guc_ct *ct, struct ct_incoming_msg *r
>   		ret = 0;
>   		break;
>   	case INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED:
> -		CT_ERROR(ct, "Received GuC crash dump notification!\n");
> -		ret = 0;
> -		break;
>   	case INTEL_GUC_ACTION_NOTIFY_EXCEPTION:
> -		CT_ERROR(ct, "Received GuC exception notification!\n");
> -		ret = 0;
> +		ret = intel_guc_crash_process_msg(guc, action);
>   		break;
>   	default:
>   		ret = -EOPNOTSUPP;


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-06-07  0:16 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-06-05 20:54 [Intel-gfx] [PATCH] drm/i915/guc: Force a reset on internal GuC error John.C.Harrison
2023-06-06  2:43 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for " Patchwork
2023-06-06  2:54 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2023-06-06 23:48 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
2023-06-07  0:15 ` [Intel-gfx] [PATCH] " Ceraolo Spurio, Daniele

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox