All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Don't touch fence->error when resetting an innocent request
@ 2017-07-20 13:24 Chris Wilson
  2017-07-20 13:39 ` Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Chris Wilson @ 2017-07-20 13:24 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

If the request has been completed before the reset took effect, we don't
need to mark it up as being a victim. Touching fence->error after the
fence has been signaled is detected by dma_fence_set_error() and
triggers a BUG:

[  231.743133] kernel BUG at ./include/linux/dma-fence.h:434!
[  231.743156] invalid opcode: 0000 [#1] SMP KASAN
[  231.743172] Modules linked in: i915 drm_kms_helper drm iptable_nat nf_nat_ipv4 nf_nat x86_pkg_temp_thermal iosf_mbi i2c_algo_bit cfbfillrect syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea fb font fbdev [last unloaded: drm]
[  231.743221] CPU: 2 PID: 20 Comm: kworker/2:0 Tainted: G     U          4.13.0-rc1+ #52
[  231.743236] Hardware name: Hewlett-Packard HP EliteBook 8460p/161C, BIOS 68SCF Ver. F.01 03/11/2011
[  231.743363] Workqueue: events_long i915_hangcheck_elapsed [i915]
[  231.743382] task: ffff8801f42e9780 task.stack: ffff8801f42f8000
[  231.743489] RIP: 0010:i915_gem_reset_engine+0x45a/0x460 [i915]
[  231.743505] RSP: 0018:ffff8801f42ff770 EFLAGS: 00010202
[  231.743521] RAX: 0000000000000007 RBX: ffff8801bf6b1880 RCX: ffffffffa02881a6
[  231.743537] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff8801bf6b18c8
[  231.743551] RBP: ffff8801f42ff7c8 R08: 0000000000000001 R09: 0000000000000000
[  231.743566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801edb02d00
[  231.743581] R13: ffff8801e19d4200 R14: 000000000000001d R15: ffff8801ce2a4000
[  231.743599] FS:  0000000000000000(0000) GS:ffff8801f5a80000(0000) knlGS:0000000000000000
[  231.743614] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  231.743629] CR2: 00007f0ebd1add10 CR3: 0000000002621000 CR4: 00000000000406e0
[  231.743643] Call Trace:
[  231.743752]  i915_gem_reset+0x6c/0x150 [i915]
[  231.743853]  i915_reset+0x175/0x210 [i915]
[  231.743958]  i915_reset_device+0x33b/0x350 [i915]
[  231.744061]  ? valleyview_pipestat_irq_handler+0xe0/0xe0 [i915]
[  231.744081]  ? trace_hardirqs_off_caller+0x70/0x110
[  231.744102]  ? _raw_spin_unlock_irqrestore+0x46/0x50
[  231.744120]  ? find_held_lock+0x119/0x150
[  231.744138]  ? mark_lock+0x6d/0x850
[  231.744241]  ? gen8_gt_irq_ack+0x1f0/0x1f0 [i915]
[  231.744262]  ? work_on_cpu_safe+0x60/0x60
[  231.744284]  ? rcu_read_lock_sched_held+0x57/0xa0
[  231.744400]  ? gen6_read32+0x2ba/0x320 [i915]
[  231.744506]  i915_handle_error+0x382/0x5f0 [i915]
[  231.744611]  ? gen6_rps_reset_ei+0x20/0x20 [i915]
[  231.744630]  ? vsnprintf+0x128/0x8e0
[  231.744649]  ? pointer+0x6b0/0x6b0
[  231.744667]  ? debug_check_no_locks_freed+0x1a0/0x1a0
[  231.744688]  ? scnprintf+0x92/0xe0
[  231.744706]  ? snprintf+0xb0/0xb0
[  231.744820]  hangcheck_declare_hang+0x15a/0x1a0 [i915]
[  231.744932]  ? engine_stuck+0x440/0x440 [i915]
[  231.744951]  ? rcu_read_lock_sched_held+0x57/0xa0
[  231.745062]  ? gen6_read32+0x2ba/0x320 [i915]
[  231.745173]  ? gen6_read16+0x320/0x320 [i915]
[  231.745284]  ? intel_engine_get_active_head+0x91/0x170 [i915]
[  231.745401]  i915_hangcheck_elapsed+0x3d8/0x400 [i915]
[  231.745424]  process_one_work+0x3e8/0xac0
[  231.745444]  ? pwq_dec_nr_in_flight+0x110/0x110
[  231.745464]  ? do_raw_spin_lock+0x8e/0x120
[  231.745484]  worker_thread+0x8d/0x720
[  231.745506]  kthread+0x19e/0x1f0
[  231.745524]  ? process_one_work+0xac0/0xac0
[  231.745541]  ? kthread_create_on_node+0xa0/0xa0
[  231.745560]  ret_from_fork+0x27/0x40
[  231.745581] Code: 8b 7d c8 e8 49 0d 02 e1 49 8b 7f 38 48 8b 75 b8 48 83 c7 10 e8 b8 89 be e1 e9 95 fc ff ff 4c 89 e7 e8 4b b9 ff ff e9 30 ff ff ff <0f> 0b 0f 1f 40 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fe
[  231.745767] RIP: i915_gem_reset_engine+0x45a/0x460 [i915] RSP: ffff8801f42ff770

At first glance this looks to be related to commit c64992e035d7
("drm/i915: Look for active requests earlier in the reset path"), but it
could easily happen before as well. On the other hand, we no longer
logged victims due to the active_request being dropped earlier.

Reported-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Fixes: c64992e035d7 ("drm/i915: Look for active requests earlier in the reset path")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b1520052a5e4..3ad1352dc871 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2914,11 +2914,9 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 	if (engine->irq_seqno_barrier)
 		engine->irq_seqno_barrier(engine);
 
-	if (engine_stalled(engine)) {
-		request = i915_gem_find_active_request(engine);
-		if (request && request->fence.error == -EIO)
-			request = ERR_PTR(-EIO); /* Previous reset failed! */
-	}
+	request = i915_gem_find_active_request(engine);
+	if (request && request->fence.error == -EIO)
+		request = ERR_PTR(-EIO); /* Previous reset failed! */
 
 	return request;
 }
@@ -2987,12 +2985,10 @@ static void engine_skip_context(struct drm_i915_gem_request *request)
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 }
 
-/* Returns true if the request was guilty of hang */
-static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
+/* Returns the request if it was guilty of the hang */
+static struct drm_i915_gem_request *
+i915_gem_reset_request(struct drm_i915_gem_request *request)
 {
-	/* Read once and return the resolution */
-	const bool guilty = !i915_gem_request_completed(request);
-
 	/* The guilty request will get skipped on a hung engine.
 	 *
 	 * Users of client default contexts do not rely on logical
@@ -3014,27 +3010,32 @@ static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
 	 * subsequent hangs.
 	 */
 
-	if (guilty) {
+	if (engine_stalled(request->engine)) {
 		i915_gem_context_mark_guilty(request->ctx);
 		skip_request(request);
+
+		/* If this context is now banned, skip all pending requests. */
+		if (i915_gem_context_is_banned(request->ctx))
+			engine_skip_context(request);
 	} else {
 		i915_gem_context_mark_innocent(request->ctx);
-		dma_fence_set_error(&request->fence, -EAGAIN);
+		if (!i915_gem_request_completed(request))
+			dma_fence_set_error(&request->fence, -EAGAIN);
+		request = NULL;
 	}
 
-	return guilty;
+	return request;
 }
 
 void i915_gem_reset_engine(struct intel_engine_cs *engine,
 			   struct drm_i915_gem_request *request)
 {
-	if (request && i915_gem_reset_request(request)) {
+	if (request)
+		request = i915_gem_reset_request(request);
+
+	if (request) {
 		DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
 				 engine->name, request->global_seqno);
-
-		/* If this context is now banned, skip all pending requests. */
-		if (i915_gem_context_is_banned(request->ctx))
-			engine_skip_context(request);
 	}
 
 	/* Setup the CS to resume from the breadcrumb of the hung request */
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-07-21  9:33 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-07-20 13:24 [PATCH] drm/i915: Don't touch fence->error when resetting an innocent request Chris Wilson
2017-07-20 13:39 ` Chris Wilson
2017-07-20 13:48 ` [PATCH v2] " Chris Wilson
2017-07-20 20:41   ` Daniel Vetter
2017-07-21  9:33   ` Joonas Lahtinen
2017-07-20 15:56 ` ✓ Fi.CI.BAT: success for drm/i915: Don't touch fence->error when resetting an innocent request (rev2) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.