intel-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/6] drm/i915: Split up hangcheck phases
@ 2016-11-16 15:20 Mika Kuoppala
  2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
                   ` (6 more replies)
  0 siblings, 7 replies; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw)
  To: intel-gfx

In order to simplify hangcheck state keeping, split hangcheck
per engine loop in three phases: state load, action, state save.

Add few more hangcheck actions to separate between seqno, head
and subunit movements. This helps to gather all the hangcheck
actions under a single switch umbrella.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gpu_error.c   |   8 +-
 drivers/gpu/drm/i915/intel_hangcheck.c  | 241 ++++++++++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |   4 +-
 3 files changed, 146 insertions(+), 107 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5d620bd..f02f581 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
 		return "idle";
 	case HANGCHECK_WAIT:
 		return "wait";
-	case HANGCHECK_ACTIVE:
-		return "active";
+	case HANGCHECK_ACTIVE_SEQNO:
+		return "active seqno";
+	case HANGCHECK_ACTIVE_HEAD:
+		return "active head";
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		return "active subunits";
 	case HANGCHECK_KICK:
 		return "kick";
 	case HANGCHECK_HUNG:
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 53df5b1..3d2e81c 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
 		memset(&engine->hangcheck.instdone, 0,
 		       sizeof(engine->hangcheck.instdone));
 
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_HEAD;
 	}
 
 	if (!subunits_stuck(engine))
-		return HANGCHECK_ACTIVE;
+		return HANGCHECK_ACTIVE_SUBUNITS;
 
 	return HANGCHECK_HUNG;
 }
@@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 
+static void hangcheck_load_sample(struct intel_engine_cs *engine,
+				  struct intel_engine_hangcheck *hc)
+{
+	/* We don't strictly need an irq-barrier here, as we are not
+	 * serving an interrupt request, be paranoid in case the
+	 * barrier has side-effects (such as preventing a broken
+	 * cacheline snoop) and so be sure that we can see the seqno
+	 * advance. If the seqno should stick, due to a stale
+	 * cacheline, we would erroneously declare the GPU hung.
+	 */
+	if (engine->irq_seqno_barrier)
+		engine->irq_seqno_barrier(engine);
+
+	hc->acthd = intel_engine_get_active_head(engine);
+	hc->seqno = intel_engine_get_seqno(engine);
+	hc->score = engine->hangcheck.score;
+}
+
+static void hangcheck_store_sample(struct intel_engine_cs *engine,
+				   const struct intel_engine_hangcheck *hc)
+{
+	engine->hangcheck.acthd = hc->acthd;
+	engine->hangcheck.seqno = hc->seqno;
+	engine->hangcheck.score = hc->score;
+	engine->hangcheck.action = hc->action;
+}
+
+static enum intel_engine_hangcheck_action
+hangcheck_get_action(struct intel_engine_cs *engine,
+		     const struct intel_engine_hangcheck *hc)
+{
+	if (engine->hangcheck.seqno != hc->seqno)
+		return HANGCHECK_ACTIVE_SEQNO;
+
+	if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
+		return HANGCHECK_IDLE;
+
+	return engine_stuck(engine, hc->acthd);
+}
+
+static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
+					struct intel_engine_hangcheck *hc)
+{
+	hc->action = hangcheck_get_action(engine, hc);
+
+	switch (hc->action) {
+	case HANGCHECK_IDLE:
+	case HANGCHECK_WAIT:
+		break;
+
+	case HANGCHECK_ACTIVE_HEAD:
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		/* We always increment the hangcheck score
+		 * if the engine is busy and still processing
+		 * the same request, so that no single request
+		 * can run indefinitely (such as a chain of
+		 * batches). The only time we do not increment
+		 * the hangcheck score on this ring, if this
+		 * engine is in a legitimate wait for another
+		 * engine. In that case the waiting engine is a
+		 * victim and we want to be sure we catch the
+		 * right culprit. Then every time we do kick
+		 * the ring, add a small increment to the
+		 * score so that we can catch a batch that is
+		 * being repeatedly kicked and so responsible
+		 * for stalling the machine.
+		 */
+		hc->score += 1;
+		break;
+
+	case HANGCHECK_KICK:
+		hc->score += 5;
+		break;
+
+	case HANGCHECK_HUNG:
+		hc->score += 20;
+		break;
+
+	case HANGCHECK_ACTIVE_SEQNO:
+		/* Gradually reduce the count so that we catch DoS
+		 * attempts across multiple batches.
+		 */
+		if (hc->score > 0)
+			hc->score -= 15;
+		if (hc->score < 0)
+			hc->score = 0;
+
+		/* Clear head and subunit states on seqno movement */
+		hc->acthd = 0;
+
+		memset(&engine->hangcheck.instdone, 0,
+		       sizeof(engine->hangcheck.instdone));
+		break;
+
+	default:
+		MISSING_CASE(hc->action);
+	}
+}
+
+static void hangcheck_declare_hang(struct drm_i915_private *i915,
+				   unsigned int hung,
+				   unsigned int stuck)
+{
+	struct intel_engine_cs *engine;
+	char msg[80];
+	unsigned int tmp;
+	int len;
+
+	/* If some rings hung but others were still busy, only
+	 * blame the hanging rings in the synopsis.
+	 */
+	if (stuck != hung)
+		hung &= ~stuck;
+	len = scnprintf(msg, sizeof(msg),
+			"%s on ", stuck == hung ? "No progress" : "Hang");
+	for_each_engine_masked(engine, i915, hung, tmp)
+		len += scnprintf(msg + len, sizeof(msg) - len,
+				 "%s, ", engine->name);
+	msg[len-2] = '\0';
+
+	return i915_handle_error(i915, hung, msg);
+}
+
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	enum intel_engine_id id;
 	unsigned int hung = 0, stuck = 0;
 	int busy_count = 0;
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define ACTIVE_DECAY 15
 
 	if (!i915.enable_hangcheck)
 		return;
@@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
 
 	for_each_engine(engine, dev_priv, id) {
-		bool busy = intel_engine_has_waiter(engine);
-		u64 acthd;
-		u32 seqno;
-		u32 submit;
+		struct intel_engine_hangcheck cur_state, *hc = &cur_state;
+		const bool busy = intel_engine_has_waiter(engine);
 
 		semaphore_clear_deadlocks(dev_priv);
 
-		/* We don't strictly need an irq-barrier here, as we are not
-		 * serving an interrupt request, be paranoid in case the
-		 * barrier has side-effects (such as preventing a broken
-		 * cacheline snoop) and so be sure that we can see the seqno
-		 * advance. If the seqno should stick, due to a stale
-		 * cacheline, we would erroneously declare the GPU hung.
-		 */
-		if (engine->irq_seqno_barrier)
-			engine->irq_seqno_barrier(engine);
-
-		acthd = intel_engine_get_active_head(engine);
-		seqno = intel_engine_get_seqno(engine);
-		submit = intel_engine_last_submit(engine);
-
-		if (engine->hangcheck.seqno == seqno) {
-			if (i915_seqno_passed(seqno, submit)) {
-				engine->hangcheck.action = HANGCHECK_IDLE;
-			} else {
-				/* We always increment the hangcheck score
-				 * if the engine is busy and still processing
-				 * the same request, so that no single request
-				 * can run indefinitely (such as a chain of
-				 * batches). The only time we do not increment
-				 * the hangcheck score on this ring, if this
-				 * engine is in a legitimate wait for another
-				 * engine. In that case the waiting engine is a
-				 * victim and we want to be sure we catch the
-				 * right culprit. Then every time we do kick
-				 * the ring, add a small increment to the
-				 * score so that we can catch a batch that is
-				 * being repeatedly kicked and so responsible
-				 * for stalling the machine.
-				 */
-				engine->hangcheck.action =
-					engine_stuck(engine, acthd);
-
-				switch (engine->hangcheck.action) {
-				case HANGCHECK_IDLE:
-				case HANGCHECK_WAIT:
-					break;
-				case HANGCHECK_ACTIVE:
-					engine->hangcheck.score += BUSY;
-					break;
-				case HANGCHECK_KICK:
-					engine->hangcheck.score += KICK;
-					break;
-				case HANGCHECK_HUNG:
-					engine->hangcheck.score += HUNG;
-					break;
-				}
-			}
-
-			if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
-				hung |= intel_engine_flag(engine);
-				if (engine->hangcheck.action != HANGCHECK_HUNG)
-					stuck |= intel_engine_flag(engine);
-			}
-		} else {
-			engine->hangcheck.action = HANGCHECK_ACTIVE;
-
-			/* Gradually reduce the count so that we catch DoS
-			 * attempts across multiple batches.
-			 */
-			if (engine->hangcheck.score > 0)
-				engine->hangcheck.score -= ACTIVE_DECAY;
-			if (engine->hangcheck.score < 0)
-				engine->hangcheck.score = 0;
-
-			/* Clear head and subunit states on seqno movement */
-			acthd = 0;
-
-			memset(&engine->hangcheck.instdone, 0,
-			       sizeof(engine->hangcheck.instdone));
+		hangcheck_load_sample(engine, hc);
+		hangcheck_accumulate_sample(engine, hc);
+		hangcheck_store_sample(engine, hc);
+
+		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+			hung |= intel_engine_flag(engine);
+			if (hc->action != HANGCHECK_HUNG)
+				stuck |= intel_engine_flag(engine);
 		}
 
-		engine->hangcheck.seqno = seqno;
-		engine->hangcheck.acthd = acthd;
 		busy_count += busy;
 	}
 
-	if (hung) {
-		char msg[80];
-		unsigned int tmp;
-		int len;
-
-		/* If some rings hung but others were still busy, only
-		 * blame the hanging rings in the synopsis.
-		 */
-		if (stuck != hung)
-			hung &= ~stuck;
-		len = scnprintf(msg, sizeof(msg),
-				"%s on ", stuck == hung ? "No progress" : "Hang");
-		for_each_engine_masked(engine, dev_priv, hung, tmp)
-			len += scnprintf(msg + len, sizeof(msg) - len,
-					 "%s, ", engine->name);
-		msg[len-2] = '\0';
-
-		return i915_handle_error(dev_priv, hung, msg);
-	}
+	if (hung)
+		hangcheck_declare_hang(dev_priv, hung, stuck);
 
 	/* Reset timer in case GPU hangs without another request being added */
 	if (busy_count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3466b4e..3152b2b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -67,7 +67,9 @@ struct intel_hw_status_page {
 enum intel_engine_hangcheck_action {
 	HANGCHECK_IDLE = 0,
 	HANGCHECK_WAIT,
-	HANGCHECK_ACTIVE,
+	HANGCHECK_ACTIVE_SEQNO,
+	HANGCHECK_ACTIVE_HEAD,
+	HANGCHECK_ACTIVE_SUBUNITS,
 	HANGCHECK_KICK,
 	HANGCHECK_HUNG,
 };
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period
  2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
@ 2016-11-16 15:20 ` Mika Kuoppala
  2016-11-16 17:05   ` Chris Wilson
  2016-11-18 13:09   ` Mika Kuoppala
  2016-11-16 15:20 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw)
  To: intel-gfx

Hangcheck state accumulation has gained more steps
along the years, like head movement and more recently the
subunit inactivity check. As the subunit sampling is only
done if the previous state check showed inactivity, we
have added more stages (and time) to reach a hang verdict.

Asymmetric engine states led to different actual weight of
'one hangcheck unit' and it was demonstrated in some
hangs that due to difference in stages, simpler engines
were accused falsely of a hang as their scoring was much
more quicker to accumulate above the hang treshold.

To completely decouple the hangcheck guilty score
from the hangcheck period, convert hangcheck score to a
rough period of inactivity measurement. As these are
tracked as jiffies, they are meaningful also across
reset boundaries. This makes finding a guilty engine
more accurate across multi engine activity scenarios,
especially across asymmetric engines.

We lose the ability to detect cross batch malicious attempts
to hinder the progress. Plan is to move this functionality
to be part of context banning which is more natural fit,
later in the series.

v2: use time_before macros (Chris)
    reinstate the pardoning of moving engine after hc (Chris)
v3: avoid global state for per engine stall detection (Chris)
v4: take timeline last retirement into account (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c      |  17 +++--
 drivers/gpu/drm/i915/i915_drv.h          |   6 +-
 drivers/gpu/drm/i915/i915_gem.c          |  11 ++-
 drivers/gpu/drm/i915/i915_gem_timeline.c |  12 +++-
 drivers/gpu/drm/i915/i915_gem_timeline.h |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.c    |  46 ++++---------
 drivers/gpu/drm/i915/intel_hangcheck.c   | 115 ++++++++++++++++---------------
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  40 ++++++++---
 8 files changed, 139 insertions(+), 109 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1cc971c..5ef5c1a 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1351,10 +1351,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
 			   engine->hangcheck.seqno, seqno[id],
 			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
+		seq_printf(m, "\twaiters? %s, fake irq active? %s, stall? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)));
+					  &dev_priv->gpu_error.missed_irq_rings)),
+			   yesno(engine->hangcheck.stall));
+
 		spin_lock_irq(&b->lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
 			struct intel_wait *w = container_of(rb, typeof(*w), node);
@@ -1367,8 +1369,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
-		seq_printf(m, "\tscore = %d\n", engine->hangcheck.score);
-		seq_printf(m, "\taction = %d\n", engine->hangcheck.action);
+		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
+			   hangcheck_action_to_str(engine->hangcheck.action),
+			   engine->hangcheck.action,
+			   jiffies_to_msecs(jiffies -
+					    engine->hangcheck.action_timestamp));
 
 		if (engine->id == RCS) {
 			seq_puts(m, "\tinstdone read =\n");
@@ -3163,11 +3168,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 		u64 addr;
 
 		seq_printf(m, "%s\n", engine->name);
-		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [score %d]\n",
+		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
 			   intel_engine_get_seqno(engine),
 			   intel_engine_last_submit(engine),
 			   engine->hangcheck.seqno,
-			   engine->hangcheck.score);
+			   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
 
 		rcu_read_lock();
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5192206..4562a39 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -795,7 +795,8 @@ struct drm_i915_error_state {
 		/* Software tracked state */
 		bool waiting;
 		int num_waiters;
-		int hangcheck_score;
+		unsigned long hangcheck_timestamp;
+		bool hangcheck_stall;
 		enum intel_engine_hangcheck_action hangcheck_action;
 		struct i915_address_space *vm;
 		int num_requests;
@@ -1441,6 +1442,9 @@ struct i915_error_state_file_priv {
 #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */
 #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */
 
+#define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* No observed seqno/head/subunit progress */
+#define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Engine progress without head progress */
+
 struct i915_gpu_error {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3fb5e66..ae2a219 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2703,9 +2703,16 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 	if (!request)
 		return;
 
-	ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
-	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine))
+	ring_hung = engine->hangcheck.stall;
+	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) {
+		if (ring_hung)
+			DRM_ERROR("%s pardoned due to progress after hangcheck %x vs %x\n",
+				  engine->name,
+				  engine->hangcheck.seqno,
+				  intel_engine_get_seqno(engine));
+
 		ring_hung = false;
+	}
 
 	i915_set_reset_status(request->ctx, ring_hung);
 	if (!ring_hung)
diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c
index bf8a471..348b0f2 100644
--- a/drivers/gpu/drm/i915/i915_gem_timeline.c
+++ b/drivers/gpu/drm/i915/i915_gem_timeline.c
@@ -24,6 +24,15 @@
 
 #include "i915_drv.h"
 
+static void i915_gem_timeline_retire(struct i915_gem_active *active,
+				     struct drm_i915_gem_request *request)
+{
+	struct intel_timeline *tl =
+		container_of(active, typeof(*tl), last_request);
+
+	tl->last_retire_timestamp = jiffies;
+}
+
 static int __i915_gem_timeline_init(struct drm_i915_private *i915,
 				    struct i915_gem_timeline *timeline,
 				    const char *name,
@@ -54,7 +63,8 @@ static int __i915_gem_timeline_init(struct drm_i915_private *i915,
 #else
 		spin_lock_init(&tl->lock);
 #endif
-		init_request_active(&tl->last_request, NULL);
+		init_request_active(&tl->last_request,
+				    i915_gem_timeline_retire);
 		INIT_LIST_HEAD(&tl->requests);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.h b/drivers/gpu/drm/i915/i915_gem_timeline.h
index 98d99a6..d4eea88 100644
--- a/drivers/gpu/drm/i915/i915_gem_timeline.h
+++ b/drivers/gpu/drm/i915/i915_gem_timeline.h
@@ -34,6 +34,7 @@ struct i915_gem_timeline;
 struct intel_timeline {
 	u64 fence_context;
 	u32 last_submitted_seqno;
+	unsigned long last_retire_timestamp;
 
 	spinlock_t lock;
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index f02f581..3eee6c2 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -316,28 +316,6 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	}
 }
 
-static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
-{
-	switch (a) {
-	case HANGCHECK_IDLE:
-		return "idle";
-	case HANGCHECK_WAIT:
-		return "wait";
-	case HANGCHECK_ACTIVE_SEQNO:
-		return "active seqno";
-	case HANGCHECK_ACTIVE_HEAD:
-		return "active head";
-	case HANGCHECK_ACTIVE_SUBUNITS:
-		return "active subunits";
-	case HANGCHECK_KICK:
-		return "kick";
-	case HANGCHECK_HUNG:
-		return "hung";
-	}
-
-	return "unknown";
-}
-
 static void error_print_instdone(struct drm_i915_error_state_buf *m,
 				 struct drm_i915_error_engine *ee)
 {
@@ -445,9 +423,13 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck: %s [%d]\n",
-		   hangcheck_action_to_str(ee->hangcheck_action),
-		   ee->hangcheck_score);
+	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stall));
+	err_printf(m, "  hangcheck action: %s\n",
+		   hangcheck_action_to_str(ee->hangcheck_action));
+	err_printf(m, "  hangcheck action timestamp: %lu, %u ms ago\n",
+		   ee->hangcheck_timestamp,
+		   jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
+
 	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
 	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
 }
@@ -537,7 +519,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	struct pci_dev *pdev = dev_priv->drm.pdev;
 	struct drm_i915_error_state *error = error_priv->error;
 	struct drm_i915_error_object *obj;
-	int max_hangcheck_score;
 	int i, j;
 
 	if (!error) {
@@ -554,13 +535,9 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Uptime: %ld s %ld us\n",
 		   error->uptime.tv_sec, error->uptime.tv_usec);
 	err_print_capabilities(m, &error->device_info);
-	max_hangcheck_score = 0;
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score > max_hangcheck_score)
-			max_hangcheck_score = error->engine[i].hangcheck_score;
-	}
+
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score == max_hangcheck_score &&
+		if (error->engine[i].hangcheck_stall &&
 		    error->engine[i].pid != -1) {
 			err_printf(m, "Active process (on ring %s): %s [%d]\n",
 				   engine_str(i),
@@ -946,7 +923,7 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
 	 * strictly a client bug. Use instdone to differentiate those some.
 	 */
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		if (error->engine[i].hangcheck_action == HANGCHECK_HUNG) {
+		if (error->engine[i].hangcheck_stall) {
 			if (engine_id)
 				*engine_id = i;
 
@@ -1164,8 +1141,9 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 		ee->hws = I915_READ(mmio);
 	}
 
-	ee->hangcheck_score = engine->hangcheck.score;
+	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->hangcheck_action = engine->hangcheck.action;
+	ee->hangcheck_stall = engine->hangcheck.stall;
 
 	if (USES_PPGTT(dev_priv)) {
 		int i;
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 3d2e81c..71ee837 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -236,13 +236,13 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
 		memset(&engine->hangcheck.instdone, 0,
 		       sizeof(engine->hangcheck.instdone));
 
-		return HANGCHECK_ACTIVE_HEAD;
+		return ENGINE_ACTIVE_HEAD;
 	}
 
 	if (!subunits_stuck(engine))
-		return HANGCHECK_ACTIVE_SUBUNITS;
+		return ENGINE_ACTIVE_SUBUNITS;
 
-	return HANGCHECK_HUNG;
+	return ENGINE_DEAD;
 }
 
 static enum intel_engine_hangcheck_action
@@ -253,11 +253,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	u32 tmp;
 
 	ha = head_stuck(engine, acthd);
-	if (ha != HANGCHECK_HUNG)
+	if (ha != ENGINE_DEAD)
 		return ha;
 
 	if (IS_GEN2(dev_priv))
-		return HANGCHECK_HUNG;
+		return ENGINE_DEAD;
 
 	/* Is the chip hanging on a WAIT_FOR_EVENT?
 	 * If so we can simply poke the RB_WAIT bit
@@ -270,25 +270,25 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 				  "Kicking stuck wait on %s",
 				  engine->name);
 		I915_WRITE_CTL(engine, tmp);
-		return HANGCHECK_KICK;
+		return ENGINE_WAIT_KICK;
 	}
 
 	if (INTEL_GEN(dev_priv) >= 6 && tmp & RING_WAIT_SEMAPHORE) {
 		switch (semaphore_passed(engine)) {
 		default:
-			return HANGCHECK_HUNG;
+			return ENGINE_DEAD;
 		case 1:
 			i915_handle_error(dev_priv, 0,
 					  "Kicking stuck semaphore on %s",
 					  engine->name);
 			I915_WRITE_CTL(engine, tmp);
-			return HANGCHECK_KICK;
+			return ENGINE_WAIT_KICK;
 		case 0:
-			return HANGCHECK_WAIT;
+			return ENGINE_WAIT;
 		}
 	}
 
-	return HANGCHECK_HUNG;
+	return ENGINE_DEAD;
 }
 
 static void hangcheck_load_sample(struct intel_engine_cs *engine,
@@ -306,7 +306,6 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
 
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_seqno(engine);
-	hc->score = engine->hangcheck.score;
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
@@ -314,8 +313,8 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
 {
 	engine->hangcheck.acthd = hc->acthd;
 	engine->hangcheck.seqno = hc->seqno;
-	engine->hangcheck.score = hc->score;
 	engine->hangcheck.action = hc->action;
+	engine->hangcheck.stall = hc->stall;
 }
 
 static enum intel_engine_hangcheck_action
@@ -323,10 +322,10 @@ hangcheck_get_action(struct intel_engine_cs *engine,
 		     const struct intel_engine_hangcheck *hc)
 {
 	if (engine->hangcheck.seqno != hc->seqno)
-		return HANGCHECK_ACTIVE_SEQNO;
+		return ENGINE_ACTIVE_SEQNO;
 
 	if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
-		return HANGCHECK_IDLE;
+		return ENGINE_IDLE;
 
 	return engine_stuck(engine, hc->acthd);
 }
@@ -334,60 +333,64 @@ hangcheck_get_action(struct intel_engine_cs *engine,
 static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
 					struct intel_engine_hangcheck *hc)
 {
+	unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
+	unsigned long last_retire;
+
 	hc->action = hangcheck_get_action(engine, hc);
 
-	switch (hc->action) {
-	case HANGCHECK_IDLE:
-	case HANGCHECK_WAIT:
-		break;
+	/* We always increment the progress
+	 * if the engine is busy and still processing
+	 * the same request, so that no single request
+	 * can run indefinitely (such as a chain of
+	 * batches). The only time we do not increment
+	 * the hangcheck score on this ring, if this
+	 * engine is in a legitimate wait for another
+	 * engine. In that case the waiting engine is a
+	 * victim and we want to be sure we catch the
+	 * right culprit. Then every time we do kick
+	 * the ring, make it as a progress as the seqno
+	 * advancement might ensure and if not, it
+	 * will catch the hanging engine.
+	 */
 
-	case HANGCHECK_ACTIVE_HEAD:
-	case HANGCHECK_ACTIVE_SUBUNITS:
-		/* We always increment the hangcheck score
-		 * if the engine is busy and still processing
-		 * the same request, so that no single request
-		 * can run indefinitely (such as a chain of
-		 * batches). The only time we do not increment
-		 * the hangcheck score on this ring, if this
-		 * engine is in a legitimate wait for another
-		 * engine. In that case the waiting engine is a
-		 * victim and we want to be sure we catch the
-		 * right culprit. Then every time we do kick
-		 * the ring, add a small increment to the
-		 * score so that we can catch a batch that is
-		 * being repeatedly kicked and so responsible
-		 * for stalling the machine.
-		 */
-		hc->score += 1;
-		break;
+	switch (hc->action) {
+	case ENGINE_IDLE:
+	case ENGINE_ACTIVE_SEQNO:
+		/* Clear head and subunit states on seqno movement */
+		hc->acthd = 0;
 
-	case HANGCHECK_KICK:
-		hc->score += 5;
-		break;
+		memset(&engine->hangcheck.instdone, 0,
+		       sizeof(engine->hangcheck.instdone));
 
-	case HANGCHECK_HUNG:
-		hc->score += 20;
+		/* Intentional fall through */
+	case ENGINE_WAIT_KICK:
+	case ENGINE_WAIT:
+		engine->hangcheck.action_timestamp = jiffies;
 		break;
 
-	case HANGCHECK_ACTIVE_SEQNO:
-		/* Gradually reduce the count so that we catch DoS
-		 * attempts across multiple batches.
+	case ENGINE_ACTIVE_HEAD:
+	case ENGINE_ACTIVE_SUBUNITS:
+		/* Seqno stuck with still active engine gets leeway,
+		 * in hopes that it is just a long shader.
 		 */
-		if (hc->score > 0)
-			hc->score -= 15;
-		if (hc->score < 0)
-			hc->score = 0;
-
-		/* Clear head and subunit states on seqno movement */
-		hc->acthd = 0;
+		timeout = I915_SEQNO_DEAD_TIMEOUT;
+		break;
 
-		memset(&engine->hangcheck.instdone, 0,
-		       sizeof(engine->hangcheck.instdone));
+	case ENGINE_DEAD:
 		break;
 
 	default:
 		MISSING_CASE(hc->action);
 	}
+
+	last_retire = READ_ONCE(engine->timeline->last_retire_timestamp);
+
+	/* If there is retire on the timeline, take that into account */
+	if (time_after(last_retire, engine->hangcheck.action_timestamp))
+		engine->hangcheck.action_timestamp = last_retire;
+
+	hc->stall = time_after(jiffies,
+			       engine->hangcheck.action_timestamp + timeout);
 }
 
 static void hangcheck_declare_hang(struct drm_i915_private *i915,
@@ -454,9 +457,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		hangcheck_accumulate_sample(engine, hc);
 		hangcheck_store_sample(engine, hc);
 
-		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+		if (engine->hangcheck.stall) {
 			hung |= intel_engine_flag(engine);
-			if (hc->action != HANGCHECK_HUNG)
+			if (hc->action != ENGINE_DEAD)
 				stuck |= intel_engine_flag(engine);
 		}
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3152b2b..d2d870b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -65,16 +65,37 @@ struct intel_hw_status_page {
 	 GEN8_SEMAPHORE_OFFSET(from, (__ring)->id))
 
 enum intel_engine_hangcheck_action {
-	HANGCHECK_IDLE = 0,
-	HANGCHECK_WAIT,
-	HANGCHECK_ACTIVE_SEQNO,
-	HANGCHECK_ACTIVE_HEAD,
-	HANGCHECK_ACTIVE_SUBUNITS,
-	HANGCHECK_KICK,
-	HANGCHECK_HUNG,
+	ENGINE_IDLE = 0,
+	ENGINE_WAIT,
+	ENGINE_ACTIVE_SEQNO,
+	ENGINE_ACTIVE_HEAD,
+	ENGINE_ACTIVE_SUBUNITS,
+	ENGINE_WAIT_KICK,
+	ENGINE_DEAD,
 };
 
-#define HANGCHECK_SCORE_RING_HUNG 31
+static inline const char *
+hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
+{
+	switch (a) {
+	case ENGINE_IDLE:
+		return "idle";
+	case ENGINE_WAIT:
+		return "wait";
+	case ENGINE_ACTIVE_SEQNO:
+		return "active seqno";
+	case ENGINE_ACTIVE_HEAD:
+		return "active head";
+	case ENGINE_ACTIVE_SUBUNITS:
+		return "active subunits";
+	case ENGINE_WAIT_KICK:
+		return "wait kick";
+	case ENGINE_DEAD:
+		return "dead";
+	}
+
+	return "unknown";
+}
 
 #define I915_MAX_SLICES	3
 #define I915_MAX_SUBSLICES 3
@@ -106,10 +127,11 @@ struct intel_instdone {
 struct intel_engine_hangcheck {
 	u64 acthd;
 	u32 seqno;
-	int score;
 	enum intel_engine_hangcheck_action action;
+	unsigned long action_timestamp;
 	int deadlock;
 	struct intel_instdone instdone;
+	bool stall;
 };
 
 struct intel_ring {
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/6] drm/i915: Use request retirement as context progress
  2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
  2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
@ 2016-11-16 15:20 ` Mika Kuoppala
  2016-11-16 17:08   ` Chris Wilson
  2016-11-16 15:20 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw)
  To: intel-gfx

As hangcheck score was removed, the active decay of score
was removed also. This removed feature for hangcheck to detect
if the gpu client was accidentally or maliciously causing intermittent
hangs. Reinstate the scoring as a per context property, so that if
one context starts to act unfavourably, ban it.

v2: ban_period_secs as a gate to score check (Chris)
v3: decay in proper spot. scores as tunables (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  5 ++++
 drivers/gpu/drm/i915/i915_gem.c         | 44 ++++++++++++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_request.c |  4 +++
 3 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4562a39..9f24957 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -914,6 +914,11 @@ struct i915_ctx_hang_stats {
 
 	/* This context is banned to submit more work */
 	bool banned;
+
+#define CONTEXT_SCORE_GUILTY		10
+#define CONTEXT_SCORE_BAN_THRESHOLD	40
+	/* Accumulated score of hangs caused by this context */
+	int ban_score;
 };
 
 /* This must match up with the value previously used for execbuf2.rsvd1. */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ae2a219..5948f09 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2620,33 +2620,45 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 
 static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 {
+	const struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
 	unsigned long elapsed;
 
-	if (ctx->hang_stats.banned)
+	if (hs->banned)
 		return true;
 
-	elapsed = get_seconds() - ctx->hang_stats.guilty_ts;
-	if (ctx->hang_stats.ban_period_seconds &&
-	    elapsed <= ctx->hang_stats.ban_period_seconds) {
+	if (!hs->ban_period_seconds)
+		return false;
+
+	elapsed = get_seconds() - hs->guilty_ts;
+	if (elapsed <= hs->ban_period_seconds) {
 		DRM_DEBUG("context hanging too fast, banning!\n");
 		return true;
 	}
 
+	if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) {
+		DRM_DEBUG("context hanging too often, banning!\n");
+		return true;
+	}
+
 	return false;
 }
 
-static void i915_set_reset_status(struct i915_gem_context *ctx,
-				  const bool guilty)
+static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 {
 	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
 
-	if (guilty) {
-		hs->banned = i915_context_is_banned(ctx);
-		hs->batch_active++;
-		hs->guilty_ts = get_seconds();
-	} else {
-		hs->batch_pending++;
-	}
+	hs->ban_score += CONTEXT_SCORE_GUILTY;
+
+	hs->banned = i915_context_is_banned(ctx);
+	hs->batch_active++;
+	hs->guilty_ts = get_seconds();
+}
+
+static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
+{
+	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
+
+	hs->batch_pending++;
 }
 
 struct drm_i915_gem_request *
@@ -2714,7 +2726,11 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 		ring_hung = false;
 	}
 
-	i915_set_reset_status(request->ctx, ring_hung);
+	if (ring_hung)
+		i915_gem_context_mark_guilty(request->ctx);
+	else
+		i915_gem_context_mark_innocent(request->ctx);
+
 	if (!ring_hung)
 		return;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index b9b5253..b31d18e 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -255,6 +255,10 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 					       request->engine);
 	}
 
+	/* Retirement decays the ban score as it is a sign of ctx progress */
+	if (request->ctx->hang_stats.ban_score > 0)
+		request->ctx->hang_stats.ban_score--;
+
 	i915_gem_context_put(request->ctx);
 
 	dma_fence_signal(&request->fence);
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/6] drm/i915: Add bannable context parameter
  2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
  2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
  2016-11-16 15:20 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala
@ 2016-11-16 15:20 ` Mika Kuoppala
  2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw)
  To: intel-gfx

Now when driver has per context scoring of 'hanging badness'
and also subsequent hangs during short windows are allowed,
if there is progress made in between, it does not make sense
to expose a ban timing window as a context parameter anymore.

Let the scoring be the sole indicator for ban policy and substitute
ban period context parameter as a boolean to get/set context
bannable property.

v2: allow non root to opt into being banned (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         | 13 +++----------
 drivers/gpu/drm/i915/i915_gem.c         | 10 +---------
 drivers/gpu/drm/i915/i915_gem_context.c | 23 ++++++++++++++---------
 drivers/gpu/drm/i915/i915_gpu_error.c   |  5 +++--
 include/uapi/drm/i915_drm.h             |  1 +
 5 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9f24957..74f421a1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -845,6 +845,7 @@ struct drm_i915_error_state {
 			long jiffies;
 			pid_t pid;
 			u32 context;
+			int ban_score;
 			u32 seqno;
 			u32 head;
 			u32 tail;
@@ -904,16 +905,10 @@ struct i915_ctx_hang_stats {
 	/* This context had batch active when hang was declared */
 	unsigned batch_active;
 
-	/* Time when this context was last blamed for a GPU reset */
-	unsigned long guilty_ts;
-
-	/* If the contexts causes a second GPU hang within this time,
-	 * it is permanently banned from submitting any more work.
-	 */
-	unsigned long ban_period_seconds;
+	bool bannable:1;
 
 	/* This context is banned to submit more work */
-	bool banned;
+	bool banned:1;
 
 #define CONTEXT_SCORE_GUILTY		10
 #define CONTEXT_SCORE_BAN_THRESHOLD	40
@@ -1454,8 +1449,6 @@ struct i915_gpu_error {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
-	/* Hang gpu twice in this window and your context gets banned */
-#define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
 
 	struct delayed_work hangcheck_work;
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5948f09..c48e0d2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2621,20 +2621,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 {
 	const struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
-	unsigned long elapsed;
 
 	if (hs->banned)
 		return true;
 
-	if (!hs->ban_period_seconds)
+	if (!hs->bannable)
 		return false;
 
-	elapsed = get_seconds() - hs->guilty_ts;
-	if (elapsed <= hs->ban_period_seconds) {
-		DRM_DEBUG("context hanging too fast, banning!\n");
-		return true;
-	}
-
 	if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) {
 		DRM_DEBUG("context hanging too often, banning!\n");
 		return true;
@@ -2651,7 +2644,6 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 
 	hs->banned = i915_context_is_banned(ctx);
 	hs->batch_active++;
-	hs->guilty_ts = get_seconds();
 }
 
 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 1f94b8d..bc1b232 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev,
 	 * is no remap info, it will be a NOP. */
 	ctx->remap_slice = ALL_L3_SLICES(dev_priv);
 
-	ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD;
+	ctx->hang_stats.bannable = true;
 	ctx->ring_size = 4 * PAGE_SIZE;
 	ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) <<
 			     GEN8_CTX_ADDRESSING_MODE_SHIFT;
@@ -1085,7 +1085,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	args->size = 0;
 	switch (args->param) {
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
-		args->value = ctx->hang_stats.ban_period_seconds;
+		ret = -EINVAL;
 		break;
 	case I915_CONTEXT_PARAM_NO_ZEROMAP:
 		args->value = ctx->flags & CONTEXT_NO_ZEROMAP;
@@ -1101,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
 		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
 		break;
+	case I915_CONTEXT_PARAM_BANNABLE:
+		args->value = ctx->hang_stats.bannable;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1130,13 +1133,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 
 	switch (args->param) {
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
-		if (args->size)
-			ret = -EINVAL;
-		else if (args->value < ctx->hang_stats.ban_period_seconds &&
-			 !capable(CAP_SYS_ADMIN))
-			ret = -EPERM;
-		else
-			ctx->hang_stats.ban_period_seconds = args->value;
+		ret = -EINVAL;
 		break;
 	case I915_CONTEXT_PARAM_NO_ZEROMAP:
 		if (args->size) {
@@ -1156,6 +1153,14 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
 		}
 		break;
+	case I915_CONTEXT_PARAM_BANNABLE:
+		if (args->size)
+			ret = -EINVAL;
+		else if (!capable(CAP_SYS_ADMIN) && !args->value)
+			ret = -EPERM;
+		else
+			ctx->hang_stats.bannable = args->value;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 3eee6c2..d52453e 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -352,8 +352,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
 	if (!erq->seqno)
 		return;
 
-	err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
-		   prefix, erq->pid,
+	err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
+		   prefix, erq->pid, erq->ban_score,
 		   erq->context, erq->seqno,
 		   jiffies_to_msecs(jiffies - erq->jiffies),
 		   erq->head, erq->tail);
@@ -1171,6 +1171,7 @@ static void record_request(struct drm_i915_gem_request *request,
 			   struct drm_i915_error_request *erq)
 {
 	erq->context = request->ctx->hw_id;
+	erq->ban_score = request->ctx->hang_stats.ban_score;
 	erq->seqno = request->global_seqno;
 	erq->jiffies = request->emitted_jiffies;
 	erq->head = request->head;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1c12a35..12003f0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1224,6 +1224,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
+#define I915_CONTEXT_PARAM_BANNABLE	0x5
 	__u64 value;
 };
 
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/6] drm/i915: Add per client max context ban limit
  2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
                   ` (2 preceding siblings ...)
  2016-11-16 15:20 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala
@ 2016-11-16 15:20 ` Mika Kuoppala
  2016-11-16 17:12   ` Chris Wilson
  2016-11-18 13:10   ` Mika Kuoppala
  2016-11-16 15:20 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw)
  To: intel-gfx

If we have a bad client submitting unfavourably across different
contexts, creating new ones, the per context scoring of badness
doesn't remove the root cause, the offending client.
To counter, keep track of per client context bans. Deny access if
client is responsible for more than 3 context bans in
it's lifetime.

v2: move ban check to context create ioctl (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  4 ++++
 drivers/gpu/drm/i915/i915_gem.c         | 14 ++++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c   | 10 ++++++----
 4 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 74f421a1..b385b9d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -411,6 +411,9 @@ struct drm_i915_file_private {
 	} rps;
 
 	unsigned int bsd_engine;
+
+#define I915_MAX_CLIENT_CONTEXT_BANS 3
+	int context_bans;
 };
 
 /* Used by dp and fdi links */
@@ -867,6 +870,7 @@ struct drm_i915_error_state {
 
 		pid_t pid;
 		char comm[TASK_COMM_LEN];
+		int context_bans;
 	} engine[I915_NUM_ENGINES];
 
 	struct drm_i915_error_buffer {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c48e0d2..a24fd99 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2644,6 +2644,20 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 
 	hs->banned = i915_context_is_banned(ctx);
 	hs->batch_active++;
+
+	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
+			 ctx->name, hs->ban_score, yesno(hs->banned));
+
+	if (!ctx->file_priv)
+		return;
+
+	if (hs->banned) {
+		ctx->file_priv->context_bans++;
+
+		DRM_DEBUG_DRIVER("client %s has has %d context banned\n",
+				 ctx->name,
+				 ctx->file_priv->context_bans);
+	}
 }
 
 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index bc1b232..4f88f03 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1003,6 +1003,11 @@ static bool contexts_enabled(struct drm_device *dev)
 	return i915.enable_execlists || to_i915(dev)->hw_context_size;
 }
 
+static bool client_is_banned(struct drm_i915_file_private *file_priv)
+{
+	return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS;
+}
+
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file)
 {
@@ -1017,6 +1022,14 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 	if (args->pad != 0)
 		return -EINVAL;
 
+	if (client_is_banned(file_priv)) {
+		DRM_DEBUG("client %s[%d] banned from creating ctx\n",
+			  current->comm,
+			  pid_nr(get_task_pid(current, PIDTYPE_PID)));
+
+		return -EIO;
+	}
+
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index d52453e..5990b97 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -539,10 +539,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		if (error->engine[i].hangcheck_stall &&
 		    error->engine[i].pid != -1) {
-			err_printf(m, "Active process (on ring %s): %s [%d]\n",
+			err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
 				   engine_str(i),
 				   error->engine[i].comm,
-				   error->engine[i].pid);
+				   error->engine[i].pid,
+				   error->engine[i].context_bans);
 		}
 	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
@@ -633,9 +634,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		if (obj) {
 			err_puts(m, dev_priv->engine[i]->name);
 			if (ee->pid != -1)
-				err_printf(m, " (submitted by %s [%d])",
+				err_printf(m, " (submitted by %s [%d], bans %d)",
 					   ee->comm,
-					   ee->pid);
+					   ee->pid,
+					   ee->context_bans);
 			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
 				   upper_32_bits(obj->gtt_offset),
 				   lower_32_bits(obj->gtt_offset));
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct
  2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
                   ` (3 preceding siblings ...)
  2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala
@ 2016-11-16 15:20 ` Mika Kuoppala
  2016-11-16 17:13   ` Chris Wilson
  2016-11-16 16:16 ` ✗ Fi.CI.BAT: warning for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork
  2016-11-18 14:15 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases (rev3) Patchwork
  6 siblings, 1 reply; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw)
  To: intel-gfx

Bannable property, banned status, guilty and active counts are
properties of i915_gem_context. Make them so.

v2: rebase

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            | 31 ++++++++++--------------------
 drivers/gpu/drm/i915/i915_gem.c            | 25 ++++++++++--------------
 drivers/gpu/drm/i915/i915_gem_context.c    | 12 +++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  4 +---
 drivers/gpu/drm/i915/i915_gem_request.c    |  4 ++--
 drivers/gpu/drm/i915/i915_gpu_error.c      |  2 +-
 6 files changed, 29 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b385b9d..8bf95e9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -902,25 +902,6 @@ enum i915_cache_level {
 	I915_CACHE_WT, /* hsw:gt3e WriteThrough for scanouts */
 };
 
-struct i915_ctx_hang_stats {
-	/* This context had batch pending when hang was declared */
-	unsigned batch_pending;
-
-	/* This context had batch active when hang was declared */
-	unsigned batch_active;
-
-	bool bannable:1;
-
-	/* This context is banned to submit more work */
-	bool banned:1;
-
-#define CONTEXT_SCORE_GUILTY		10
-#define CONTEXT_SCORE_BAN_THRESHOLD	40
-	/* Accumulated score of hangs caused by this context */
-	int ban_score;
-};
-
-/* This must match up with the value previously used for execbuf2.rsvd1. */
 #define DEFAULT_CONTEXT_HANDLE 0
 
 /**
@@ -950,8 +931,6 @@ struct i915_gem_context {
 	struct pid *pid;
 	const char *name;
 
-	struct i915_ctx_hang_stats hang_stats;
-
 	unsigned long flags;
 #define CONTEXT_NO_ZEROMAP		BIT(0)
 #define CONTEXT_NO_ERROR_CAPTURE	BIT(1)
@@ -980,6 +959,16 @@ struct i915_gem_context {
 
 	u8 remap_slice;
 	bool closed:1;
+	bool bannable:1;
+	bool banned:1;
+
+	unsigned int guilty_count; /* guilty of a hang */
+	unsigned int active_count; /* active during hang */
+
+#define CONTEXT_SCORE_GUILTY		10
+#define CONTEXT_SCORE_BAN_THRESHOLD	40
+	/* Accumulated score of hangs caused by this context */
+	int ban_score;
 };
 
 enum fb_op_origin {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a24fd99..b3324a5 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2620,15 +2620,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 
 static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 {
-	const struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
-
-	if (hs->banned)
+	if (ctx->banned)
 		return true;
 
-	if (!hs->bannable)
+	if (!ctx->bannable)
 		return false;
 
-	if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) {
+	if (ctx->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) {
 		DRM_DEBUG("context hanging too often, banning!\n");
 		return true;
 	}
@@ -2638,20 +2636,19 @@ static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 
 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 {
-	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
-
-	hs->ban_score += CONTEXT_SCORE_GUILTY;
+	ctx->ban_score += CONTEXT_SCORE_GUILTY;
 
-	hs->banned = i915_context_is_banned(ctx);
-	hs->batch_active++;
+	ctx->banned = i915_context_is_banned(ctx);
+	ctx->guilty_count++;
 
 	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
-			 ctx->name, hs->ban_score, yesno(hs->banned));
+			 ctx->name, ctx->ban_score,
+			 yesno(ctx->banned));
 
 	if (!ctx->file_priv)
 		return;
 
-	if (hs->banned) {
+	if (ctx->banned) {
 		ctx->file_priv->context_bans++;
 
 		DRM_DEBUG_DRIVER("client %s has has %d context banned\n",
@@ -2662,9 +2659,7 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 
 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
 {
-	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
-
-	hs->batch_pending++;
+	ctx->active_count++;
 }
 
 struct drm_i915_gem_request *
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 4f88f03..1249b2c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev,
 	 * is no remap info, it will be a NOP. */
 	ctx->remap_slice = ALL_L3_SLICES(dev_priv);
 
-	ctx->hang_stats.bannable = true;
+	ctx->bannable = true;
 	ctx->ring_size = 4 * PAGE_SIZE;
 	ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) <<
 			     GEN8_CTX_ADDRESSING_MODE_SHIFT;
@@ -1115,7 +1115,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
 		break;
 	case I915_CONTEXT_PARAM_BANNABLE:
-		args->value = ctx->hang_stats.bannable;
+		args->value = ctx->bannable;
 		break;
 	default:
 		ret = -EINVAL;
@@ -1172,7 +1172,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 		else if (!capable(CAP_SYS_ADMIN) && !args->value)
 			ret = -EPERM;
 		else
-			ctx->hang_stats.bannable = args->value;
+			ctx->bannable = args->value;
 		break;
 	default:
 		ret = -EINVAL;
@@ -1188,7 +1188,6 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_i915_reset_stats *args = data;
-	struct i915_ctx_hang_stats *hs;
 	struct i915_gem_context *ctx;
 	int ret;
 
@@ -1207,15 +1206,14 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
 		mutex_unlock(&dev->struct_mutex);
 		return PTR_ERR(ctx);
 	}
-	hs = &ctx->hang_stats;
 
 	if (capable(CAP_SYS_ADMIN))
 		args->reset_count = i915_reset_count(&dev_priv->gpu_error);
 	else
 		args->reset_count = 0;
 
-	args->batch_active = hs->batch_active;
-	args->batch_pending = hs->batch_pending;
+	args->batch_active = ctx->guilty_count;
+	args->batch_pending = ctx->active_count;
 
 	mutex_unlock(&dev->struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index e804cb2..c0c24f1 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1232,14 +1232,12 @@ i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
 			  struct intel_engine_cs *engine, const u32 ctx_id)
 {
 	struct i915_gem_context *ctx;
-	struct i915_ctx_hang_stats *hs;
 
 	ctx = i915_gem_context_lookup(file->driver_priv, ctx_id);
 	if (IS_ERR(ctx))
 		return ctx;
 
-	hs = &ctx->hang_stats;
-	if (hs->banned) {
+	if (ctx->banned) {
 		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
 		return ERR_PTR(-EIO);
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index b31d18e..b7cd017 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -256,8 +256,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	}
 
 	/* Retirement decays the ban score as it is a sign of ctx progress */
-	if (request->ctx->hang_stats.ban_score > 0)
-		request->ctx->hang_stats.ban_score--;
+	if (request->ctx->ban_score > 0)
+		request->ctx->ban_score--;
 
 	i915_gem_context_put(request->ctx);
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5990b97..3e7489a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1173,7 +1173,7 @@ static void record_request(struct drm_i915_gem_request *request,
 			   struct drm_i915_error_request *erq)
 {
 	erq->context = request->ctx->hw_id;
-	erq->ban_score = request->ctx->hang_stats.ban_score;
+	erq->ban_score = request->ctx->ban_score;
 	erq->seqno = request->global_seqno;
 	erq->jiffies = request->emitted_jiffies;
 	erq->head = request->head;
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* ✗ Fi.CI.BAT: warning for series starting with [1/6] drm/i915: Split up hangcheck phases
  2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
                   ` (4 preceding siblings ...)
  2016-11-16 15:20 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala
@ 2016-11-16 16:16 ` Patchwork
  2016-11-18 14:15 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases (rev3) Patchwork
  6 siblings, 0 replies; 15+ messages in thread
From: Patchwork @ 2016-11-16 16:16 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/6] drm/i915: Split up hangcheck phases
URL   : https://patchwork.freedesktop.org/series/15423/
State : warning

== Summary ==

Series 15423v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/15423/revisions/1/mbox/

Test drv_module_reload_basic:
                pass       -> DMESG-WARN (fi-skl-6770hq)

fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15 
fi-bsw-n3050     total:244  pass:204  dwarn:0   dfail:0   fail:0   skip:40 
fi-bxt-t5700     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53 
fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-kbl-7200u     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
fi-skl-6700hq    total:244  pass:223  dwarn:0   dfail:0   fail:0   skip:21 
fi-skl-6700k     total:244  pass:222  dwarn:1   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:244  pass:229  dwarn:1   dfail:0   fail:0   skip:14 
fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33 

6294f67611ebe69006c0e85c372efadcac8e9d66 drm-intel-nightly: 2016y-11m-16d-09h-57m-25s UTC integration manifest
7bb86db drm/i915: Wipe hang stats as an embedded struct
73fd54a drm/i915: Add per client max context ban limit
85bae87 drm/i915: Add bannable context parameter
2ae4b63 drm/i915: Use request retirement as context progress
80ee7bc drm/i915: Decouple hang detection from hangcheck period
11ed92d drm/i915: Split up hangcheck phases

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3024/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period
  2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
@ 2016-11-16 17:05   ` Chris Wilson
  2016-11-18 13:09   ` Mika Kuoppala
  1 sibling, 0 replies; 15+ messages in thread
From: Chris Wilson @ 2016-11-16 17:05 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Wed, Nov 16, 2016 at 05:20:30PM +0200, Mika Kuoppala wrote:
> -	ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
> -	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine))
> +	ring_hung = engine->hangcheck.stall;
> +	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) {
> +		if (ring_hung)
> +			DRM_ERROR("%s pardoned due to progress after hangcheck %x vs %x\n",
> +				  engine->name,
> +				  engine->hangcheck.seqno,
> +				  intel_engine_get_seqno(engine));
> +

Is this worth alarming the user over? We recover gracefully either way.

DRM_DEBUG_DRIVER("pardoned, was guilty? %s\n", yesno(ring_hung));

>  		ring_hung = false;
> +	}
>  
>  	i915_set_reset_status(request->ctx, ring_hung);
>  	if (!ring_hung)
> diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c
> index bf8a471..348b0f2 100644
> --- a/drivers/gpu/drm/i915/i915_gem_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_gem_timeline.c
> @@ -24,6 +24,15 @@
>  
>  #include "i915_drv.h"
>  
> +static void i915_gem_timeline_retire(struct i915_gem_active *active,
> +				     struct drm_i915_gem_request *request)
> +{
> +	struct intel_timeline *tl =
> +		container_of(active, typeof(*tl), last_request);
> +
> +	tl->last_retire_timestamp = jiffies;

Make this a separate patch. last_retired_timestamp to match
last_submitted_seqno. Maybe worth adding last_submitted_timestamp as
well for debug purposes.

I guess we are missing a debugfs/i915_gem_timelines

Some grumbling, but nothing stands out in this patch.

So with the couple of points above,
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/6] drm/i915: Use request retirement as context progress
  2016-11-16 15:20 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala
@ 2016-11-16 17:08   ` Chris Wilson
  0 siblings, 0 replies; 15+ messages in thread
From: Chris Wilson @ 2016-11-16 17:08 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Wed, Nov 16, 2016 at 05:20:31PM +0200, Mika Kuoppala wrote:
> As hangcheck score was removed, the active decay of score
> was removed also. This removed feature for hangcheck to detect
> if the gpu client was accidentally or maliciously causing intermittent
> hangs. Reinstate the scoring as a per context property, so that if
> one context starts to act unfavourably, ban it.
> 
> v2: ban_period_secs as a gate to score check (Chris)
> v3: decay in proper spot. scores as tunables (Chris)
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 5/6] drm/i915: Add per client max context ban limit
  2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala
@ 2016-11-16 17:12   ` Chris Wilson
  2016-11-18 13:10   ` Mika Kuoppala
  1 sibling, 0 replies; 15+ messages in thread
From: Chris Wilson @ 2016-11-16 17:12 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Wed, Nov 16, 2016 at 05:20:33PM +0200, Mika Kuoppala wrote:
> If we have a bad client submitting unfavourably across different
> contexts, creating new ones, the per context scoring of badness
> doesn't remove the root cause, the offending client.
> To counter, keep track of per client context bans. Deny access if
> client is responsible for more than 3 context bans in
> it's lifetime.
> 
> v2: move ban check to context create ioctl (Chris)
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  4 ++++
>  drivers/gpu/drm/i915/i915_gem.c         | 14 ++++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++
>  drivers/gpu/drm/i915/i915_gpu_error.c   | 10 ++++++----
>  4 files changed, 37 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 74f421a1..b385b9d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -411,6 +411,9 @@ struct drm_i915_file_private {
>  	} rps;
>  
>  	unsigned int bsd_engine;
> +
> +#define I915_MAX_CLIENT_CONTEXT_BANS 3

I'd just like some commentary here giving some rationale for 3.

With that,
Reviewed-by: Chris Wilson <chris@chris-wilso.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct
  2016-11-16 15:20 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala
@ 2016-11-16 17:13   ` Chris Wilson
  2016-11-21 12:55     ` Mika Kuoppala
  0 siblings, 1 reply; 15+ messages in thread
From: Chris Wilson @ 2016-11-16 17:13 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Wed, Nov 16, 2016 at 05:20:34PM +0200, Mika Kuoppala wrote:
> Bannable property, banned status, guilty and active counts are
> properties of i915_gem_context. Make them so.
> 
> v2: rebase
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>

Been hesistating since the substruct might have helped, but after a few
reads, I'm happier with the conversion.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period
  2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
  2016-11-16 17:05   ` Chris Wilson
@ 2016-11-18 13:09   ` Mika Kuoppala
  1 sibling, 0 replies; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-18 13:09 UTC (permalink / raw)
  To: intel-gfx

Hangcheck state accumulation has gained more steps
along the years, like head movement and more recently the
subunit inactivity check. As the subunit sampling is only
done if the previous state check showed inactivity, we
have added more stages (and time) to reach a hang verdict.

Asymmetric engine states led to different actual weight of
'one hangcheck unit' and it was demonstrated in some
hangs that due to difference in stages, simpler engines
were accused falsely of a hang as their scoring was much
more quicker to accumulate above the hang treshold.

To completely decouple the hangcheck guilty score
from the hangcheck period, convert hangcheck score to a
rough period of inactivity measurement. As these are
tracked as jiffies, they are meaningful also across
reset boundaries. This makes finding a guilty engine
more accurate across multi engine activity scenarios,
especially across asymmetric engines.

We lose the ability to detect cross batch malicious attempts
to hinder the progress. Plan is to move this functionality
to be part of context banning which is more natural fit,
later in the series.

v2: use time_before macros (Chris)
    reinstate the pardoning of moving engine after hc (Chris)
v3: avoid global state for per engine stall detection (Chris)
v4: take timeline last retirement into account (Chris)
v5: do debug print on pardoning, split out retirement timestamp (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |  17 +++--
 drivers/gpu/drm/i915/i915_drv.h         |   6 +-
 drivers/gpu/drm/i915/i915_gem.c         |   8 ++-
 drivers/gpu/drm/i915/i915_gpu_error.c   |  46 ++++----------
 drivers/gpu/drm/i915/intel_hangcheck.c  | 108 +++++++++++++++-----------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  40 +++++++++---
 6 files changed, 117 insertions(+), 108 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index b7f42c4..d67079f 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1351,10 +1351,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
 			   engine->hangcheck.seqno, seqno[id],
 			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
+		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)));
+					  &dev_priv->gpu_error.missed_irq_rings)),
+			   yesno(engine->hangcheck.stalled));
+
 		spin_lock_irq(&b->lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
 			struct intel_wait *w = container_of(rb, typeof(*w), node);
@@ -1367,8 +1369,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
-		seq_printf(m, "\tscore = %d\n", engine->hangcheck.score);
-		seq_printf(m, "\taction = %d\n", engine->hangcheck.action);
+		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
+			   hangcheck_action_to_str(engine->hangcheck.action),
+			   engine->hangcheck.action,
+			   jiffies_to_msecs(jiffies -
+					    engine->hangcheck.action_timestamp));
 
 		if (engine->id == RCS) {
 			seq_puts(m, "\tinstdone read =\n");
@@ -3162,11 +3167,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 		u64 addr;
 
 		seq_printf(m, "%s\n", engine->name);
-		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [score %d]\n",
+		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
 			   intel_engine_get_seqno(engine),
 			   intel_engine_last_submit(engine),
 			   engine->hangcheck.seqno,
-			   engine->hangcheck.score);
+			   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
 
 		rcu_read_lock();
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index be67aee..7b0e5c7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -800,7 +800,8 @@ struct drm_i915_error_state {
 		/* Software tracked state */
 		bool waiting;
 		int num_waiters;
-		int hangcheck_score;
+		unsigned long hangcheck_timestamp;
+		bool hangcheck_stalled;
 		enum intel_engine_hangcheck_action hangcheck_action;
 		struct i915_address_space *vm;
 		int num_requests;
@@ -1446,6 +1447,9 @@ struct i915_error_state_file_priv {
 #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */
 #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */
 
+#define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* Seqno, head and subunits dead */
+#define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Seqno dead with active head */
+
 struct i915_gpu_error {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d5b7723..2ab57d2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2703,9 +2703,13 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 	if (!request)
 		return;
 
-	ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
-	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine))
+	ring_hung = engine->hangcheck.stalled;
+	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) {
+		DRM_DEBUG_DRIVER("%s pardoned, was guilty? %s\n",
+				 engine->name,
+				 yesno(ring_hung));
 		ring_hung = false;
+	}
 
 	i915_set_reset_status(request->ctx, ring_hung);
 	if (!ring_hung)
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 4bcf1a0..d5a4ec9 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -316,28 +316,6 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	}
 }
 
-static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
-{
-	switch (a) {
-	case HANGCHECK_IDLE:
-		return "idle";
-	case HANGCHECK_WAIT:
-		return "wait";
-	case HANGCHECK_ACTIVE_SEQNO:
-		return "active seqno";
-	case HANGCHECK_ACTIVE_HEAD:
-		return "active head";
-	case HANGCHECK_ACTIVE_SUBUNITS:
-		return "active subunits";
-	case HANGCHECK_KICK:
-		return "kick";
-	case HANGCHECK_HUNG:
-		return "hung";
-	}
-
-	return "unknown";
-}
-
 static void error_print_instdone(struct drm_i915_error_state_buf *m,
 				 struct drm_i915_error_engine *ee)
 {
@@ -445,9 +423,13 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck: %s [%d]\n",
-		   hangcheck_action_to_str(ee->hangcheck_action),
-		   ee->hangcheck_score);
+	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
+	err_printf(m, "  hangcheck action: %s\n",
+		   hangcheck_action_to_str(ee->hangcheck_action));
+	err_printf(m, "  hangcheck action timestamp: %lu, %u ms ago\n",
+		   ee->hangcheck_timestamp,
+		   jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
+
 	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
 	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
 }
@@ -536,7 +518,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	struct pci_dev *pdev = dev_priv->drm.pdev;
 	struct drm_i915_error_state *error = error_priv->error;
 	struct drm_i915_error_object *obj;
-	int max_hangcheck_score;
 	int i, j;
 
 	if (!error) {
@@ -553,13 +534,9 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Uptime: %ld s %ld us\n",
 		   error->uptime.tv_sec, error->uptime.tv_usec);
 	err_print_capabilities(m, &error->device_info);
-	max_hangcheck_score = 0;
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score > max_hangcheck_score)
-			max_hangcheck_score = error->engine[i].hangcheck_score;
-	}
+
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score == max_hangcheck_score &&
+		if (error->engine[i].hangcheck_stalled &&
 		    error->engine[i].pid != -1) {
 			err_printf(m, "Active process (on ring %s): %s [%d]\n",
 				   engine_str(i),
@@ -945,7 +922,7 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
 	 * strictly a client bug. Use instdone to differentiate those some.
 	 */
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		if (error->engine[i].hangcheck_action == HANGCHECK_HUNG) {
+		if (error->engine[i].hangcheck_stalled) {
 			if (engine_id)
 				*engine_id = i;
 
@@ -1163,8 +1140,9 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 		ee->hws = I915_READ(mmio);
 	}
 
-	ee->hangcheck_score = engine->hangcheck.score;
+	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->hangcheck_action = engine->hangcheck.action;
+	ee->hangcheck_stalled = engine->hangcheck.stalled;
 
 	if (USES_PPGTT(dev_priv)) {
 		int i;
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 3d2e81c..c03db02 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -236,13 +236,13 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
 		memset(&engine->hangcheck.instdone, 0,
 		       sizeof(engine->hangcheck.instdone));
 
-		return HANGCHECK_ACTIVE_HEAD;
+		return ENGINE_ACTIVE_HEAD;
 	}
 
 	if (!subunits_stuck(engine))
-		return HANGCHECK_ACTIVE_SUBUNITS;
+		return ENGINE_ACTIVE_SUBUNITS;
 
-	return HANGCHECK_HUNG;
+	return ENGINE_DEAD;
 }
 
 static enum intel_engine_hangcheck_action
@@ -253,11 +253,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	u32 tmp;
 
 	ha = head_stuck(engine, acthd);
-	if (ha != HANGCHECK_HUNG)
+	if (ha != ENGINE_DEAD)
 		return ha;
 
 	if (IS_GEN2(dev_priv))
-		return HANGCHECK_HUNG;
+		return ENGINE_DEAD;
 
 	/* Is the chip hanging on a WAIT_FOR_EVENT?
 	 * If so we can simply poke the RB_WAIT bit
@@ -270,25 +270,25 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 				  "Kicking stuck wait on %s",
 				  engine->name);
 		I915_WRITE_CTL(engine, tmp);
-		return HANGCHECK_KICK;
+		return ENGINE_WAIT_KICK;
 	}
 
 	if (INTEL_GEN(dev_priv) >= 6 && tmp & RING_WAIT_SEMAPHORE) {
 		switch (semaphore_passed(engine)) {
 		default:
-			return HANGCHECK_HUNG;
+			return ENGINE_DEAD;
 		case 1:
 			i915_handle_error(dev_priv, 0,
 					  "Kicking stuck semaphore on %s",
 					  engine->name);
 			I915_WRITE_CTL(engine, tmp);
-			return HANGCHECK_KICK;
+			return ENGINE_WAIT_KICK;
 		case 0:
-			return HANGCHECK_WAIT;
+			return ENGINE_WAIT;
 		}
 	}
 
-	return HANGCHECK_HUNG;
+	return ENGINE_DEAD;
 }
 
 static void hangcheck_load_sample(struct intel_engine_cs *engine,
@@ -306,7 +306,6 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
 
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_seqno(engine);
-	hc->score = engine->hangcheck.score;
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
@@ -314,8 +313,8 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
 {
 	engine->hangcheck.acthd = hc->acthd;
 	engine->hangcheck.seqno = hc->seqno;
-	engine->hangcheck.score = hc->score;
 	engine->hangcheck.action = hc->action;
+	engine->hangcheck.stalled = hc->stalled;
 }
 
 static enum intel_engine_hangcheck_action
@@ -323,10 +322,10 @@ hangcheck_get_action(struct intel_engine_cs *engine,
 		     const struct intel_engine_hangcheck *hc)
 {
 	if (engine->hangcheck.seqno != hc->seqno)
-		return HANGCHECK_ACTIVE_SEQNO;
+		return ENGINE_ACTIVE_SEQNO;
 
 	if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
-		return HANGCHECK_IDLE;
+		return ENGINE_IDLE;
 
 	return engine_stuck(engine, hc->acthd);
 }
@@ -334,60 +333,57 @@ hangcheck_get_action(struct intel_engine_cs *engine,
 static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
 					struct intel_engine_hangcheck *hc)
 {
+	unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
+
 	hc->action = hangcheck_get_action(engine, hc);
 
-	switch (hc->action) {
-	case HANGCHECK_IDLE:
-	case HANGCHECK_WAIT:
-		break;
+	/* We always increment the progress
+	 * if the engine is busy and still processing
+	 * the same request, so that no single request
+	 * can run indefinitely (such as a chain of
+	 * batches). The only time we do not increment
+	 * the hangcheck score on this ring, if this
+	 * engine is in a legitimate wait for another
+	 * engine. In that case the waiting engine is a
+	 * victim and we want to be sure we catch the
+	 * right culprit. Then every time we do kick
+	 * the ring, make it as a progress as the seqno
+	 * advancement might ensure and if not, it
+	 * will catch the hanging engine.
+	 */
 
-	case HANGCHECK_ACTIVE_HEAD:
-	case HANGCHECK_ACTIVE_SUBUNITS:
-		/* We always increment the hangcheck score
-		 * if the engine is busy and still processing
-		 * the same request, so that no single request
-		 * can run indefinitely (such as a chain of
-		 * batches). The only time we do not increment
-		 * the hangcheck score on this ring, if this
-		 * engine is in a legitimate wait for another
-		 * engine. In that case the waiting engine is a
-		 * victim and we want to be sure we catch the
-		 * right culprit. Then every time we do kick
-		 * the ring, add a small increment to the
-		 * score so that we can catch a batch that is
-		 * being repeatedly kicked and so responsible
-		 * for stalling the machine.
-		 */
-		hc->score += 1;
-		break;
+	switch (hc->action) {
+	case ENGINE_IDLE:
+	case ENGINE_ACTIVE_SEQNO:
+		/* Clear head and subunit states on seqno movement */
+		hc->acthd = 0;
 
-	case HANGCHECK_KICK:
-		hc->score += 5;
-		break;
+		memset(&engine->hangcheck.instdone, 0,
+		       sizeof(engine->hangcheck.instdone));
 
-	case HANGCHECK_HUNG:
-		hc->score += 20;
+		/* Intentional fall through */
+	case ENGINE_WAIT_KICK:
+	case ENGINE_WAIT:
+		engine->hangcheck.action_timestamp = jiffies;
 		break;
 
-	case HANGCHECK_ACTIVE_SEQNO:
-		/* Gradually reduce the count so that we catch DoS
-		 * attempts across multiple batches.
+	case ENGINE_ACTIVE_HEAD:
+	case ENGINE_ACTIVE_SUBUNITS:
+		/* Seqno stuck with still active engine gets leeway,
+		 * in hopes that it is just a long shader.
 		 */
-		if (hc->score > 0)
-			hc->score -= 15;
-		if (hc->score < 0)
-			hc->score = 0;
-
-		/* Clear head and subunit states on seqno movement */
-		hc->acthd = 0;
+		timeout = I915_SEQNO_DEAD_TIMEOUT;
+		break;
 
-		memset(&engine->hangcheck.instdone, 0,
-		       sizeof(engine->hangcheck.instdone));
+	case ENGINE_DEAD:
 		break;
 
 	default:
 		MISSING_CASE(hc->action);
 	}
+
+	hc->stalled = time_after(jiffies,
+				 engine->hangcheck.action_timestamp + timeout);
 }
 
 static void hangcheck_declare_hang(struct drm_i915_private *i915,
@@ -454,9 +450,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		hangcheck_accumulate_sample(engine, hc);
 		hangcheck_store_sample(engine, hc);
 
-		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+		if (engine->hangcheck.stalled) {
 			hung |= intel_engine_flag(engine);
-			if (hc->action != HANGCHECK_HUNG)
+			if (hc->action != ENGINE_DEAD)
 				stuck |= intel_engine_flag(engine);
 		}
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3152b2b..3f43ade 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -65,16 +65,37 @@ struct intel_hw_status_page {
 	 GEN8_SEMAPHORE_OFFSET(from, (__ring)->id))
 
 enum intel_engine_hangcheck_action {
-	HANGCHECK_IDLE = 0,
-	HANGCHECK_WAIT,
-	HANGCHECK_ACTIVE_SEQNO,
-	HANGCHECK_ACTIVE_HEAD,
-	HANGCHECK_ACTIVE_SUBUNITS,
-	HANGCHECK_KICK,
-	HANGCHECK_HUNG,
+	ENGINE_IDLE = 0,
+	ENGINE_WAIT,
+	ENGINE_ACTIVE_SEQNO,
+	ENGINE_ACTIVE_HEAD,
+	ENGINE_ACTIVE_SUBUNITS,
+	ENGINE_WAIT_KICK,
+	ENGINE_DEAD,
 };
 
-#define HANGCHECK_SCORE_RING_HUNG 31
+static inline const char *
+hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
+{
+	switch (a) {
+	case ENGINE_IDLE:
+		return "idle";
+	case ENGINE_WAIT:
+		return "wait";
+	case ENGINE_ACTIVE_SEQNO:
+		return "active seqno";
+	case ENGINE_ACTIVE_HEAD:
+		return "active head";
+	case ENGINE_ACTIVE_SUBUNITS:
+		return "active subunits";
+	case ENGINE_WAIT_KICK:
+		return "wait kick";
+	case ENGINE_DEAD:
+		return "dead";
+	}
+
+	return "unknown";
+}
 
 #define I915_MAX_SLICES	3
 #define I915_MAX_SUBSLICES 3
@@ -106,10 +127,11 @@ struct intel_instdone {
 struct intel_engine_hangcheck {
 	u64 acthd;
 	u32 seqno;
-	int score;
 	enum intel_engine_hangcheck_action action;
+	unsigned long action_timestamp;
 	int deadlock;
 	struct intel_instdone instdone;
+	bool stalled;
 };
 
 struct intel_ring {
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/6] drm/i915: Add per client max context ban limit
  2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala
  2016-11-16 17:12   ` Chris Wilson
@ 2016-11-18 13:10   ` Mika Kuoppala
  1 sibling, 0 replies; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-18 13:10 UTC (permalink / raw)
  To: intel-gfx

If we have a bad client submitting unfavourably across different
contexts, creating new ones, the per context scoring of badness
doesn't remove the root cause, the offending client.
To counter, keep track of per client context bans. Deny access if
client is responsible for more than 3 context bans in
it's lifetime.

v2: move ban check to context create ioctl (Chris)
v3: add commentary about hangs needed to reach client ban (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         | 10 ++++++++++
 drivers/gpu/drm/i915/i915_gem.c         | 14 ++++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c   | 10 ++++++----
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 837abdf..304e4c9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -416,6 +416,15 @@ struct drm_i915_file_private {
 	} rps;
 
 	unsigned int bsd_engine;
+
+/* Client can have a maximum of 3 contexts banned before
+ * it is denied of creating new contexts. As one context
+ * ban needs 4 consecutive hangs, and more if there is
+ * progress in between, this is a last resort stop gap measure
+ * to limit the badly behaving clients access to gpu.
+ */
+#define I915_MAX_CLIENT_CONTEXT_BANS 3
+	int context_bans;
 };
 
 /* Used by dp and fdi links */
@@ -872,6 +881,7 @@ struct drm_i915_error_state {
 
 		pid_t pid;
 		char comm[TASK_COMM_LEN];
+		int context_bans;
 	} engine[I915_NUM_ENGINES];
 
 	struct drm_i915_error_buffer {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 734b270..dd015bc 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2644,6 +2644,20 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 
 	hs->banned = i915_context_is_banned(ctx);
 	hs->batch_active++;
+
+	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
+			 ctx->name, hs->ban_score, yesno(hs->banned));
+
+	if (!ctx->file_priv)
+		return;
+
+	if (hs->banned) {
+		ctx->file_priv->context_bans++;
+
+		DRM_DEBUG_DRIVER("client %s has has %d context banned\n",
+				 ctx->name,
+				 ctx->file_priv->context_bans);
+	}
 }
 
 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index bc1b232..4f88f03 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1003,6 +1003,11 @@ static bool contexts_enabled(struct drm_device *dev)
 	return i915.enable_execlists || to_i915(dev)->hw_context_size;
 }
 
+static bool client_is_banned(struct drm_i915_file_private *file_priv)
+{
+	return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS;
+}
+
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file)
 {
@@ -1017,6 +1022,14 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 	if (args->pad != 0)
 		return -EINVAL;
 
+	if (client_is_banned(file_priv)) {
+		DRM_DEBUG("client %s[%d] banned from creating ctx\n",
+			  current->comm,
+			  pid_nr(get_task_pid(current, PIDTYPE_PID)));
+
+		return -EIO;
+	}
+
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index fa988a0..af4f0ef 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -538,10 +538,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		if (error->engine[i].hangcheck_stalled &&
 		    error->engine[i].pid != -1) {
-			err_printf(m, "Active process (on ring %s): %s [%d]\n",
+			err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
 				   engine_str(i),
 				   error->engine[i].comm,
-				   error->engine[i].pid);
+				   error->engine[i].pid,
+				   error->engine[i].context_bans);
 		}
 	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
@@ -632,9 +633,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		if (obj) {
 			err_puts(m, dev_priv->engine[i]->name);
 			if (ee->pid != -1)
-				err_printf(m, " (submitted by %s [%d])",
+				err_printf(m, " (submitted by %s [%d], bans %d)",
 					   ee->comm,
-					   ee->pid);
+					   ee->pid,
+					   ee->context_bans);
 			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
 				   upper_32_bits(obj->gtt_offset),
 				   lower_32_bits(obj->gtt_offset));
-- 
2.7.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases (rev3)
  2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
                   ` (5 preceding siblings ...)
  2016-11-16 16:16 ` ✗ Fi.CI.BAT: warning for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork
@ 2016-11-18 14:15 ` Patchwork
  6 siblings, 0 replies; 15+ messages in thread
From: Patchwork @ 2016-11-18 14:15 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/6] drm/i915: Split up hangcheck phases (rev3)
URL   : https://patchwork.freedesktop.org/series/15423/
State : success

== Summary ==

Series 15423v3 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/15423/revisions/3/mbox/


fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15 
fi-bsw-n3050     total:244  pass:204  dwarn:0   dfail:0   fail:0   skip:40 
fi-bxt-t5700     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53 
fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-kbl-7200u     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
fi-skl-6700hq    total:244  pass:223  dwarn:0   dfail:0   fail:0   skip:21 
fi-skl-6700k     total:244  pass:222  dwarn:1   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14 
fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33 

05104adb7446f310fad4e36f22d4c84ffafa31fc drm-intel-nightly: 2016y-11m-18d-13h-10m-16s UTC integration manifest
8ef5298 drm/i915: Wipe hang stats as an embedded struct
dfc655e drm/i915: Add per client max context ban limit
c0920d3 drm/i915: Add bannable context parameter
3dc541d drm/i915: Use request retirement as context progress
3f27d62 drm/i915: Decouple hang detection from hangcheck period
4a7b29a drm/i915: Split up hangcheck phases

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3054/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct
  2016-11-16 17:13   ` Chris Wilson
@ 2016-11-21 12:55     ` Mika Kuoppala
  0 siblings, 0 replies; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-21 12:55 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On Wed, Nov 16, 2016 at 05:20:34PM +0200, Mika Kuoppala wrote:
>> Bannable property, banned status, guilty and active counts are
>> properties of i915_gem_context. Make them so.
>> 
>> v2: rebase
>> 
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
>
> Been hesistating since the substruct might have helped, but after a few
> reads, I'm happier with the conversion.
>
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

Whole series pushed. Thanks for review.
-Mika


> -Chris
>
> -- 
> Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2016-11-21 12:56 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala
2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
2016-11-16 17:05   ` Chris Wilson
2016-11-18 13:09   ` Mika Kuoppala
2016-11-16 15:20 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala
2016-11-16 17:08   ` Chris Wilson
2016-11-16 15:20 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala
2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala
2016-11-16 17:12   ` Chris Wilson
2016-11-18 13:10   ` Mika Kuoppala
2016-11-16 15:20 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala
2016-11-16 17:13   ` Chris Wilson
2016-11-21 12:55     ` Mika Kuoppala
2016-11-16 16:16 ` ✗ Fi.CI.BAT: warning for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork
2016-11-18 14:15 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases (rev3) Patchwork

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).