* [PATCH 1/6] drm/i915: Split up hangcheck phases
@ 2016-11-16 15:20 Mika Kuoppala
2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala
` (6 more replies)
0 siblings, 7 replies; 15+ messages in thread
From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw)
To: intel-gfx
In order to simplify hangcheck state keeping, split hangcheck
per engine loop in three phases: state load, action, state save.
Add few more hangcheck actions to separate between seqno, head
and subunit movements. This helps to gather all the hangcheck
actions under a single switch umbrella.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_gpu_error.c | 8 +-
drivers/gpu/drm/i915/intel_hangcheck.c | 241 ++++++++++++++++++--------------
drivers/gpu/drm/i915/intel_ringbuffer.h | 4 +-
3 files changed, 146 insertions(+), 107 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5d620bd..f02f581 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
return "idle";
case HANGCHECK_WAIT:
return "wait";
- case HANGCHECK_ACTIVE:
- return "active";
+ case HANGCHECK_ACTIVE_SEQNO:
+ return "active seqno";
+ case HANGCHECK_ACTIVE_HEAD:
+ return "active head";
+ case HANGCHECK_ACTIVE_SUBUNITS:
+ return "active subunits";
case HANGCHECK_KICK:
return "kick";
case HANGCHECK_HUNG:
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 53df5b1..3d2e81c 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
memset(&engine->hangcheck.instdone, 0,
sizeof(engine->hangcheck.instdone));
- return HANGCHECK_ACTIVE;
+ return HANGCHECK_ACTIVE_HEAD;
}
if (!subunits_stuck(engine))
- return HANGCHECK_ACTIVE;
+ return HANGCHECK_ACTIVE_SUBUNITS;
return HANGCHECK_HUNG;
}
@@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
return HANGCHECK_HUNG;
}
+static void hangcheck_load_sample(struct intel_engine_cs *engine,
+ struct intel_engine_hangcheck *hc)
+{
+ /* We don't strictly need an irq-barrier here, as we are not
+ * serving an interrupt request, be paranoid in case the
+ * barrier has side-effects (such as preventing a broken
+ * cacheline snoop) and so be sure that we can see the seqno
+ * advance. If the seqno should stick, due to a stale
+ * cacheline, we would erroneously declare the GPU hung.
+ */
+ if (engine->irq_seqno_barrier)
+ engine->irq_seqno_barrier(engine);
+
+ hc->acthd = intel_engine_get_active_head(engine);
+ hc->seqno = intel_engine_get_seqno(engine);
+ hc->score = engine->hangcheck.score;
+}
+
+static void hangcheck_store_sample(struct intel_engine_cs *engine,
+ const struct intel_engine_hangcheck *hc)
+{
+ engine->hangcheck.acthd = hc->acthd;
+ engine->hangcheck.seqno = hc->seqno;
+ engine->hangcheck.score = hc->score;
+ engine->hangcheck.action = hc->action;
+}
+
+static enum intel_engine_hangcheck_action
+hangcheck_get_action(struct intel_engine_cs *engine,
+ const struct intel_engine_hangcheck *hc)
+{
+ if (engine->hangcheck.seqno != hc->seqno)
+ return HANGCHECK_ACTIVE_SEQNO;
+
+ if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
+ return HANGCHECK_IDLE;
+
+ return engine_stuck(engine, hc->acthd);
+}
+
+static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
+ struct intel_engine_hangcheck *hc)
+{
+ hc->action = hangcheck_get_action(engine, hc);
+
+ switch (hc->action) {
+ case HANGCHECK_IDLE:
+ case HANGCHECK_WAIT:
+ break;
+
+ case HANGCHECK_ACTIVE_HEAD:
+ case HANGCHECK_ACTIVE_SUBUNITS:
+ /* We always increment the hangcheck score
+ * if the engine is busy and still processing
+ * the same request, so that no single request
+ * can run indefinitely (such as a chain of
+ * batches). The only time we do not increment
+ * the hangcheck score on this ring, if this
+ * engine is in a legitimate wait for another
+ * engine. In that case the waiting engine is a
+ * victim and we want to be sure we catch the
+ * right culprit. Then every time we do kick
+ * the ring, add a small increment to the
+ * score so that we can catch a batch that is
+ * being repeatedly kicked and so responsible
+ * for stalling the machine.
+ */
+ hc->score += 1;
+ break;
+
+ case HANGCHECK_KICK:
+ hc->score += 5;
+ break;
+
+ case HANGCHECK_HUNG:
+ hc->score += 20;
+ break;
+
+ case HANGCHECK_ACTIVE_SEQNO:
+ /* Gradually reduce the count so that we catch DoS
+ * attempts across multiple batches.
+ */
+ if (hc->score > 0)
+ hc->score -= 15;
+ if (hc->score < 0)
+ hc->score = 0;
+
+ /* Clear head and subunit states on seqno movement */
+ hc->acthd = 0;
+
+ memset(&engine->hangcheck.instdone, 0,
+ sizeof(engine->hangcheck.instdone));
+ break;
+
+ default:
+ MISSING_CASE(hc->action);
+ }
+}
+
+static void hangcheck_declare_hang(struct drm_i915_private *i915,
+ unsigned int hung,
+ unsigned int stuck)
+{
+ struct intel_engine_cs *engine;
+ char msg[80];
+ unsigned int tmp;
+ int len;
+
+ /* If some rings hung but others were still busy, only
+ * blame the hanging rings in the synopsis.
+ */
+ if (stuck != hung)
+ hung &= ~stuck;
+ len = scnprintf(msg, sizeof(msg),
+ "%s on ", stuck == hung ? "No progress" : "Hang");
+ for_each_engine_masked(engine, i915, hung, tmp)
+ len += scnprintf(msg + len, sizeof(msg) - len,
+ "%s, ", engine->name);
+ msg[len-2] = '\0';
+
+ return i915_handle_error(i915, hung, msg);
+}
+
/*
* This is called when the chip hasn't reported back with completed
* batchbuffers in a long time. We keep track per ring seqno progress and
@@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
enum intel_engine_id id;
unsigned int hung = 0, stuck = 0;
int busy_count = 0;
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define ACTIVE_DECAY 15
if (!i915.enable_hangcheck)
return;
@@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
for_each_engine(engine, dev_priv, id) {
- bool busy = intel_engine_has_waiter(engine);
- u64 acthd;
- u32 seqno;
- u32 submit;
+ struct intel_engine_hangcheck cur_state, *hc = &cur_state;
+ const bool busy = intel_engine_has_waiter(engine);
semaphore_clear_deadlocks(dev_priv);
- /* We don't strictly need an irq-barrier here, as we are not
- * serving an interrupt request, be paranoid in case the
- * barrier has side-effects (such as preventing a broken
- * cacheline snoop) and so be sure that we can see the seqno
- * advance. If the seqno should stick, due to a stale
- * cacheline, we would erroneously declare the GPU hung.
- */
- if (engine->irq_seqno_barrier)
- engine->irq_seqno_barrier(engine);
-
- acthd = intel_engine_get_active_head(engine);
- seqno = intel_engine_get_seqno(engine);
- submit = intel_engine_last_submit(engine);
-
- if (engine->hangcheck.seqno == seqno) {
- if (i915_seqno_passed(seqno, submit)) {
- engine->hangcheck.action = HANGCHECK_IDLE;
- } else {
- /* We always increment the hangcheck score
- * if the engine is busy and still processing
- * the same request, so that no single request
- * can run indefinitely (such as a chain of
- * batches). The only time we do not increment
- * the hangcheck score on this ring, if this
- * engine is in a legitimate wait for another
- * engine. In that case the waiting engine is a
- * victim and we want to be sure we catch the
- * right culprit. Then every time we do kick
- * the ring, add a small increment to the
- * score so that we can catch a batch that is
- * being repeatedly kicked and so responsible
- * for stalling the machine.
- */
- engine->hangcheck.action =
- engine_stuck(engine, acthd);
-
- switch (engine->hangcheck.action) {
- case HANGCHECK_IDLE:
- case HANGCHECK_WAIT:
- break;
- case HANGCHECK_ACTIVE:
- engine->hangcheck.score += BUSY;
- break;
- case HANGCHECK_KICK:
- engine->hangcheck.score += KICK;
- break;
- case HANGCHECK_HUNG:
- engine->hangcheck.score += HUNG;
- break;
- }
- }
-
- if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
- hung |= intel_engine_flag(engine);
- if (engine->hangcheck.action != HANGCHECK_HUNG)
- stuck |= intel_engine_flag(engine);
- }
- } else {
- engine->hangcheck.action = HANGCHECK_ACTIVE;
-
- /* Gradually reduce the count so that we catch DoS
- * attempts across multiple batches.
- */
- if (engine->hangcheck.score > 0)
- engine->hangcheck.score -= ACTIVE_DECAY;
- if (engine->hangcheck.score < 0)
- engine->hangcheck.score = 0;
-
- /* Clear head and subunit states on seqno movement */
- acthd = 0;
-
- memset(&engine->hangcheck.instdone, 0,
- sizeof(engine->hangcheck.instdone));
+ hangcheck_load_sample(engine, hc);
+ hangcheck_accumulate_sample(engine, hc);
+ hangcheck_store_sample(engine, hc);
+
+ if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+ hung |= intel_engine_flag(engine);
+ if (hc->action != HANGCHECK_HUNG)
+ stuck |= intel_engine_flag(engine);
}
- engine->hangcheck.seqno = seqno;
- engine->hangcheck.acthd = acthd;
busy_count += busy;
}
- if (hung) {
- char msg[80];
- unsigned int tmp;
- int len;
-
- /* If some rings hung but others were still busy, only
- * blame the hanging rings in the synopsis.
- */
- if (stuck != hung)
- hung &= ~stuck;
- len = scnprintf(msg, sizeof(msg),
- "%s on ", stuck == hung ? "No progress" : "Hang");
- for_each_engine_masked(engine, dev_priv, hung, tmp)
- len += scnprintf(msg + len, sizeof(msg) - len,
- "%s, ", engine->name);
- msg[len-2] = '\0';
-
- return i915_handle_error(dev_priv, hung, msg);
- }
+ if (hung)
+ hangcheck_declare_hang(dev_priv, hung, stuck);
/* Reset timer in case GPU hangs without another request being added */
if (busy_count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3466b4e..3152b2b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -67,7 +67,9 @@ struct intel_hw_status_page {
enum intel_engine_hangcheck_action {
HANGCHECK_IDLE = 0,
HANGCHECK_WAIT,
- HANGCHECK_ACTIVE,
+ HANGCHECK_ACTIVE_SEQNO,
+ HANGCHECK_ACTIVE_HEAD,
+ HANGCHECK_ACTIVE_SUBUNITS,
HANGCHECK_KICK,
HANGCHECK_HUNG,
};
--
2.7.4
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period 2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala @ 2016-11-16 15:20 ` Mika Kuoppala 2016-11-16 17:05 ` Chris Wilson 2016-11-18 13:09 ` Mika Kuoppala 2016-11-16 15:20 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala ` (5 subsequent siblings) 6 siblings, 2 replies; 15+ messages in thread From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw) To: intel-gfx Hangcheck state accumulation has gained more steps along the years, like head movement and more recently the subunit inactivity check. As the subunit sampling is only done if the previous state check showed inactivity, we have added more stages (and time) to reach a hang verdict. Asymmetric engine states led to different actual weight of 'one hangcheck unit' and it was demonstrated in some hangs that due to difference in stages, simpler engines were accused falsely of a hang as their scoring was much more quicker to accumulate above the hang treshold. To completely decouple the hangcheck guilty score from the hangcheck period, convert hangcheck score to a rough period of inactivity measurement. As these are tracked as jiffies, they are meaningful also across reset boundaries. This makes finding a guilty engine more accurate across multi engine activity scenarios, especially across asymmetric engines. We lose the ability to detect cross batch malicious attempts to hinder the progress. Plan is to move this functionality to be part of context banning which is more natural fit, later in the series. v2: use time_before macros (Chris) reinstate the pardoning of moving engine after hc (Chris) v3: avoid global state for per engine stall detection (Chris) v4: take timeline last retirement into account (Chris) Cc: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_debugfs.c | 17 +++-- drivers/gpu/drm/i915/i915_drv.h | 6 +- drivers/gpu/drm/i915/i915_gem.c | 11 ++- drivers/gpu/drm/i915/i915_gem_timeline.c | 12 +++- drivers/gpu/drm/i915/i915_gem_timeline.h | 1 + drivers/gpu/drm/i915/i915_gpu_error.c | 46 ++++--------- drivers/gpu/drm/i915/intel_hangcheck.c | 115 ++++++++++++++++--------------- drivers/gpu/drm/i915/intel_ringbuffer.h | 40 ++++++++--- 8 files changed, 139 insertions(+), 109 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 1cc971c..5ef5c1a 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -1351,10 +1351,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) seq_printf(m, "\tseqno = %x [current %x, last %x]\n", engine->hangcheck.seqno, seqno[id], intel_engine_last_submit(engine)); - seq_printf(m, "\twaiters? %s, fake irq active? %s\n", + seq_printf(m, "\twaiters? %s, fake irq active? %s, stall? %s\n", yesno(intel_engine_has_waiter(engine)), yesno(test_bit(engine->id, - &dev_priv->gpu_error.missed_irq_rings))); + &dev_priv->gpu_error.missed_irq_rings)), + yesno(engine->hangcheck.stall)); + spin_lock_irq(&b->lock); for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) { struct intel_wait *w = container_of(rb, typeof(*w), node); @@ -1367,8 +1369,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n", (long long)engine->hangcheck.acthd, (long long)acthd[id]); - seq_printf(m, "\tscore = %d\n", engine->hangcheck.score); - seq_printf(m, "\taction = %d\n", engine->hangcheck.action); + seq_printf(m, "\taction = %s(%d) %d ms ago\n", + hangcheck_action_to_str(engine->hangcheck.action), + engine->hangcheck.action, + jiffies_to_msecs(jiffies - + engine->hangcheck.action_timestamp)); if (engine->id == RCS) { seq_puts(m, "\tinstdone read =\n"); @@ -3163,11 +3168,11 @@ static int i915_engine_info(struct seq_file *m, void *unused) u64 addr; seq_printf(m, "%s\n", engine->name); - seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [score %d]\n", + seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n", intel_engine_get_seqno(engine), intel_engine_last_submit(engine), engine->hangcheck.seqno, - engine->hangcheck.score); + jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp)); rcu_read_lock(); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 5192206..4562a39 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -795,7 +795,8 @@ struct drm_i915_error_state { /* Software tracked state */ bool waiting; int num_waiters; - int hangcheck_score; + unsigned long hangcheck_timestamp; + bool hangcheck_stall; enum intel_engine_hangcheck_action hangcheck_action; struct i915_address_space *vm; int num_requests; @@ -1441,6 +1442,9 @@ struct i915_error_state_file_priv { #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */ #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */ +#define I915_ENGINE_DEAD_TIMEOUT (4 * HZ) /* No observed seqno/head/subunit progress */ +#define I915_SEQNO_DEAD_TIMEOUT (12 * HZ) /* Engine progress without head progress */ + struct i915_gpu_error { /* For hangcheck timer */ #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */ diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 3fb5e66..ae2a219 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2703,9 +2703,16 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine) if (!request) return; - ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG; - if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) + ring_hung = engine->hangcheck.stall; + if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) { + if (ring_hung) + DRM_ERROR("%s pardoned due to progress after hangcheck %x vs %x\n", + engine->name, + engine->hangcheck.seqno, + intel_engine_get_seqno(engine)); + ring_hung = false; + } i915_set_reset_status(request->ctx, ring_hung); if (!ring_hung) diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c index bf8a471..348b0f2 100644 --- a/drivers/gpu/drm/i915/i915_gem_timeline.c +++ b/drivers/gpu/drm/i915/i915_gem_timeline.c @@ -24,6 +24,15 @@ #include "i915_drv.h" +static void i915_gem_timeline_retire(struct i915_gem_active *active, + struct drm_i915_gem_request *request) +{ + struct intel_timeline *tl = + container_of(active, typeof(*tl), last_request); + + tl->last_retire_timestamp = jiffies; +} + static int __i915_gem_timeline_init(struct drm_i915_private *i915, struct i915_gem_timeline *timeline, const char *name, @@ -54,7 +63,8 @@ static int __i915_gem_timeline_init(struct drm_i915_private *i915, #else spin_lock_init(&tl->lock); #endif - init_request_active(&tl->last_request, NULL); + init_request_active(&tl->last_request, + i915_gem_timeline_retire); INIT_LIST_HEAD(&tl->requests); } diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.h b/drivers/gpu/drm/i915/i915_gem_timeline.h index 98d99a6..d4eea88 100644 --- a/drivers/gpu/drm/i915/i915_gem_timeline.h +++ b/drivers/gpu/drm/i915/i915_gem_timeline.h @@ -34,6 +34,7 @@ struct i915_gem_timeline; struct intel_timeline { u64 fence_context; u32 last_submitted_seqno; + unsigned long last_retire_timestamp; spinlock_t lock; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index f02f581..3eee6c2 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -316,28 +316,6 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m, } } -static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a) -{ - switch (a) { - case HANGCHECK_IDLE: - return "idle"; - case HANGCHECK_WAIT: - return "wait"; - case HANGCHECK_ACTIVE_SEQNO: - return "active seqno"; - case HANGCHECK_ACTIVE_HEAD: - return "active head"; - case HANGCHECK_ACTIVE_SUBUNITS: - return "active subunits"; - case HANGCHECK_KICK: - return "kick"; - case HANGCHECK_HUNG: - return "hung"; - } - - return "unknown"; -} - static void error_print_instdone(struct drm_i915_error_state_buf *m, struct drm_i915_error_engine *ee) { @@ -445,9 +423,13 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, err_printf(m, " waiting: %s\n", yesno(ee->waiting)); err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); - err_printf(m, " hangcheck: %s [%d]\n", - hangcheck_action_to_str(ee->hangcheck_action), - ee->hangcheck_score); + err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stall)); + err_printf(m, " hangcheck action: %s\n", + hangcheck_action_to_str(ee->hangcheck_action)); + err_printf(m, " hangcheck action timestamp: %lu, %u ms ago\n", + ee->hangcheck_timestamp, + jiffies_to_msecs(jiffies - ee->hangcheck_timestamp)); + error_print_request(m, " ELSP[0]: ", &ee->execlist[0]); error_print_request(m, " ELSP[1]: ", &ee->execlist[1]); } @@ -537,7 +519,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, struct pci_dev *pdev = dev_priv->drm.pdev; struct drm_i915_error_state *error = error_priv->error; struct drm_i915_error_object *obj; - int max_hangcheck_score; int i, j; if (!error) { @@ -554,13 +535,9 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, err_printf(m, "Uptime: %ld s %ld us\n", error->uptime.tv_sec, error->uptime.tv_usec); err_print_capabilities(m, &error->device_info); - max_hangcheck_score = 0; - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (error->engine[i].hangcheck_score > max_hangcheck_score) - max_hangcheck_score = error->engine[i].hangcheck_score; - } + for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (error->engine[i].hangcheck_score == max_hangcheck_score && + if (error->engine[i].hangcheck_stall && error->engine[i].pid != -1) { err_printf(m, "Active process (on ring %s): %s [%d]\n", engine_str(i), @@ -946,7 +923,7 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv, * strictly a client bug. Use instdone to differentiate those some. */ for (i = 0; i < I915_NUM_ENGINES; i++) { - if (error->engine[i].hangcheck_action == HANGCHECK_HUNG) { + if (error->engine[i].hangcheck_stall) { if (engine_id) *engine_id = i; @@ -1164,8 +1141,9 @@ static void error_record_engine_registers(struct drm_i915_error_state *error, ee->hws = I915_READ(mmio); } - ee->hangcheck_score = engine->hangcheck.score; + ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; ee->hangcheck_action = engine->hangcheck.action; + ee->hangcheck_stall = engine->hangcheck.stall; if (USES_PPGTT(dev_priv)) { int i; diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index 3d2e81c..71ee837 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -236,13 +236,13 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd) memset(&engine->hangcheck.instdone, 0, sizeof(engine->hangcheck.instdone)); - return HANGCHECK_ACTIVE_HEAD; + return ENGINE_ACTIVE_HEAD; } if (!subunits_stuck(engine)) - return HANGCHECK_ACTIVE_SUBUNITS; + return ENGINE_ACTIVE_SUBUNITS; - return HANGCHECK_HUNG; + return ENGINE_DEAD; } static enum intel_engine_hangcheck_action @@ -253,11 +253,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) u32 tmp; ha = head_stuck(engine, acthd); - if (ha != HANGCHECK_HUNG) + if (ha != ENGINE_DEAD) return ha; if (IS_GEN2(dev_priv)) - return HANGCHECK_HUNG; + return ENGINE_DEAD; /* Is the chip hanging on a WAIT_FOR_EVENT? * If so we can simply poke the RB_WAIT bit @@ -270,25 +270,25 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) "Kicking stuck wait on %s", engine->name); I915_WRITE_CTL(engine, tmp); - return HANGCHECK_KICK; + return ENGINE_WAIT_KICK; } if (INTEL_GEN(dev_priv) >= 6 && tmp & RING_WAIT_SEMAPHORE) { switch (semaphore_passed(engine)) { default: - return HANGCHECK_HUNG; + return ENGINE_DEAD; case 1: i915_handle_error(dev_priv, 0, "Kicking stuck semaphore on %s", engine->name); I915_WRITE_CTL(engine, tmp); - return HANGCHECK_KICK; + return ENGINE_WAIT_KICK; case 0: - return HANGCHECK_WAIT; + return ENGINE_WAIT; } } - return HANGCHECK_HUNG; + return ENGINE_DEAD; } static void hangcheck_load_sample(struct intel_engine_cs *engine, @@ -306,7 +306,6 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine, hc->acthd = intel_engine_get_active_head(engine); hc->seqno = intel_engine_get_seqno(engine); - hc->score = engine->hangcheck.score; } static void hangcheck_store_sample(struct intel_engine_cs *engine, @@ -314,8 +313,8 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine, { engine->hangcheck.acthd = hc->acthd; engine->hangcheck.seqno = hc->seqno; - engine->hangcheck.score = hc->score; engine->hangcheck.action = hc->action; + engine->hangcheck.stall = hc->stall; } static enum intel_engine_hangcheck_action @@ -323,10 +322,10 @@ hangcheck_get_action(struct intel_engine_cs *engine, const struct intel_engine_hangcheck *hc) { if (engine->hangcheck.seqno != hc->seqno) - return HANGCHECK_ACTIVE_SEQNO; + return ENGINE_ACTIVE_SEQNO; if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine))) - return HANGCHECK_IDLE; + return ENGINE_IDLE; return engine_stuck(engine, hc->acthd); } @@ -334,60 +333,64 @@ hangcheck_get_action(struct intel_engine_cs *engine, static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, struct intel_engine_hangcheck *hc) { + unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT; + unsigned long last_retire; + hc->action = hangcheck_get_action(engine, hc); - switch (hc->action) { - case HANGCHECK_IDLE: - case HANGCHECK_WAIT: - break; + /* We always increment the progress + * if the engine is busy and still processing + * the same request, so that no single request + * can run indefinitely (such as a chain of + * batches). The only time we do not increment + * the hangcheck score on this ring, if this + * engine is in a legitimate wait for another + * engine. In that case the waiting engine is a + * victim and we want to be sure we catch the + * right culprit. Then every time we do kick + * the ring, make it as a progress as the seqno + * advancement might ensure and if not, it + * will catch the hanging engine. + */ - case HANGCHECK_ACTIVE_HEAD: - case HANGCHECK_ACTIVE_SUBUNITS: - /* We always increment the hangcheck score - * if the engine is busy and still processing - * the same request, so that no single request - * can run indefinitely (such as a chain of - * batches). The only time we do not increment - * the hangcheck score on this ring, if this - * engine is in a legitimate wait for another - * engine. In that case the waiting engine is a - * victim and we want to be sure we catch the - * right culprit. Then every time we do kick - * the ring, add a small increment to the - * score so that we can catch a batch that is - * being repeatedly kicked and so responsible - * for stalling the machine. - */ - hc->score += 1; - break; + switch (hc->action) { + case ENGINE_IDLE: + case ENGINE_ACTIVE_SEQNO: + /* Clear head and subunit states on seqno movement */ + hc->acthd = 0; - case HANGCHECK_KICK: - hc->score += 5; - break; + memset(&engine->hangcheck.instdone, 0, + sizeof(engine->hangcheck.instdone)); - case HANGCHECK_HUNG: - hc->score += 20; + /* Intentional fall through */ + case ENGINE_WAIT_KICK: + case ENGINE_WAIT: + engine->hangcheck.action_timestamp = jiffies; break; - case HANGCHECK_ACTIVE_SEQNO: - /* Gradually reduce the count so that we catch DoS - * attempts across multiple batches. + case ENGINE_ACTIVE_HEAD: + case ENGINE_ACTIVE_SUBUNITS: + /* Seqno stuck with still active engine gets leeway, + * in hopes that it is just a long shader. */ - if (hc->score > 0) - hc->score -= 15; - if (hc->score < 0) - hc->score = 0; - - /* Clear head and subunit states on seqno movement */ - hc->acthd = 0; + timeout = I915_SEQNO_DEAD_TIMEOUT; + break; - memset(&engine->hangcheck.instdone, 0, - sizeof(engine->hangcheck.instdone)); + case ENGINE_DEAD: break; default: MISSING_CASE(hc->action); } + + last_retire = READ_ONCE(engine->timeline->last_retire_timestamp); + + /* If there is retire on the timeline, take that into account */ + if (time_after(last_retire, engine->hangcheck.action_timestamp)) + engine->hangcheck.action_timestamp = last_retire; + + hc->stall = time_after(jiffies, + engine->hangcheck.action_timestamp + timeout); } static void hangcheck_declare_hang(struct drm_i915_private *i915, @@ -454,9 +457,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work) hangcheck_accumulate_sample(engine, hc); hangcheck_store_sample(engine, hc); - if (hc->score >= HANGCHECK_SCORE_RING_HUNG) { + if (engine->hangcheck.stall) { hung |= intel_engine_flag(engine); - if (hc->action != HANGCHECK_HUNG) + if (hc->action != ENGINE_DEAD) stuck |= intel_engine_flag(engine); } diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 3152b2b..d2d870b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -65,16 +65,37 @@ struct intel_hw_status_page { GEN8_SEMAPHORE_OFFSET(from, (__ring)->id)) enum intel_engine_hangcheck_action { - HANGCHECK_IDLE = 0, - HANGCHECK_WAIT, - HANGCHECK_ACTIVE_SEQNO, - HANGCHECK_ACTIVE_HEAD, - HANGCHECK_ACTIVE_SUBUNITS, - HANGCHECK_KICK, - HANGCHECK_HUNG, + ENGINE_IDLE = 0, + ENGINE_WAIT, + ENGINE_ACTIVE_SEQNO, + ENGINE_ACTIVE_HEAD, + ENGINE_ACTIVE_SUBUNITS, + ENGINE_WAIT_KICK, + ENGINE_DEAD, }; -#define HANGCHECK_SCORE_RING_HUNG 31 +static inline const char * +hangcheck_action_to_str(const enum intel_engine_hangcheck_action a) +{ + switch (a) { + case ENGINE_IDLE: + return "idle"; + case ENGINE_WAIT: + return "wait"; + case ENGINE_ACTIVE_SEQNO: + return "active seqno"; + case ENGINE_ACTIVE_HEAD: + return "active head"; + case ENGINE_ACTIVE_SUBUNITS: + return "active subunits"; + case ENGINE_WAIT_KICK: + return "wait kick"; + case ENGINE_DEAD: + return "dead"; + } + + return "unknown"; +} #define I915_MAX_SLICES 3 #define I915_MAX_SUBSLICES 3 @@ -106,10 +127,11 @@ struct intel_instdone { struct intel_engine_hangcheck { u64 acthd; u32 seqno; - int score; enum intel_engine_hangcheck_action action; + unsigned long action_timestamp; int deadlock; struct intel_instdone instdone; + bool stall; }; struct intel_ring { -- 2.7.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period 2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala @ 2016-11-16 17:05 ` Chris Wilson 2016-11-18 13:09 ` Mika Kuoppala 1 sibling, 0 replies; 15+ messages in thread From: Chris Wilson @ 2016-11-16 17:05 UTC (permalink / raw) To: Mika Kuoppala; +Cc: intel-gfx On Wed, Nov 16, 2016 at 05:20:30PM +0200, Mika Kuoppala wrote: > - ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG; > - if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) > + ring_hung = engine->hangcheck.stall; > + if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) { > + if (ring_hung) > + DRM_ERROR("%s pardoned due to progress after hangcheck %x vs %x\n", > + engine->name, > + engine->hangcheck.seqno, > + intel_engine_get_seqno(engine)); > + Is this worth alarming the user over? We recover gracefully either way. DRM_DEBUG_DRIVER("pardoned, was guilty? %s\n", yesno(ring_hung)); > ring_hung = false; > + } > > i915_set_reset_status(request->ctx, ring_hung); > if (!ring_hung) > diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c > index bf8a471..348b0f2 100644 > --- a/drivers/gpu/drm/i915/i915_gem_timeline.c > +++ b/drivers/gpu/drm/i915/i915_gem_timeline.c > @@ -24,6 +24,15 @@ > > #include "i915_drv.h" > > +static void i915_gem_timeline_retire(struct i915_gem_active *active, > + struct drm_i915_gem_request *request) > +{ > + struct intel_timeline *tl = > + container_of(active, typeof(*tl), last_request); > + > + tl->last_retire_timestamp = jiffies; Make this a separate patch. last_retired_timestamp to match last_submitted_seqno. Maybe worth adding last_submitted_timestamp as well for debug purposes. I guess we are missing a debugfs/i915_gem_timelines Some grumbling, but nothing stands out in this patch. So with the couple of points above, Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> -Chris -- Chris Wilson, Intel Open Source Technology Centre _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period 2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala 2016-11-16 17:05 ` Chris Wilson @ 2016-11-18 13:09 ` Mika Kuoppala 1 sibling, 0 replies; 15+ messages in thread From: Mika Kuoppala @ 2016-11-18 13:09 UTC (permalink / raw) To: intel-gfx Hangcheck state accumulation has gained more steps along the years, like head movement and more recently the subunit inactivity check. As the subunit sampling is only done if the previous state check showed inactivity, we have added more stages (and time) to reach a hang verdict. Asymmetric engine states led to different actual weight of 'one hangcheck unit' and it was demonstrated in some hangs that due to difference in stages, simpler engines were accused falsely of a hang as their scoring was much more quicker to accumulate above the hang treshold. To completely decouple the hangcheck guilty score from the hangcheck period, convert hangcheck score to a rough period of inactivity measurement. As these are tracked as jiffies, they are meaningful also across reset boundaries. This makes finding a guilty engine more accurate across multi engine activity scenarios, especially across asymmetric engines. We lose the ability to detect cross batch malicious attempts to hinder the progress. Plan is to move this functionality to be part of context banning which is more natural fit, later in the series. v2: use time_before macros (Chris) reinstate the pardoning of moving engine after hc (Chris) v3: avoid global state for per engine stall detection (Chris) v4: take timeline last retirement into account (Chris) v5: do debug print on pardoning, split out retirement timestamp (Chris) Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_debugfs.c | 17 +++-- drivers/gpu/drm/i915/i915_drv.h | 6 +- drivers/gpu/drm/i915/i915_gem.c | 8 ++- drivers/gpu/drm/i915/i915_gpu_error.c | 46 ++++---------- drivers/gpu/drm/i915/intel_hangcheck.c | 108 +++++++++++++++----------------- drivers/gpu/drm/i915/intel_ringbuffer.h | 40 +++++++++--- 6 files changed, 117 insertions(+), 108 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index b7f42c4..d67079f 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -1351,10 +1351,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) seq_printf(m, "\tseqno = %x [current %x, last %x]\n", engine->hangcheck.seqno, seqno[id], intel_engine_last_submit(engine)); - seq_printf(m, "\twaiters? %s, fake irq active? %s\n", + seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s\n", yesno(intel_engine_has_waiter(engine)), yesno(test_bit(engine->id, - &dev_priv->gpu_error.missed_irq_rings))); + &dev_priv->gpu_error.missed_irq_rings)), + yesno(engine->hangcheck.stalled)); + spin_lock_irq(&b->lock); for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) { struct intel_wait *w = container_of(rb, typeof(*w), node); @@ -1367,8 +1369,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n", (long long)engine->hangcheck.acthd, (long long)acthd[id]); - seq_printf(m, "\tscore = %d\n", engine->hangcheck.score); - seq_printf(m, "\taction = %d\n", engine->hangcheck.action); + seq_printf(m, "\taction = %s(%d) %d ms ago\n", + hangcheck_action_to_str(engine->hangcheck.action), + engine->hangcheck.action, + jiffies_to_msecs(jiffies - + engine->hangcheck.action_timestamp)); if (engine->id == RCS) { seq_puts(m, "\tinstdone read =\n"); @@ -3162,11 +3167,11 @@ static int i915_engine_info(struct seq_file *m, void *unused) u64 addr; seq_printf(m, "%s\n", engine->name); - seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [score %d]\n", + seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n", intel_engine_get_seqno(engine), intel_engine_last_submit(engine), engine->hangcheck.seqno, - engine->hangcheck.score); + jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp)); rcu_read_lock(); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index be67aee..7b0e5c7 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -800,7 +800,8 @@ struct drm_i915_error_state { /* Software tracked state */ bool waiting; int num_waiters; - int hangcheck_score; + unsigned long hangcheck_timestamp; + bool hangcheck_stalled; enum intel_engine_hangcheck_action hangcheck_action; struct i915_address_space *vm; int num_requests; @@ -1446,6 +1447,9 @@ struct i915_error_state_file_priv { #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */ #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */ +#define I915_ENGINE_DEAD_TIMEOUT (4 * HZ) /* Seqno, head and subunits dead */ +#define I915_SEQNO_DEAD_TIMEOUT (12 * HZ) /* Seqno dead with active head */ + struct i915_gpu_error { /* For hangcheck timer */ #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */ diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d5b7723..2ab57d2 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2703,9 +2703,13 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine) if (!request) return; - ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG; - if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) + ring_hung = engine->hangcheck.stalled; + if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) { + DRM_DEBUG_DRIVER("%s pardoned, was guilty? %s\n", + engine->name, + yesno(ring_hung)); ring_hung = false; + } i915_set_reset_status(request->ctx, ring_hung); if (!ring_hung) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 4bcf1a0..d5a4ec9 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -316,28 +316,6 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m, } } -static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a) -{ - switch (a) { - case HANGCHECK_IDLE: - return "idle"; - case HANGCHECK_WAIT: - return "wait"; - case HANGCHECK_ACTIVE_SEQNO: - return "active seqno"; - case HANGCHECK_ACTIVE_HEAD: - return "active head"; - case HANGCHECK_ACTIVE_SUBUNITS: - return "active subunits"; - case HANGCHECK_KICK: - return "kick"; - case HANGCHECK_HUNG: - return "hung"; - } - - return "unknown"; -} - static void error_print_instdone(struct drm_i915_error_state_buf *m, struct drm_i915_error_engine *ee) { @@ -445,9 +423,13 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, err_printf(m, " waiting: %s\n", yesno(ee->waiting)); err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); - err_printf(m, " hangcheck: %s [%d]\n", - hangcheck_action_to_str(ee->hangcheck_action), - ee->hangcheck_score); + err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled)); + err_printf(m, " hangcheck action: %s\n", + hangcheck_action_to_str(ee->hangcheck_action)); + err_printf(m, " hangcheck action timestamp: %lu, %u ms ago\n", + ee->hangcheck_timestamp, + jiffies_to_msecs(jiffies - ee->hangcheck_timestamp)); + error_print_request(m, " ELSP[0]: ", &ee->execlist[0]); error_print_request(m, " ELSP[1]: ", &ee->execlist[1]); } @@ -536,7 +518,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, struct pci_dev *pdev = dev_priv->drm.pdev; struct drm_i915_error_state *error = error_priv->error; struct drm_i915_error_object *obj; - int max_hangcheck_score; int i, j; if (!error) { @@ -553,13 +534,9 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, err_printf(m, "Uptime: %ld s %ld us\n", error->uptime.tv_sec, error->uptime.tv_usec); err_print_capabilities(m, &error->device_info); - max_hangcheck_score = 0; - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (error->engine[i].hangcheck_score > max_hangcheck_score) - max_hangcheck_score = error->engine[i].hangcheck_score; - } + for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (error->engine[i].hangcheck_score == max_hangcheck_score && + if (error->engine[i].hangcheck_stalled && error->engine[i].pid != -1) { err_printf(m, "Active process (on ring %s): %s [%d]\n", engine_str(i), @@ -945,7 +922,7 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv, * strictly a client bug. Use instdone to differentiate those some. */ for (i = 0; i < I915_NUM_ENGINES; i++) { - if (error->engine[i].hangcheck_action == HANGCHECK_HUNG) { + if (error->engine[i].hangcheck_stalled) { if (engine_id) *engine_id = i; @@ -1163,8 +1140,9 @@ static void error_record_engine_registers(struct drm_i915_error_state *error, ee->hws = I915_READ(mmio); } - ee->hangcheck_score = engine->hangcheck.score; + ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; ee->hangcheck_action = engine->hangcheck.action; + ee->hangcheck_stalled = engine->hangcheck.stalled; if (USES_PPGTT(dev_priv)) { int i; diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index 3d2e81c..c03db02 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -236,13 +236,13 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd) memset(&engine->hangcheck.instdone, 0, sizeof(engine->hangcheck.instdone)); - return HANGCHECK_ACTIVE_HEAD; + return ENGINE_ACTIVE_HEAD; } if (!subunits_stuck(engine)) - return HANGCHECK_ACTIVE_SUBUNITS; + return ENGINE_ACTIVE_SUBUNITS; - return HANGCHECK_HUNG; + return ENGINE_DEAD; } static enum intel_engine_hangcheck_action @@ -253,11 +253,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) u32 tmp; ha = head_stuck(engine, acthd); - if (ha != HANGCHECK_HUNG) + if (ha != ENGINE_DEAD) return ha; if (IS_GEN2(dev_priv)) - return HANGCHECK_HUNG; + return ENGINE_DEAD; /* Is the chip hanging on a WAIT_FOR_EVENT? * If so we can simply poke the RB_WAIT bit @@ -270,25 +270,25 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) "Kicking stuck wait on %s", engine->name); I915_WRITE_CTL(engine, tmp); - return HANGCHECK_KICK; + return ENGINE_WAIT_KICK; } if (INTEL_GEN(dev_priv) >= 6 && tmp & RING_WAIT_SEMAPHORE) { switch (semaphore_passed(engine)) { default: - return HANGCHECK_HUNG; + return ENGINE_DEAD; case 1: i915_handle_error(dev_priv, 0, "Kicking stuck semaphore on %s", engine->name); I915_WRITE_CTL(engine, tmp); - return HANGCHECK_KICK; + return ENGINE_WAIT_KICK; case 0: - return HANGCHECK_WAIT; + return ENGINE_WAIT; } } - return HANGCHECK_HUNG; + return ENGINE_DEAD; } static void hangcheck_load_sample(struct intel_engine_cs *engine, @@ -306,7 +306,6 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine, hc->acthd = intel_engine_get_active_head(engine); hc->seqno = intel_engine_get_seqno(engine); - hc->score = engine->hangcheck.score; } static void hangcheck_store_sample(struct intel_engine_cs *engine, @@ -314,8 +313,8 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine, { engine->hangcheck.acthd = hc->acthd; engine->hangcheck.seqno = hc->seqno; - engine->hangcheck.score = hc->score; engine->hangcheck.action = hc->action; + engine->hangcheck.stalled = hc->stalled; } static enum intel_engine_hangcheck_action @@ -323,10 +322,10 @@ hangcheck_get_action(struct intel_engine_cs *engine, const struct intel_engine_hangcheck *hc) { if (engine->hangcheck.seqno != hc->seqno) - return HANGCHECK_ACTIVE_SEQNO; + return ENGINE_ACTIVE_SEQNO; if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine))) - return HANGCHECK_IDLE; + return ENGINE_IDLE; return engine_stuck(engine, hc->acthd); } @@ -334,60 +333,57 @@ hangcheck_get_action(struct intel_engine_cs *engine, static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, struct intel_engine_hangcheck *hc) { + unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT; + hc->action = hangcheck_get_action(engine, hc); - switch (hc->action) { - case HANGCHECK_IDLE: - case HANGCHECK_WAIT: - break; + /* We always increment the progress + * if the engine is busy and still processing + * the same request, so that no single request + * can run indefinitely (such as a chain of + * batches). The only time we do not increment + * the hangcheck score on this ring, if this + * engine is in a legitimate wait for another + * engine. In that case the waiting engine is a + * victim and we want to be sure we catch the + * right culprit. Then every time we do kick + * the ring, make it as a progress as the seqno + * advancement might ensure and if not, it + * will catch the hanging engine. + */ - case HANGCHECK_ACTIVE_HEAD: - case HANGCHECK_ACTIVE_SUBUNITS: - /* We always increment the hangcheck score - * if the engine is busy and still processing - * the same request, so that no single request - * can run indefinitely (such as a chain of - * batches). The only time we do not increment - * the hangcheck score on this ring, if this - * engine is in a legitimate wait for another - * engine. In that case the waiting engine is a - * victim and we want to be sure we catch the - * right culprit. Then every time we do kick - * the ring, add a small increment to the - * score so that we can catch a batch that is - * being repeatedly kicked and so responsible - * for stalling the machine. - */ - hc->score += 1; - break; + switch (hc->action) { + case ENGINE_IDLE: + case ENGINE_ACTIVE_SEQNO: + /* Clear head and subunit states on seqno movement */ + hc->acthd = 0; - case HANGCHECK_KICK: - hc->score += 5; - break; + memset(&engine->hangcheck.instdone, 0, + sizeof(engine->hangcheck.instdone)); - case HANGCHECK_HUNG: - hc->score += 20; + /* Intentional fall through */ + case ENGINE_WAIT_KICK: + case ENGINE_WAIT: + engine->hangcheck.action_timestamp = jiffies; break; - case HANGCHECK_ACTIVE_SEQNO: - /* Gradually reduce the count so that we catch DoS - * attempts across multiple batches. + case ENGINE_ACTIVE_HEAD: + case ENGINE_ACTIVE_SUBUNITS: + /* Seqno stuck with still active engine gets leeway, + * in hopes that it is just a long shader. */ - if (hc->score > 0) - hc->score -= 15; - if (hc->score < 0) - hc->score = 0; - - /* Clear head and subunit states on seqno movement */ - hc->acthd = 0; + timeout = I915_SEQNO_DEAD_TIMEOUT; + break; - memset(&engine->hangcheck.instdone, 0, - sizeof(engine->hangcheck.instdone)); + case ENGINE_DEAD: break; default: MISSING_CASE(hc->action); } + + hc->stalled = time_after(jiffies, + engine->hangcheck.action_timestamp + timeout); } static void hangcheck_declare_hang(struct drm_i915_private *i915, @@ -454,9 +450,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work) hangcheck_accumulate_sample(engine, hc); hangcheck_store_sample(engine, hc); - if (hc->score >= HANGCHECK_SCORE_RING_HUNG) { + if (engine->hangcheck.stalled) { hung |= intel_engine_flag(engine); - if (hc->action != HANGCHECK_HUNG) + if (hc->action != ENGINE_DEAD) stuck |= intel_engine_flag(engine); } diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 3152b2b..3f43ade 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -65,16 +65,37 @@ struct intel_hw_status_page { GEN8_SEMAPHORE_OFFSET(from, (__ring)->id)) enum intel_engine_hangcheck_action { - HANGCHECK_IDLE = 0, - HANGCHECK_WAIT, - HANGCHECK_ACTIVE_SEQNO, - HANGCHECK_ACTIVE_HEAD, - HANGCHECK_ACTIVE_SUBUNITS, - HANGCHECK_KICK, - HANGCHECK_HUNG, + ENGINE_IDLE = 0, + ENGINE_WAIT, + ENGINE_ACTIVE_SEQNO, + ENGINE_ACTIVE_HEAD, + ENGINE_ACTIVE_SUBUNITS, + ENGINE_WAIT_KICK, + ENGINE_DEAD, }; -#define HANGCHECK_SCORE_RING_HUNG 31 +static inline const char * +hangcheck_action_to_str(const enum intel_engine_hangcheck_action a) +{ + switch (a) { + case ENGINE_IDLE: + return "idle"; + case ENGINE_WAIT: + return "wait"; + case ENGINE_ACTIVE_SEQNO: + return "active seqno"; + case ENGINE_ACTIVE_HEAD: + return "active head"; + case ENGINE_ACTIVE_SUBUNITS: + return "active subunits"; + case ENGINE_WAIT_KICK: + return "wait kick"; + case ENGINE_DEAD: + return "dead"; + } + + return "unknown"; +} #define I915_MAX_SLICES 3 #define I915_MAX_SUBSLICES 3 @@ -106,10 +127,11 @@ struct intel_instdone { struct intel_engine_hangcheck { u64 acthd; u32 seqno; - int score; enum intel_engine_hangcheck_action action; + unsigned long action_timestamp; int deadlock; struct intel_instdone instdone; + bool stalled; }; struct intel_ring { -- 2.7.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 3/6] drm/i915: Use request retirement as context progress 2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala 2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala @ 2016-11-16 15:20 ` Mika Kuoppala 2016-11-16 17:08 ` Chris Wilson 2016-11-16 15:20 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala ` (4 subsequent siblings) 6 siblings, 1 reply; 15+ messages in thread From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw) To: intel-gfx As hangcheck score was removed, the active decay of score was removed also. This removed feature for hangcheck to detect if the gpu client was accidentally or maliciously causing intermittent hangs. Reinstate the scoring as a per context property, so that if one context starts to act unfavourably, ban it. v2: ban_period_secs as a gate to score check (Chris) v3: decay in proper spot. scores as tunables (Chris) Cc: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 5 ++++ drivers/gpu/drm/i915/i915_gem.c | 44 ++++++++++++++++++++++----------- drivers/gpu/drm/i915/i915_gem_request.c | 4 +++ 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 4562a39..9f24957 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -914,6 +914,11 @@ struct i915_ctx_hang_stats { /* This context is banned to submit more work */ bool banned; + +#define CONTEXT_SCORE_GUILTY 10 +#define CONTEXT_SCORE_BAN_THRESHOLD 40 + /* Accumulated score of hangs caused by this context */ + int ban_score; }; /* This must match up with the value previously used for execbuf2.rsvd1. */ diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index ae2a219..5948f09 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2620,33 +2620,45 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj, static bool i915_context_is_banned(const struct i915_gem_context *ctx) { + const struct i915_ctx_hang_stats *hs = &ctx->hang_stats; unsigned long elapsed; - if (ctx->hang_stats.banned) + if (hs->banned) return true; - elapsed = get_seconds() - ctx->hang_stats.guilty_ts; - if (ctx->hang_stats.ban_period_seconds && - elapsed <= ctx->hang_stats.ban_period_seconds) { + if (!hs->ban_period_seconds) + return false; + + elapsed = get_seconds() - hs->guilty_ts; + if (elapsed <= hs->ban_period_seconds) { DRM_DEBUG("context hanging too fast, banning!\n"); return true; } + if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) { + DRM_DEBUG("context hanging too often, banning!\n"); + return true; + } + return false; } -static void i915_set_reset_status(struct i915_gem_context *ctx, - const bool guilty) +static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) { struct i915_ctx_hang_stats *hs = &ctx->hang_stats; - if (guilty) { - hs->banned = i915_context_is_banned(ctx); - hs->batch_active++; - hs->guilty_ts = get_seconds(); - } else { - hs->batch_pending++; - } + hs->ban_score += CONTEXT_SCORE_GUILTY; + + hs->banned = i915_context_is_banned(ctx); + hs->batch_active++; + hs->guilty_ts = get_seconds(); +} + +static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx) +{ + struct i915_ctx_hang_stats *hs = &ctx->hang_stats; + + hs->batch_pending++; } struct drm_i915_gem_request * @@ -2714,7 +2726,11 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine) ring_hung = false; } - i915_set_reset_status(request->ctx, ring_hung); + if (ring_hung) + i915_gem_context_mark_guilty(request->ctx); + else + i915_gem_context_mark_innocent(request->ctx); + if (!ring_hung) return; diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index b9b5253..b31d18e 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -255,6 +255,10 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request) request->engine); } + /* Retirement decays the ban score as it is a sign of ctx progress */ + if (request->ctx->hang_stats.ban_score > 0) + request->ctx->hang_stats.ban_score--; + i915_gem_context_put(request->ctx); dma_fence_signal(&request->fence); -- 2.7.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH 3/6] drm/i915: Use request retirement as context progress 2016-11-16 15:20 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala @ 2016-11-16 17:08 ` Chris Wilson 0 siblings, 0 replies; 15+ messages in thread From: Chris Wilson @ 2016-11-16 17:08 UTC (permalink / raw) To: Mika Kuoppala; +Cc: intel-gfx On Wed, Nov 16, 2016 at 05:20:31PM +0200, Mika Kuoppala wrote: > As hangcheck score was removed, the active decay of score > was removed also. This removed feature for hangcheck to detect > if the gpu client was accidentally or maliciously causing intermittent > hangs. Reinstate the scoring as a per context property, so that if > one context starts to act unfavourably, ban it. > > v2: ban_period_secs as a gate to score check (Chris) > v3: decay in proper spot. scores as tunables (Chris) > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> -Chris -- Chris Wilson, Intel Open Source Technology Centre _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH 4/6] drm/i915: Add bannable context parameter 2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala 2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala 2016-11-16 15:20 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala @ 2016-11-16 15:20 ` Mika Kuoppala 2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala ` (3 subsequent siblings) 6 siblings, 0 replies; 15+ messages in thread From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw) To: intel-gfx Now when driver has per context scoring of 'hanging badness' and also subsequent hangs during short windows are allowed, if there is progress made in between, it does not make sense to expose a ban timing window as a context parameter anymore. Let the scoring be the sole indicator for ban policy and substitute ban period context parameter as a boolean to get/set context bannable property. v2: allow non root to opt into being banned (Chris) Cc: Chris Wilson <chris@chris-wilson.co.uk> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 13 +++---------- drivers/gpu/drm/i915/i915_gem.c | 10 +--------- drivers/gpu/drm/i915/i915_gem_context.c | 23 ++++++++++++++--------- drivers/gpu/drm/i915/i915_gpu_error.c | 5 +++-- include/uapi/drm/i915_drm.h | 1 + 5 files changed, 22 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 9f24957..74f421a1 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -845,6 +845,7 @@ struct drm_i915_error_state { long jiffies; pid_t pid; u32 context; + int ban_score; u32 seqno; u32 head; u32 tail; @@ -904,16 +905,10 @@ struct i915_ctx_hang_stats { /* This context had batch active when hang was declared */ unsigned batch_active; - /* Time when this context was last blamed for a GPU reset */ - unsigned long guilty_ts; - - /* If the contexts causes a second GPU hang within this time, - * it is permanently banned from submitting any more work. - */ - unsigned long ban_period_seconds; + bool bannable:1; /* This context is banned to submit more work */ - bool banned; + bool banned:1; #define CONTEXT_SCORE_GUILTY 10 #define CONTEXT_SCORE_BAN_THRESHOLD 40 @@ -1454,8 +1449,6 @@ struct i915_gpu_error { /* For hangcheck timer */ #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */ #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD) - /* Hang gpu twice in this window and your context gets banned */ -#define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000) struct delayed_work hangcheck_work; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 5948f09..c48e0d2 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2621,20 +2621,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj, static bool i915_context_is_banned(const struct i915_gem_context *ctx) { const struct i915_ctx_hang_stats *hs = &ctx->hang_stats; - unsigned long elapsed; if (hs->banned) return true; - if (!hs->ban_period_seconds) + if (!hs->bannable) return false; - elapsed = get_seconds() - hs->guilty_ts; - if (elapsed <= hs->ban_period_seconds) { - DRM_DEBUG("context hanging too fast, banning!\n"); - return true; - } - if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) { DRM_DEBUG("context hanging too often, banning!\n"); return true; @@ -2651,7 +2644,6 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) hs->banned = i915_context_is_banned(ctx); hs->batch_active++; - hs->guilty_ts = get_seconds(); } static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 1f94b8d..bc1b232 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev, * is no remap info, it will be a NOP. */ ctx->remap_slice = ALL_L3_SLICES(dev_priv); - ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD; + ctx->hang_stats.bannable = true; ctx->ring_size = 4 * PAGE_SIZE; ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) << GEN8_CTX_ADDRESSING_MODE_SHIFT; @@ -1085,7 +1085,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, args->size = 0; switch (args->param) { case I915_CONTEXT_PARAM_BAN_PERIOD: - args->value = ctx->hang_stats.ban_period_seconds; + ret = -EINVAL; break; case I915_CONTEXT_PARAM_NO_ZEROMAP: args->value = ctx->flags & CONTEXT_NO_ZEROMAP; @@ -1101,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE: args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE); break; + case I915_CONTEXT_PARAM_BANNABLE: + args->value = ctx->hang_stats.bannable; + break; default: ret = -EINVAL; break; @@ -1130,13 +1133,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, switch (args->param) { case I915_CONTEXT_PARAM_BAN_PERIOD: - if (args->size) - ret = -EINVAL; - else if (args->value < ctx->hang_stats.ban_period_seconds && - !capable(CAP_SYS_ADMIN)) - ret = -EPERM; - else - ctx->hang_stats.ban_period_seconds = args->value; + ret = -EINVAL; break; case I915_CONTEXT_PARAM_NO_ZEROMAP: if (args->size) { @@ -1156,6 +1153,14 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE; } break; + case I915_CONTEXT_PARAM_BANNABLE: + if (args->size) + ret = -EINVAL; + else if (!capable(CAP_SYS_ADMIN) && !args->value) + ret = -EPERM; + else + ctx->hang_stats.bannable = args->value; + break; default: ret = -EINVAL; break; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 3eee6c2..d52453e 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -352,8 +352,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m, if (!erq->seqno) return; - err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n", - prefix, erq->pid, + err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n", + prefix, erq->pid, erq->ban_score, erq->context, erq->seqno, jiffies_to_msecs(jiffies - erq->jiffies), erq->head, erq->tail); @@ -1171,6 +1171,7 @@ static void record_request(struct drm_i915_gem_request *request, struct drm_i915_error_request *erq) { erq->context = request->ctx->hw_id; + erq->ban_score = request->ctx->hang_stats.ban_score; erq->seqno = request->global_seqno; erq->jiffies = request->emitted_jiffies; erq->head = request->head; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 1c12a35..12003f0 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1224,6 +1224,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2 #define I915_CONTEXT_PARAM_GTT_SIZE 0x3 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4 +#define I915_CONTEXT_PARAM_BANNABLE 0x5 __u64 value; }; -- 2.7.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 5/6] drm/i915: Add per client max context ban limit 2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala ` (2 preceding siblings ...) 2016-11-16 15:20 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala @ 2016-11-16 15:20 ` Mika Kuoppala 2016-11-16 17:12 ` Chris Wilson 2016-11-18 13:10 ` Mika Kuoppala 2016-11-16 15:20 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala ` (2 subsequent siblings) 6 siblings, 2 replies; 15+ messages in thread From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw) To: intel-gfx If we have a bad client submitting unfavourably across different contexts, creating new ones, the per context scoring of badness doesn't remove the root cause, the offending client. To counter, keep track of per client context bans. Deny access if client is responsible for more than 3 context bans in it's lifetime. v2: move ban check to context create ioctl (Chris) Cc: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 4 ++++ drivers/gpu/drm/i915/i915_gem.c | 14 ++++++++++++++ drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++ drivers/gpu/drm/i915/i915_gpu_error.c | 10 ++++++---- 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 74f421a1..b385b9d 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -411,6 +411,9 @@ struct drm_i915_file_private { } rps; unsigned int bsd_engine; + +#define I915_MAX_CLIENT_CONTEXT_BANS 3 + int context_bans; }; /* Used by dp and fdi links */ @@ -867,6 +870,7 @@ struct drm_i915_error_state { pid_t pid; char comm[TASK_COMM_LEN]; + int context_bans; } engine[I915_NUM_ENGINES]; struct drm_i915_error_buffer { diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c48e0d2..a24fd99 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2644,6 +2644,20 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) hs->banned = i915_context_is_banned(ctx); hs->batch_active++; + + DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n", + ctx->name, hs->ban_score, yesno(hs->banned)); + + if (!ctx->file_priv) + return; + + if (hs->banned) { + ctx->file_priv->context_bans++; + + DRM_DEBUG_DRIVER("client %s has has %d context banned\n", + ctx->name, + ctx->file_priv->context_bans); + } } static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index bc1b232..4f88f03 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -1003,6 +1003,11 @@ static bool contexts_enabled(struct drm_device *dev) return i915.enable_execlists || to_i915(dev)->hw_context_size; } +static bool client_is_banned(struct drm_i915_file_private *file_priv) +{ + return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS; +} + int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { @@ -1017,6 +1022,14 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, if (args->pad != 0) return -EINVAL; + if (client_is_banned(file_priv)) { + DRM_DEBUG("client %s[%d] banned from creating ctx\n", + current->comm, + pid_nr(get_task_pid(current, PIDTYPE_PID))); + + return -EIO; + } + ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index d52453e..5990b97 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -539,10 +539,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, for (i = 0; i < ARRAY_SIZE(error->engine); i++) { if (error->engine[i].hangcheck_stall && error->engine[i].pid != -1) { - err_printf(m, "Active process (on ring %s): %s [%d]\n", + err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n", engine_str(i), error->engine[i].comm, - error->engine[i].pid); + error->engine[i].pid, + error->engine[i].context_bans); } } err_printf(m, "Reset count: %u\n", error->reset_count); @@ -633,9 +634,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, if (obj) { err_puts(m, dev_priv->engine[i]->name); if (ee->pid != -1) - err_printf(m, " (submitted by %s [%d])", + err_printf(m, " (submitted by %s [%d], bans %d)", ee->comm, - ee->pid); + ee->pid, + ee->context_bans); err_printf(m, " --- gtt_offset = 0x%08x %08x\n", upper_32_bits(obj->gtt_offset), lower_32_bits(obj->gtt_offset)); -- 2.7.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH 5/6] drm/i915: Add per client max context ban limit 2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala @ 2016-11-16 17:12 ` Chris Wilson 2016-11-18 13:10 ` Mika Kuoppala 1 sibling, 0 replies; 15+ messages in thread From: Chris Wilson @ 2016-11-16 17:12 UTC (permalink / raw) To: Mika Kuoppala; +Cc: intel-gfx On Wed, Nov 16, 2016 at 05:20:33PM +0200, Mika Kuoppala wrote: > If we have a bad client submitting unfavourably across different > contexts, creating new ones, the per context scoring of badness > doesn't remove the root cause, the offending client. > To counter, keep track of per client context bans. Deny access if > client is responsible for more than 3 context bans in > it's lifetime. > > v2: move ban check to context create ioctl (Chris) > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> > --- > drivers/gpu/drm/i915/i915_drv.h | 4 ++++ > drivers/gpu/drm/i915/i915_gem.c | 14 ++++++++++++++ > drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++ > drivers/gpu/drm/i915/i915_gpu_error.c | 10 ++++++---- > 4 files changed, 37 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 74f421a1..b385b9d 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -411,6 +411,9 @@ struct drm_i915_file_private { > } rps; > > unsigned int bsd_engine; > + > +#define I915_MAX_CLIENT_CONTEXT_BANS 3 I'd just like some commentary here giving some rationale for 3. With that, Reviewed-by: Chris Wilson <chris@chris-wilso.co.uk> -Chris -- Chris Wilson, Intel Open Source Technology Centre _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH 5/6] drm/i915: Add per client max context ban limit 2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala 2016-11-16 17:12 ` Chris Wilson @ 2016-11-18 13:10 ` Mika Kuoppala 1 sibling, 0 replies; 15+ messages in thread From: Mika Kuoppala @ 2016-11-18 13:10 UTC (permalink / raw) To: intel-gfx If we have a bad client submitting unfavourably across different contexts, creating new ones, the per context scoring of badness doesn't remove the root cause, the offending client. To counter, keep track of per client context bans. Deny access if client is responsible for more than 3 context bans in it's lifetime. v2: move ban check to context create ioctl (Chris) v3: add commentary about hangs needed to reach client ban (Chris) Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 10 ++++++++++ drivers/gpu/drm/i915/i915_gem.c | 14 ++++++++++++++ drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++ drivers/gpu/drm/i915/i915_gpu_error.c | 10 ++++++---- 4 files changed, 43 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 837abdf..304e4c9 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -416,6 +416,15 @@ struct drm_i915_file_private { } rps; unsigned int bsd_engine; + +/* Client can have a maximum of 3 contexts banned before + * it is denied of creating new contexts. As one context + * ban needs 4 consecutive hangs, and more if there is + * progress in between, this is a last resort stop gap measure + * to limit the badly behaving clients access to gpu. + */ +#define I915_MAX_CLIENT_CONTEXT_BANS 3 + int context_bans; }; /* Used by dp and fdi links */ @@ -872,6 +881,7 @@ struct drm_i915_error_state { pid_t pid; char comm[TASK_COMM_LEN]; + int context_bans; } engine[I915_NUM_ENGINES]; struct drm_i915_error_buffer { diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 734b270..dd015bc 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2644,6 +2644,20 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) hs->banned = i915_context_is_banned(ctx); hs->batch_active++; + + DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n", + ctx->name, hs->ban_score, yesno(hs->banned)); + + if (!ctx->file_priv) + return; + + if (hs->banned) { + ctx->file_priv->context_bans++; + + DRM_DEBUG_DRIVER("client %s has has %d context banned\n", + ctx->name, + ctx->file_priv->context_bans); + } } static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index bc1b232..4f88f03 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -1003,6 +1003,11 @@ static bool contexts_enabled(struct drm_device *dev) return i915.enable_execlists || to_i915(dev)->hw_context_size; } +static bool client_is_banned(struct drm_i915_file_private *file_priv) +{ + return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS; +} + int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { @@ -1017,6 +1022,14 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, if (args->pad != 0) return -EINVAL; + if (client_is_banned(file_priv)) { + DRM_DEBUG("client %s[%d] banned from creating ctx\n", + current->comm, + pid_nr(get_task_pid(current, PIDTYPE_PID))); + + return -EIO; + } + ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index fa988a0..af4f0ef 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -538,10 +538,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, for (i = 0; i < ARRAY_SIZE(error->engine); i++) { if (error->engine[i].hangcheck_stalled && error->engine[i].pid != -1) { - err_printf(m, "Active process (on ring %s): %s [%d]\n", + err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n", engine_str(i), error->engine[i].comm, - error->engine[i].pid); + error->engine[i].pid, + error->engine[i].context_bans); } } err_printf(m, "Reset count: %u\n", error->reset_count); @@ -632,9 +633,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, if (obj) { err_puts(m, dev_priv->engine[i]->name); if (ee->pid != -1) - err_printf(m, " (submitted by %s [%d])", + err_printf(m, " (submitted by %s [%d], bans %d)", ee->comm, - ee->pid); + ee->pid, + ee->context_bans); err_printf(m, " --- gtt_offset = 0x%08x %08x\n", upper_32_bits(obj->gtt_offset), lower_32_bits(obj->gtt_offset)); -- 2.7.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct 2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala ` (3 preceding siblings ...) 2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala @ 2016-11-16 15:20 ` Mika Kuoppala 2016-11-16 17:13 ` Chris Wilson 2016-11-16 16:16 ` ✗ Fi.CI.BAT: warning for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork 2016-11-18 14:15 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases (rev3) Patchwork 6 siblings, 1 reply; 15+ messages in thread From: Mika Kuoppala @ 2016-11-16 15:20 UTC (permalink / raw) To: intel-gfx Bannable property, banned status, guilty and active counts are properties of i915_gem_context. Make them so. v2: rebase Cc: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_drv.h | 31 ++++++++++-------------------- drivers/gpu/drm/i915/i915_gem.c | 25 ++++++++++-------------- drivers/gpu/drm/i915/i915_gem_context.c | 12 +++++------- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 +--- drivers/gpu/drm/i915/i915_gem_request.c | 4 ++-- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- 6 files changed, 29 insertions(+), 49 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index b385b9d..8bf95e9 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -902,25 +902,6 @@ enum i915_cache_level { I915_CACHE_WT, /* hsw:gt3e WriteThrough for scanouts */ }; -struct i915_ctx_hang_stats { - /* This context had batch pending when hang was declared */ - unsigned batch_pending; - - /* This context had batch active when hang was declared */ - unsigned batch_active; - - bool bannable:1; - - /* This context is banned to submit more work */ - bool banned:1; - -#define CONTEXT_SCORE_GUILTY 10 -#define CONTEXT_SCORE_BAN_THRESHOLD 40 - /* Accumulated score of hangs caused by this context */ - int ban_score; -}; - -/* This must match up with the value previously used for execbuf2.rsvd1. */ #define DEFAULT_CONTEXT_HANDLE 0 /** @@ -950,8 +931,6 @@ struct i915_gem_context { struct pid *pid; const char *name; - struct i915_ctx_hang_stats hang_stats; - unsigned long flags; #define CONTEXT_NO_ZEROMAP BIT(0) #define CONTEXT_NO_ERROR_CAPTURE BIT(1) @@ -980,6 +959,16 @@ struct i915_gem_context { u8 remap_slice; bool closed:1; + bool bannable:1; + bool banned:1; + + unsigned int guilty_count; /* guilty of a hang */ + unsigned int active_count; /* active during hang */ + +#define CONTEXT_SCORE_GUILTY 10 +#define CONTEXT_SCORE_BAN_THRESHOLD 40 + /* Accumulated score of hangs caused by this context */ + int ban_score; }; enum fb_op_origin { diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index a24fd99..b3324a5 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2620,15 +2620,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj, static bool i915_context_is_banned(const struct i915_gem_context *ctx) { - const struct i915_ctx_hang_stats *hs = &ctx->hang_stats; - - if (hs->banned) + if (ctx->banned) return true; - if (!hs->bannable) + if (!ctx->bannable) return false; - if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) { + if (ctx->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) { DRM_DEBUG("context hanging too often, banning!\n"); return true; } @@ -2638,20 +2636,19 @@ static bool i915_context_is_banned(const struct i915_gem_context *ctx) static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) { - struct i915_ctx_hang_stats *hs = &ctx->hang_stats; - - hs->ban_score += CONTEXT_SCORE_GUILTY; + ctx->ban_score += CONTEXT_SCORE_GUILTY; - hs->banned = i915_context_is_banned(ctx); - hs->batch_active++; + ctx->banned = i915_context_is_banned(ctx); + ctx->guilty_count++; DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n", - ctx->name, hs->ban_score, yesno(hs->banned)); + ctx->name, ctx->ban_score, + yesno(ctx->banned)); if (!ctx->file_priv) return; - if (hs->banned) { + if (ctx->banned) { ctx->file_priv->context_bans++; DRM_DEBUG_DRIVER("client %s has has %d context banned\n", @@ -2662,9 +2659,7 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx) { - struct i915_ctx_hang_stats *hs = &ctx->hang_stats; - - hs->batch_pending++; + ctx->active_count++; } struct drm_i915_gem_request * diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 4f88f03..1249b2c 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev, * is no remap info, it will be a NOP. */ ctx->remap_slice = ALL_L3_SLICES(dev_priv); - ctx->hang_stats.bannable = true; + ctx->bannable = true; ctx->ring_size = 4 * PAGE_SIZE; ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) << GEN8_CTX_ADDRESSING_MODE_SHIFT; @@ -1115,7 +1115,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE); break; case I915_CONTEXT_PARAM_BANNABLE: - args->value = ctx->hang_stats.bannable; + args->value = ctx->bannable; break; default: ret = -EINVAL; @@ -1172,7 +1172,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, else if (!capable(CAP_SYS_ADMIN) && !args->value) ret = -EPERM; else - ctx->hang_stats.bannable = args->value; + ctx->bannable = args->value; break; default: ret = -EINVAL; @@ -1188,7 +1188,6 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, { struct drm_i915_private *dev_priv = to_i915(dev); struct drm_i915_reset_stats *args = data; - struct i915_ctx_hang_stats *hs; struct i915_gem_context *ctx; int ret; @@ -1207,15 +1206,14 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, mutex_unlock(&dev->struct_mutex); return PTR_ERR(ctx); } - hs = &ctx->hang_stats; if (capable(CAP_SYS_ADMIN)) args->reset_count = i915_reset_count(&dev_priv->gpu_error); else args->reset_count = 0; - args->batch_active = hs->batch_active; - args->batch_pending = hs->batch_pending; + args->batch_active = ctx->guilty_count; + args->batch_pending = ctx->active_count; mutex_unlock(&dev->struct_mutex); diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index e804cb2..c0c24f1 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1232,14 +1232,12 @@ i915_gem_validate_context(struct drm_device *dev, struct drm_file *file, struct intel_engine_cs *engine, const u32 ctx_id) { struct i915_gem_context *ctx; - struct i915_ctx_hang_stats *hs; ctx = i915_gem_context_lookup(file->driver_priv, ctx_id); if (IS_ERR(ctx)) return ctx; - hs = &ctx->hang_stats; - if (hs->banned) { + if (ctx->banned) { DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id); return ERR_PTR(-EIO); } diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index b31d18e..b7cd017 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -256,8 +256,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request) } /* Retirement decays the ban score as it is a sign of ctx progress */ - if (request->ctx->hang_stats.ban_score > 0) - request->ctx->hang_stats.ban_score--; + if (request->ctx->ban_score > 0) + request->ctx->ban_score--; i915_gem_context_put(request->ctx); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 5990b97..3e7489a 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1173,7 +1173,7 @@ static void record_request(struct drm_i915_gem_request *request, struct drm_i915_error_request *erq) { erq->context = request->ctx->hw_id; - erq->ban_score = request->ctx->hang_stats.ban_score; + erq->ban_score = request->ctx->ban_score; erq->seqno = request->global_seqno; erq->jiffies = request->emitted_jiffies; erq->head = request->head; -- 2.7.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct 2016-11-16 15:20 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala @ 2016-11-16 17:13 ` Chris Wilson 2016-11-21 12:55 ` Mika Kuoppala 0 siblings, 1 reply; 15+ messages in thread From: Chris Wilson @ 2016-11-16 17:13 UTC (permalink / raw) To: Mika Kuoppala; +Cc: intel-gfx On Wed, Nov 16, 2016 at 05:20:34PM +0200, Mika Kuoppala wrote: > Bannable property, banned status, guilty and active counts are > properties of i915_gem_context. Make them so. > > v2: rebase > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> Been hesistating since the substruct might have helped, but after a few reads, I'm happier with the conversion. Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> -Chris -- Chris Wilson, Intel Open Source Technology Centre _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct 2016-11-16 17:13 ` Chris Wilson @ 2016-11-21 12:55 ` Mika Kuoppala 0 siblings, 0 replies; 15+ messages in thread From: Mika Kuoppala @ 2016-11-21 12:55 UTC (permalink / raw) To: Chris Wilson; +Cc: intel-gfx Chris Wilson <chris@chris-wilson.co.uk> writes: > On Wed, Nov 16, 2016 at 05:20:34PM +0200, Mika Kuoppala wrote: >> Bannable property, banned status, guilty and active counts are >> properties of i915_gem_context. Make them so. >> >> v2: rebase >> >> Cc: Chris Wilson <chris@chris-wilson.co.uk> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> > > Been hesistating since the substruct might have helped, but after a few > reads, I'm happier with the conversion. > > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Whole series pushed. Thanks for review. -Mika > -Chris > > -- > Chris Wilson, Intel Open Source Technology Centre _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply [flat|nested] 15+ messages in thread
* ✗ Fi.CI.BAT: warning for series starting with [1/6] drm/i915: Split up hangcheck phases 2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala ` (4 preceding siblings ...) 2016-11-16 15:20 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala @ 2016-11-16 16:16 ` Patchwork 2016-11-18 14:15 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases (rev3) Patchwork 6 siblings, 0 replies; 15+ messages in thread From: Patchwork @ 2016-11-16 16:16 UTC (permalink / raw) To: Mika Kuoppala; +Cc: intel-gfx == Series Details == Series: series starting with [1/6] drm/i915: Split up hangcheck phases URL : https://patchwork.freedesktop.org/series/15423/ State : warning == Summary == Series 15423v1 Series without cover letter https://patchwork.freedesktop.org/api/1.0/series/15423/revisions/1/mbox/ Test drv_module_reload_basic: pass -> DMESG-WARN (fi-skl-6770hq) fi-bdw-5557u total:244 pass:229 dwarn:0 dfail:0 fail:0 skip:15 fi-bsw-n3050 total:244 pass:204 dwarn:0 dfail:0 fail:0 skip:40 fi-bxt-t5700 total:244 pass:216 dwarn:0 dfail:0 fail:0 skip:28 fi-byt-j1900 total:244 pass:216 dwarn:0 dfail:0 fail:0 skip:28 fi-byt-n2820 total:244 pass:212 dwarn:0 dfail:0 fail:0 skip:32 fi-hsw-4770 total:244 pass:224 dwarn:0 dfail:0 fail:0 skip:20 fi-hsw-4770r total:244 pass:224 dwarn:0 dfail:0 fail:0 skip:20 fi-ilk-650 total:244 pass:191 dwarn:0 dfail:0 fail:0 skip:53 fi-ivb-3520m total:244 pass:222 dwarn:0 dfail:0 fail:0 skip:22 fi-ivb-3770 total:244 pass:222 dwarn:0 dfail:0 fail:0 skip:22 fi-kbl-7200u total:244 pass:222 dwarn:0 dfail:0 fail:0 skip:22 fi-skl-6260u total:244 pass:230 dwarn:0 dfail:0 fail:0 skip:14 fi-skl-6700hq total:244 pass:223 dwarn:0 dfail:0 fail:0 skip:21 fi-skl-6700k total:244 pass:222 dwarn:1 dfail:0 fail:0 skip:21 fi-skl-6770hq total:244 pass:229 dwarn:1 dfail:0 fail:0 skip:14 fi-snb-2520m total:244 pass:212 dwarn:0 dfail:0 fail:0 skip:32 fi-snb-2600 total:244 pass:211 dwarn:0 dfail:0 fail:0 skip:33 6294f67611ebe69006c0e85c372efadcac8e9d66 drm-intel-nightly: 2016y-11m-16d-09h-57m-25s UTC integration manifest 7bb86db drm/i915: Wipe hang stats as an embedded struct 73fd54a drm/i915: Add per client max context ban limit 85bae87 drm/i915: Add bannable context parameter 2ae4b63 drm/i915: Use request retirement as context progress 80ee7bc drm/i915: Decouple hang detection from hangcheck period 11ed92d drm/i915: Split up hangcheck phases == Logs == For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3024/ _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply [flat|nested] 15+ messages in thread
* ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases (rev3) 2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala ` (5 preceding siblings ...) 2016-11-16 16:16 ` ✗ Fi.CI.BAT: warning for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork @ 2016-11-18 14:15 ` Patchwork 6 siblings, 0 replies; 15+ messages in thread From: Patchwork @ 2016-11-18 14:15 UTC (permalink / raw) To: Mika Kuoppala; +Cc: intel-gfx == Series Details == Series: series starting with [1/6] drm/i915: Split up hangcheck phases (rev3) URL : https://patchwork.freedesktop.org/series/15423/ State : success == Summary == Series 15423v3 Series without cover letter https://patchwork.freedesktop.org/api/1.0/series/15423/revisions/3/mbox/ fi-bdw-5557u total:244 pass:229 dwarn:0 dfail:0 fail:0 skip:15 fi-bsw-n3050 total:244 pass:204 dwarn:0 dfail:0 fail:0 skip:40 fi-bxt-t5700 total:244 pass:216 dwarn:0 dfail:0 fail:0 skip:28 fi-byt-j1900 total:244 pass:216 dwarn:0 dfail:0 fail:0 skip:28 fi-byt-n2820 total:244 pass:212 dwarn:0 dfail:0 fail:0 skip:32 fi-hsw-4770 total:244 pass:224 dwarn:0 dfail:0 fail:0 skip:20 fi-hsw-4770r total:244 pass:224 dwarn:0 dfail:0 fail:0 skip:20 fi-ilk-650 total:244 pass:191 dwarn:0 dfail:0 fail:0 skip:53 fi-ivb-3520m total:244 pass:222 dwarn:0 dfail:0 fail:0 skip:22 fi-ivb-3770 total:244 pass:222 dwarn:0 dfail:0 fail:0 skip:22 fi-kbl-7200u total:244 pass:222 dwarn:0 dfail:0 fail:0 skip:22 fi-skl-6260u total:244 pass:230 dwarn:0 dfail:0 fail:0 skip:14 fi-skl-6700hq total:244 pass:223 dwarn:0 dfail:0 fail:0 skip:21 fi-skl-6700k total:244 pass:222 dwarn:1 dfail:0 fail:0 skip:21 fi-skl-6770hq total:244 pass:230 dwarn:0 dfail:0 fail:0 skip:14 fi-snb-2520m total:244 pass:212 dwarn:0 dfail:0 fail:0 skip:32 fi-snb-2600 total:244 pass:211 dwarn:0 dfail:0 fail:0 skip:33 05104adb7446f310fad4e36f22d4c84ffafa31fc drm-intel-nightly: 2016y-11m-18d-13h-10m-16s UTC integration manifest 8ef5298 drm/i915: Wipe hang stats as an embedded struct dfc655e drm/i915: Add per client max context ban limit c0920d3 drm/i915: Add bannable context parameter 3dc541d drm/i915: Use request retirement as context progress 3f27d62 drm/i915: Decouple hang detection from hangcheck period 4a7b29a drm/i915: Split up hangcheck phases == Logs == For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3054/ _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2016-11-21 12:56 UTC | newest] Thread overview: 15+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2016-11-16 15:20 [PATCH 1/6] drm/i915: Split up hangcheck phases Mika Kuoppala 2016-11-16 15:20 ` [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period Mika Kuoppala 2016-11-16 17:05 ` Chris Wilson 2016-11-18 13:09 ` Mika Kuoppala 2016-11-16 15:20 ` [PATCH 3/6] drm/i915: Use request retirement as context progress Mika Kuoppala 2016-11-16 17:08 ` Chris Wilson 2016-11-16 15:20 ` [PATCH 4/6] drm/i915: Add bannable context parameter Mika Kuoppala 2016-11-16 15:20 ` [PATCH 5/6] drm/i915: Add per client max context ban limit Mika Kuoppala 2016-11-16 17:12 ` Chris Wilson 2016-11-18 13:10 ` Mika Kuoppala 2016-11-16 15:20 ` [PATCH 6/6] drm/i915: Wipe hang stats as an embedded struct Mika Kuoppala 2016-11-16 17:13 ` Chris Wilson 2016-11-21 12:55 ` Mika Kuoppala 2016-11-16 16:16 ` ✗ Fi.CI.BAT: warning for series starting with [1/6] drm/i915: Split up hangcheck phases Patchwork 2016-11-18 14:15 ` ✓ Fi.CI.BAT: success for series starting with [1/6] drm/i915: Split up hangcheck phases (rev3) Patchwork
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).