public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches
@ 2014-02-10 14:30 Mika Kuoppala
  2014-02-10 14:30 ` [PATCH 2/3] drm/i915: Record pid/comm of hanging task Mika Kuoppala
                   ` (3 more replies)
  0 siblings, 4 replies; 19+ messages in thread
From: Mika Kuoppala @ 2014-02-10 14:30 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky, miku

From: Chris Wilson <chris@chris-wilson.co.uk>

In the past, it was possible to have multiple batches per request due to
a stray signal or ENOMEM. As a result we had to scan each active object
(filtered by those having the COMMAND domain) for the one that contained
the ACTHD pointer. This was then made more complicated by the
introduction of ppgtt, whereby ACTHD then pointed into the address space
of the context and so also needed to be taken into account.

This is a fairly robust approach (though the implementation is a little
fragile and depends upon the per-generation setup, registers and
parameters). However, due to the requirements for hangstats, we needed a
robust method for associating batches with a particular request and
having that we can rely upon it for finding the associated batch object
for error capture.

If the batch buffer tracking is not robust enough, that should become
apparent quite quickly through an erroneous error capture. That should
also help to make sure that the runtime reporting to userspace is
robust. It also means that we then report the oldest incomplete batch on
each ring, which can be useful for determining the state of userspace at
the time of a hang.

v2: Use i915_gem_find_active_request (Mika)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1)
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v2)
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h       |    3 ++
 drivers/gpu/drm/i915/i915_gem.c       |   13 +++++--
 drivers/gpu/drm/i915/i915_gpu_error.c |   66 ++++-----------------------------
 3 files changed, 20 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b1e91c3..eb260bc 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2147,6 +2147,9 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
 	}
 }
 
+struct drm_i915_gem_request *
+i915_gem_find_active_request(struct intel_ring_buffer *ring);
+
 bool i915_gem_retire_requests(struct drm_device *dev);
 void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
 int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b0a244a..20e55ef 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2298,11 +2298,16 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
 	kfree(request);
 }
 
-static struct drm_i915_gem_request *
-i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
+struct drm_i915_gem_request *
+i915_gem_find_active_request(struct intel_ring_buffer *ring)
 {
 	struct drm_i915_gem_request *request;
-	const u32 completed_seqno = ring->get_seqno(ring, false);
+	u32 completed_seqno;
+
+	if (WARN_ON(!ring->get_seqno))
+		return NULL;
+
+	completed_seqno = ring->get_seqno(ring, false);
 
 	list_for_each_entry(request, &ring->request_list, list) {
 		if (i915_seqno_passed(completed_seqno, request->seqno))
@@ -2320,7 +2325,7 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
 	struct drm_i915_gem_request *request;
 	bool ring_hung;
 
-	request = i915_gem_find_first_non_complete(ring);
+	request = i915_gem_find_active_request(ring);
 
 	if (request == NULL)
 		return;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index dc47bb9..5bd075a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -713,46 +713,14 @@ static void i915_gem_record_fences(struct drm_device *dev,
 	}
 }
 
-/* This assumes all batchbuffers are executed from the PPGTT. It might have to
- * change in the future. */
-static bool is_active_vm(struct i915_address_space *vm,
-			 struct intel_ring_buffer *ring)
-{
-	struct drm_device *dev = vm->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct i915_hw_ppgtt *ppgtt;
-
-	if (INTEL_INFO(dev)->gen < 7)
-		return i915_is_ggtt(vm);
-
-	/* FIXME: This ignores that the global gtt vm is also on this list. */
-	ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
-
-	if (INTEL_INFO(dev)->gen >= 8) {
-		u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
-		pdp0 |=  I915_READ(GEN8_RING_PDP_LDW(ring, 0));
-		return pdp0 == ppgtt->pd_dma_addr[0];
-	} else {
-		u32 pp_db;
-		pp_db = I915_READ(RING_PP_DIR_BASE(ring));
-		return (pp_db >> 10) == ppgtt->pd_offset;
-	}
-}
-
 static struct drm_i915_error_object *
 i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 			     struct intel_ring_buffer *ring)
 {
-	struct i915_address_space *vm;
-	struct i915_vma *vma;
-	struct drm_i915_gem_object *obj;
-	bool found_active = false;
-	u32 seqno;
-
-	if (!ring->get_seqno)
-		return NULL;
+	struct drm_i915_gem_request *request;
 
 	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
+		struct drm_i915_gem_object *obj;
 		u32 acthd = I915_READ(ACTHD);
 
 		if (WARN_ON(ring->id != RCS))
@@ -765,32 +733,14 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 			return i915_error_ggtt_object_create(dev_priv, obj);
 	}
 
-	seqno = ring->get_seqno(ring, false);
-	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
-		if (!is_active_vm(vm, ring))
-			continue;
-
-		found_active = true;
-
-		list_for_each_entry(vma, &vm->active_list, mm_list) {
-			obj = vma->obj;
-			if (obj->ring != ring)
-				continue;
-
-			if (i915_seqno_passed(seqno, obj->last_read_seqno))
-				continue;
-
-			if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
-				continue;
-
-			/* We need to copy these to an anonymous buffer as the simplest
-			 * method to avoid being overwritten by userspace.
-			 */
-			return i915_error_object_create(dev_priv, obj, vm);
-		}
+	request = i915_gem_find_active_request(ring);
+	if (request) {
+		/* We need to copy these to an anonymous buffer as the simplest
+		 * method to avoid being overwritten by userspace.
+		 */
+		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
 	}
 
-	WARN_ON(!found_active);
 	return NULL;
 }
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/3] drm/i915: Record pid/comm of hanging task
  2014-02-10 14:30 [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Mika Kuoppala
@ 2014-02-10 14:30 ` Mika Kuoppala
  2014-02-12  2:07   ` Ben Widawsky
  2014-02-10 14:30 ` [PATCH 3/3] drm/i915: Record error state capture reason Mika Kuoppala
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 19+ messages in thread
From: Mika Kuoppala @ 2014-02-10 14:30 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky, miku

From: Chris Wilson <chris@chris-wilson.co.uk>

After finding the guilty batch and request, we can use it to find the
process that submitted the batch and then add the culprit into the error
state.

This is a slightly different approach from Ben's in that instead of
adding the extra information into the struct i915_hw_context, we use the
information already captured in struct drm_file which is then referenced
from the request.

v2: Also capture the workaround buffer for gen2, so that we can compare
    its contents against the intended batch for the active request.

v3: Rebase (Mika)

Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v2)
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v3)
Cc: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h       |    6 +-
 drivers/gpu/drm/i915/i915_gem.c       |    1 +
 drivers/gpu/drm/i915/i915_gpu_error.c |  121 ++++++++++++++++++---------------
 3 files changed, 74 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index eb260bc..2a61a29 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -356,7 +356,7 @@ struct drm_i915_error_state {
 			int page_count;
 			u32 gtt_offset;
 			u32 *pages[0];
-		} *ringbuffer, *batchbuffer, *ctx, *hws_page;
+		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
 		struct drm_i915_error_request {
 			long jiffies;
@@ -371,6 +371,9 @@ struct drm_i915_error_state {
 				u32 pp_dir_base;
 			};
 		} vm_info;
+
+		pid_t pid;
+		char comm[TASK_COMM_LEN];
 	} ring[I915_NUM_RINGS];
 	struct drm_i915_error_buffer {
 		u32 size;
@@ -1786,6 +1789,7 @@ struct drm_i915_gem_request {
 
 struct drm_i915_file_private {
 	struct drm_i915_private *dev_priv;
+	struct drm_file *file;
 
 	struct {
 		spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 20e55ef..efbd1dc 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4883,6 +4883,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file)
 
 	file->driver_priv = file_priv;
 	file_priv->dev_priv = dev->dev_private;
+	file_priv->file = file;
 
 	spin_lock_init(&file_priv->mm.lock);
 	INIT_LIST_HEAD(&file_priv->mm.request_list);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5bd075a..a90971a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -301,13 +301,28 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
 	va_end(args);
 }
 
+static void print_error_obj(struct drm_i915_error_state_buf *m,
+			    struct drm_i915_error_object *obj)
+{
+	int page, offset, elt;
+
+	for (page = offset = 0; page < obj->page_count; page++) {
+		for (elt = 0; elt < PAGE_SIZE/4; elt++) {
+			err_printf(m, "%08x :  %08x\n", offset,
+				   obj->pages[page][elt]);
+			offset += 4;
+		}
+	}
+}
+
 int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			    const struct i915_error_state_file_priv *error_priv)
 {
 	struct drm_device *dev = error_priv->dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error = error_priv->error;
-	int i, j, page, offset, elt;
+	int i, j, offset, elt;
+	int max_hangcheck_score;
 
 	if (!error) {
 		err_printf(m, "no error state collected\n");
@@ -317,6 +332,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
 		   error->time.tv_usec);
 	err_printf(m, "Kernel: " UTS_RELEASE "\n");
+	max_hangcheck_score = 0;
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+		if (error->ring[i].hangcheck_score > max_hangcheck_score)
+			max_hangcheck_score = error->ring[i].hangcheck_score;
+	}
+	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+		if (error->ring[i].hangcheck_score == max_hangcheck_score &&
+		    error->ring[i].pid != -1) {
+			err_printf(m, "Active process (on ring %s): %s [%d]\n",
+				   ring_str(i),
+				   error->ring[i].comm,
+				   error->ring[i].pid);
+		}
+	}
 	err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
 	err_printf(m, "EIR: 0x%08x\n", error->eir);
 	err_printf(m, "IER: 0x%08x\n", error->ier);
@@ -360,17 +389,19 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		struct drm_i915_error_object *obj;
 
 		if ((obj = error->ring[i].batchbuffer)) {
-			err_printf(m, "%s --- gtt_offset = 0x%08x\n",
-				   dev_priv->ring[i].name,
+			err_puts(m, dev_priv->ring[i].name);
+			if (error->ring[i].pid != -1)
+				err_printf(m, " (submitted by %s [%d])",
+					   error->ring[i].comm, error->ring[i].pid);
+			err_printf(m, " --- gtt_offset = 0x%08x\n",
 				   obj->gtt_offset);
-			offset = 0;
-			for (page = 0; page < obj->page_count; page++) {
-				for (elt = 0; elt < PAGE_SIZE/4; elt++) {
-					err_printf(m, "%08x :  %08x\n", offset,
-						   obj->pages[page][elt]);
-					offset += 4;
-				}
-			}
+			print_error_obj(m, obj);
+		}
+
+		if ((obj = error->ring[i].wa_batchbuffer)) {
+			err_printf(m, "%s (w/a) --- gtt_offset = 0x%08x\n",
+				   dev_priv->ring[i].name, obj->gtt_offset);
+			print_error_obj(m, obj);
 		}
 
 		if (error->ring[i].num_requests) {
@@ -389,15 +420,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			err_printf(m, "%s --- ringbuffer = 0x%08x\n",
 				   dev_priv->ring[i].name,
 				   obj->gtt_offset);
-			offset = 0;
-			for (page = 0; page < obj->page_count; page++) {
-				for (elt = 0; elt < PAGE_SIZE/4; elt++) {
-					err_printf(m, "%08x :  %08x\n",
-						   offset,
-						   obj->pages[page][elt]);
-					offset += 4;
-				}
-			}
+			print_error_obj(m, obj);
 		}
 
 		if ((obj = error->ring[i].hws_page)) {
@@ -713,37 +736,6 @@ static void i915_gem_record_fences(struct drm_device *dev,
 	}
 }
 
-static struct drm_i915_error_object *
-i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
-			     struct intel_ring_buffer *ring)
-{
-	struct drm_i915_gem_request *request;
-
-	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
-		struct drm_i915_gem_object *obj;
-		u32 acthd = I915_READ(ACTHD);
-
-		if (WARN_ON(ring->id != RCS))
-			return NULL;
-
-		obj = ring->scratch.obj;
-		if (obj != NULL &&
-		    acthd >= i915_gem_obj_ggtt_offset(obj) &&
-		    acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
-			return i915_error_ggtt_object_create(dev_priv, obj);
-	}
-
-	request = i915_gem_find_active_request(ring);
-	if (request) {
-		/* We need to copy these to an anonymous buffer as the simplest
-		 * method to avoid being overwritten by userspace.
-		 */
-		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
-	}
-
-	return NULL;
-}
-
 static void i915_record_ring_state(struct drm_device *dev,
 				   struct intel_ring_buffer *ring,
 				   struct drm_i915_error_ring *ering)
@@ -892,8 +884,31 @@ static void i915_gem_record_rings(struct drm_device *dev,
 
 		i915_record_ring_state(dev, ring, &error->ring[i]);
 
-		error->ring[i].batchbuffer =
-			i915_error_first_batchbuffer(dev_priv, ring);
+		error->ring[i].pid = -1;
+		request = i915_gem_find_active_request(ring);
+		if (request) {
+			/* We need to copy these to an anonymous buffer as the simplest
+			 * method to avoid being overwritten by userspace.
+			 */
+			error->ring[i].batchbuffer =
+				i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+
+			if (HAS_BROKEN_CS_TLB(dev_priv->dev) && ring->scratch.obj)
+				error->ring[i].wa_batchbuffer =
+					i915_error_ggtt_object_create(dev_priv, ring->scratch.obj);
+
+			if (request->file_priv) {
+				struct task_struct *task;
+
+				rcu_read_lock();
+				task = pid_task(request->file_priv->file->pid, PIDTYPE_PID);
+				if (task) {
+					strcpy(error->ring[i].comm, task->comm);
+					error->ring[i].pid = task->pid;
+				}
+				rcu_read_unlock();
+			}
+		}
 
 		error->ring[i].ringbuffer =
 			i915_error_ggtt_object_create(dev_priv, ring->obj);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/3] drm/i915: Record error state capture reason
  2014-02-10 14:30 [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Mika Kuoppala
  2014-02-10 14:30 ` [PATCH 2/3] drm/i915: Record pid/comm of hanging task Mika Kuoppala
@ 2014-02-10 14:30 ` Mika Kuoppala
  2014-02-12  2:32   ` Ben Widawsky
  2014-02-12  1:57 ` [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Ben Widawsky
  2014-02-12 17:47 ` Chris Wilson
  3 siblings, 1 reply; 19+ messages in thread
From: Mika Kuoppala @ 2014-02-10 14:30 UTC (permalink / raw)
  To: intel-gfx; +Cc: miku

We capture error state not only when the GPU hangs but
also on other situations as in interrupt errors and
in situations where we can kick things forward without GPU reset.
There will be log entry on most of these cases. But as error state
capture might be only thing we have, if dmesg was not captured. Or as
in GEN4 case, interrupt error can trigger error state capture without log
entry, the exact reason why capture was made is hard to decipher.

To avoid confusion why the error state was captured, print reason
along with error code into log and also store it into the error state.

References: https://bugs.freedesktop.org/show_bug.cgi?id=74193
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c   |    5 +-
 drivers/gpu/drm/i915/i915_drv.h       |    9 +++-
 drivers/gpu/drm/i915/i915_gpu_error.c |   84 +++++++++++++++++++++++----------
 drivers/gpu/drm/i915/i915_irq.c       |   45 +++++++++++-------
 4 files changed, 98 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 2dc05c3..175e524 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3006,9 +3006,8 @@ i915_wedged_set(void *data, u64 val)
 {
 	struct drm_device *dev = data;
 
-	DRM_INFO("Manually setting wedged to %llu\n", val);
-	i915_handle_error(dev, val);
-
+	i915_handle_error(dev, val,
+			  "Manually setting wedged to %llu", val);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2a61a29..ad41c71 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -299,6 +299,8 @@ struct drm_i915_error_state {
 	struct kref ref;
 	struct timeval time;
 
+	char error_msg[128];
+
 	/* Generic register state */
 	u32 eir;
 	u32 pgtbl_er;
@@ -1994,7 +1996,9 @@ extern void intel_console_resume(struct work_struct *work);
 
 /* i915_irq.c */
 void i915_queue_hangcheck(struct drm_device *dev);
-void i915_handle_error(struct drm_device *dev, bool wedged);
+__printf(3, 4)
+void i915_handle_error(struct drm_device *dev, bool wedged,
+		       const char *fmt, ...);
 
 void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
 							int new_delay);
@@ -2464,7 +2468,8 @@ static inline void i915_error_state_buf_release(
 {
 	kfree(eb->buf);
 }
-void i915_capture_error_state(struct drm_device *dev);
+void i915_capture_error_state(struct drm_device *dev, bool wedge,
+			      const char *error_msg);
 void i915_error_state_get(struct drm_device *dev,
 			  struct i915_error_state_file_priv *error_priv);
 void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index a90971a..dce569b 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -329,6 +329,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		goto out;
 	}
 
+	err_printf(m, "%s\n", error->error_msg);
 	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
 		   error->time.tv_usec);
 	err_printf(m, "Kernel: " UTS_RELEASE "\n");
@@ -686,7 +687,8 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
  * It's only a small step better than a random number in its current form.
  */
 static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
-					 struct drm_i915_error_state *error)
+					 struct drm_i915_error_state *error,
+					 int *ring_id)
 {
 	uint32_t error_code = 0;
 	int i;
@@ -696,9 +698,14 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
 	 * synchronization commands which almost always appear in the case
 	 * strictly a client bug. Use instdone to differentiate those some.
 	 */
-	for (i = 0; i < I915_NUM_RINGS; i++)
-		if (error->ring[i].hangcheck_action == HANGCHECK_HUNG)
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		if (error->ring[i].hangcheck_action == HANGCHECK_HUNG) {
+			if (ring_id)
+				*ring_id = i;
+
 			return error->ring[i].ipehr ^ error->ring[i].instdone;
+		}
+	}
 
 	return error_code;
 }
@@ -1075,6 +1082,39 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
 	i915_get_extra_instdone(dev, error->extra_instdone);
 }
 
+static void i915_error_generate_msg(struct drm_device *dev,
+				    struct drm_i915_error_state *error,
+				    bool wedge,
+				    const char *error_msg)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u32 ecode;
+	int ring_id = -1, len;
+
+	ecode = i915_error_generate_code(dev_priv, error, &ring_id);
+
+	len = scnprintf(error->error_msg, sizeof(error->error_msg),
+			"GPU HANG: ecode %d:0x%08x", ring_id, ecode);
+
+	if (ring_id != -1 && error->ring[ring_id].pid != -1)
+			len += scnprintf(error->error_msg + len,
+					 sizeof(error->error_msg) - len,
+					 ", in %s [%d]",
+					 error->ring[ring_id].comm,
+					 error->ring[ring_id].pid);
+
+	if (error_msg)
+		len += scnprintf(error->error_msg + len,
+				 sizeof(error->error_msg) - len,
+				 ", reason: %s",
+				 error_msg);
+
+	if (wedge)
+		len += scnprintf(error->error_msg + len,
+				 sizeof(error->error_msg) - len,
+				 ", resetting GPU");
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -1084,19 +1124,13 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
  * out a structure which becomes available in debugfs for user level tools
  * to pick up.
  */
-void i915_capture_error_state(struct drm_device *dev)
+void i915_capture_error_state(struct drm_device *dev, bool wedged,
+			      const char *error_msg)
 {
 	static bool warned;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error;
 	unsigned long flags;
-	uint32_t ecode;
-
-	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
-	error = dev_priv->gpu_error.first_error;
-	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
-	if (error)
-		return;
 
 	/* Account for pipe specific data like PIPE*STAT */
 	error = kzalloc(sizeof(*error), GFP_ATOMIC);
@@ -1105,30 +1139,21 @@ void i915_capture_error_state(struct drm_device *dev)
 		return;
 	}
 
-	DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
-		 dev->primary->index);
 	kref_init(&error->ref);
 
 	i915_capture_reg_state(dev_priv, error);
 	i915_gem_capture_buffers(dev_priv, error);
 	i915_gem_record_fences(dev, error);
 	i915_gem_record_rings(dev, error);
-	ecode = i915_error_generate_code(dev_priv, error);
-
-	if (!warned) {
-		DRM_INFO("GPU HANG [%x]\n", ecode);
-		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
-		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
-		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
-		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
-		warned = true;
-	}
 
 	do_gettimeofday(&error->time);
 
 	error->overlay = intel_overlay_capture_error_state(dev);
 	error->display = intel_display_capture_error_state(dev);
 
+	i915_error_generate_msg(dev, error, wedged, error_msg);
+	DRM_INFO("%s\n", error->error_msg);
+
 	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
 	if (dev_priv->gpu_error.first_error == NULL) {
 		dev_priv->gpu_error.first_error = error;
@@ -1136,8 +1161,19 @@ void i915_capture_error_state(struct drm_device *dev)
 	}
 	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
 
-	if (error)
+	if (error) {
 		i915_error_state_free(&error->ref);
+		return;
+	}
+
+	if (!warned) {
+		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
+		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
+		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
+		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
+		DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", dev->primary->index);
+		warned = true;
+	}
 }
 
 void i915_error_state_get(struct drm_device *dev,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index d4defd8..36bba16 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1223,8 +1223,7 @@ static void snb_gt_irq_handler(struct drm_device *dev,
 	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
 		      GT_BSD_CS_ERROR_INTERRUPT |
 		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
-		DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
-		i915_handle_error(dev, false);
+		i915_handle_error(dev, false, "GT error interrupt 0x%08x", gt_iir);
 	}
 
 	if (gt_iir & GT_PARITY_ERROR(dev))
@@ -1471,8 +1470,9 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
 			notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
 
 		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
-			DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir);
-			i915_handle_error(dev_priv->dev, false);
+			i915_handle_error(dev_priv->dev, false,
+					  "VEBOX CS error interrupt 0x%08x",
+					  pm_iir);
 		}
 	}
 }
@@ -2174,11 +2174,18 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
  * so userspace knows something bad happened (should trigger collection
  * of a ring dump etc.).
  */
-void i915_handle_error(struct drm_device *dev, bool wedged)
+void i915_handle_error(struct drm_device *dev, bool wedged,
+		       const char *fmt, ...)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	va_list args;
+	char error_msg[80];
 
-	i915_capture_error_state(dev);
+	va_start(args, fmt);
+	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
+	va_end(args);
+
+	i915_capture_error_state(dev, wedged, error_msg);
 	i915_report_and_clear_eir(dev);
 
 	if (wedged) {
@@ -2481,9 +2488,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 	 */
 	tmp = I915_READ_CTL(ring);
 	if (tmp & RING_WAIT) {
-		DRM_ERROR("Kicking stuck wait on %s\n",
-			  ring->name);
-		i915_handle_error(dev, false);
+		i915_handle_error(dev, false,
+				  "Kicking stuck wait on %s",
+				  ring->name);
 		I915_WRITE_CTL(ring, tmp);
 		return HANGCHECK_KICK;
 	}
@@ -2493,9 +2500,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 		default:
 			return HANGCHECK_HUNG;
 		case 1:
-			DRM_ERROR("Kicking stuck semaphore on %s\n",
-				  ring->name);
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, false,
+					  "Kicking stuck semaphore on %s",
+					  ring->name);
 			I915_WRITE_CTL(ring, tmp);
 			return HANGCHECK_KICK;
 		case 0:
@@ -2617,7 +2624,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
 	}
 
 	if (rings_hung)
-		return i915_handle_error(dev, true);
+		return i915_handle_error(dev, true, "Ring hung");
 
 	if (busy_count)
 		/* Reset timer case chip hangs without another request
@@ -3234,7 +3241,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
 		 */
 		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, false,
+					  "Command parser error, iir 0x%08x",
+					  iir);
 
 		for_each_pipe(pipe) {
 			int reg = PIPESTAT(pipe);
@@ -3416,7 +3425,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
 		 */
 		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, false,
+					  "Command parser error, iir 0x%08x",
+					  iir);
 
 		for_each_pipe(pipe) {
 			int reg = PIPESTAT(pipe);
@@ -3653,7 +3664,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
 		 */
 		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, false,
+					  "Command parser error, iir 0x%08x",
+					  iir);
 
 		for_each_pipe(pipe) {
 			int reg = PIPESTAT(pipe);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches
  2014-02-10 14:30 [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Mika Kuoppala
  2014-02-10 14:30 ` [PATCH 2/3] drm/i915: Record pid/comm of hanging task Mika Kuoppala
  2014-02-10 14:30 ` [PATCH 3/3] drm/i915: Record error state capture reason Mika Kuoppala
@ 2014-02-12  1:57 ` Ben Widawsky
  2014-02-12  8:25   ` Chris Wilson
  2014-02-12 17:47 ` Chris Wilson
  3 siblings, 1 reply; 19+ messages in thread
From: Ben Widawsky @ 2014-02-12  1:57 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku, Ben Widawsky

On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> In the past, it was possible to have multiple batches per request due to
> a stray signal or ENOMEM. As a result we had to scan each active object
> (filtered by those having the COMMAND domain) for the one that contained
> the ACTHD pointer. This was then made more complicated by the
> introduction of ppgtt, whereby ACTHD then pointed into the address space
> of the context and so also needed to be taken into account.
> 
> This is a fairly robust approach (though the implementation is a little
> fragile and depends upon the per-generation setup, registers and
> parameters). However, due to the requirements for hangstats, we needed a
> robust method for associating batches with a particular request and
> having that we can rely upon it for finding the associated batch object
> for error capture.
> 
> If the batch buffer tracking is not robust enough, that should become
> apparent quite quickly through an erroneous error capture. That should
> also help to make sure that the runtime reporting to userspace is
> robust. It also means that we then report the oldest incomplete batch on
> each ring, which can be useful for determining the state of userspace at
> the time of a hang.
> 
> v2: Use i915_gem_find_active_request (Mika)
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1)
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v2)
> Cc: Ben Widawsky <benjamin.widawsky@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |    3 ++
>  drivers/gpu/drm/i915/i915_gem.c       |   13 +++++--
>  drivers/gpu/drm/i915/i915_gpu_error.c |   66 ++++-----------------------------
>  3 files changed, 20 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b1e91c3..eb260bc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2147,6 +2147,9 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
>  	}
>  }
>  
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring);
> +
>  bool i915_gem_retire_requests(struct drm_device *dev);
>  void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
>  int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b0a244a..20e55ef 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2298,11 +2298,16 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
>  	kfree(request);
>  }
>  
> -static struct drm_i915_gem_request *
> -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring)
>  {
>  	struct drm_i915_gem_request *request;
> -	const u32 completed_seqno = ring->get_seqno(ring, false);
> +	u32 completed_seqno;
> +
> +	if (WARN_ON(!ring->get_seqno))
> +		return NULL;

ring->get_seqno(ring, false) ?

This looks like it's currently wrong below as well.

> +
> +	completed_seqno = ring->get_seqno(ring, false);
>  
>  	list_for_each_entry(request, &ring->request_list, list) {
>  		if (i915_seqno_passed(completed_seqno, request->seqno))
> @@ -2320,7 +2325,7 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  	struct drm_i915_gem_request *request;
>  	bool ring_hung;
>  
> -	request = i915_gem_find_first_non_complete(ring);
> +	request = i915_gem_find_active_request(ring);
>  
>  	if (request == NULL)
>  		return;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index dc47bb9..5bd075a 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -713,46 +713,14 @@ static void i915_gem_record_fences(struct drm_device *dev,
>  	}
>  }
>  
> -/* This assumes all batchbuffers are executed from the PPGTT. It might have to
> - * change in the future. */
> -static bool is_active_vm(struct i915_address_space *vm,
> -			 struct intel_ring_buffer *ring)
> -{
> -	struct drm_device *dev = vm->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	struct i915_hw_ppgtt *ppgtt;
> -
> -	if (INTEL_INFO(dev)->gen < 7)
> -		return i915_is_ggtt(vm);
> -
> -	/* FIXME: This ignores that the global gtt vm is also on this list. */
> -	ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
> -
> -	if (INTEL_INFO(dev)->gen >= 8) {
> -		u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
> -		pdp0 |=  I915_READ(GEN8_RING_PDP_LDW(ring, 0));
> -		return pdp0 == ppgtt->pd_dma_addr[0];
> -	} else {
> -		u32 pp_db;
> -		pp_db = I915_READ(RING_PP_DIR_BASE(ring));
> -		return (pp_db >> 10) == ppgtt->pd_offset;
> -	}
> -}
> -
>  static struct drm_i915_error_object *
>  i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			     struct intel_ring_buffer *ring)
>  {
> -	struct i915_address_space *vm;
> -	struct i915_vma *vma;
> -	struct drm_i915_gem_object *obj;
> -	bool found_active = false;
> -	u32 seqno;
> -
> -	if (!ring->get_seqno)
> -		return NULL;
> +	struct drm_i915_gem_request *request;
>  
>  	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
> +		struct drm_i915_gem_object *obj;
>  		u32 acthd = I915_READ(ACTHD);
>  
>  		if (WARN_ON(ring->id != RCS))
> @@ -765,32 +733,14 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>  			return i915_error_ggtt_object_create(dev_priv, obj);
>  	}
>  
> -	seqno = ring->get_seqno(ring, false);
> -	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> -		if (!is_active_vm(vm, ring))
> -			continue;
> -
> -		found_active = true;
> -
> -		list_for_each_entry(vma, &vm->active_list, mm_list) {
> -			obj = vma->obj;
> -			if (obj->ring != ring)
> -				continue;
> -
> -			if (i915_seqno_passed(seqno, obj->last_read_seqno))
> -				continue;
> -
> -			if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
> -				continue;
> -
> -			/* We need to copy these to an anonymous buffer as the simplest
> -			 * method to avoid being overwritten by userspace.
> -			 */
> -			return i915_error_object_create(dev_priv, obj, vm);
> -		}
> +	request = i915_gem_find_active_request(ring);
> +	if (request) {
> +		/* We need to copy these to an anonymous buffer as the simplest
> +		 * method to avoid being overwritten by userspace.
> +		 */
> +		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);

Wrap this one. It's too long even for my undiscerning eye.

>  	}
>  
> -	WARN_ON(!found_active);
>  	return NULL;
>  }
>  


I still would have preferred to keep both methods for a while until we
felt really comfortable with this... the commit message's statement that
getting bad error state is inevitably a good thing is total bunk, I
think.

However, I'll act as a Daniel proxy here who seems in favor of this
path. The code is definitely simpler, I am just a chicken.

With the above fixed:
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/i915: Record pid/comm of hanging task
  2014-02-10 14:30 ` [PATCH 2/3] drm/i915: Record pid/comm of hanging task Mika Kuoppala
@ 2014-02-12  2:07   ` Ben Widawsky
  2014-02-12  8:15     ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Ben Widawsky @ 2014-02-12  2:07 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Mon, Feb 10, 2014 at 04:30:50PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> After finding the guilty batch and request, we can use it to find the
> process that submitted the batch and then add the culprit into the error
> state.
> 
> This is a slightly different approach from Ben's in that instead of
> adding the extra information into the struct i915_hw_context, we use the
> information already captured in struct drm_file which is then referenced
> from the request.
> 
> v2: Also capture the workaround buffer for gen2, so that we can compare
>     its contents against the intended batch for the active request.
> 
> v3: Rebase (Mika)
> 
> Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v2)
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v3)
> Cc: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |    6 +-
>  drivers/gpu/drm/i915/i915_gem.c       |    1 +
>  drivers/gpu/drm/i915/i915_gpu_error.c |  121 ++++++++++++++++++---------------
>  3 files changed, 74 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index eb260bc..2a61a29 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -356,7 +356,7 @@ struct drm_i915_error_state {
>  			int page_count;
>  			u32 gtt_offset;
>  			u32 *pages[0];
> -		} *ringbuffer, *batchbuffer, *ctx, *hws_page;
> +		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
>  
>  		struct drm_i915_error_request {
>  			long jiffies;
> @@ -371,6 +371,9 @@ struct drm_i915_error_state {
>  				u32 pp_dir_base;
>  			};
>  		} vm_info;
> +
> +		pid_t pid;
> +		char comm[TASK_COMM_LEN];
>  	} ring[I915_NUM_RINGS];
>  	struct drm_i915_error_buffer {
>  		u32 size;
> @@ -1786,6 +1789,7 @@ struct drm_i915_gem_request {
>  
>  struct drm_i915_file_private {
>  	struct drm_i915_private *dev_priv;
> +	struct drm_file *file;
>  
>  	struct {
>  		spinlock_t lock;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 20e55ef..efbd1dc 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4883,6 +4883,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file)
>  
>  	file->driver_priv = file_priv;
>  	file_priv->dev_priv = dev->dev_private;
> +	file_priv->file = file;
>  
>  	spin_lock_init(&file_priv->mm.lock);
>  	INIT_LIST_HEAD(&file_priv->mm.request_list);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 5bd075a..a90971a 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -301,13 +301,28 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
>  	va_end(args);
>  }
>  
> +static void print_error_obj(struct drm_i915_error_state_buf *m,
> +			    struct drm_i915_error_object *obj)
> +{
> +	int page, offset, elt;
> +
> +	for (page = offset = 0; page < obj->page_count; page++) {
> +		for (elt = 0; elt < PAGE_SIZE/4; elt++) {
> +			err_printf(m, "%08x :  %08x\n", offset,
> +				   obj->pages[page][elt]);
> +			offset += 4;
> +		}
> +	}
> +}
> +
>  int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  			    const struct i915_error_state_file_priv *error_priv)
>  {
>  	struct drm_device *dev = error_priv->dev;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
>  	struct drm_i915_error_state *error = error_priv->error;
> -	int i, j, page, offset, elt;
> +	int i, j, offset, elt;
> +	int max_hangcheck_score;
>  
>  	if (!error) {
>  		err_printf(m, "no error state collected\n");
> @@ -317,6 +332,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
>  		   error->time.tv_usec);
>  	err_printf(m, "Kernel: " UTS_RELEASE "\n");
> +	max_hangcheck_score = 0;
> +	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
> +		if (error->ring[i].hangcheck_score > max_hangcheck_score)
> +			max_hangcheck_score = error->ring[i].hangcheck_score;
> +	}
> +	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
> +		if (error->ring[i].hangcheck_score == max_hangcheck_score &&
> +		    error->ring[i].pid != -1) {
> +			err_printf(m, "Active process (on ring %s): %s [%d]\n",
> +				   ring_str(i),
> +				   error->ring[i].comm,
> +				   error->ring[i].pid);
> +		}
> +	}
>  	err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
>  	err_printf(m, "EIR: 0x%08x\n", error->eir);
>  	err_printf(m, "IER: 0x%08x\n", error->ier);
> @@ -360,17 +389,19 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  		struct drm_i915_error_object *obj;
>  
>  		if ((obj = error->ring[i].batchbuffer)) {
> -			err_printf(m, "%s --- gtt_offset = 0x%08x\n",
> -				   dev_priv->ring[i].name,
> +			err_puts(m, dev_priv->ring[i].name);
> +			if (error->ring[i].pid != -1)
> +				err_printf(m, " (submitted by %s [%d])",
> +					   error->ring[i].comm, error->ring[i].pid);
> +			err_printf(m, " --- gtt_offset = 0x%08x\n",
>  				   obj->gtt_offset);
> -			offset = 0;
> -			for (page = 0; page < obj->page_count; page++) {
> -				for (elt = 0; elt < PAGE_SIZE/4; elt++) {
> -					err_printf(m, "%08x :  %08x\n", offset,
> -						   obj->pages[page][elt]);
> -					offset += 4;
> -				}
> -			}
> +			print_error_obj(m, obj);
> +		}
> +
> +		if ((obj = error->ring[i].wa_batchbuffer)) {
> +			err_printf(m, "%s (w/a) --- gtt_offset = 0x%08x\n",
> +				   dev_priv->ring[i].name, obj->gtt_offset);
> +			print_error_obj(m, obj);
>  		}
>  
>  		if (error->ring[i].num_requests) {
> @@ -389,15 +420,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  			err_printf(m, "%s --- ringbuffer = 0x%08x\n",
>  				   dev_priv->ring[i].name,
>  				   obj->gtt_offset);
> -			offset = 0;
> -			for (page = 0; page < obj->page_count; page++) {
> -				for (elt = 0; elt < PAGE_SIZE/4; elt++) {
> -					err_printf(m, "%08x :  %08x\n",
> -						   offset,
> -						   obj->pages[page][elt]);
> -					offset += 4;
> -				}
> -			}
> +			print_error_obj(m, obj);
>  		}
>  
>  		if ((obj = error->ring[i].hws_page)) {
> @@ -713,37 +736,6 @@ static void i915_gem_record_fences(struct drm_device *dev,
>  	}
>  }
>  
> -static struct drm_i915_error_object *
> -i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
> -			     struct intel_ring_buffer *ring)
> -{
> -	struct drm_i915_gem_request *request;
> -
> -	if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
> -		struct drm_i915_gem_object *obj;
> -		u32 acthd = I915_READ(ACTHD);
> -
> -		if (WARN_ON(ring->id != RCS))
> -			return NULL;
> -
> -		obj = ring->scratch.obj;
> -		if (obj != NULL &&
> -		    acthd >= i915_gem_obj_ggtt_offset(obj) &&
> -		    acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
> -			return i915_error_ggtt_object_create(dev_priv, obj);
> -	}
> -
> -	request = i915_gem_find_active_request(ring);
> -	if (request) {
> -		/* We need to copy these to an anonymous buffer as the simplest
> -		 * method to avoid being overwritten by userspace.
> -		 */
> -		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
> -	}
> -
> -	return NULL;
> -}
> -
>  static void i915_record_ring_state(struct drm_device *dev,
>  				   struct intel_ring_buffer *ring,
>  				   struct drm_i915_error_ring *ering)
> @@ -892,8 +884,31 @@ static void i915_gem_record_rings(struct drm_device *dev,
>  
>  		i915_record_ring_state(dev, ring, &error->ring[i]);
>  
> -		error->ring[i].batchbuffer =
> -			i915_error_first_batchbuffer(dev_priv, ring);
> +		error->ring[i].pid = -1;
> +		request = i915_gem_find_active_request(ring);
> +		if (request) {
> +			/* We need to copy these to an anonymous buffer as the simplest
> +			 * method to avoid being overwritten by userspace.
> +			 */
> +			error->ring[i].batchbuffer =
> +				i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
> +
> +			if (HAS_BROKEN_CS_TLB(dev_priv->dev) && ring->scratch.obj)
> +				error->ring[i].wa_batchbuffer =
> +					i915_error_ggtt_object_create(dev_priv, ring->scratch.obj);
> +
> +			if (request->file_priv) {
> +				struct task_struct *task;
> +
> +				rcu_read_lock();
> +				task = pid_task(request->file_priv->file->pid, PIDTYPE_PID);
> +				if (task) {
> +					strcpy(error->ring[i].comm, task->comm);
> +					error->ring[i].pid = task->pid;
> +				}
> +				rcu_read_unlock();
> +			}

I still like my solution which does the strcpy on context creation. Your
solution is deferred in the usual case since the hangcheck is
asynchronous. The process could be long dead and gone.

Also, I think my solution is simpler, and with full PPGTT should get the
same information.

> +		}
>  
>  		error->ring[i].ringbuffer =
>  			i915_error_ggtt_object_create(dev_priv, ring->obj);

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] drm/i915: Record error state capture reason
  2014-02-10 14:30 ` [PATCH 3/3] drm/i915: Record error state capture reason Mika Kuoppala
@ 2014-02-12  2:32   ` Ben Widawsky
  2014-02-12  8:13     ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Ben Widawsky @ 2014-02-12  2:32 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku

On Mon, Feb 10, 2014 at 04:30:51PM +0200, Mika Kuoppala wrote:
> We capture error state not only when the GPU hangs but
> also on other situations as in interrupt errors and
> in situations where we can kick things forward without GPU reset.
> There will be log entry on most of these cases. But as error state
> capture might be only thing we have, if dmesg was not captured. Or as
> in GEN4 case, interrupt error can trigger error state capture without log
> entry, the exact reason why capture was made is hard to decipher.
> 
> To avoid confusion why the error state was captured, print reason
> along with error code into log and also store it into the error state.
> 

I could have done with a better commit subject and message. We already
capture error state, so "Record error state capture" is a bit weird.
Also, I think the commit message glosses over the main point of why
these cases are special as opposed to hangcheck.

> References: https://bugs.freedesktop.org/show_bug.cgi?id=74193
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c   |    5 +-
>  drivers/gpu/drm/i915/i915_drv.h       |    9 +++-
>  drivers/gpu/drm/i915/i915_gpu_error.c |   84 +++++++++++++++++++++++----------
>  drivers/gpu/drm/i915/i915_irq.c       |   45 +++++++++++-------
>  4 files changed, 98 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 2dc05c3..175e524 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3006,9 +3006,8 @@ i915_wedged_set(void *data, u64 val)
>  {
>  	struct drm_device *dev = data;
>  
> -	DRM_INFO("Manually setting wedged to %llu\n", val);
> -	i915_handle_error(dev, val);
> -
> +	i915_handle_error(dev, val,
> +			  "Manually setting wedged to %llu", val);
>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2a61a29..ad41c71 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -299,6 +299,8 @@ struct drm_i915_error_state {
>  	struct kref ref;
>  	struct timeval time;
>  
> +	char error_msg[128];
> +
>  	/* Generic register state */
>  	u32 eir;
>  	u32 pgtbl_er;
> @@ -1994,7 +1996,9 @@ extern void intel_console_resume(struct work_struct *work);
>  
>  /* i915_irq.c */
>  void i915_queue_hangcheck(struct drm_device *dev);
> -void i915_handle_error(struct drm_device *dev, bool wedged);
> +__printf(3, 4)
> +void i915_handle_error(struct drm_device *dev, bool wedged,
> +		       const char *fmt, ...);
>  
>  void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
>  							int new_delay);
> @@ -2464,7 +2468,8 @@ static inline void i915_error_state_buf_release(
>  {
>  	kfree(eb->buf);
>  }
> -void i915_capture_error_state(struct drm_device *dev);
> +void i915_capture_error_state(struct drm_device *dev, bool wedge,
> +			      const char *error_msg);
>  void i915_error_state_get(struct drm_device *dev,
>  			  struct i915_error_state_file_priv *error_priv);
>  void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a90971a..dce569b 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -329,6 +329,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  		goto out;
>  	}
>  
> +	err_printf(m, "%s\n", error->error_msg);
>  	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
>  		   error->time.tv_usec);
>  	err_printf(m, "Kernel: " UTS_RELEASE "\n");
> @@ -686,7 +687,8 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
>   * It's only a small step better than a random number in its current form.
>   */
>  static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
> -					 struct drm_i915_error_state *error)
> +					 struct drm_i915_error_state *error,
> +					 int *ring_id)
>  {
>  	uint32_t error_code = 0;
>  	int i;
> @@ -696,9 +698,14 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
>  	 * synchronization commands which almost always appear in the case
>  	 * strictly a client bug. Use instdone to differentiate those some.
>  	 */
> -	for (i = 0; i < I915_NUM_RINGS; i++)
> -		if (error->ring[i].hangcheck_action == HANGCHECK_HUNG)
> +	for (i = 0; i < I915_NUM_RINGS; i++) {
> +		if (error->ring[i].hangcheck_action == HANGCHECK_HUNG) {
> +			if (ring_id)
> +				*ring_id = i;
> +
>  			return error->ring[i].ipehr ^ error->ring[i].instdone;
> +		}
> +	}
>  
>  	return error_code;
>  }
> @@ -1075,6 +1082,39 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
>  	i915_get_extra_instdone(dev, error->extra_instdone);
>  }
>  
> +static void i915_error_generate_msg(struct drm_device *dev,
> +				    struct drm_i915_error_state *error,
> +				    bool wedge,
> +				    const char *error_msg)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	u32 ecode;
> +	int ring_id = -1, len;
> +
> +	ecode = i915_error_generate_code(dev_priv, error, &ring_id);
> +
> +	len = scnprintf(error->error_msg, sizeof(error->error_msg),
> +			"GPU HANG: ecode %d:0x%08x", ring_id, ecode);
> +
> +	if (ring_id != -1 && error->ring[ring_id].pid != -1)
> +			len += scnprintf(error->error_msg + len,
> +					 sizeof(error->error_msg) - len,
> +					 ", in %s [%d]",
> +					 error->ring[ring_id].comm,
> +					 error->ring[ring_id].pid);
> +
> +	if (error_msg)

I'd drop this and just always have an error_msg. But whatever you want
is fine.

> +		len += scnprintf(error->error_msg + len,
> +				 sizeof(error->error_msg) - len,
> +				 ", reason: %s",
> +				 error_msg);
> +
> +	if (wedge)
> +		len += scnprintf(error->error_msg + len,
> +				 sizeof(error->error_msg) - len,
> +				 ", resetting GPU");

I also don't think the fact that we're resetting the GPU belongs here
(for instance, if it's disabled via modparam). On the other hand, I'd be
in favor of a taint on GPU reset for all future error states.

> +}
> +
>  /**
>   * i915_capture_error_state - capture an error record for later analysis
>   * @dev: drm device
> @@ -1084,19 +1124,13 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
>   * out a structure which becomes available in debugfs for user level tools
>   * to pick up.
>   */
> -void i915_capture_error_state(struct drm_device *dev)
> +void i915_capture_error_state(struct drm_device *dev, bool wedged,
> +			      const char *error_msg)
>  {
>  	static bool warned;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct drm_i915_error_state *error;
>  	unsigned long flags;
> -	uint32_t ecode;
> -
> -	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> -	error = dev_priv->gpu_error.first_error;
> -	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
> -	if (error)
> -		return;

This seems like it should be a separate patch, or else I'm not clear why
you've introduced this change in this series. It doesn't seem to be
related to what the rest of the patch does.

>  
>  	/* Account for pipe specific data like PIPE*STAT */
>  	error = kzalloc(sizeof(*error), GFP_ATOMIC);
> @@ -1105,30 +1139,21 @@ void i915_capture_error_state(struct drm_device *dev)
>  		return;
>  	}
>  
> -	DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
> -		 dev->primary->index);
>  	kref_init(&error->ref);
>  
>  	i915_capture_reg_state(dev_priv, error);
>  	i915_gem_capture_buffers(dev_priv, error);
>  	i915_gem_record_fences(dev, error);
>  	i915_gem_record_rings(dev, error);
> -	ecode = i915_error_generate_code(dev_priv, error);
> -
> -	if (!warned) {
> -		DRM_INFO("GPU HANG [%x]\n", ecode);
> -		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
> -		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
> -		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
> -		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
> -		warned = true;
> -	}
>  
>  	do_gettimeofday(&error->time);
>  
>  	error->overlay = intel_overlay_capture_error_state(dev);
>  	error->display = intel_display_capture_error_state(dev);
>  
> +	i915_error_generate_msg(dev, error, wedged, error_msg);
> +	DRM_INFO("%s\n", error->error_msg);
> +
>  	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
>  	if (dev_priv->gpu_error.first_error == NULL) {
>  		dev_priv->gpu_error.first_error = error;
> @@ -1136,8 +1161,19 @@ void i915_capture_error_state(struct drm_device *dev)
>  	}
>  	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
>  
> -	if (error)
> +	if (error) {
>  		i915_error_state_free(&error->ref);
> +		return;
> +	}

Same comment as above.

> +
> +	if (!warned) {
> +		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
> +		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
> +		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
> +		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
> +		DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", dev->primary->index);
> +		warned = true;
> +	}
>  }
>  
>  void i915_error_state_get(struct drm_device *dev,
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index d4defd8..36bba16 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1223,8 +1223,7 @@ static void snb_gt_irq_handler(struct drm_device *dev,
>  	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
>  		      GT_BSD_CS_ERROR_INTERRUPT |
>  		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
> -		DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
> -		i915_handle_error(dev, false);
> +		i915_handle_error(dev, false, "GT error interrupt 0x%08x", gt_iir);
>  	}
>  
>  	if (gt_iir & GT_PARITY_ERROR(dev))
> @@ -1471,8 +1470,9 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
>  			notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
>  
>  		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
> -			DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir);
> -			i915_handle_error(dev_priv->dev, false);
> +			i915_handle_error(dev_priv->dev, false,
> +					  "VEBOX CS error interrupt 0x%08x",
> +					  pm_iir);
>  		}
>  	}
>  }
> @@ -2174,11 +2174,18 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
>   * so userspace knows something bad happened (should trigger collection
>   * of a ring dump etc.).
>   */
> -void i915_handle_error(struct drm_device *dev, bool wedged)
> +void i915_handle_error(struct drm_device *dev, bool wedged,
> +		       const char *fmt, ...)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> +	va_list args;
> +	char error_msg[80];
>  
> -	i915_capture_error_state(dev);
> +	va_start(args, fmt);
> +	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> +	va_end(args);
> +
> +	i915_capture_error_state(dev, wedged, error_msg);

As I said above regarding always having an error_msg, I'd be in favor of
moving most of this to capture error state, and just passing along the
string - but whatever you want is fine with me.

>  	i915_report_and_clear_eir(dev);
>  
>  	if (wedged) {
> @@ -2481,9 +2488,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
>  	 */
>  	tmp = I915_READ_CTL(ring);
>  	if (tmp & RING_WAIT) {
> -		DRM_ERROR("Kicking stuck wait on %s\n",
> -			  ring->name);
> -		i915_handle_error(dev, false);
> +		i915_handle_error(dev, false,
> +				  "Kicking stuck wait on %s",
> +				  ring->name);
>  		I915_WRITE_CTL(ring, tmp);
>  		return HANGCHECK_KICK;
>  	}
> @@ -2493,9 +2500,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
>  		default:
>  			return HANGCHECK_HUNG;
>  		case 1:
> -			DRM_ERROR("Kicking stuck semaphore on %s\n",
> -				  ring->name);
> -			i915_handle_error(dev, false);
> +			i915_handle_error(dev, false,
> +					  "Kicking stuck semaphore on %s",
> +					  ring->name);
>  			I915_WRITE_CTL(ring, tmp);
>  			return HANGCHECK_KICK;
>  		case 0:
> @@ -2617,7 +2624,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  	}
>  
>  	if (rings_hung)
> -		return i915_handle_error(dev, true);
> +		return i915_handle_error(dev, true, "Ring hung");
>  
>  	if (busy_count)
>  		/* Reset timer case chip hangs without another request
> @@ -3234,7 +3241,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
>  		 */
>  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
>  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -			i915_handle_error(dev, false);
> +			i915_handle_error(dev, false,
> +					  "Command parser error, iir 0x%08x",
> +					  iir);
>  
>  		for_each_pipe(pipe) {
>  			int reg = PIPESTAT(pipe);
> @@ -3416,7 +3425,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
>  		 */
>  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
>  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -			i915_handle_error(dev, false);
> +			i915_handle_error(dev, false,
> +					  "Command parser error, iir 0x%08x",
> +					  iir);
>  
>  		for_each_pipe(pipe) {
>  			int reg = PIPESTAT(pipe);
> @@ -3653,7 +3664,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
>  		 */
>  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
>  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -			i915_handle_error(dev, false);
> +			i915_handle_error(dev, false,
> +					  "Command parser error, iir 0x%08x",
> +					  iir);
>  

Since you're not directly using any of the DRM_ printers, we lose the
file/line. Would you care to add __FILE__/__LINE? I think it would be
helpful - not sure what others thing.

>  		for_each_pipe(pipe) {
>  			int reg = PIPESTAT(pipe);
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] drm/i915: Record error state capture reason
  2014-02-12  2:32   ` Ben Widawsky
@ 2014-02-12  8:13     ` Chris Wilson
  2014-02-12 19:05       ` Ben Widawsky
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2014-02-12  8:13 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx, miku

On Tue, Feb 11, 2014 at 06:32:24PM -0800, Ben Widawsky wrote:
> > @@ -3234,7 +3241,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
> >  		 */
> >  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
> >  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> > -			i915_handle_error(dev, false);
> > +			i915_handle_error(dev, false,
> > +					  "Command parser error, iir 0x%08x",
> > +					  iir);
> >  
> >  		for_each_pipe(pipe) {
> >  			int reg = PIPESTAT(pipe);
> > @@ -3416,7 +3425,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
> >  		 */
> >  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
> >  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> > -			i915_handle_error(dev, false);
> > +			i915_handle_error(dev, false,
> > +					  "Command parser error, iir 0x%08x",
> > +					  iir);
> >  
> >  		for_each_pipe(pipe) {
> >  			int reg = PIPESTAT(pipe);
> > @@ -3653,7 +3664,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
> >  		 */
> >  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
> >  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> > -			i915_handle_error(dev, false);
> > +			i915_handle_error(dev, false,
> > +					  "Command parser error, iir 0x%08x",
> > +					  iir);
> >  
> 
> Since you're not directly using any of the DRM_ printers, we lose the
> file/line. Would you care to add __FILE__/__LINE? I think it would be
> helpful - not sure what others thing.

In a user-facing message, I don't think so. In the error state we have
the GPU generation and reason, so we know which irq handler fired in
that case. I don't see what extra information func:line gives us since
the messenger (the irq handler) is pretty uninteresting wrt the hang.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/i915: Record pid/comm of hanging task
  2014-02-12  2:07   ` Ben Widawsky
@ 2014-02-12  8:15     ` Chris Wilson
  2014-02-12 18:55       ` Ben Widawsky
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2014-02-12  8:15 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx, miku

On Tue, Feb 11, 2014 at 06:07:09PM -0800, Ben Widawsky wrote:
> I still like my solution which does the strcpy on context creation. Your
> solution is deferred in the usual case since the hangcheck is
> asynchronous. The process could be long dead and gone.

I disliked your solution since there was never a need to copy the string
upon process creation, you could just have kept a reference to the
pid...

> Also, I think my solution is simpler, and with full PPGTT should get the
> same information.

I disagree that your solution was simpler and just introduced redundant
storage.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches
  2014-02-12  1:57 ` [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Ben Widawsky
@ 2014-02-12  8:25   ` Chris Wilson
  2014-02-12 19:02     ` Ben Widawsky
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2014-02-12  8:25 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx, miku, Ben Widawsky

On Tue, Feb 11, 2014 at 05:57:19PM -0800, Ben Widawsky wrote:
> On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> > -static struct drm_i915_gem_request *
> > -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> > +struct drm_i915_gem_request *
> > +i915_gem_find_active_request(struct intel_ring_buffer *ring)
> >  {
> >  	struct drm_i915_gem_request *request;
> > -	const u32 completed_seqno = ring->get_seqno(ring, false);
> > +	u32 completed_seqno;
> > +
> > +	if (WARN_ON(!ring->get_seqno))
> > +		return NULL;
> 
> ring->get_seqno(ring, false) ?

?

This was originally used in the error capture code to detect
uninitialised rings.

> This looks like it's currently wrong below as well.

?
 
> I still would have preferred to keep both methods for a while until we
> felt really comfortable with this... the commit message's statement that
> getting bad error state is inevitably a good thing is total bunk, I
> think.

I strongly disagree. One of the critical factors you first scan for when
reading error states is whether it is self-consistent. If the error
state is inconsistent, we also know that some of our critical
infrastructure is wrong - which we can't know if the error capture code
was different. And in the cases where the error state is inconsistent,
the scanning method would also be snafu (because ACTHD is no longer
within the expected bounds).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches
  2014-02-10 14:30 [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Mika Kuoppala
                   ` (2 preceding siblings ...)
  2014-02-12  1:57 ` [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Ben Widawsky
@ 2014-02-12 17:47 ` Chris Wilson
  3 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2014-02-12 17:47 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx, miku, Ben Widawsky

On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> +	request = i915_gem_find_active_request(ring);
> +	if (request) {
> +		/* We need to copy these to an anonymous buffer as the simplest
> +		 * method to avoid being overwritten by userspace.
> +		 */
> +		return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);

This needs to protect against ctx == NULL, e.g.
  request->ctx ? request->ctx->vm : &dev_priv->gtt.base)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/i915: Record pid/comm of hanging task
  2014-02-12  8:15     ` Chris Wilson
@ 2014-02-12 18:55       ` Ben Widawsky
  2014-02-12 19:18         ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Ben Widawsky @ 2014-02-12 18:55 UTC (permalink / raw)
  To: Chris Wilson, Mika Kuoppala, intel-gfx, miku

On Wed, Feb 12, 2014 at 08:15:58AM +0000, Chris Wilson wrote:
> On Tue, Feb 11, 2014 at 06:07:09PM -0800, Ben Widawsky wrote:
> > I still like my solution which does the strcpy on context creation. Your
> > solution is deferred in the usual case since the hangcheck is
> > asynchronous. The process could be long dead and gone.
> 
> I disliked your solution since there was never a need to copy the string
> upon process creation, you could just have kept a reference to the
> pid...
> 
> > Also, I think my solution is simpler, and with full PPGTT should get the
> > same information.
> 
> I disagree that your solution was simpler and just introduced redundant
> storage.
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre

Are you opposed to doing anything at context creation? pid reference
works for me too, or hold on to it with the request - I don't care.

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches
  2014-02-12  8:25   ` Chris Wilson
@ 2014-02-12 19:02     ` Ben Widawsky
  2014-02-12 19:15       ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Ben Widawsky @ 2014-02-12 19:02 UTC (permalink / raw)
  To: Chris Wilson, Mika Kuoppala, intel-gfx, miku, Ben Widawsky

On Wed, Feb 12, 2014 at 08:25:58AM +0000, Chris Wilson wrote:
> On Tue, Feb 11, 2014 at 05:57:19PM -0800, Ben Widawsky wrote:
> > On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> > > -static struct drm_i915_gem_request *
> > > -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> > > +struct drm_i915_gem_request *
> > > +i915_gem_find_active_request(struct intel_ring_buffer *ring)
> > >  {
> > >  	struct drm_i915_gem_request *request;
> > > -	const u32 completed_seqno = ring->get_seqno(ring, false);
> > > +	u32 completed_seqno;
> > > +
> > > +	if (WARN_ON(!ring->get_seqno))
> > > +		return NULL;
> > 
> > ring->get_seqno(ring, false) ?
> 
> ?
> 
> This was originally used in the error capture code to detect
> uninitialised rings.

We have a new flag for that now, don't we? Also, I don't think we can
actually have an uninitialized ring at this point.

> 
> > This looks like it's currently wrong below as well.
> 
> ?
>  
> > I still would have preferred to keep both methods for a while until we
> > felt really comfortable with this... the commit message's statement that
> > getting bad error state is inevitably a good thing is total bunk, I
> > think.
> 
> I strongly disagree. One of the critical factors you first scan for when
> reading error states is whether it is self-consistent. If the error
> state is inconsistent, we also know that some of our critical
> infrastructure is wrong - which we can't know if the error capture code
> was different. And in the cases where the error state is inconsistent,
> the scanning method would also be snafu (because ACTHD is no longer
> within the expected bounds).
> -Chris
> 

The ACTHD case is one of the reasons I can't argue for keeping the old
method. (Though I think capturing all buffers with COMMAND domain gets
you to the same end). Since you read way more error states than I, I'm
happy to favor your opinion. As I said [but you cut] I am just a
chicken.

You already got my r-b, stop arguing.

> -- 
> Chris Wilson, Intel Open Source Technology Centre

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/3] drm/i915: Record error state capture reason
  2014-02-12  8:13     ` Chris Wilson
@ 2014-02-12 19:05       ` Ben Widawsky
  0 siblings, 0 replies; 19+ messages in thread
From: Ben Widawsky @ 2014-02-12 19:05 UTC (permalink / raw)
  To: Chris Wilson, Mika Kuoppala, intel-gfx, miku

On Wed, Feb 12, 2014 at 08:13:50AM +0000, Chris Wilson wrote:
> On Tue, Feb 11, 2014 at 06:32:24PM -0800, Ben Widawsky wrote:
> > > @@ -3234,7 +3241,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
> > >  		 */
> > >  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
> > >  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> > > -			i915_handle_error(dev, false);
> > > +			i915_handle_error(dev, false,
> > > +					  "Command parser error, iir 0x%08x",
> > > +					  iir);
> > >  
> > >  		for_each_pipe(pipe) {
> > >  			int reg = PIPESTAT(pipe);
> > > @@ -3416,7 +3425,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
> > >  		 */
> > >  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
> > >  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> > > -			i915_handle_error(dev, false);
> > > +			i915_handle_error(dev, false,
> > > +					  "Command parser error, iir 0x%08x",
> > > +					  iir);
> > >  
> > >  		for_each_pipe(pipe) {
> > >  			int reg = PIPESTAT(pipe);
> > > @@ -3653,7 +3664,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
> > >  		 */
> > >  		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
> > >  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> > > -			i915_handle_error(dev, false);
> > > +			i915_handle_error(dev, false,
> > > +					  "Command parser error, iir 0x%08x",
> > > +					  iir);
> > >  
> > 
> > Since you're not directly using any of the DRM_ printers, we lose the
> > file/line. Would you care to add __FILE__/__LINE? I think it would be
> > helpful - not sure what others thing.
> 
> In a user-facing message, I don't think so. In the error state we have
> the GPU generation and reason, so we know which irq handler fired in
> that case. I don't see what extra information func:line gives us since
> the messenger (the irq handler) is pretty uninteresting wrt the hang.
> -Chris
> 

My gripe was with the same string for all errors - but I *just* noticed
that they are for different GENs, and not different interrupts. So
ignore me.

> -- 
> Chris Wilson, Intel Open Source Technology Centre

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches
  2014-02-12 19:02     ` Ben Widawsky
@ 2014-02-12 19:15       ` Chris Wilson
  0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2014-02-12 19:15 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx, miku, Ben Widawsky

On Wed, Feb 12, 2014 at 11:02:34AM -0800, Ben Widawsky wrote:
> On Wed, Feb 12, 2014 at 08:25:58AM +0000, Chris Wilson wrote:
> > On Tue, Feb 11, 2014 at 05:57:19PM -0800, Ben Widawsky wrote:
> > > On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> > > > -static struct drm_i915_gem_request *
> > > > -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> > > > +struct drm_i915_gem_request *
> > > > +i915_gem_find_active_request(struct intel_ring_buffer *ring)
> > > >  {
> > > >  	struct drm_i915_gem_request *request;
> > > > -	const u32 completed_seqno = ring->get_seqno(ring, false);
> > > > +	u32 completed_seqno;
> > > > +
> > > > +	if (WARN_ON(!ring->get_seqno))
> > > > +		return NULL;
> > > 
> > > ring->get_seqno(ring, false) ?
> > 
> > ?
> > 
> > This was originally used in the error capture code to detect
> > uninitialised rings.
> 
> We have a new flag for that now, don't we? Also, I don't think we can
> actually have an uninitialized ring at this point.

Considering that I have a vebox being reported on my SNB atm, something
is a little fishy with the detection of valid rings for error capture.
Yes, by the point we call i915_gem_find_active_request(), we should have
ring->get_seqno and so could just drop the WARN_ON()
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/i915: Record pid/comm of hanging task
  2014-02-12 18:55       ` Ben Widawsky
@ 2014-02-12 19:18         ` Chris Wilson
  2014-02-12 19:32           ` Ben Widawsky
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2014-02-12 19:18 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx, miku

On Wed, Feb 12, 2014 at 10:55:07AM -0800, Ben Widawsky wrote:
> On Wed, Feb 12, 2014 at 08:15:58AM +0000, Chris Wilson wrote:
> Are you opposed to doing anything at context creation? pid reference
> works for me too, or hold on to it with the request - I don't care.

I'm just don't feel that the issue is severe enough for yet another pid
reference floating around inside drm. I could well be wrong though, but
I would like to be sure we have tried our best to use the available
information we have already.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/i915: Record pid/comm of hanging task
  2014-02-12 19:18         ` Chris Wilson
@ 2014-02-12 19:32           ` Ben Widawsky
  2014-03-05 13:08             ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Ben Widawsky @ 2014-02-12 19:32 UTC (permalink / raw)
  To: Chris Wilson, Mika Kuoppala, intel-gfx, miku

On Wed, Feb 12, 2014 at 07:18:19PM +0000, Chris Wilson wrote:
> On Wed, Feb 12, 2014 at 10:55:07AM -0800, Ben Widawsky wrote:
> > On Wed, Feb 12, 2014 at 08:15:58AM +0000, Chris Wilson wrote:
> > Are you opposed to doing anything at context creation? pid reference
> > works for me too, or hold on to it with the request - I don't care.
> 
> I'm just don't feel that the issue is severe enough for yet another pid
> reference floating around inside drm. I could well be wrong though, but
> I would like to be sure we have tried our best to use the available
> information we have already.
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre

Well, the ref wouldn't be held if you just copied it at context create,
and it avoids having to stick in *file to actually do the proper look up
(not that that in itself is so horrible).

For the sake of making progress, I disagree with Chris in tbut think it's
better than what is currently there, so:
Acked-by: Ben Widawsky <ben@bwidawsk.net>

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/i915: Record pid/comm of hanging task
  2014-02-12 19:32           ` Ben Widawsky
@ 2014-03-05 13:08             ` Chris Wilson
  2014-03-05 13:34               ` Daniel Vetter
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2014-03-05 13:08 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Ben Widawsky, miku, intel-gfx

On Wed, Feb 12, 2014 at 11:32:00AM -0800, Ben Widawsky wrote:
> On Wed, Feb 12, 2014 at 07:18:19PM +0000, Chris Wilson wrote:
> > On Wed, Feb 12, 2014 at 10:55:07AM -0800, Ben Widawsky wrote:
> > > On Wed, Feb 12, 2014 at 08:15:58AM +0000, Chris Wilson wrote:
> > > Are you opposed to doing anything at context creation? pid reference
> > > works for me too, or hold on to it with the request - I don't care.
> > 
> > I'm just don't feel that the issue is severe enough for yet another pid
> > reference floating around inside drm. I could well be wrong though, but
> > I would like to be sure we have tried our best to use the available
> > information we have already.
> > -Chris
> > 
> > -- 
> > Chris Wilson, Intel Open Source Technology Centre
> 
> Well, the ref wouldn't be held if you just copied it at context create,
> and it avoids having to stick in *file to actually do the proper look up
> (not that that in itself is so horrible).
> 
> For the sake of making progress, I disagree with Chris in tbut think it's
> better than what is currently there, so:
> Acked-by: Ben Widawsky <ben@bwidawsk.net>

Daniel?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/i915: Record pid/comm of hanging task
  2014-03-05 13:08             ` Chris Wilson
@ 2014-03-05 13:34               ` Daniel Vetter
  2014-03-05 13:50                 ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Vetter @ 2014-03-05 13:34 UTC (permalink / raw)
  To: Chris Wilson, Daniel Vetter, Ben Widawsky, Mika Kuoppala,
	intel-gfx, miku

On Wed, Mar 05, 2014 at 01:08:15PM +0000, Chris Wilson wrote:
> On Wed, Feb 12, 2014 at 11:32:00AM -0800, Ben Widawsky wrote:
> > On Wed, Feb 12, 2014 at 07:18:19PM +0000, Chris Wilson wrote:
> > > On Wed, Feb 12, 2014 at 10:55:07AM -0800, Ben Widawsky wrote:
> > > > On Wed, Feb 12, 2014 at 08:15:58AM +0000, Chris Wilson wrote:
> > > > Are you opposed to doing anything at context creation? pid reference
> > > > works for me too, or hold on to it with the request - I don't care.
> > > 
> > > I'm just don't feel that the issue is severe enough for yet another pid
> > > reference floating around inside drm. I could well be wrong though, but
> > > I would like to be sure we have tried our best to use the available
> > > information we have already.
> > > -Chris
> > > 
> > > -- 
> > > Chris Wilson, Intel Open Source Technology Centre
> > 
> > Well, the ref wouldn't be held if you just copied it at context create,
> > and it avoids having to stick in *file to actually do the proper look up
> > (not that that in itself is so horrible).
> > 
> > For the sake of making progress, I disagree with Chris in tbut think it's
> > better than what is currently there, so:
> > Acked-by: Ben Widawsky <ben@bwidawsk.net>
> 
> Daniel?

I've thought I pick this all up when Mika resends the revised versions for
1&3. Or is everything ready already (it didn't quite look like that ...)?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] drm/i915: Record pid/comm of hanging task
  2014-03-05 13:34               ` Daniel Vetter
@ 2014-03-05 13:50                 ` Chris Wilson
  0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2014-03-05 13:50 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Daniel Vetter, Ben Widawsky, miku, intel-gfx

On Wed, Mar 05, 2014 at 02:34:48PM +0100, Daniel Vetter wrote:
> On Wed, Mar 05, 2014 at 01:08:15PM +0000, Chris Wilson wrote:
> > On Wed, Feb 12, 2014 at 11:32:00AM -0800, Ben Widawsky wrote:
> > > On Wed, Feb 12, 2014 at 07:18:19PM +0000, Chris Wilson wrote:
> > > > On Wed, Feb 12, 2014 at 10:55:07AM -0800, Ben Widawsky wrote:
> > > > > On Wed, Feb 12, 2014 at 08:15:58AM +0000, Chris Wilson wrote:
> > > > > Are you opposed to doing anything at context creation? pid reference
> > > > > works for me too, or hold on to it with the request - I don't care.
> > > > 
> > > > I'm just don't feel that the issue is severe enough for yet another pid
> > > > reference floating around inside drm. I could well be wrong though, but
> > > > I would like to be sure we have tried our best to use the available
> > > > information we have already.
> > > > -Chris
> > > > 
> > > > -- 
> > > > Chris Wilson, Intel Open Source Technology Centre
> > > 
> > > Well, the ref wouldn't be held if you just copied it at context create,
> > > and it avoids having to stick in *file to actually do the proper look up
> > > (not that that in itself is so horrible).
> > > 
> > > For the sake of making progress, I disagree with Chris in tbut think it's
> > > better than what is currently there, so:
> > > Acked-by: Ben Widawsky <ben@bwidawsk.net>
> > 
> > Daniel?
> 
> I've thought I pick this all up when Mika resends the revised versions for
> 1&3. Or is everything ready already (it didn't quite look like that ...)?

Oh, I just forgot where this was taken up. It just made it to the top of
my wishlist with a couple of bizarre error-states containing gibberish
for which I want to find the culprit.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2014-03-05 13:50 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-02-10 14:30 [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Mika Kuoppala
2014-02-10 14:30 ` [PATCH 2/3] drm/i915: Record pid/comm of hanging task Mika Kuoppala
2014-02-12  2:07   ` Ben Widawsky
2014-02-12  8:15     ` Chris Wilson
2014-02-12 18:55       ` Ben Widawsky
2014-02-12 19:18         ` Chris Wilson
2014-02-12 19:32           ` Ben Widawsky
2014-03-05 13:08             ` Chris Wilson
2014-03-05 13:34               ` Daniel Vetter
2014-03-05 13:50                 ` Chris Wilson
2014-02-10 14:30 ` [PATCH 3/3] drm/i915: Record error state capture reason Mika Kuoppala
2014-02-12  2:32   ` Ben Widawsky
2014-02-12  8:13     ` Chris Wilson
2014-02-12 19:05       ` Ben Widawsky
2014-02-12  1:57 ` [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches Ben Widawsky
2014-02-12  8:25   ` Chris Wilson
2014-02-12 19:02     ` Ben Widawsky
2014-02-12 19:15       ` Chris Wilson
2014-02-12 17:47 ` Chris Wilson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox