[PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM

public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed

* [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM
@ 2014-01-25  2:17 Ben Widawsky
  2014-01-25  2:17 ` [PATCH 2/5] drm/i915: Print captured bo for all VM in error state Ben Widawsky
                   ` (4 more replies)
  0 siblings, 5 replies; 26+ messages in thread
From: Ben Widawsky @ 2014-01-25  2:17 UTC (permalink / raw)
  To: Intel GFX

From: Chris Wilson <chris@chris-wilson.co.uk>

This is useful for debugging as we then know that the first entry is
always the global GTT, and all later entries the per-process GTT VM.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_gem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 024e454..946a577 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4634,7 +4634,7 @@ void i915_init_vm(struct drm_i915_private *dev_priv,
 	INIT_LIST_HEAD(&vm->active_list);
 	INIT_LIST_HEAD(&vm->inactive_list);
 	INIT_LIST_HEAD(&vm->global_link);
-	list_add(&vm->global_link, &dev_priv->vm_list);
+	list_add_tail(&vm->global_link, &dev_priv->vm_list);
 }
 
 void
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/5] drm/i915: Print captured bo for all VM in error state
  2014-01-25  2:17 [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Ben Widawsky
@ 2014-01-25  2:17 ` Ben Widawsky
  2014-01-25  2:17 ` [PATCH 3/5] drm/i915: Create a USES_PPGTT macro Ben Widawsky
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Ben Widawsky @ 2014-01-25  2:17 UTC (permalink / raw)
  To: Intel GFX

From: Chris Wilson <chris@chris-wilson.co.uk>

The current error state harks back to the era of just a single VM. For
full-ppgtt, we capture every bo on every VM. It behoves us to then print
every bo for every VM, which we currently fail to do and so miss vital
information in the error state.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h       |  4 +++
 drivers/gpu/drm/i915/i915_gpu_error.c | 66 ++++++++++++++++++++++++++---------
 2 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 20f0e78..e851a82 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -329,6 +329,7 @@ struct drm_i915_error_state {
 	u32 faddr[I915_NUM_RINGS];
 	u64 fence[I915_MAX_NUM_FENCES];
 	struct timeval time;
+
 	struct drm_i915_error_ring {
 		struct drm_i915_error_object {
 			int page_count;
@@ -342,6 +343,7 @@ struct drm_i915_error_state {
 		} *requests;
 		int num_requests;
 	} ring[I915_NUM_RINGS];
+
 	struct drm_i915_error_buffer {
 		u32 size;
 		u32 name;
@@ -358,6 +360,8 @@ struct drm_i915_error_state {
 		u32 cache_level:3;
 	} **active_bo, **pinned_bo;
 	u32 *active_bo_count, *pinned_bo_count;
+	u32 vm_count;
+
 	struct intel_overlay_error_state *overlay;
 	struct intel_display_error_state *display;
 	int hangcheck_score[I915_NUM_RINGS];
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ae8cf61..f598829 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -188,10 +188,10 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 				struct drm_i915_error_buffer *err,
 				int count)
 {
-	err_printf(m, "%s [%d]:\n", name, count);
+	err_printf(m, "  %s [%d]:\n", name, count);
 
 	while (count--) {
-		err_printf(m, "  %08x %8u %02x %02x %x %x",
+		err_printf(m, "    %08x %8u %02x %02x %x %x",
 			   err->gtt_offset,
 			   err->size,
 			   err->read_domains,
@@ -331,15 +331,17 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	for_each_ring(ring, dev_priv, i)
 		i915_ring_error_state(m, dev, error, i);
 
-	if (error->active_bo)
+	for (i = 0; i < error->vm_count; i++) {
+		err_printf(m, "vm[%d]\n", i);
+
 		print_error_buffers(m, "Active",
-				    error->active_bo[0],
-				    error->active_bo_count[0]);
+				    error->active_bo[i],
+				    error->active_bo_count[i]);
 
-	if (error->pinned_bo)
 		print_error_buffers(m, "Pinned",
-				    error->pinned_bo[0],
-				    error->pinned_bo_count[0]);
+				    error->pinned_bo[i],
+				    error->pinned_bo_count[i]);
+	}
 
 	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
 		struct drm_i915_error_object *obj;
@@ -604,13 +606,23 @@ static u32 capture_active_bo(struct drm_i915_error_buffer *err,
 }
 
 static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
-			     int count, struct list_head *head)
+			     int count, struct list_head *head,
+			     struct i915_address_space *vm)
 {
 	struct drm_i915_gem_object *obj;
 	int i = 0;
 
 	list_for_each_entry(obj, head, global_list) {
-		if (!i915_gem_obj_is_pinned(obj))
+		struct i915_vma *vma;
+		bool bound = false;
+
+		list_for_each_entry(vma, &obj->vma_list, vma_link)
+			if (vma->vm == vm && vma->pin_count > 0) {
+				bound = true;
+				break;
+			}
+
+		if (!bound)
 			continue;
 
 		capture_bo(err++, obj);
@@ -874,9 +886,14 @@ static void i915_gem_capture_vm(struct drm_i915_private *dev_priv,
 	list_for_each_entry(vma, &vm->active_list, mm_list)
 		i++;
 	error->active_bo_count[ndx] = i;
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list)
-		if (i915_gem_obj_is_pinned(obj))
-			i++;
+
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
+		list_for_each_entry(vma, &obj->vma_list, vma_link)
+			if (vma->vm == vm && vma->pin_count > 0) {
+				i++;
+				break;
+			}
+	}
 	error->pinned_bo_count[ndx] = i - error->active_bo_count[ndx];
 
 	if (i) {
@@ -895,7 +912,7 @@ static void i915_gem_capture_vm(struct drm_i915_private *dev_priv,
 		error->pinned_bo_count[ndx] =
 			capture_pinned_bo(pinned_bo,
 					  error->pinned_bo_count[ndx],
-					  &dev_priv->mm.bound_list);
+					  &dev_priv->mm.bound_list, vm);
 	error->active_bo[ndx] = active_bo;
 	error->pinned_bo[ndx] = pinned_bo;
 }
@@ -916,8 +933,25 @@ static void i915_gem_capture_buffers(struct drm_i915_private *dev_priv,
 	error->pinned_bo_count = kcalloc(cnt, sizeof(*error->pinned_bo_count),
 					 GFP_ATOMIC);
 
-	list_for_each_entry(vm, &dev_priv->vm_list, global_link)
-		i915_gem_capture_vm(dev_priv, error, vm, i++);
+	if (error->active_bo == NULL ||
+	    error->pinned_bo == NULL ||
+	    error->active_bo_count == NULL ||
+	    error->pinned_bo_count == NULL) {
+		kfree(error->active_bo);
+		kfree(error->active_bo_count);
+		kfree(error->pinned_bo);
+		kfree(error->pinned_bo_count);
+
+		error->active_bo = NULL;
+		error->active_bo_count = NULL;
+		error->pinned_bo = NULL;
+		error->pinned_bo_count = NULL;
+	} else {
+		list_for_each_entry(vm, &dev_priv->vm_list, global_link)
+			i915_gem_capture_vm(dev_priv, error, vm, i++);
+
+		error->vm_count = cnt;
+	}
 }
 
 /**
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/5] drm/i915: Create a USES_PPGTT macro
  2014-01-25  2:17 [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Ben Widawsky
  2014-01-25  2:17 ` [PATCH 2/5] drm/i915: Print captured bo for all VM in error state Ben Widawsky
@ 2014-01-25  2:17 ` Ben Widawsky
  2014-01-25 20:41   ` Daniel Vetter
  2014-01-25  2:17 ` [PATCH 4/5] drm/i915: Capture PPGTT info on error capture Ben Widawsky
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2014-01-25  2:17 UTC (permalink / raw)
  To: Intel GFX; +Cc: Ben Widawsky, Ben Widawsky

There are cases where we want to know if there is a full, or aliased
ppgtt. Having to always to the || is annoying. This shorthand will keep
the code a bit cleaner/easier to read.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e851a82..6f68515 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1844,6 +1844,7 @@ struct drm_i915_file_private {
 #define HAS_PPGTT(dev)		(INTEL_INFO(dev)->gen >= 7 && !IS_VALLEYVIEW(dev) && !IS_BROADWELL(dev))
 #define USES_ALIASING_PPGTT(dev) intel_enable_ppgtt(dev, false)
 #define USES_FULL_PPGTT(dev)	intel_enable_ppgtt(dev, true)
+#define USES_PPGTT(dev)		(USES_ALIASING_PPGTT(dev) || USES_FULL_PPGTT(dev))
 
 #define HAS_OVERLAY(dev)		(INTEL_INFO(dev)->has_overlay)
 #define OVERLAY_NEEDS_PHYSICAL(dev)	(INTEL_INFO(dev)->overlay_needs_physical)
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 4/5] drm/i915: Capture PPGTT info on error capture
  2014-01-25  2:17 [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Ben Widawsky
  2014-01-25  2:17 ` [PATCH 2/5] drm/i915: Print captured bo for all VM in error state Ben Widawsky
  2014-01-25  2:17 ` [PATCH 3/5] drm/i915: Create a USES_PPGTT macro Ben Widawsky
@ 2014-01-25  2:17 ` Ben Widawsky
  2014-01-26 11:42   ` Chris Wilson
  2014-01-25  2:17 ` [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW Ben Widawsky
  2014-01-25  8:32 ` [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Kenneth Graunke
  4 siblings, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2014-01-25  2:17 UTC (permalink / raw)
  To: Intel GFX; +Cc: Ben Widawsky, Ben Widawsky

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h       |  7 ++++++
 drivers/gpu/drm/i915/i915_gpu_error.c | 41 +++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6f68515..5105fd4 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -359,6 +359,13 @@ struct drm_i915_error_state {
 		s32 ring:4;
 		u32 cache_level:3;
 	} **active_bo, **pinned_bo;
+	struct drm_i915_vm_info {
+		u32 gfx_mode;
+		union {
+			u64 pdp[4];
+			u32 pp_dir_base;
+		};
+	} vm_info[I915_NUM_RINGS];
 	u32 *active_bo_count, *pinned_bo_count;
 	u32 vm_count;
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index f598829..76bb010 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -276,6 +276,22 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  hangcheck: %s [%d]\n",
 		   hangcheck_action_to_str(error->hangcheck_action[ring]),
 		   error->hangcheck_score[ring]);
+
+	if (USES_PPGTT(dev)) {
+		err_printf(m, "  GFX_MODE: 0x%08x\n",
+			   error->vm_info[ring].gfx_mode);
+
+		if (INTEL_INFO(dev)->gen >= 8) {
+			int i;
+			for (i = 0; i < 4; i++)
+				err_printf(m, "  PDP%d: 0x%016llx\n", i,
+					   error->vm_info[ring].pdp[i]);
+		} else {
+			err_printf(m, "  PP_DIR_BASE: 0x%08x\n",
+				   error->vm_info[ring].pp_dir_base);
+		}
+	}
+
 }
 
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
@@ -799,6 +815,31 @@ static void i915_record_ring_state(struct drm_device *dev,
 
 	error->hangcheck_score[ring->id] = ring->hangcheck.score;
 	error->hangcheck_action[ring->id] = ring->hangcheck.action;
+
+	if (USES_PPGTT(dev)) {
+		int i;
+
+		error->vm_info[ring->id].gfx_mode =
+			I915_READ(RING_MODE_GEN7(ring));
+
+		switch (INTEL_INFO(dev)->gen) {
+		case 8:
+			for (i = 0; i < 4; i++) {
+				error->vm_info[ring->id].pdp[i] =
+					I915_READ(GEN8_RING_PDP_UDW(ring, i));
+				error->vm_info[ring->id].pdp[i] <<= 32;
+				error->vm_info[ring->id].pdp[i] |=
+					I915_READ(GEN8_RING_PDP_LDW(ring, i));
+			}
+			break;
+		case 7:
+			error->vm_info[ring->id].pp_dir_base = RING_PP_DIR_BASE(ring);
+			break;
+		case 6:
+			error->vm_info[ring->id].pp_dir_base = RING_PP_DIR_BASE_READ(ring);
+			break;
+		}
+	}
 }
 
 
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-25  2:17 [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Ben Widawsky
                   ` (2 preceding siblings ...)
  2014-01-25  2:17 ` [PATCH 4/5] drm/i915: Capture PPGTT info on error capture Ben Widawsky
@ 2014-01-25  2:17 ` Ben Widawsky
  2014-01-26 11:47   ` Chris Wilson
  2014-01-25  8:32 ` [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Kenneth Graunke
  4 siblings, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2014-01-25  2:17 UTC (permalink / raw)
  To: Intel GFX; +Cc: Ben Widawsky, Ben Widawsky

The previous check during error capture of whether or not the current VM
should be scanned used, gen < 7. That was more or less trying to
determine if there was a full PPGTT. At the time, this was sort of what
I meant to do because I was more interested in working backwards from
hardware state. However, this is incorrect because it will not include
platforms that are greater than gen7, and not having PPGTT.  Example
would be BYT which is gen7 but doesn't have PPGTT, BDW, or any platform
greater than gen7 with the PPGTT module parameter invoked.

I am /assuming/ BYT was broken, I have not actually checked.

While here, clean up the file a bit to avoid duplicate reads (now that
the PPGTT info is in the error state).

I think Mika/Chris may have been looking at this too.

Broken by:
commit 685987c6915222730f45141a89f1cd87fb092e9a
Author: Ben Widawsky <benjamin.widawsky@intel.com>
Date:   Fri Dec 6 14:10:54 2013 -0800

    drm/i915: Identify active VM for batchbuffer capture

Reported-by: Kenneth Graunke <kenneth.w.graunke@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 76bb010..6a859f1 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -684,32 +684,32 @@ static void i915_gem_record_fences(struct drm_device *dev,
 
 /* This assumes all batchbuffers are executed from the PPGTT. It might have to
  * change in the future. */
-static bool is_active_vm(struct i915_address_space *vm,
+static bool is_active_vm(struct drm_i915_error_state *error,
+			 struct i915_address_space *vm,
 			 struct intel_ring_buffer *ring)
 {
 	struct drm_device *dev = vm->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct i915_hw_ppgtt *ppgtt;
 
-	if (INTEL_INFO(dev)->gen < 7)
+	if (!HAS_PPGTT(dev))
 		return i915_is_ggtt(vm);
 
-	/* FIXME: This ignores that the global gtt vm is also on this list. */
+	if (i915_is_ggtt(vm) || !USES_PPGTT(dev))
+		return error->head[ring->id] &&
+			(error->acthd[ring->id] ==
+			(error->head[ring->id] & HEAD_ADDR));
+
 	ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
 
-	if (INTEL_INFO(dev)->gen >= 8) {
-		u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
-		pdp0 |=  I915_READ(GEN8_RING_PDP_LDW(ring, 0));
-		return pdp0 == ppgtt->pd_dma_addr[0];
-	} else {
-		u32 pp_db;
-		pp_db = I915_READ(RING_PP_DIR_BASE(ring));
-		return (pp_db >> 10) == ppgtt->pd_offset;
-	}
+	if (INTEL_INFO(dev)->gen >= 8)
+		return error->vm_info[ring->id].pdp[0] == ppgtt->pd_dma_addr[0];
+	else
+		return error->vm_info[ring->id].pp_dir_base == ppgtt->pd_offset;
 }
 
 static struct drm_i915_error_object *
 i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
+			     struct drm_i915_error_state *error,
 			     struct intel_ring_buffer *ring)
 {
 	struct i915_address_space *vm;
@@ -735,7 +735,7 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
 
 	seqno = ring->get_seqno(ring, false);
 	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
-		if (!is_active_vm(vm, ring))
+		if (!is_active_vm(error, vm, ring))
 			continue;
 
 		found_active = true;
@@ -877,7 +877,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
 		i915_record_ring_state(dev, error, ring);
 
 		error->ring[i].batchbuffer =
-			i915_error_first_batchbuffer(dev_priv, ring);
+			i915_error_first_batchbuffer(dev_priv, error, ring);
 
 		error->ring[i].ringbuffer =
 			i915_error_ggtt_object_create(dev_priv, ring->obj);
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM
  2014-01-25  2:17 [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Ben Widawsky
                   ` (3 preceding siblings ...)
  2014-01-25  2:17 ` [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW Ben Widawsky
@ 2014-01-25  8:32 ` Kenneth Graunke
  2014-01-25 20:48   ` Daniel Vetter
  4 siblings, 1 reply; 26+ messages in thread
From: Kenneth Graunke @ 2014-01-25  8:32 UTC (permalink / raw)
  To: Ben Widawsky, Intel GFX


[-- Attachment #1.1: Type: text/plain, Size: 1261 bytes --]

On 01/24/2014 06:17 PM, Ben Widawsky wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> This is useful for debugging as we then know that the first entry is
> always the global GTT, and all later entries the per-process GTT VM.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_gem.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 024e454..946a577 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4634,7 +4634,7 @@ void i915_init_vm(struct drm_i915_private *dev_priv,
>  	INIT_LIST_HEAD(&vm->active_list);
>  	INIT_LIST_HEAD(&vm->inactive_list);
>  	INIT_LIST_HEAD(&vm->global_link);
> -	list_add(&vm->global_link, &dev_priv->vm_list);
> +	list_add_tail(&vm->global_link, &dev_priv->vm_list);
>  }
>  
>  void

These five patches are:
Tested-by: Kenneth Graunke <kenneth@whitecape.org>

On Broadwell with drm-intel-nightly, I don't get the batchbuffer as part
of my error state, which is a regression.  With these patches, it's back
again.

Thanks for investigating this, Ben.



[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/5] drm/i915: Create a USES_PPGTT macro
  2014-01-25  2:17 ` [PATCH 3/5] drm/i915: Create a USES_PPGTT macro Ben Widawsky
@ 2014-01-25 20:41   ` Daniel Vetter
  2014-01-26  5:45     ` Ben Widawsky
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2014-01-25 20:41 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

On Fri, Jan 24, 2014 at 06:17:43PM -0800, Ben Widawsky wrote:
> There are cases where we want to know if there is a full, or aliased
> ppgtt. Having to always to the || is annoying. This shorthand will keep
> the code a bit cleaner/easier to read.
> 
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_drv.h | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e851a82..6f68515 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1844,6 +1844,7 @@ struct drm_i915_file_private {
>  #define HAS_PPGTT(dev)		(INTEL_INFO(dev)->gen >= 7 && !IS_VALLEYVIEW(dev) && !IS_BROADWELL(dev))
>  #define USES_ALIASING_PPGTT(dev) intel_enable_ppgtt(dev, false)
>  #define USES_FULL_PPGTT(dev)	intel_enable_ppgtt(dev, true)
> +#define USES_PPGTT(dev)		(USES_ALIASING_PPGTT(dev) || USES_FULL_PPGTT(dev))

Just do an s/ALIASING/HW/ and we get to the same point with overall
clearer code.
-Daniel

>  
>  #define HAS_OVERLAY(dev)		(INTEL_INFO(dev)->has_overlay)
>  #define OVERLAY_NEEDS_PHYSICAL(dev)	(INTEL_INFO(dev)->overlay_needs_physical)
> -- 
> 1.8.5.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM
  2014-01-25  8:32 ` [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Kenneth Graunke
@ 2014-01-25 20:48   ` Daniel Vetter
  2014-01-25 21:31     ` Kenneth Graunke
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2014-01-25 20:48 UTC (permalink / raw)
  To: Kenneth Graunke; +Cc: Intel GFX, Ben Widawsky

On Sat, Jan 25, 2014 at 12:32:33AM -0800, Kenneth Graunke wrote:
> On 01/24/2014 06:17 PM, Ben Widawsky wrote:
> > From: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > This is useful for debugging as we then know that the first entry is
> > always the global GTT, and all later entries the per-process GTT VM.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 024e454..946a577 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -4634,7 +4634,7 @@ void i915_init_vm(struct drm_i915_private *dev_priv,
> >  	INIT_LIST_HEAD(&vm->active_list);
> >  	INIT_LIST_HEAD(&vm->inactive_list);
> >  	INIT_LIST_HEAD(&vm->global_link);
> > -	list_add(&vm->global_link, &dev_priv->vm_list);
> > +	list_add_tail(&vm->global_link, &dev_priv->vm_list);
> >  }
> >  
> >  void
> 
> These five patches are:
> Tested-by: Kenneth Graunke <kenneth@whitecape.org>
> 
> On Broadwell with drm-intel-nightly, I don't get the batchbuffer as part
> of my error state, which is a regression.  With these patches, it's back
> again.
> 
> Thanks for investigating this, Ben.

I want to merge Mika's fixes for the reset stats ioctls, mostly since his
patches are older and we also have neat testcases for them all. Then we
can reconsider the issue here and how to best get at the right batch
buffers. If it breaks development just boot with ppgtt=1 for now, for
userspace nothing really changes compared to full ppgtt.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM
  2014-01-25 20:48   ` Daniel Vetter
@ 2014-01-25 21:31     ` Kenneth Graunke
  2014-01-26  5:09       ` Ben Widawsky
  0 siblings, 1 reply; 26+ messages in thread
From: Kenneth Graunke @ 2014-01-25 21:31 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel GFX, Ben Widawsky


[-- Attachment #1.1: Type: text/plain, Size: 2080 bytes --]

On 01/25/2014 12:48 PM, Daniel Vetter wrote:
> On Sat, Jan 25, 2014 at 12:32:33AM -0800, Kenneth Graunke wrote:
>> On 01/24/2014 06:17 PM, Ben Widawsky wrote:
>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>
>>> This is useful for debugging as we then know that the first entry is
>>> always the global GTT, and all later entries the per-process GTT VM.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
>>> ---
>>>  drivers/gpu/drm/i915/i915_gem.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index 024e454..946a577 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -4634,7 +4634,7 @@ void i915_init_vm(struct drm_i915_private *dev_priv,
>>>  	INIT_LIST_HEAD(&vm->active_list);
>>>  	INIT_LIST_HEAD(&vm->inactive_list);
>>>  	INIT_LIST_HEAD(&vm->global_link);
>>> -	list_add(&vm->global_link, &dev_priv->vm_list);
>>> +	list_add_tail(&vm->global_link, &dev_priv->vm_list);
>>>  }
>>>  
>>>  void
>>
>> These five patches are:
>> Tested-by: Kenneth Graunke <kenneth@whitecape.org>
>>
>> On Broadwell with drm-intel-nightly, I don't get the batchbuffer as part
>> of my error state, which is a regression.  With these patches, it's back
>> again.
>>
>> Thanks for investigating this, Ben.
> 
> I want to merge Mika's fixes for the reset stats ioctls, mostly since his
> patches are older and we also have neat testcases for them all. Then we
> can reconsider the issue here and how to best get at the right batch
> buffers. If it breaks development just boot with ppgtt=1 for now, for
> userspace nothing really changes compared to full ppgtt.
> -Daniel
> 

Do you mean i915.i915_enable_ppgtt=1?  I still get no batchbuffer in the
error state either with or without that.

But for now, Ben gave me a branch that fixes the issue, so I'll just
keep using that until you get this sorted in next/nightly.

--Ken


[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM
  2014-01-25 21:31     ` Kenneth Graunke
@ 2014-01-26  5:09       ` Ben Widawsky
  2014-01-26  9:26         ` Daniel Vetter
  0 siblings, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2014-01-26  5:09 UTC (permalink / raw)
  To: Kenneth Graunke; +Cc: Intel GFX

On Sat, Jan 25, 2014 at 01:31:29PM -0800, Kenneth Graunke wrote:
> On 01/25/2014 12:48 PM, Daniel Vetter wrote:
> > On Sat, Jan 25, 2014 at 12:32:33AM -0800, Kenneth Graunke wrote:
> >> On 01/24/2014 06:17 PM, Ben Widawsky wrote:
> >>> From: Chris Wilson <chris@chris-wilson.co.uk>
> >>>
> >>> This is useful for debugging as we then know that the first entry is
> >>> always the global GTT, and all later entries the per-process GTT VM.
> >>>
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
> >>> ---
> >>>  drivers/gpu/drm/i915/i915_gem.c | 2 +-
> >>>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >>> index 024e454..946a577 100644
> >>> --- a/drivers/gpu/drm/i915/i915_gem.c
> >>> +++ b/drivers/gpu/drm/i915/i915_gem.c
> >>> @@ -4634,7 +4634,7 @@ void i915_init_vm(struct drm_i915_private *dev_priv,
> >>>  	INIT_LIST_HEAD(&vm->active_list);
> >>>  	INIT_LIST_HEAD(&vm->inactive_list);
> >>>  	INIT_LIST_HEAD(&vm->global_link);
> >>> -	list_add(&vm->global_link, &dev_priv->vm_list);
> >>> +	list_add_tail(&vm->global_link, &dev_priv->vm_list);
> >>>  }
> >>>  
> >>>  void
> >>
> >> These five patches are:
> >> Tested-by: Kenneth Graunke <kenneth@whitecape.org>
> >>
> >> On Broadwell with drm-intel-nightly, I don't get the batchbuffer as part
> >> of my error state, which is a regression.  With these patches, it's back
> >> again.
> >>
> >> Thanks for investigating this, Ben.
> > 
> > I want to merge Mika's fixes for the reset stats ioctls, mostly since his
> > patches are older and we also have neat testcases for them all. Then we
> > can reconsider the issue here and how to best get at the right batch
> > buffers. If it breaks development just boot with ppgtt=1 for now, for
> > userspace nothing really changes compared to full ppgtt.
> > -Daniel
> > 
> 
> Do you mean i915.i915_enable_ppgtt=1?  I still get no batchbuffer in the
> error state either with or without that.
> 
> But for now, Ben gave me a branch that fixes the issue, so I'll just
> keep using that until you get this sorted in next/nightly.
> 
> --Ken
> 

Daniel, the issue is exactly with aliasing (any platform >= gen7 with
aliasing to be precise). I did ask Ken to try some of Mika's patches
before pursuing this, but since then I've asked Mika to send me exactly
what I need to review. I am not sure they are exactly what I asked Ken
to test.

In any case, I will try to review Mika's patches by Monday, however a
quick glance lead me to believe they are unrelated to this.

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/5] drm/i915: Create a USES_PPGTT macro
  2014-01-25 20:41   ` Daniel Vetter
@ 2014-01-26  5:45     ` Ben Widawsky
  2014-01-26  9:24       ` Daniel Vetter
  0 siblings, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2014-01-26  5:45 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel GFX, Ben Widawsky

On Sat, Jan 25, 2014 at 09:41:22PM +0100, Daniel Vetter wrote:
> On Fri, Jan 24, 2014 at 06:17:43PM -0800, Ben Widawsky wrote:
> > There are cases where we want to know if there is a full, or aliased
> > ppgtt. Having to always to the || is annoying. This shorthand will keep
> > the code a bit cleaner/easier to read.
> > 
> > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index e851a82..6f68515 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1844,6 +1844,7 @@ struct drm_i915_file_private {
> >  #define HAS_PPGTT(dev)		(INTEL_INFO(dev)->gen >= 7 && !IS_VALLEYVIEW(dev) && !IS_BROADWELL(dev))
> >  #define USES_ALIASING_PPGTT(dev) intel_enable_ppgtt(dev, false)
> >  #define USES_FULL_PPGTT(dev)	intel_enable_ppgtt(dev, true)
> > +#define USES_PPGTT(dev)		(USES_ALIASING_PPGTT(dev) || USES_FULL_PPGTT(dev))
> 
> Just do an s/ALIASING/HW/ and we get to the same point with overall
> clearer code.
> -Daniel

I had plans to make USES_ALIASING actually mean it's using aliasing, and
NOT full PPGTT. However, at this point, they are indeed logically
equivalent. Let me go over the code again and see if we actually want an
USES_ALIASING_PPGTT(), and if not, I'll do it your way. First, let's
figure out if the series will get merged at all.

> 
> >  
> >  #define HAS_OVERLAY(dev)		(INTEL_INFO(dev)->has_overlay)
> >  #define OVERLAY_NEEDS_PHYSICAL(dev)	(INTEL_INFO(dev)->overlay_needs_physical)
> > -- 
> > 1.8.5.3
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/5] drm/i915: Create a USES_PPGTT macro
  2014-01-26  5:45     ` Ben Widawsky
@ 2014-01-26  9:24       ` Daniel Vetter
  0 siblings, 0 replies; 26+ messages in thread
From: Daniel Vetter @ 2014-01-26  9:24 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

On Sun, Jan 26, 2014 at 6:45 AM, Ben Widawsky
<benjamin.widawsky@intel.com> wrote:
>> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> > index e851a82..6f68515 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> > @@ -1844,6 +1844,7 @@ struct drm_i915_file_private {
>> >  #define HAS_PPGTT(dev)             (INTEL_INFO(dev)->gen >= 7 && !IS_VALLEYVIEW(dev) && !IS_BROADWELL(dev))
>> >  #define USES_ALIASING_PPGTT(dev) intel_enable_ppgtt(dev, false)
>> >  #define USES_FULL_PPGTT(dev)       intel_enable_ppgtt(dev, true)
>> > +#define USES_PPGTT(dev)            (USES_ALIASING_PPGTT(dev) || USES_FULL_PPGTT(dev))
>>
>> Just do an s/ALIASING/HW/ and we get to the same point with overall
>> clearer code.
>> -Daniel
>
> I had plans to make USES_ALIASING actually mean it's using aliasing, and
> NOT full PPGTT. However, at this point, they are indeed logically
> equivalent. Let me go over the code again and see if we actually want an
> USES_ALIASING_PPGTT(), and if not, I'll do it your way. First, let's
> figure out if the series will get merged at all.

Well I've had a giant wtf moment when looking through your ppgtt
patches before the holidays due to this, until I've noticed that
USES_ALIASING_PPGTT also holds for full ppgtt. So I'm voting very much
for a cleanup here, the current code is confusing as hell in some
areas, at least for incompetent me ;-)

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM
  2014-01-26  5:09       ` Ben Widawsky
@ 2014-01-26  9:26         ` Daniel Vetter
  2014-01-26 11:10           ` Daniel Vetter
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2014-01-26  9:26 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX

On Sun, Jan 26, 2014 at 6:09 AM, Ben Widawsky
<benjamin.widawsky@intel.com> wrote:
> Daniel, the issue is exactly with aliasing (any platform >= gen7 with
> aliasing to be precise). I did ask Ken to try some of Mika's patches
> before pursuing this, but since then I've asked Mika to send me exactly
> what I need to review. I am not sure they are exactly what I asked Ken
> to test.

Oh, iirc I've looked at gen6 and there it seemed to worked ok. Could
this just be a ALIASING vs. FULL vs. HW_PPGTT mixup? If so I'd prefer
just the minimal patch to remedy that and then fix the full ppgtt
dumper in a 2nd step.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM
  2014-01-26  9:26         ` Daniel Vetter
@ 2014-01-26 11:10           ` Daniel Vetter
  0 siblings, 0 replies; 26+ messages in thread
From: Daniel Vetter @ 2014-01-26 11:10 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX

On Sun, Jan 26, 2014 at 10:26 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Sun, Jan 26, 2014 at 6:09 AM, Ben Widawsky
> <benjamin.widawsky@intel.com> wrote:
>> Daniel, the issue is exactly with aliasing (any platform >= gen7 with
>> aliasing to be precise). I did ask Ken to try some of Mika's patches
>> before pursuing this, but since then I've asked Mika to send me exactly
>> what I need to review. I am not sure they are exactly what I asked Ken
>> to test.
>
> Oh, iirc I've looked at gen6 and there it seemed to worked ok. Could
> this just be a ALIASING vs. FULL vs. HW_PPGTT mixup? If so I'd prefer
> just the minimal patch to remedy that and then fix the full ppgtt
> dumper in a 2nd step.

And since we seem to have a bit a confusion going on with no-ppgtt,
aliasing ppgtt and full ppgtt a testcase for this would be useful.
Should be fairly simply by submitting a bit of special noise after the
MI_BB_END command for the hanging batches in the hangman testcase.
Then we can just grep for that in the resulting error state. Bonus
points for doing that on each ring and parsing the error state to make
sure the cookie dword is indeed in the batch (and not part of some
other date we dump).
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/5] drm/i915: Capture PPGTT info on error capture
  2014-01-25  2:17 ` [PATCH 4/5] drm/i915: Capture PPGTT info on error capture Ben Widawsky
@ 2014-01-26 11:42   ` Chris Wilson
  2014-01-26 19:06     ` Ben Widawsky
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2014-01-26 11:42 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

On Fri, Jan 24, 2014 at 06:17:44PM -0800, Ben Widawsky wrote:
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  7 ++++++
>  drivers/gpu/drm/i915/i915_gpu_error.c | 41 +++++++++++++++++++++++++++++++++++
>  2 files changed, 48 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 6f68515..5105fd4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -359,6 +359,13 @@ struct drm_i915_error_state {
>  		s32 ring:4;
>  		u32 cache_level:3;
>  	} **active_bo, **pinned_bo;
> +	struct drm_i915_vm_info {
> +		u32 gfx_mode;
> +		union {
> +			u64 pdp[4];
> +			u32 pp_dir_base;
> +		};
> +	} vm_info[I915_NUM_RINGS];

Note for future janitorial work: let's coalesce all the per-ring
information into the ring error struct.

>  	u32 *active_bo_count, *pinned_bo_count;

For instance, I thought active_bo was already being tracked per-ring.
(Pinned bo is global since that exists more or less to make sure that
our registers are pointing into pinned objects.)

Do we also want to capture?
  GAC_ECO_BITS /* gen6,7 */
  GAM_ECOCHK /* gen6,7 */
  GAB_CTL /* gen6 */
  GFX_MODE /* gen6 */

The rest looks good.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-25  2:17 ` [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW Ben Widawsky
@ 2014-01-26 11:47   ` Chris Wilson
  2014-01-26 19:05     ` Ben Widawsky
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2014-01-26 11:47 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

On Fri, Jan 24, 2014 at 06:17:45PM -0800, Ben Widawsky wrote:
> The previous check during error capture of whether or not the current VM
> should be scanned used, gen < 7. That was more or less trying to
> determine if there was a full PPGTT. At the time, this was sort of what
> I meant to do because I was more interested in working backwards from
> hardware state. However, this is incorrect because it will not include
> platforms that are greater than gen7, and not having PPGTT.  Example
> would be BYT which is gen7 but doesn't have PPGTT, BDW, or any platform
> greater than gen7 with the PPGTT module parameter invoked.
> 
> I am /assuming/ BYT was broken, I have not actually checked.
> 
> While here, clean up the file a bit to avoid duplicate reads (now that
> the PPGTT info is in the error state).
> 
> I think Mika/Chris may have been looking at this too.

Sure, we are looking (for identifying the guilty request/batch) by using
the older, simpler mechanism of finding the first incomplete request. I
think that search is now definite since we preallocate the request and no
longer do request collascing if ENOMEM (i.e. there is a 1:1 relationship
between seqno/batch/request).

That should also apply here and be much simpler.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-26 11:47   ` Chris Wilson
@ 2014-01-26 19:05     ` Ben Widawsky
  2014-01-26 19:55       ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2014-01-26 19:05 UTC (permalink / raw)
  To: Chris Wilson, Ben Widawsky, Intel GFX, Mika Kuoppala

On Sun, Jan 26, 2014 at 11:47:40AM +0000, Chris Wilson wrote:
> On Fri, Jan 24, 2014 at 06:17:45PM -0800, Ben Widawsky wrote:
> > The previous check during error capture of whether or not the current VM
> > should be scanned used, gen < 7. That was more or less trying to
> > determine if there was a full PPGTT. At the time, this was sort of what
> > I meant to do because I was more interested in working backwards from
> > hardware state. However, this is incorrect because it will not include
> > platforms that are greater than gen7, and not having PPGTT.  Example
> > would be BYT which is gen7 but doesn't have PPGTT, BDW, or any platform
> > greater than gen7 with the PPGTT module parameter invoked.
> > 
> > I am /assuming/ BYT was broken, I have not actually checked.
> > 
> > While here, clean up the file a bit to avoid duplicate reads (now that
> > the PPGTT info is in the error state).
> > 
> > I think Mika/Chris may have been looking at this too.
> 
> Sure, we are looking (for identifying the guilty request/batch) by using
> the older, simpler mechanism of finding the first incomplete request. I
> think that search is now definite since we preallocate the request and no
> longer do request collascing if ENOMEM (i.e. there is a 1:1 relationship
> between seqno/batch/request).
> 
> That should also apply here and be much simpler.
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre

How does that solve hangs which aren't caused by requests?

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/5] drm/i915: Capture PPGTT info on error capture
  2014-01-26 11:42   ` Chris Wilson
@ 2014-01-26 19:06     ` Ben Widawsky
  2014-01-26 19:51       ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2014-01-26 19:06 UTC (permalink / raw)
  To: Chris Wilson, Ben Widawsky, Intel GFX

On Sun, Jan 26, 2014 at 11:42:22AM +0000, Chris Wilson wrote:
> On Fri, Jan 24, 2014 at 06:17:44PM -0800, Ben Widawsky wrote:
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h       |  7 ++++++
> >  drivers/gpu/drm/i915/i915_gpu_error.c | 41 +++++++++++++++++++++++++++++++++++
> >  2 files changed, 48 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 6f68515..5105fd4 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -359,6 +359,13 @@ struct drm_i915_error_state {
> >  		s32 ring:4;
> >  		u32 cache_level:3;
> >  	} **active_bo, **pinned_bo;
> > +	struct drm_i915_vm_info {
> > +		u32 gfx_mode;
> > +		union {
> > +			u64 pdp[4];
> > +			u32 pp_dir_base;
> > +		};
> > +	} vm_info[I915_NUM_RINGS];
> 
> Note for future janitorial work: let's coalesce all the per-ring
> information into the ring error struct.
> 
> >  	u32 *active_bo_count, *pinned_bo_count;
> 
> For instance, I thought active_bo was already being tracked per-ring.
> (Pinned bo is global since that exists more or less to make sure that
> our registers are pointing into pinned objects.)
> 
> Do we also want to capture?
>   GAC_ECO_BITS /* gen6,7 */
>   GAM_ECOCHK /* gen6,7 */
>   GAB_CTL /* gen6 */
>   GFX_MODE /* gen6 */
> 
> The rest looks good.
> -Chris
> 

I agree. I was pretty unhappy too with how we've done things, but as
this information is immediately useful, I'd really like to not postpone.
Does "The rest looks good." mean reviewed-by?

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 4/5] drm/i915: Capture PPGTT info on error capture
  2014-01-26 19:06     ` Ben Widawsky
@ 2014-01-26 19:51       ` Chris Wilson
  0 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2014-01-26 19:51 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

On Sun, Jan 26, 2014 at 11:06:40AM -0800, Ben Widawsky wrote:
> On Sun, Jan 26, 2014 at 11:42:22AM +0000, Chris Wilson wrote:
> > On Fri, Jan 24, 2014 at 06:17:44PM -0800, Ben Widawsky wrote:
> > > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > > Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> > > ---
> > >  drivers/gpu/drm/i915/i915_drv.h       |  7 ++++++
> > >  drivers/gpu/drm/i915/i915_gpu_error.c | 41 +++++++++++++++++++++++++++++++++++
> > >  2 files changed, 48 insertions(+)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > index 6f68515..5105fd4 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > @@ -359,6 +359,13 @@ struct drm_i915_error_state {
> > >  		s32 ring:4;
> > >  		u32 cache_level:3;
> > >  	} **active_bo, **pinned_bo;
> > > +	struct drm_i915_vm_info {
> > > +		u32 gfx_mode;
> > > +		union {
> > > +			u64 pdp[4];
> > > +			u32 pp_dir_base;
> > > +		};
> > > +	} vm_info[I915_NUM_RINGS];
> > 
> > Note for future janitorial work: let's coalesce all the per-ring
> > information into the ring error struct.
> > 
> > >  	u32 *active_bo_count, *pinned_bo_count;
> > 
> > For instance, I thought active_bo was already being tracked per-ring.
> > (Pinned bo is global since that exists more or less to make sure that
> > our registers are pointing into pinned objects.)
> > 
> > Do we also want to capture?
> >   GAC_ECO_BITS /* gen6,7 */
> >   GAM_ECOCHK /* gen6,7 */
> >   GAB_CTL /* gen6 */
> >   GFX_MODE /* gen6 */
> > 
> > The rest looks good.
> > -Chris
> > 
> 
> I agree. I was pretty unhappy too with how we've done things, but as
> this information is immediately useful, I'd really like to not postpone.
> Does "The rest looks good." mean reviewed-by?

Yes. If you spend the 2 minutes to add reviewed-by to the changelog, you
can also add the extra registers. ;-)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-26 19:05     ` Ben Widawsky
@ 2014-01-26 19:55       ` Chris Wilson
  2014-01-26 21:47         ` Ben Widawsky
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2014-01-26 19:55 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

On Sun, Jan 26, 2014 at 11:05:40AM -0800, Ben Widawsky wrote:
> On Sun, Jan 26, 2014 at 11:47:40AM +0000, Chris Wilson wrote:
> > On Fri, Jan 24, 2014 at 06:17:45PM -0800, Ben Widawsky wrote:
> > > The previous check during error capture of whether or not the current VM
> > > should be scanned used, gen < 7. That was more or less trying to
> > > determine if there was a full PPGTT. At the time, this was sort of what
> > > I meant to do because I was more interested in working backwards from
> > > hardware state. However, this is incorrect because it will not include
> > > platforms that are greater than gen7, and not having PPGTT.  Example
> > > would be BYT which is gen7 but doesn't have PPGTT, BDW, or any platform
> > > greater than gen7 with the PPGTT module parameter invoked.
> > > 
> > > I am /assuming/ BYT was broken, I have not actually checked.
> > > 
> > > While here, clean up the file a bit to avoid duplicate reads (now that
> > > the PPGTT info is in the error state).
> > > 
> > > I think Mika/Chris may have been looking at this too.
> > 
> > Sure, we are looking (for identifying the guilty request/batch) by using
> > the older, simpler mechanism of finding the first incomplete request. I
> > think that search is now definite since we preallocate the request and no
> > longer do request collascing if ENOMEM (i.e. there is a 1:1 relationship
> > between seqno/batch/request).
> > 
> > That should also apply here and be much simpler.
> 
> How does that solve hangs which aren't caused by requests?

Was that an intentional rhetorical question?

The code you touch here only deals with requests - finding the current
batchbuffer if any.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-26 19:55       ` Chris Wilson
@ 2014-01-26 21:47         ` Ben Widawsky
  2014-01-27 13:45           ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2014-01-26 21:47 UTC (permalink / raw)
  To: Chris Wilson, Ben Widawsky, Intel GFX, Mika Kuoppala

On Sun, Jan 26, 2014 at 07:55:59PM +0000, Chris Wilson wrote:
> On Sun, Jan 26, 2014 at 11:05:40AM -0800, Ben Widawsky wrote:
> > On Sun, Jan 26, 2014 at 11:47:40AM +0000, Chris Wilson wrote:
> > > On Fri, Jan 24, 2014 at 06:17:45PM -0800, Ben Widawsky wrote:
> > > > The previous check during error capture of whether or not the current VM
> > > > should be scanned used, gen < 7. That was more or less trying to
> > > > determine if there was a full PPGTT. At the time, this was sort of what
> > > > I meant to do because I was more interested in working backwards from
> > > > hardware state. However, this is incorrect because it will not include
> > > > platforms that are greater than gen7, and not having PPGTT.  Example
> > > > would be BYT which is gen7 but doesn't have PPGTT, BDW, or any platform
> > > > greater than gen7 with the PPGTT module parameter invoked.
> > > > 
> > > > I am /assuming/ BYT was broken, I have not actually checked.
> > > > 
> > > > While here, clean up the file a bit to avoid duplicate reads (now that
> > > > the PPGTT info is in the error state).
> > > > 
> > > > I think Mika/Chris may have been looking at this too.
> > > 
> > > Sure, we are looking (for identifying the guilty request/batch) by using
> > > the older, simpler mechanism of finding the first incomplete request. I
> > > think that search is now definite since we preallocate the request and no
> > > longer do request collascing if ENOMEM (i.e. there is a 1:1 relationship
> > > between seqno/batch/request).
> > > 
> > > That should also apply here and be much simpler.
> > 
> > How does that solve hangs which aren't caused by requests?
> 
> Was that an intentional rhetorical question?
> 
> The code you touch here only deals with requests - finding the current
> batchbuffer if any.
> -Chris
> 

It wasn't rhetorical. I temporarily ignored that all batches are tied to
a request.

So what's the plan now? Just looking at the callers, we seem to have a
couple of callers that can't easily identify the bad request.

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-26 21:47         ` Ben Widawsky
@ 2014-01-27 13:45           ` Chris Wilson
  2014-01-27 18:24             ` Ben Widawsky
  2014-01-27 20:31             ` Ben Widawsky
  0 siblings, 2 replies; 26+ messages in thread
From: Chris Wilson @ 2014-01-27 13:45 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

On Sun, Jan 26, 2014 at 01:47:29PM -0800, Ben Widawsky wrote:
> On Sun, Jan 26, 2014 at 07:55:59PM +0000, Chris Wilson wrote:
> > On Sun, Jan 26, 2014 at 11:05:40AM -0800, Ben Widawsky wrote:
> > > On Sun, Jan 26, 2014 at 11:47:40AM +0000, Chris Wilson wrote:
> > > > On Fri, Jan 24, 2014 at 06:17:45PM -0800, Ben Widawsky wrote:
> > > > > The previous check during error capture of whether or not the current VM
> > > > > should be scanned used, gen < 7. That was more or less trying to
> > > > > determine if there was a full PPGTT. At the time, this was sort of what
> > > > > I meant to do because I was more interested in working backwards from
> > > > > hardware state. However, this is incorrect because it will not include
> > > > > platforms that are greater than gen7, and not having PPGTT.  Example
> > > > > would be BYT which is gen7 but doesn't have PPGTT, BDW, or any platform
> > > > > greater than gen7 with the PPGTT module parameter invoked.
> > > > > 
> > > > > I am /assuming/ BYT was broken, I have not actually checked.
> > > > > 
> > > > > While here, clean up the file a bit to avoid duplicate reads (now that
> > > > > the PPGTT info is in the error state).
> > > > > 
> > > > > I think Mika/Chris may have been looking at this too.
> > > > 
> > > > Sure, we are looking (for identifying the guilty request/batch) by using
> > > > the older, simpler mechanism of finding the first incomplete request. I
> > > > think that search is now definite since we preallocate the request and no
> > > > longer do request collascing if ENOMEM (i.e. there is a 1:1 relationship
> > > > between seqno/batch/request).
> > > > 
> > > > That should also apply here and be much simpler.
> > > 
> > > How does that solve hangs which aren't caused by requests?
> > 
> > Was that an intentional rhetorical question?
> > 
> > The code you touch here only deals with requests - finding the current
> > batchbuffer if any.
> > -Chris
> > 
> 
> It wasn't rhetorical. I temporarily ignored that all batches are tied to
> a request.
> 
> So what's the plan now? Just looking at the callers, we seem to have a
> couple of callers that can't easily identify the bad request.

I was thinking along the lines of:

@@ -737,31 +709,16 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
        }
 
        seqno = ring->get_seqno(ring, false);
-       list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
-               if (!is_active_vm(vm, ring))
+       list_for_each_entry(request, &ring->request_list, list) {
+               if (i915_seqno_passed(seqno, request->seqno))
                        continue;
 
-               found_active = true;
-
-               list_for_each_entry(vma, &vm->active_list, mm_list) {
-                       obj = vma->obj;
-                       if (obj->ring != ring)
-                               continue;
-
-                       if (i915_seqno_passed(seqno, obj->last_read_seqno))
-                               continue;
-
-                       if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
-                               continue;
-
-                       /* We need to copy these to an anonymous buffer as the simplest
-                        * method to avoid being overwritten by userspace.
-                        */
-                       return i915_error_object_create(dev_priv, obj, vm);
-               }
+               /* We need to copy these to an anonymous buffer as the simplest
+                * method to avoid being overwritten by userspace.
+                */
+               return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
        }
 
-       WARN_ON(!found_active);

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-27 13:45           ` Chris Wilson
@ 2014-01-27 18:24             ` Ben Widawsky
  2014-01-27 20:31             ` Ben Widawsky
  1 sibling, 0 replies; 26+ messages in thread
From: Ben Widawsky @ 2014-01-27 18:24 UTC (permalink / raw)
  To: Chris Wilson, Ben Widawsky, Intel GFX, Mika Kuoppala

On Mon, Jan 27, 2014 at 01:45:22PM +0000, Chris Wilson wrote:
> On Sun, Jan 26, 2014 at 01:47:29PM -0800, Ben Widawsky wrote:
> > On Sun, Jan 26, 2014 at 07:55:59PM +0000, Chris Wilson wrote:
> > > On Sun, Jan 26, 2014 at 11:05:40AM -0800, Ben Widawsky wrote:
> > > > On Sun, Jan 26, 2014 at 11:47:40AM +0000, Chris Wilson wrote:
> > > > > On Fri, Jan 24, 2014 at 06:17:45PM -0800, Ben Widawsky wrote:
> > > > > > The previous check during error capture of whether or not the current VM
> > > > > > should be scanned used, gen < 7. That was more or less trying to
> > > > > > determine if there was a full PPGTT. At the time, this was sort of what
> > > > > > I meant to do because I was more interested in working backwards from
> > > > > > hardware state. However, this is incorrect because it will not include
> > > > > > platforms that are greater than gen7, and not having PPGTT.  Example
> > > > > > would be BYT which is gen7 but doesn't have PPGTT, BDW, or any platform
> > > > > > greater than gen7 with the PPGTT module parameter invoked.
> > > > > > 
> > > > > > I am /assuming/ BYT was broken, I have not actually checked.
> > > > > > 
> > > > > > While here, clean up the file a bit to avoid duplicate reads (now that
> > > > > > the PPGTT info is in the error state).
> > > > > > 
> > > > > > I think Mika/Chris may have been looking at this too.
> > > > > 
> > > > > Sure, we are looking (for identifying the guilty request/batch) by using
> > > > > the older, simpler mechanism of finding the first incomplete request. I
> > > > > think that search is now definite since we preallocate the request and no
> > > > > longer do request collascing if ENOMEM (i.e. there is a 1:1 relationship
> > > > > between seqno/batch/request).
> > > > > 
> > > > > That should also apply here and be much simpler.
> > > > 
> > > > How does that solve hangs which aren't caused by requests?
> > > 
> > > Was that an intentional rhetorical question?
> > > 
> > > The code you touch here only deals with requests - finding the current
> > > batchbuffer if any.
> > > -Chris
> > > 
> > 
> > It wasn't rhetorical. I temporarily ignored that all batches are tied to
> > a request.
> > 
> > So what's the plan now? Just looking at the callers, we seem to have a
> > couple of callers that can't easily identify the bad request.
> 
> I was thinking along the lines of:
> 
> @@ -737,31 +709,16 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>         }
>  
>         seqno = ring->get_seqno(ring, false);
> -       list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> -               if (!is_active_vm(vm, ring))
> +       list_for_each_entry(request, &ring->request_list, list) {
> +               if (i915_seqno_passed(seqno, request->seqno))
>                         continue;
>  
> -               found_active = true;
> -
> -               list_for_each_entry(vma, &vm->active_list, mm_list) {
> -                       obj = vma->obj;
> -                       if (obj->ring != ring)
> -                               continue;
> -
> -                       if (i915_seqno_passed(seqno, obj->last_read_seqno))
> -                               continue;
> -
> -                       if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
> -                               continue;
> -
> -                       /* We need to copy these to an anonymous buffer as the simplest
> -                        * method to avoid being overwritten by userspace.
> -                        */
> -                       return i915_error_object_create(dev_priv, obj, vm);
> -               }
> +               /* We need to copy these to an anonymous buffer as the simplest
> +                * method to avoid being overwritten by userspace.
> +                */
> +               return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
>         }
>  
> -       WARN_ON(!found_active);
> 

So per ring batchbuffers is okay with you (it's fine by me)?

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-27 13:45           ` Chris Wilson
  2014-01-27 18:24             ` Ben Widawsky
@ 2014-01-27 20:31             ` Ben Widawsky
  2014-01-27 21:31               ` Chris Wilson
  1 sibling, 1 reply; 26+ messages in thread
From: Ben Widawsky @ 2014-01-27 20:31 UTC (permalink / raw)
  To: Chris Wilson, Ben Widawsky, Intel GFX, Mika Kuoppala

On Mon, Jan 27, 2014 at 01:45:22PM +0000, Chris Wilson wrote:
> On Sun, Jan 26, 2014 at 01:47:29PM -0800, Ben Widawsky wrote:
> > On Sun, Jan 26, 2014 at 07:55:59PM +0000, Chris Wilson wrote:
> > > On Sun, Jan 26, 2014 at 11:05:40AM -0800, Ben Widawsky wrote:
> > > > On Sun, Jan 26, 2014 at 11:47:40AM +0000, Chris Wilson wrote:
> > > > > On Fri, Jan 24, 2014 at 06:17:45PM -0800, Ben Widawsky wrote:
> > > > > > The previous check during error capture of whether or not the current VM
> > > > > > should be scanned used, gen < 7. That was more or less trying to
> > > > > > determine if there was a full PPGTT. At the time, this was sort of what
> > > > > > I meant to do because I was more interested in working backwards from
> > > > > > hardware state. However, this is incorrect because it will not include
> > > > > > platforms that are greater than gen7, and not having PPGTT.  Example
> > > > > > would be BYT which is gen7 but doesn't have PPGTT, BDW, or any platform
> > > > > > greater than gen7 with the PPGTT module parameter invoked.
> > > > > > 
> > > > > > I am /assuming/ BYT was broken, I have not actually checked.
> > > > > > 
> > > > > > While here, clean up the file a bit to avoid duplicate reads (now that
> > > > > > the PPGTT info is in the error state).
> > > > > > 
> > > > > > I think Mika/Chris may have been looking at this too.
> > > > > 
> > > > > Sure, we are looking (for identifying the guilty request/batch) by using
> > > > > the older, simpler mechanism of finding the first incomplete request. I
> > > > > think that search is now definite since we preallocate the request and no
> > > > > longer do request collascing if ENOMEM (i.e. there is a 1:1 relationship
> > > > > between seqno/batch/request).
> > > > > 
> > > > > That should also apply here and be much simpler.
> > > > 
> > > > How does that solve hangs which aren't caused by requests?
> > > 
> > > Was that an intentional rhetorical question?
> > > 
> > > The code you touch here only deals with requests - finding the current
> > > batchbuffer if any.
> > > -Chris
> > > 
> > 
> > It wasn't rhetorical. I temporarily ignored that all batches are tied to
> > a request.
> > 
> > So what's the plan now? Just looking at the callers, we seem to have a
> > couple of callers that can't easily identify the bad request.
> 
> I was thinking along the lines of:
> 
> @@ -737,31 +709,16 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>         }
>  
>         seqno = ring->get_seqno(ring, false);
> -       list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> -               if (!is_active_vm(vm, ring))
> +       list_for_each_entry(request, &ring->request_list, list) {
> +               if (i915_seqno_passed(seqno, request->seqno))
>                         continue;
>  
> -               found_active = true;
> -
> -               list_for_each_entry(vma, &vm->active_list, mm_list) {
> -                       obj = vma->obj;
> -                       if (obj->ring != ring)
> -                               continue;
> -
> -                       if (i915_seqno_passed(seqno, obj->last_read_seqno))
> -                               continue;
> -
> -                       if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
> -                               continue;
> -
> -                       /* We need to copy these to an anonymous buffer as the simplest
> -                        * method to avoid being overwritten by userspace.
> -                        */
> -                       return i915_error_object_create(dev_priv, obj, vm);
> -               }
> +               /* We need to copy these to an anonymous buffer as the simplest
> +                * method to avoid being overwritten by userspace.
> +                */
> +               return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
>         }
>  
> -       WARN_ON(!found_active);
> 

The other issue is the existing method doesn't rely as much on proper
request handling, ie. this could be more resilient to driver bugs. I
kind of want to keep both...

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-27 20:31             ` Ben Widawsky
@ 2014-01-27 21:31               ` Chris Wilson
  2014-01-27 21:54                 ` Ben Widawsky
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2014-01-27 21:31 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

On Mon, Jan 27, 2014 at 12:31:08PM -0800, Ben Widawsky wrote:
> The other issue is the existing method doesn't rely as much on proper
> request handling, ie. this could be more resilient to driver bugs. I
> kind of want to keep both...

Actually I think it is. Part of the process of reading an error dump is
tying together the registers with what is captured. If they are
inconsistent, we know that the driver/capture is buggy. What happens in
the real world is that the GPU executes something completely different
than the batch buffer anyway...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW
  2014-01-27 21:31               ` Chris Wilson
@ 2014-01-27 21:54                 ` Ben Widawsky
  0 siblings, 0 replies; 26+ messages in thread
From: Ben Widawsky @ 2014-01-27 21:54 UTC (permalink / raw)
  To: Chris Wilson, Ben Widawsky, Intel GFX, Mika Kuoppala

On Mon, Jan 27, 2014 at 09:31:04PM +0000, Chris Wilson wrote:
> On Mon, Jan 27, 2014 at 12:31:08PM -0800, Ben Widawsky wrote:
> > The other issue is the existing method doesn't rely as much on proper
> > request handling, ie. this could be more resilient to driver bugs. I
> > kind of want to keep both...
> 
> Actually I think it is. Part of the process of reading an error dump is
> tying together the registers with what is captured. If they are
> inconsistent, we know that the driver/capture is buggy. What happens in
> the real world is that the GPU executes something completely different
> than the batch buffer anyway...
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre

Recapping IRC conversation - Chris is sending a patch to fix this
problem with his solution.

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2014-01-27 21:54 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-01-25  2:17 [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Ben Widawsky
2014-01-25  2:17 ` [PATCH 2/5] drm/i915: Print captured bo for all VM in error state Ben Widawsky
2014-01-25  2:17 ` [PATCH 3/5] drm/i915: Create a USES_PPGTT macro Ben Widawsky
2014-01-25 20:41   ` Daniel Vetter
2014-01-26  5:45     ` Ben Widawsky
2014-01-26  9:24       ` Daniel Vetter
2014-01-25  2:17 ` [PATCH 4/5] drm/i915: Capture PPGTT info on error capture Ben Widawsky
2014-01-26 11:42   ` Chris Wilson
2014-01-26 19:06     ` Ben Widawsky
2014-01-26 19:51       ` Chris Wilson
2014-01-25  2:17 ` [PATCH 5/5] drm/i915: Fix error capture on BYT/BDW Ben Widawsky
2014-01-26 11:47   ` Chris Wilson
2014-01-26 19:05     ` Ben Widawsky
2014-01-26 19:55       ` Chris Wilson
2014-01-26 21:47         ` Ben Widawsky
2014-01-27 13:45           ` Chris Wilson
2014-01-27 18:24             ` Ben Widawsky
2014-01-27 20:31             ` Ben Widawsky
2014-01-27 21:31               ` Chris Wilson
2014-01-27 21:54                 ` Ben Widawsky
2014-01-25  8:32 ` [PATCH 1/5] drm/i915: Place the Global GTT VM first in the list of VM Kenneth Graunke
2014-01-25 20:48   ` Daniel Vetter
2014-01-25 21:31     ` Kenneth Graunke
2014-01-26  5:09       ` Ben Widawsky
2014-01-26  9:26         ` Daniel Vetter
2014-01-26 11:10           ` Daniel Vetter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox