* [PATCH 1/1] drm/i915: track first and last processes that touch gem objects
@ 2012-02-03 14:43 Eugeni Dodonov
2012-02-03 14:53 ` Konstantin Belousov
2012-02-03 18:02 ` Chris Wilson
0 siblings, 2 replies; 8+ messages in thread
From: Eugeni Dodonov @ 2012-02-03 14:43 UTC (permalink / raw)
To: intel-gfx; +Cc: Ben Widawsky, Eugeni Dodonov
This allows to hopefully find out who was responsible for the GPU death.
We record the 1st and last process to touch each object, to keep track of
the process which created the object originally and the last process to
touch it.
To simplify post-mortem analysis, we also search for the processes names
when gathering the i915_error_state and when peeking at the list of active
gem objects in debugfs. This is not perfect for tracking all the
processes, as they can quit or die before their batchbuffers got executed,
but having to track them during the entire object lifetime would be
excessively memcpy hungry.
CC: Eric Anholt <eric@anholt.net>
CC: Daniel Vetter <daniel@ffwll.ch>
CC: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Eugeni Dodonov <eugeni.dodonov@intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 45 ++++++++++++++++++++++++++++
drivers/gpu/drm/i915/i915_drv.h | 5 +++
drivers/gpu/drm/i915/i915_gem_execbuffer.c | 6 ++++
drivers/gpu/drm/i915/i915_irq.c | 22 +++++++++++++
4 files changed, 78 insertions(+), 0 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a017b98..2eb28d2 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -121,6 +121,7 @@ static const char *cache_level_str(int type)
static void
describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
{
+ int i;
seq_printf(m, "%p: %s%s %8zdKiB %04x %04x %d %d%s%s%s",
&obj->base,
get_pin_flag(obj),
@@ -151,6 +152,28 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
}
if (obj->ring != NULL)
seq_printf(m, " (%s)", obj->ring->name);
+
+ /* Describe 1st and last process to touch the object */
+ for (i=0; i < 2; i++) {
+ struct pid *p;
+ struct task_struct *tsk = NULL;
+
+ /* Skip objects that have no associated pid */
+ if (!obj->pid[i])
+ continue;
+
+ p = find_get_pid(obj->pid[i]);
+ if (p) {
+ tsk = get_pid_task(p, PIDTYPE_PID);
+ put_pid(p);
+ }
+
+ seq_printf(m, " (pid_%s: %5d [%s])",
+ (i==0) ? "first" : "last",
+ obj->pid[i],
+ (tsk) ? tsk->comm : "unknown");
+
+ }
}
static int i915_gem_object_list_info(struct seq_file *m, void *data)
@@ -710,6 +733,7 @@ static void print_error_buffers(struct seq_file *m,
struct drm_i915_error_buffer *err,
int count)
{
+ int i;
seq_printf(m, "%s [%d]:\n", name, count);
while (count--) {
@@ -731,6 +755,27 @@ static void print_error_buffers(struct seq_file *m,
if (err->fence_reg != I915_FENCE_REG_NONE)
seq_printf(m, " (fence: %d)", err->fence_reg);
+ /* Describe 1st and last process to touch the object */
+ for (i=0; i < 2; i++) {
+ struct pid *p;
+ struct task_struct *tsk = NULL;
+
+ /* Skip objects that have no associated pid */
+ if (!err->pid[i])
+ continue;
+
+ p = find_get_pid(err->pid[i]);
+ if (p)
+ tsk = get_pid_task(p, PIDTYPE_PID);
+
+ seq_printf(m, " (pid_%s: %5d [%s])",
+ (i==0) ? "first" : "last",
+ err->pid[i],
+ (tsk) ? tsk->comm : "unknown");
+
+ put_pid(p);
+ }
+
seq_printf(m, "\n");
err++;
}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9689ca3..9711ff0a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -193,6 +193,8 @@ struct drm_i915_error_state {
u32 purgeable:1;
u32 ring:4;
u32 cache_level:2;
+ u32 pid[2];
+ char comm[2][TASK_COMM_LEN];
} *active_bo, *pinned_bo;
u32 active_bo_count, pinned_bo_count;
struct intel_overlay_error_state *overlay;
@@ -891,6 +893,9 @@ struct drm_i915_gem_object {
/** for phy allocated objects */
struct drm_i915_gem_phys_object *phys_obj;
+ /** pid of first and last process to touch the object */
+ uint32_t pid[2];
+
/**
* Number of crtcs where this object is currently the fb, but
* will be page flipped away on the next vblank. When it
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 65e1f00..188893d 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1140,6 +1140,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
list_add_tail(&obj->exec_list, &objects);
obj->exec_handle = exec[i].handle;
obj->exec_entry = &exec[i];
+
+ /* Discover pid of caller process */
+ if (!obj->pid[0])
+ obj->pid[0] = file->pid;
+ obj->pid[1] = file->pid;
+
eb_add_object(eb, obj);
}
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5bd4361..4b6fb56 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -789,8 +789,30 @@ static u32 capture_bo_list(struct drm_i915_error_buffer *err,
int i = 0;
list_for_each_entry(obj, head, mm_list) {
+ struct pid *p = NULL;
+ struct task_struct *tsk = NULL;
+ int j;
+
err->size = obj->base.size;
err->name = obj->base.name;
+
+ /* Record processes which touched this object and collect their
+ * names to simplify further analysis.
+ */
+ for (j=0; j < 2; j++) {
+ err->pid[j] = obj->pid[j];
+
+ if (err->pid[j])
+ p = find_get_pid(err->pid[j]);
+ if (p) {
+ tsk = get_pid_task(p, PIDTYPE_PID);
+ put_pid(p);
+ }
+
+ snprintf(err->comm[j], TASK_COMM_LEN,
+ (tsk) ? tsk->comm : "unknown");
+ }
+
err->seqno = obj->last_rendering_seqno;
err->gtt_offset = obj->gtt_offset;
err->read_domains = obj->base.read_domains;
--
1.7.8.1
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] drm/i915: track first and last processes that touch gem objects
2012-02-03 14:43 [PATCH 1/1] drm/i915: track first and last processes that touch gem objects Eugeni Dodonov
@ 2012-02-03 14:53 ` Konstantin Belousov
2012-02-03 15:31 ` Eugeni Dodonov
2012-02-03 18:02 ` Chris Wilson
1 sibling, 1 reply; 8+ messages in thread
From: Konstantin Belousov @ 2012-02-03 14:53 UTC (permalink / raw)
To: Eugeni Dodonov; +Cc: intel-gfx, Ben Widawsky
[-- Attachment #1.1: Type: text/plain, Size: 874 bytes --]
On Fri, Feb 03, 2012 at 12:43:25PM -0200, Eugeni Dodonov wrote:
> This allows to hopefully find out who was responsible for the GPU death.
> We record the 1st and last process to touch each object, to keep track of
> the process which created the object originally and the last process to
> touch it.
>
> To simplify post-mortem analysis, we also search for the processes names
> when gathering the i915_error_state and when peeking at the list of active
> gem objects in debugfs. This is not perfect for tracking all the
> processes, as they can quit or die before their batchbuffers got executed,
> but having to track them during the entire object lifetime would be
> excessively memcpy hungry.
May be this is too stupid question, but it seems you do not track
neither writers using I915_GEM_PWRITE nor writers which modified buffer
with mmaped access.
[-- Attachment #1.2: Type: application/pgp-signature, Size: 196 bytes --]
[-- Attachment #2: Type: text/plain, Size: 159 bytes --]
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 1/1] drm/i915: track first and last processes that touch gem objects
2012-02-03 14:53 ` Konstantin Belousov
@ 2012-02-03 15:31 ` Eugeni Dodonov
2012-02-03 15:49 ` Konstantin Belousov
0 siblings, 1 reply; 8+ messages in thread
From: Eugeni Dodonov @ 2012-02-03 15:31 UTC (permalink / raw)
To: intel-gfx; +Cc: Ben Widawsky, Eugeni Dodonov
This allows to hopefully find out who was responsible for the GPU death.
We record the 1st and last process to touch each object, to keep track of
the process which created the object originally and the last process to
touch it.
To simplify post-mortem analysis, we also search for the processes names
when gathering the i915_error_state and when peeking at the list of active
gem objects in debugfs. This is not perfect for tracking all the
processes, as they can quit or die before their batchbuffers got executed,
but having to track them during the entire object lifetime would be
excessively memcpy hungry.
v2: also track objects accessed via mmap or pwrite.
CC: Konstantin Belousov <kostikbel@gmail.com>
CC: Eric Anholt <eric@anholt.net>
CC: Daniel Vetter <daniel@ffwll.ch>
CC: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Eugeni Dodonov <eugeni.dodonov@intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 45 ++++++++++++++++++++++++++++
drivers/gpu/drm/i915/i915_drv.h | 5 +++
drivers/gpu/drm/i915/i915_gem.c | 12 +++++++
drivers/gpu/drm/i915/i915_gem_execbuffer.c | 6 ++++
drivers/gpu/drm/i915/i915_irq.c | 22 +++++++++++++
5 files changed, 90 insertions(+), 0 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a017b98..2eb28d2 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -121,6 +121,7 @@ static const char *cache_level_str(int type)
static void
describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
{
+ int i;
seq_printf(m, "%p: %s%s %8zdKiB %04x %04x %d %d%s%s%s",
&obj->base,
get_pin_flag(obj),
@@ -151,6 +152,28 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
}
if (obj->ring != NULL)
seq_printf(m, " (%s)", obj->ring->name);
+
+ /* Describe 1st and last process to touch the object */
+ for (i=0; i < 2; i++) {
+ struct pid *p;
+ struct task_struct *tsk = NULL;
+
+ /* Skip objects that have no associated pid */
+ if (!obj->pid[i])
+ continue;
+
+ p = find_get_pid(obj->pid[i]);
+ if (p) {
+ tsk = get_pid_task(p, PIDTYPE_PID);
+ put_pid(p);
+ }
+
+ seq_printf(m, " (pid_%s: %5d [%s])",
+ (i==0) ? "first" : "last",
+ obj->pid[i],
+ (tsk) ? tsk->comm : "unknown");
+
+ }
}
static int i915_gem_object_list_info(struct seq_file *m, void *data)
@@ -710,6 +733,7 @@ static void print_error_buffers(struct seq_file *m,
struct drm_i915_error_buffer *err,
int count)
{
+ int i;
seq_printf(m, "%s [%d]:\n", name, count);
while (count--) {
@@ -731,6 +755,27 @@ static void print_error_buffers(struct seq_file *m,
if (err->fence_reg != I915_FENCE_REG_NONE)
seq_printf(m, " (fence: %d)", err->fence_reg);
+ /* Describe 1st and last process to touch the object */
+ for (i=0; i < 2; i++) {
+ struct pid *p;
+ struct task_struct *tsk = NULL;
+
+ /* Skip objects that have no associated pid */
+ if (!err->pid[i])
+ continue;
+
+ p = find_get_pid(err->pid[i]);
+ if (p)
+ tsk = get_pid_task(p, PIDTYPE_PID);
+
+ seq_printf(m, " (pid_%s: %5d [%s])",
+ (i==0) ? "first" : "last",
+ err->pid[i],
+ (tsk) ? tsk->comm : "unknown");
+
+ put_pid(p);
+ }
+
seq_printf(m, "\n");
err++;
}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9689ca3..9711ff0a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -193,6 +193,8 @@ struct drm_i915_error_state {
u32 purgeable:1;
u32 ring:4;
u32 cache_level:2;
+ u32 pid[2];
+ char comm[2][TASK_COMM_LEN];
} *active_bo, *pinned_bo;
u32 active_bo_count, pinned_bo_count;
struct intel_overlay_error_state *overlay;
@@ -891,6 +893,9 @@ struct drm_i915_gem_object {
/** for phy allocated objects */
struct drm_i915_gem_phys_object *phys_obj;
+ /** pid of first and last process to touch the object */
+ uint32_t pid[2];
+
/**
* Number of crtcs where this object is currently the fb, but
* will be page flipped away on the next vblank. When it
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e55badb..135b387 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -988,6 +988,11 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
goto out;
}
+ /* Discover pid of caller process */
+ if (!obj->pid[0])
+ obj->pid[0] = file->pid;
+ obj->pid[1] = file->pid;
+
trace_i915_gem_object_pwrite(obj, args->offset, args->size);
/* We can only do the GTT pwrite on untiled buffers, as otherwise
@@ -1144,6 +1149,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
struct drm_i915_private *dev_priv = dev->dev_private;
struct drm_i915_gem_mmap *args = data;
struct drm_gem_object *obj;
+ struct drm_i915_gem_object *i915_obj;
unsigned long addr;
if (!(dev->driver->driver_features & DRIVER_GEM))
@@ -1158,6 +1164,12 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
return -E2BIG;
}
+ /* Discover pid of caller process */
+ i915_obj = to_intel_bo(obj);
+ if (!i915_obj->pid[0])
+ i915_obj->pid[0] = file->pid;
+ i915_obj->pid[1] = file->pid;
+
down_write(¤t->mm->mmap_sem);
addr = do_mmap(obj->filp, 0, args->size,
PROT_READ | PROT_WRITE, MAP_SHARED,
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 65e1f00..188893d 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1140,6 +1140,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
list_add_tail(&obj->exec_list, &objects);
obj->exec_handle = exec[i].handle;
obj->exec_entry = &exec[i];
+
+ /* Discover pid of caller process */
+ if (!obj->pid[0])
+ obj->pid[0] = file->pid;
+ obj->pid[1] = file->pid;
+
eb_add_object(eb, obj);
}
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5bd4361..4b6fb56 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -789,8 +789,30 @@ static u32 capture_bo_list(struct drm_i915_error_buffer *err,
int i = 0;
list_for_each_entry(obj, head, mm_list) {
+ struct pid *p = NULL;
+ struct task_struct *tsk = NULL;
+ int j;
+
err->size = obj->base.size;
err->name = obj->base.name;
+
+ /* Record processes which touched this object and collect their
+ * names to simplify further analysis.
+ */
+ for (j=0; j < 2; j++) {
+ err->pid[j] = obj->pid[j];
+
+ if (err->pid[j])
+ p = find_get_pid(err->pid[j]);
+ if (p) {
+ tsk = get_pid_task(p, PIDTYPE_PID);
+ put_pid(p);
+ }
+
+ snprintf(err->comm[j], TASK_COMM_LEN,
+ (tsk) ? tsk->comm : "unknown");
+ }
+
err->seqno = obj->last_rendering_seqno;
err->gtt_offset = obj->gtt_offset;
err->read_domains = obj->base.read_domains;
--
1.7.8.1
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] drm/i915: track first and last processes that touch gem objects
2012-02-03 15:31 ` Eugeni Dodonov
@ 2012-02-03 15:49 ` Konstantin Belousov
0 siblings, 0 replies; 8+ messages in thread
From: Konstantin Belousov @ 2012-02-03 15:49 UTC (permalink / raw)
To: Eugeni Dodonov; +Cc: intel-gfx, Ben Widawsky
[-- Attachment #1.1: Type: text/plain, Size: 951 bytes --]
On Fri, Feb 03, 2012 at 01:31:39PM -0200, Eugeni Dodonov wrote:
> This allows to hopefully find out who was responsible for the GPU death.
> We record the 1st and last process to touch each object, to keep track of
> the process which created the object originally and the last process to
> touch it.
>
> To simplify post-mortem analysis, we also search for the processes names
> when gathering the i915_error_state and when peeking at the list of active
> gem objects in debugfs. This is not perfect for tracking all the
> processes, as they can quit or die before their batchbuffers got executed,
> but having to track them during the entire object lifetime would be
> excessively memcpy hungry.
>
> v2: also track objects accessed via mmap or pwrite.
Only writes through GTT mappings are recorded, I think that writes when
the object is in CPU domain are almost impossible to note with the driver
(at least it is so on FreeBSD).
[-- Attachment #1.2: Type: application/pgp-signature, Size: 196 bytes --]
[-- Attachment #2: Type: text/plain, Size: 159 bytes --]
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] drm/i915: track first and last processes that touch gem objects
2012-02-03 14:43 [PATCH 1/1] drm/i915: track first and last processes that touch gem objects Eugeni Dodonov
2012-02-03 14:53 ` Konstantin Belousov
@ 2012-02-03 18:02 ` Chris Wilson
2012-02-06 16:15 ` Daniel Vetter
1 sibling, 1 reply; 8+ messages in thread
From: Chris Wilson @ 2012-02-03 18:02 UTC (permalink / raw)
To: intel-gfx; +Cc: Ben Widawsky, Eugeni Dodonov
On Fri, 3 Feb 2012 12:43:25 -0200, Eugeni Dodonov <eugeni.dodonov@intel.com> wrote:
> This allows to hopefully find out who was responsible for the GPU death.
> We record the 1st and last process to touch each object, to keep track of
> the process which created the object originally and the last process to
> touch it.
>
> To simplify post-mortem analysis, we also search for the processes names
> when gathering the i915_error_state and when peeking at the list of active
> gem objects in debugfs. This is not perfect for tracking all the
> processes, as they can quit or die before their batchbuffers got executed,
> but having to track them during the entire object lifetime would be
> excessively memcpy hungry.
I think you've slightly missed here. Tracking who created a buffer is
interesting and who last used it, but you really need to also track
on whose behalf the request (i.e. each batch) is executing.
For the goal of recording creator, you could just use:
obj->creator = current ? current->pid : 0;
in i915_gem_object_init with 0 as the special value for objects created by
the driver outside of process context. And similarly for i915_add_request,
though I'd associate those with the owner of the file_priv. The important
point here is that a buffer may be associated with multiple batches
submitted by one or more clients before a hang is detected, and so unless
the dispatch pid is tracked you do not know who submitted the erroneous
batch. (Even a batch may be submitted more than once by many clients,
given sufficient pathology.) So adding the request queue to the
i915_error_state would also be interesting, especially with the jiffie
and ring->tail.
Also note that there is no direct link between i915_gem_fault() and usage
of the object, the point at which you want to add the obj->last_used_by
tracking to is domain management - which catches the usage of CPU
mappings as well as move-to-active.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] drm/i915: track first and last processes that touch gem objects
2012-02-03 18:02 ` Chris Wilson
@ 2012-02-06 16:15 ` Daniel Vetter
2012-02-06 22:59 ` Eric Anholt
0 siblings, 1 reply; 8+ messages in thread
From: Daniel Vetter @ 2012-02-06 16:15 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx, Ben Widawsky, Eugeni Dodonov
On Fri, Feb 03, 2012 at 06:02:38PM +0000, Chris Wilson wrote:
> On Fri, 3 Feb 2012 12:43:25 -0200, Eugeni Dodonov <eugeni.dodonov@intel.com> wrote:
> > This allows to hopefully find out who was responsible for the GPU death.
> > We record the 1st and last process to touch each object, to keep track of
> > the process which created the object originally and the last process to
> > touch it.
> >
> > To simplify post-mortem analysis, we also search for the processes names
> > when gathering the i915_error_state and when peeking at the list of active
> > gem objects in debugfs. This is not perfect for tracking all the
> > processes, as they can quit or die before their batchbuffers got executed,
> > but having to track them during the entire object lifetime would be
> > excessively memcpy hungry.
>
> I think you've slightly missed here. Tracking who created a buffer is
> interesting and who last used it, but you really need to also track
> on whose behalf the request (i.e. each batch) is executing.
>
> For the goal of recording creator, you could just use:
>
> obj->creator = current ? current->pid : 0;
>
> in i915_gem_object_init with 0 as the special value for objects created by
> the driver outside of process context. And similarly for i915_add_request,
> though I'd associate those with the owner of the file_priv. The important
> point here is that a buffer may be associated with multiple batches
> submitted by one or more clients before a hang is detected, and so unless
> the dispatch pid is tracked you do not know who submitted the erroneous
> batch. (Even a batch may be submitted more than once by many clients,
> given sufficient pathology.) So adding the request queue to the
> i915_error_state would also be interesting, especially with the jiffie
> and ring->tail.
>
> Also note that there is no direct link between i915_gem_fault() and usage
> of the object, the point at which you want to add the obj->last_used_by
> tracking to is domain management - which catches the usage of CPU
> mappings as well as move-to-active.
I'll second Chris here - I think the interesting stuff is to add some kind
of cheap ownership tracking, not who exactly created the buffer. The
latter is imo only really interesting for resource accounting, and that
would require it to be somewhat more solid. And we don't do any resource
accounting atm anyway.
-Daniel
--
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] drm/i915: track first and last processes that touch gem objects
2012-02-06 16:15 ` Daniel Vetter
@ 2012-02-06 22:59 ` Eric Anholt
2012-02-07 8:49 ` Ben Widawsky
0 siblings, 1 reply; 8+ messages in thread
From: Eric Anholt @ 2012-02-06 22:59 UTC (permalink / raw)
To: Daniel Vetter, Chris Wilson; +Cc: intel-gfx, Ben Widawsky, Eugeni Dodonov
[-- Attachment #1.1: Type: text/plain, Size: 2868 bytes --]
On Mon, 6 Feb 2012 17:15:44 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Fri, Feb 03, 2012 at 06:02:38PM +0000, Chris Wilson wrote:
> > On Fri, 3 Feb 2012 12:43:25 -0200, Eugeni Dodonov <eugeni.dodonov@intel.com> wrote:
> > > This allows to hopefully find out who was responsible for the GPU death.
> > > We record the 1st and last process to touch each object, to keep track of
> > > the process which created the object originally and the last process to
> > > touch it.
> > >
> > > To simplify post-mortem analysis, we also search for the processes names
> > > when gathering the i915_error_state and when peeking at the list of active
> > > gem objects in debugfs. This is not perfect for tracking all the
> > > processes, as they can quit or die before their batchbuffers got executed,
> > > but having to track them during the entire object lifetime would be
> > > excessively memcpy hungry.
> >
> > I think you've slightly missed here. Tracking who created a buffer is
> > interesting and who last used it, but you really need to also track
> > on whose behalf the request (i.e. each batch) is executing.
> >
> > For the goal of recording creator, you could just use:
> >
> > obj->creator = current ? current->pid : 0;
> >
> > in i915_gem_object_init with 0 as the special value for objects created by
> > the driver outside of process context. And similarly for i915_add_request,
> > though I'd associate those with the owner of the file_priv. The important
> > point here is that a buffer may be associated with multiple batches
> > submitted by one or more clients before a hang is detected, and so unless
> > the dispatch pid is tracked you do not know who submitted the erroneous
> > batch. (Even a batch may be submitted more than once by many clients,
> > given sufficient pathology.) So adding the request queue to the
> > i915_error_state would also be interesting, especially with the jiffie
> > and ring->tail.
> >
> > Also note that there is no direct link between i915_gem_fault() and usage
> > of the object, the point at which you want to add the obj->last_used_by
> > tracking to is domain management - which catches the usage of CPU
> > mappings as well as move-to-active.
>
> I'll second Chris here - I think the interesting stuff is to add some kind
> of cheap ownership tracking, not who exactly created the buffer. The
> latter is imo only really interesting for resource accounting, and that
> would require it to be somewhat more solid. And we don't do any resource
> accounting atm anyway.
Having the creator associated with the buffer should be nice. I agree
that for hang debugging, making the pid association part of the request
struct makes more sense than tracking it per-object. With those two, I
don't see much use for "last pwriter/executer" with the buffer.
[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]
[-- Attachment #2: Type: text/plain, Size: 159 bytes --]
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] drm/i915: track first and last processes that touch gem objects
2012-02-06 22:59 ` Eric Anholt
@ 2012-02-07 8:49 ` Ben Widawsky
0 siblings, 0 replies; 8+ messages in thread
From: Ben Widawsky @ 2012-02-07 8:49 UTC (permalink / raw)
To: Eric Anholt; +Cc: intel-gfx, Eugeni Dodonov
[-- Attachment #1.1: Type: text/plain, Size: 3206 bytes --]
On Mon, Feb 06, 2012 at 11:59:11PM +0100, Eric Anholt wrote:
> On Mon, 6 Feb 2012 17:15:44 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
> > On Fri, Feb 03, 2012 at 06:02:38PM +0000, Chris Wilson wrote:
> > > On Fri, 3 Feb 2012 12:43:25 -0200, Eugeni Dodonov <eugeni.dodonov@intel.com> wrote:
> > > > This allows to hopefully find out who was responsible for the GPU death.
> > > > We record the 1st and last process to touch each object, to keep track of
> > > > the process which created the object originally and the last process to
> > > > touch it.
> > > >
> > > > To simplify post-mortem analysis, we also search for the processes names
> > > > when gathering the i915_error_state and when peeking at the list of active
> > > > gem objects in debugfs. This is not perfect for tracking all the
> > > > processes, as they can quit or die before their batchbuffers got executed,
> > > > but having to track them during the entire object lifetime would be
> > > > excessively memcpy hungry.
> > >
> > > I think you've slightly missed here. Tracking who created a buffer is
> > > interesting and who last used it, but you really need to also track
> > > on whose behalf the request (i.e. each batch) is executing.
> > >
> > > For the goal of recording creator, you could just use:
> > >
> > > obj->creator = current ? current->pid : 0;
> > >
> > > in i915_gem_object_init with 0 as the special value for objects created by
> > > the driver outside of process context. And similarly for i915_add_request,
> > > though I'd associate those with the owner of the file_priv. The important
> > > point here is that a buffer may be associated with multiple batches
> > > submitted by one or more clients before a hang is detected, and so unless
> > > the dispatch pid is tracked you do not know who submitted the erroneous
> > > batch. (Even a batch may be submitted more than once by many clients,
> > > given sufficient pathology.) So adding the request queue to the
> > > i915_error_state would also be interesting, especially with the jiffie
> > > and ring->tail.
> > >
> > > Also note that there is no direct link between i915_gem_fault() and usage
> > > of the object, the point at which you want to add the obj->last_used_by
> > > tracking to is domain management - which catches the usage of CPU
> > > mappings as well as move-to-active.
> >
> > I'll second Chris here - I think the interesting stuff is to add some kind
> > of cheap ownership tracking, not who exactly created the buffer. The
> > latter is imo only really interesting for resource accounting, and that
> > would require it to be somewhat more solid. And we don't do any resource
> > accounting atm anyway.
>
> Having the creator associated with the buffer should be nice. I agree
> that for hang debugging, making the pid association part of the request
> struct makes more sense than tracking it per-object. With those two, I
> don't see much use for "last pwriter/executer" with the buffer.
Could I recommend storing drm_file instead of the PID. That is what I
have, and required for forced-throttling. You should be able to get to a
pid from the file descriptor.
[-- Attachment #1.2: Type: application/pgp-signature, Size: 490 bytes --]
[-- Attachment #2: Type: text/plain, Size: 159 bytes --]
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2012-02-07 8:50 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-02-03 14:43 [PATCH 1/1] drm/i915: track first and last processes that touch gem objects Eugeni Dodonov
2012-02-03 14:53 ` Konstantin Belousov
2012-02-03 15:31 ` Eugeni Dodonov
2012-02-03 15:49 ` Konstantin Belousov
2012-02-03 18:02 ` Chris Wilson
2012-02-06 16:15 ` Daniel Vetter
2012-02-06 22:59 ` Eric Anholt
2012-02-07 8:49 ` Ben Widawsky
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.