* [PATCH 1/3] drm/i915: refactor debugfs open function
@ 2011-11-02 11:46 Daniel Vetter
2011-11-02 11:46 ` [PATCH 2/3] drm/i915: refactor debugfs create functions Daniel Vetter
2011-11-02 11:46 ` [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs Daniel Vetter
0 siblings, 2 replies; 10+ messages in thread
From: Daniel Vetter @ 2011-11-02 11:46 UTC (permalink / raw)
To: intel-gfx; +Cc: Daniel Vetter
Only forcewake has an open with special semantics, the other r/w
debugfs only assign the file private pointer.
Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
drivers/gpu/drm/i915/i915_debugfs.c | 26 +++++---------------------
1 files changed, 5 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 5ba63ad..fd0e817 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1328,8 +1328,8 @@ static int i915_gen6_forcewake_count_info(struct seq_file *m, void *data)
}
static int
-i915_wedged_open(struct inode *inode,
- struct file *filp)
+i915_debugfs_common_open(struct inode *inode,
+ struct file *filp)
{
filp->private_data = inode->i_private;
return 0;
@@ -1385,20 +1385,12 @@ i915_wedged_write(struct file *filp,
static const struct file_operations i915_wedged_fops = {
.owner = THIS_MODULE,
- .open = i915_wedged_open,
+ .open = i915_debugfs_common_open,
.read = i915_wedged_read,
.write = i915_wedged_write,
.llseek = default_llseek,
};
-static int
-i915_max_freq_open(struct inode *inode,
- struct file *filp)
-{
- filp->private_data = inode->i_private;
- return 0;
-}
-
static ssize_t
i915_max_freq_read(struct file *filp,
char __user *ubuf,
@@ -1455,20 +1447,12 @@ i915_max_freq_write(struct file *filp,
static const struct file_operations i915_max_freq_fops = {
.owner = THIS_MODULE,
- .open = i915_max_freq_open,
+ .open = i915_debugfs_common_open,
.read = i915_max_freq_read,
.write = i915_max_freq_write,
.llseek = default_llseek,
};
-static int
-i915_cache_sharing_open(struct inode *inode,
- struct file *filp)
-{
- filp->private_data = inode->i_private;
- return 0;
-}
-
static ssize_t
i915_cache_sharing_read(struct file *filp,
char __user *ubuf,
@@ -1534,7 +1518,7 @@ i915_cache_sharing_write(struct file *filp,
static const struct file_operations i915_cache_sharing_fops = {
.owner = THIS_MODULE,
- .open = i915_cache_sharing_open,
+ .open = i915_debugfs_common_open,
.read = i915_cache_sharing_read,
.write = i915_cache_sharing_write,
.llseek = default_llseek,
--
1.7.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 2/3] drm/i915: refactor debugfs create functions
2011-11-02 11:46 [PATCH 1/3] drm/i915: refactor debugfs open function Daniel Vetter
@ 2011-11-02 11:46 ` Daniel Vetter
2011-11-02 11:46 ` [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs Daniel Vetter
1 sibling, 0 replies; 10+ messages in thread
From: Daniel Vetter @ 2011-11-02 11:46 UTC (permalink / raw)
To: intel-gfx; +Cc: Daniel Vetter
All r/w debugfs files are created equal.
Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
drivers/gpu/drm/i915/i915_debugfs.c | 53 ++++++++++------------------------
1 files changed, 16 insertions(+), 37 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index fd0e817..8476441 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1547,21 +1547,6 @@ drm_add_fake_info_node(struct drm_minor *minor,
return 0;
}
-static int i915_wedged_create(struct dentry *root, struct drm_minor *minor)
-{
- struct drm_device *dev = minor->dev;
- struct dentry *ent;
-
- ent = debugfs_create_file("i915_wedged",
- S_IRUGO | S_IWUSR,
- root, dev,
- &i915_wedged_fops);
- if (IS_ERR(ent))
- return PTR_ERR(ent);
-
- return drm_add_fake_info_node(minor, ent, &i915_wedged_fops);
-}
-
static int i915_forcewake_open(struct inode *inode, struct file *file)
{
struct drm_device *dev = inode->i_private;
@@ -1623,34 +1608,22 @@ static int i915_forcewake_create(struct dentry *root, struct drm_minor *minor)
return drm_add_fake_info_node(minor, ent, &i915_forcewake_fops);
}
-static int i915_max_freq_create(struct dentry *root, struct drm_minor *minor)
-{
- struct drm_device *dev = minor->dev;
- struct dentry *ent;
-
- ent = debugfs_create_file("i915_max_freq",
- S_IRUGO | S_IWUSR,
- root, dev,
- &i915_max_freq_fops);
- if (IS_ERR(ent))
- return PTR_ERR(ent);
-
- return drm_add_fake_info_node(minor, ent, &i915_max_freq_fops);
-}
-
-static int i915_cache_sharing_create(struct dentry *root, struct drm_minor *minor)
+static int i915_debugfs_create(struct dentry *root,
+ struct drm_minor *minor,
+ const char *name,
+ const struct file_operations *fops)
{
struct drm_device *dev = minor->dev;
struct dentry *ent;
- ent = debugfs_create_file("i915_cache_sharing",
+ ent = debugfs_create_file(name,
S_IRUGO | S_IWUSR,
root, dev,
- &i915_cache_sharing_fops);
+ fops);
if (IS_ERR(ent))
return PTR_ERR(ent);
- return drm_add_fake_info_node(minor, ent, &i915_cache_sharing_fops);
+ return drm_add_fake_info_node(minor, ent, fops);
}
static struct drm_info_list i915_debugfs_list[] = {
@@ -1699,17 +1672,23 @@ int i915_debugfs_init(struct drm_minor *minor)
{
int ret;
- ret = i915_wedged_create(minor->debugfs_root, minor);
+ ret = i915_debugfs_create(minor->debugfs_root, minor,
+ "i915_wedged",
+ &i915_wedged_fops);
if (ret)
return ret;
ret = i915_forcewake_create(minor->debugfs_root, minor);
if (ret)
return ret;
- ret = i915_max_freq_create(minor->debugfs_root, minor);
+ ret = i915_debugfs_create(minor->debugfs_root, minor,
+ "i915_max_freq",
+ &i915_max_freq_fops);
if (ret)
return ret;
- ret = i915_cache_sharing_create(minor->debugfs_root, minor);
+ ret = i915_debugfs_create(minor->debugfs_root, minor,
+ "i915_cache_sharing",
+ &i915_cache_sharing_fops);
if (ret)
return ret;
--
1.7.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs
2011-11-02 11:46 [PATCH 1/3] drm/i915: refactor debugfs open function Daniel Vetter
2011-11-02 11:46 ` [PATCH 2/3] drm/i915: refactor debugfs create functions Daniel Vetter
@ 2011-11-02 11:46 ` Daniel Vetter
2011-11-03 15:56 ` Eugeni Dodonov
2011-11-05 22:07 ` Ben Widawsky
1 sibling, 2 replies; 10+ messages in thread
From: Daniel Vetter @ 2011-11-02 11:46 UTC (permalink / raw)
To: intel-gfx; +Cc: Daniel Vetter
GPU reset is a very important piece of our infrastructure.
Unfortunately we only really test it by actually hanging the gpu,
which often has bad side-effects for the entire system. And the gpu
hang handling code is one of the rather complicated pieces of code we
have, consisting of
- hang detection
- error capture
- actual gpu reset
- reset of all the gem bookkeeping
- reinitialition of the entire gpu
This patch adds a debugfs to selectively stopping rings by ceasing to
update the hw tail pointer. This way we can exercise the gpu hang code
under controlled conditions without a dying gpu taking down the entire
systems.
Patch motivated by me forgetting to properly reinitialize ppgtt after
a gpu reset.
Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
drivers/gpu/drm/i915/i915_debugfs.c | 63 +++++++++++++++++++++++++++++++
drivers/gpu/drm/i915/i915_drv.c | 3 +
drivers/gpu/drm/i915/i915_drv.h | 2 +
drivers/gpu/drm/i915/intel_ringbuffer.c | 4 ++
4 files changed, 72 insertions(+), 0 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 8476441..9821a3b 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1392,6 +1392,64 @@ static const struct file_operations i915_wedged_fops = {
};
static ssize_t
+i915_ring_stop_read(struct file *filp,
+ char __user *ubuf,
+ size_t max,
+ loff_t *ppos)
+{
+ struct drm_device *dev = filp->private_data;
+ drm_i915_private_t *dev_priv = dev->dev_private;
+ char buf[80];
+ int len;
+
+ len = snprintf(buf, sizeof(buf),
+ "%d\n", dev_priv->stop_rings);
+
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+
+ return simple_read_from_buffer(ubuf, max, ppos, buf, len);
+}
+
+static ssize_t
+i915_ring_stop_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt,
+ loff_t *ppos)
+{
+ struct drm_device *dev = filp->private_data;
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ char buf[20];
+ int val = 0;
+
+ if (cnt > 0) {
+ if (cnt > sizeof(buf) - 1)
+ return -EINVAL;
+
+ if (copy_from_user(buf, ubuf, cnt))
+ return -EFAULT;
+ buf[cnt] = 0;
+
+ val = simple_strtoul(buf, NULL, 0);
+ }
+
+ DRM_DEBUG_DRIVER("Stopping rings %u\n", val);
+
+ mutex_lock(&dev->struct_mutex);
+ dev_priv->stop_rings = val;
+ mutex_unlock(&dev->struct_mutex);
+
+ return cnt;
+}
+
+static const struct file_operations i915_ring_stop_fops = {
+ .owner = THIS_MODULE,
+ .open = i915_debugfs_common_open,
+ .read = i915_ring_stop_read,
+ .write = i915_ring_stop_write,
+ .llseek = default_llseek,
+};
+static ssize_t
i915_max_freq_read(struct file *filp,
char __user *ubuf,
size_t max,
@@ -1691,6 +1749,11 @@ int i915_debugfs_init(struct drm_minor *minor)
&i915_cache_sharing_fops);
if (ret)
return ret;
+ ret = i915_debugfs_create(minor->debugfs_root, minor,
+ "i915_ring_stop",
+ &i915_ring_stop_fops);
+ if (ret)
+ return ret;
return drm_debugfs_create_files(i915_debugfs_list,
I915_DEBUGFS_ENTRIES,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 548e04b..566cc1e 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -611,6 +611,9 @@ int i915_reset(struct drm_device *dev, u8 flags)
if (!mutex_trylock(&dev->struct_mutex))
return -EBUSY;
+ printk("reenabling rings\n");
+ dev_priv->stop_rings = 0;
+
i915_gem_reset(dev);
ret = -ENODEV;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bd98fb3..503ae8c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -330,6 +330,8 @@ typedef struct drm_i915_private {
uint32_t last_instdone;
uint32_t last_instdone1;
+ unsigned int stop_rings;
+
unsigned long cfb_size;
unsigned int cfb_fb;
enum plane cfb_plane;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 3c30dba..ef7a1ca 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1179,7 +1179,11 @@ int intel_ring_begin(struct intel_ring_buffer *ring,
void intel_ring_advance(struct intel_ring_buffer *ring)
{
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
ring->tail &= ring->size - 1;
+ if (dev_priv->stop_rings & intel_ring_flag(ring))
+ return;
ring->write_tail(ring, ring->tail);
}
--
1.7.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs
2011-11-02 11:46 ` [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs Daniel Vetter
@ 2011-11-03 15:56 ` Eugeni Dodonov
2011-11-03 16:10 ` Daniel Vetter
2011-11-05 22:07 ` Ben Widawsky
1 sibling, 1 reply; 10+ messages in thread
From: Eugeni Dodonov @ 2011-11-03 15:56 UTC (permalink / raw)
To: Daniel Vetter; +Cc: intel-gfx
[-- Attachment #1.1: Type: text/plain, Size: 1029 bytes --]
On Wed, Nov 2, 2011 at 09:46, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> GPU reset is a very important piece of our infrastructure.
> Unfortunately we only really test it by actually hanging the gpu,
> which often has bad side-effects for the entire system. And the gpu
> hang handling code is one of the rather complicated pieces of code we
> have, consisting of
> - hang detection
> - error capture
> - actual gpu reset
> - reset of all the gem bookkeeping
> - reinitialition of the entire gpu
>
> This patch adds a debugfs to selectively stopping rings by ceasing to
> update the hw tail pointer. This way we can exercise the gpu hang code
> under controlled conditions without a dying gpu taking down the entire
> systems.
>
> Patch motivated by me forgetting to properly reinitialize ppgtt after
> a gpu reset.
>
> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
>
Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>
Could be handy for debugging, I like it.
--
Eugeni Dodonov
<http://eugeni.dodonov.net/>
[-- Attachment #1.2: Type: text/html, Size: 1498 bytes --]
[-- Attachment #2: Type: text/plain, Size: 159 bytes --]
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs
2011-11-03 15:56 ` Eugeni Dodonov
@ 2011-11-03 16:10 ` Daniel Vetter
2011-11-04 16:23 ` Eugeni Dodonov
0 siblings, 1 reply; 10+ messages in thread
From: Daniel Vetter @ 2011-11-03 16:10 UTC (permalink / raw)
To: Eugeni Dodonov; +Cc: Daniel Vetter, intel-gfx
On Thu, Nov 03, 2011 at 01:56:11PM -0200, Eugeni Dodonov wrote:
> Could be handy for debugging, I like it.
Have you tried actually blowing up your gpu with it? That's where the fun
starts ;-) My gpu-hanger branch contains the latest iteration of this
idea:
http://cgit.freedesktop.org/~danvet/drm/log/?h=gpu-hanger
Cheers, Daniel
--
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs
2011-11-03 16:10 ` Daniel Vetter
@ 2011-11-04 16:23 ` Eugeni Dodonov
2011-11-04 16:31 ` Chris Wilson
0 siblings, 1 reply; 10+ messages in thread
From: Eugeni Dodonov @ 2011-11-04 16:23 UTC (permalink / raw)
To: Daniel Vetter; +Cc: Daniel Vetter, intel-gfx
[-- Attachment #1.1: Type: text/plain, Size: 739 bytes --]
On Thu, Nov 3, 2011 at 14:10, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Thu, Nov 03, 2011 at 01:56:11PM -0200, Eugeni Dodonov wrote:
> > Could be handy for debugging, I like it.
>
> Have you tried actually blowing up your gpu with it? That's where the fun
> starts ;-) My gpu-hanger branch contains the latest iteration of this
> idea:
>
> http://cgit.freedesktop.org/~danvet/drm/log/?h=gpu-hanger
>
>
Yep, it works for me. I can disable and enable rings at will by echo [01] >
/sys/kernel/debug/dri/0/i915_ring_stop
I was using it for working further on that i915_error_state heuristics
idea, outside of intel_decode. It is *very* handy to discover specific
patterns in error_state's.
--
Eugeni Dodonov
<http://eugeni.dodonov.net/>
[-- Attachment #1.2: Type: text/html, Size: 1181 bytes --]
[-- Attachment #2: Type: text/plain, Size: 159 bytes --]
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs
2011-11-04 16:23 ` Eugeni Dodonov
@ 2011-11-04 16:31 ` Chris Wilson
2011-11-04 16:42 ` Eugeni Dodonov
0 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2011-11-04 16:31 UTC (permalink / raw)
To: Eugeni Dodonov, Daniel Vetter; +Cc: Daniel Vetter, intel-gfx
On Fri, 4 Nov 2011 14:23:19 -0200, Eugeni Dodonov <eugeni@dodonov.net> wrote:
> Yep, it works for me. I can disable and enable rings at will by echo [01] >
> /sys/kernel/debug/dri/0/i915_ring_stop
>
> I was using it for working further on that i915_error_state heuristics
> idea, outside of intel_decode. It is *very* handy to discover specific
> patterns in error_state's.
intel_gpu_dump or perchance reading the code ;-)
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs
2011-11-04 16:31 ` Chris Wilson
@ 2011-11-04 16:42 ` Eugeni Dodonov
0 siblings, 0 replies; 10+ messages in thread
From: Eugeni Dodonov @ 2011-11-04 16:42 UTC (permalink / raw)
To: Chris Wilson; +Cc: Daniel Vetter, intel-gfx
[-- Attachment #1.1: Type: text/plain, Size: 973 bytes --]
On Fri, Nov 4, 2011 at 14:31, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Fri, 4 Nov 2011 14:23:19 -0200, Eugeni Dodonov <eugeni@dodonov.net>
> wrote:
> > Yep, it works for me. I can disable and enable rings at will by echo
> [01] >
> > /sys/kernel/debug/dri/0/i915_ring_stop
> >
> > I was using it for working further on that i915_error_state heuristics
> > idea, outside of intel_decode. It is *very* handy to discover specific
> > patterns in error_state's.
>
> intel_gpu_dump or perchance reading the code ;-)
>
I mean, I am using it in a combination with the intel_gpu_dump, based on
the original idea which came from
http://comments.gmane.org/gmane.comp.freedesktop.xorg.drivers.intel/6408,
only outside of the real intel_decode routine.
But yes, reading the code is the best way, the overall idea here is to at
least indicate which code to read in the first place (kernel or ddx or mesa
or va or....) :).
--
Eugeni Dodonov
<http://eugeni.dodonov.net/>
[-- Attachment #1.2: Type: text/html, Size: 1474 bytes --]
[-- Attachment #2: Type: text/plain, Size: 159 bytes --]
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs
2011-11-02 11:46 ` [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs Daniel Vetter
2011-11-03 15:56 ` Eugeni Dodonov
@ 2011-11-05 22:07 ` Ben Widawsky
2011-11-05 22:30 ` Daniel Vetter
1 sibling, 1 reply; 10+ messages in thread
From: Ben Widawsky @ 2011-11-05 22:07 UTC (permalink / raw)
To: Daniel Vetter; +Cc: intel-gfx
On Wed, 2 Nov 2011 12:46:36 +0100
Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> GPU reset is a very important piece of our infrastructure.
> Unfortunately we only really test it by actually hanging the gpu,
> which often has bad side-effects for the entire system. And the gpu
> hang handling code is one of the rather complicated pieces of code we
> have, consisting of
> - hang detection
> - error capture
> - actual gpu reset
> - reset of all the gem bookkeeping
> - reinitialition of the entire gpu
>
> This patch adds a debugfs to selectively stopping rings by ceasing to
> update the hw tail pointer. This way we can exercise the gpu hang code
> under controlled conditions without a dying gpu taking down the entire
> systems.
>
> Patch motivated by me forgetting to properly reinitialize ppgtt after
> a gpu reset.
>
> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
I like the concept here very much. One thing which I thought might be
interesting would to allow hanging rings temporarily instead of
automatically hanging until reset on the write of the debugfs entry. I
think if we could start dramatically altering the time it takes to get
commands into the ring would really test our buffer dependency tracking.
The obvious target here to me is hang the media ring for say 33ms, and
see what happens to the system.
Also, maybe I missed something but could you explain how you use this
differently than i915_wedged?
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 63 +++++++++++++++++++++++++++++++
> drivers/gpu/drm/i915/i915_drv.c | 3 +
> drivers/gpu/drm/i915/i915_drv.h | 2 +
> drivers/gpu/drm/i915/intel_ringbuffer.c | 4 ++
> 4 files changed, 72 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 8476441..9821a3b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1392,6 +1392,64 @@ static const struct file_operations i915_wedged_fops = {
> };
>
> static ssize_t
> +i915_ring_stop_read(struct file *filp,
> + char __user *ubuf,
> + size_t max,
> + loff_t *ppos)
> +{
> + struct drm_device *dev = filp->private_data;
> + drm_i915_private_t *dev_priv = dev->dev_private;
> + char buf[80];
> + int len;
> +
> + len = snprintf(buf, sizeof(buf),
> + "%d\n", dev_priv->stop_rings);
> +
> + if (len > sizeof(buf))
> + len = sizeof(buf);
> +
> + return simple_read_from_buffer(ubuf, max, ppos, buf, len);
> +}
buf[80] seems a little excessive.
> +
> +static ssize_t
> +i915_ring_stop_write(struct file *filp,
> + const char __user *ubuf,
> + size_t cnt,
> + loff_t *ppos)
> +{
> + struct drm_device *dev = filp->private_data;
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + char buf[20];
> + int val = 0;
> +
> + if (cnt > 0) {
> + if (cnt > sizeof(buf) - 1)
> + return -EINVAL;
> +
> + if (copy_from_user(buf, ubuf, cnt))
> + return -EFAULT;
> + buf[cnt] = 0;
> +
> + val = simple_strtoul(buf, NULL, 0);
> + }
> +
> + DRM_DEBUG_DRIVER("Stopping rings %u\n", val);
> +
> + mutex_lock(&dev->struct_mutex);
> + dev_priv->stop_rings = val;
> + mutex_unlock(&dev->struct_mutex);
> +
> + return cnt;
> +}
> +
I think an atomic takes away the need for struct_mutex, unless you plan
to do more.
> +static const struct file_operations i915_ring_stop_fops = {
> + .owner = THIS_MODULE,
> + .open = i915_debugfs_common_open,
> + .read = i915_ring_stop_read,
> + .write = i915_ring_stop_write,
> + .llseek = default_llseek,
> +};
> +static ssize_t
> i915_max_freq_read(struct file *filp,
> char __user *ubuf,
> size_t max,
> @@ -1691,6 +1749,11 @@ int i915_debugfs_init(struct drm_minor *minor)
> &i915_cache_sharing_fops);
> if (ret)
> return ret;
> + ret = i915_debugfs_create(minor->debugfs_root, minor,
> + "i915_ring_stop",
> + &i915_ring_stop_fops);
> + if (ret)
> + return ret;
>
> return drm_debugfs_create_files(i915_debugfs_list,
> I915_DEBUGFS_ENTRIES,
I think the fact that you're stopping rings is an implementation detail,
and the name doesn't sounds nearly dangerous enough. I'd just call this
i915_hang_gpu, or something like that. Unless of course you plan
something like what I mentioned at the top about being able to restart
the ring.
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 548e04b..566cc1e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -611,6 +611,9 @@ int i915_reset(struct drm_device *dev, u8 flags)
> if (!mutex_trylock(&dev->struct_mutex))
> return -EBUSY;
>
> + printk("reenabling rings\n");
> + dev_priv->stop_rings = 0;
> +
> i915_gem_reset(dev);
>
> ret = -ENODEV;
Probably shouldn't be using printk, just to keep things consistent with
everything else. DRM_DEBUG_DRIVER
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index bd98fb3..503ae8c 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -330,6 +330,8 @@ typedef struct drm_i915_private {
> uint32_t last_instdone;
> uint32_t last_instdone1;
>
> + unsigned int stop_rings;
> +
> unsigned long cfb_size;
> unsigned int cfb_fb;
> enum plane cfb_plane;
bool? Also I think you could consider treating this per ring.
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 3c30dba..ef7a1ca 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1179,7 +1179,11 @@ int intel_ring_begin(struct intel_ring_buffer *ring,
>
> void intel_ring_advance(struct intel_ring_buffer *ring)
> {
> + struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +
> ring->tail &= ring->size - 1;
> + if (dev_priv->stop_rings & intel_ring_flag(ring))
> + return;
> ring->write_tail(ring, ring->tail);
> }
>
Maybe wrap the if with "unlikely?"
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs
2011-11-05 22:07 ` Ben Widawsky
@ 2011-11-05 22:30 ` Daniel Vetter
0 siblings, 0 replies; 10+ messages in thread
From: Daniel Vetter @ 2011-11-05 22:30 UTC (permalink / raw)
To: Ben Widawsky; +Cc: Daniel Vetter, intel-gfx
On Sat, Nov 05, 2011 at 03:07:45PM -0700, Ben Widawsky wrote:
> On Wed, 2 Nov 2011 12:46:36 +0100
> Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
>
> > GPU reset is a very important piece of our infrastructure.
> > Unfortunately we only really test it by actually hanging the gpu,
> > which often has bad side-effects for the entire system. And the gpu
> > hang handling code is one of the rather complicated pieces of code we
> > have, consisting of
> > - hang detection
> > - error capture
> > - actual gpu reset
> > - reset of all the gem bookkeeping
> > - reinitialition of the entire gpu
> >
> > This patch adds a debugfs to selectively stopping rings by ceasing to
> > update the hw tail pointer. This way we can exercise the gpu hang code
> > under controlled conditions without a dying gpu taking down the entire
> > systems.
> >
> > Patch motivated by me forgetting to properly reinitialize ppgtt after
> > a gpu reset.
> >
> > Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
>
> I like the concept here very much. One thing which I thought might be
> interesting would to allow hanging rings temporarily instead of
> automatically hanging until reset on the write of the debugfs entry. I
> think if we could start dramatically altering the time it takes to get
> commands into the ring would really test our buffer dependency tracking.
> The obvious target here to me is hang the media ring for say 33ms, and
> see what happens to the system.
>
> Also, maybe I missed something but could you explain how you use this
> differently than i915_wedged?
I fail at writing decent patch descriptions. The difference wrt
i915_wedged is the essence of the idea ;-) Let's try again:
The idea is to simulate a gpu hang as accurately as possible without
actually hanging the gpu (or touching it much in any way). Because when
the gpu is dead for real it has the tendency to take down the entire
systems. One way to do that is to simply set the wedged state and manually
call the error_handler, like i915_wedged does. This leaves out all the
hangcheck detection code, though. And it runs the error_capture in a
completely different context.
So we need a way to stop the gpu without actually touching it and without
faking anything in our gem code. The idea used in this patch is to simply
cease to update the ringbuffer tail pointer. This way the gpu will simply
stop processing commands, which the hangcheck eventually notices because
the gpu head pointer doesn't advance anymore. Then the entire gpu hang
machinery gets fired off as if the gpu has died for real (but hopefully
with a massively increased chance that we survive the gpu reset).
>
> > ---
> > drivers/gpu/drm/i915/i915_debugfs.c | 63 +++++++++++++++++++++++++++++++
> > drivers/gpu/drm/i915/i915_drv.c | 3 +
> > drivers/gpu/drm/i915/i915_drv.h | 2 +
> > drivers/gpu/drm/i915/intel_ringbuffer.c | 4 ++
> > 4 files changed, 72 insertions(+), 0 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index 8476441..9821a3b 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -1392,6 +1392,64 @@ static const struct file_operations i915_wedged_fops = {
> > };
> >
> > static ssize_t
> > +i915_ring_stop_read(struct file *filp,
> > + char __user *ubuf,
> > + size_t max,
> > + loff_t *ppos)
> > +{
> > + struct drm_device *dev = filp->private_data;
> > + drm_i915_private_t *dev_priv = dev->dev_private;
> > + char buf[80];
> > + int len;
> > +
> > + len = snprintf(buf, sizeof(buf),
> > + "%d\n", dev_priv->stop_rings);
> > +
> > + if (len > sizeof(buf))
> > + len = sizeof(buf);
> > +
> > + return simple_read_from_buffer(ubuf, max, ppos, buf, len);
> > +}
>
> buf[80] seems a little excessive.
Oops, copy&paste. I'll shrink it a bit ...
>
> > +
> > +static ssize_t
> > +i915_ring_stop_write(struct file *filp,
> > + const char __user *ubuf,
> > + size_t cnt,
> > + loff_t *ppos)
> > +{
> > + struct drm_device *dev = filp->private_data;
> > + struct drm_i915_private *dev_priv = dev->dev_private;
> > + char buf[20];
> > + int val = 0;
> > +
> > + if (cnt > 0) {
> > + if (cnt > sizeof(buf) - 1)
> > + return -EINVAL;
> > +
> > + if (copy_from_user(buf, ubuf, cnt))
> > + return -EFAULT;
> > + buf[cnt] = 0;
> > +
> > + val = simple_strtoul(buf, NULL, 0);
> > + }
> > +
> > + DRM_DEBUG_DRIVER("Stopping rings %u\n", val);
> > +
> > + mutex_lock(&dev->struct_mutex);
> > + dev_priv->stop_rings = val;
> > + mutex_unlock(&dev->struct_mutex);
> > +
> > + return cnt;
> > +}
> > +
>
> I think an atomic takes away the need for struct_mutex, unless you plan
> to do more.
atomic_t would require atomic ops on the read side. I don't care about
locking on the read side as long as the new value will eventually show up,
so I've opted for the unlocked (and fastest) option for reads.
>
> > +static const struct file_operations i915_ring_stop_fops = {
> > + .owner = THIS_MODULE,
> > + .open = i915_debugfs_common_open,
> > + .read = i915_ring_stop_read,
> > + .write = i915_ring_stop_write,
> > + .llseek = default_llseek,
> > +};
> > +static ssize_t
> > i915_max_freq_read(struct file *filp,
> > char __user *ubuf,
> > size_t max,
> > @@ -1691,6 +1749,11 @@ int i915_debugfs_init(struct drm_minor *minor)
> > &i915_cache_sharing_fops);
> > if (ret)
> > return ret;
> > + ret = i915_debugfs_create(minor->debugfs_root, minor,
> > + "i915_ring_stop",
> > + &i915_ring_stop_fops);
> > + if (ret)
> > + return ret;
> >
> > return drm_debugfs_create_files(i915_debugfs_list,
> > I915_DEBUGFS_ENTRIES,
>
> I think the fact that you're stopping rings is an implementation detail,
> and the name doesn't sounds nearly dangerous enough. I'd just call this
> i915_hang_gpu, or something like that. Unless of course you plan
> something like what I mentioned at the top about being able to restart
> the ring.
Because our current hangcheck code only checks the render ring I've wanted
to stop rings individually (so we could check hangcheck extensions to the
other rings). Adding an extension to restart the rings sounds good, it
would just require to redo the last tail writes that got dropped.
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> > index 548e04b..566cc1e 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -611,6 +611,9 @@ int i915_reset(struct drm_device *dev, u8 flags)
> > if (!mutex_trylock(&dev->struct_mutex))
> > return -EBUSY;
> >
> > + printk("reenabling rings\n");
> > + dev_priv->stop_rings = 0;
> > +
> > i915_gem_reset(dev);
> >
> > ret = -ENODEV;
>
> Probably shouldn't be using printk, just to keep things consistent with
> everything else. DRM_DEBUG_DRIVER
Oops, debug leftover, I'll change this.
>
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index bd98fb3..503ae8c 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -330,6 +330,8 @@ typedef struct drm_i915_private {
> > uint32_t last_instdone;
> > uint32_t last_instdone1;
> >
> > + unsigned int stop_rings;
> > +
> > unsigned long cfb_size;
> > unsigned int cfb_fb;
> > enum plane cfb_plane;
>
> bool? Also I think you could consider treating this per ring.
It's a flag array, so already per-ring. See below.
>
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index 3c30dba..ef7a1ca 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -1179,7 +1179,11 @@ int intel_ring_begin(struct intel_ring_buffer *ring,
> >
> > void intel_ring_advance(struct intel_ring_buffer *ring)
> > {
> > + struct drm_i915_private *dev_priv = ring->dev->dev_private;
> > +
> > ring->tail &= ring->size - 1;
> > + if (dev_priv->stop_rings & intel_ring_flag(ring))
> > + return;
> > ring->write_tail(ring, ring->tail);
> > }
> >
>
> Maybe wrap the if with "unlikely?"
Yeah.
--
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2011-11-05 22:29 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-11-02 11:46 [PATCH 1/3] drm/i915: refactor debugfs open function Daniel Vetter
2011-11-02 11:46 ` [PATCH 2/3] drm/i915: refactor debugfs create functions Daniel Vetter
2011-11-02 11:46 ` [PATCH 3/3] [RFC] drm/i915: add interface to simulate gpu hangs Daniel Vetter
2011-11-03 15:56 ` Eugeni Dodonov
2011-11-03 16:10 ` Daniel Vetter
2011-11-04 16:23 ` Eugeni Dodonov
2011-11-04 16:31 ` Chris Wilson
2011-11-04 16:42 ` Eugeni Dodonov
2011-11-05 22:07 ` Ben Widawsky
2011-11-05 22:30 ` Daniel Vetter
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.