* [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
@ 2012-04-22 22:18 Marcin Slusarz
2012-04-23 8:43 ` Martin Peres
2012-04-23 16:46 ` Martin Peres
0 siblings, 2 replies; 9+ messages in thread
From: Marcin Slusarz @ 2012-04-22 22:18 UTC (permalink / raw)
To: nouveau, dri-devel, Ben Skeggs
Overall idea:
Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
handle them at ioctl level, reset the GPU and repeat last ioctl.
GPU reset is done by doing suspend / resume cycle with few tweaks:
- CPU-only bo eviction
- ignoring vm flush / fence timeouts
- shortening waits
Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
---
Tested only on nv92.
---
drivers/gpu/drm/nouveau/Makefile | 2 +-
drivers/gpu/drm/nouveau/nouveau_bo.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_channel.c | 5 +-
drivers/gpu/drm/nouveau/nouveau_drv.c | 3 +-
drivers/gpu/drm/nouveau/nouveau_drv.h | 33 ++++++-
drivers/gpu/drm/nouveau/nouveau_fence.c | 7 +-
drivers/gpu/drm/nouveau/nouveau_gem.c | 14 +++-
drivers/gpu/drm/nouveau/nouveau_notifier.c | 3 +
drivers/gpu/drm/nouveau/nouveau_object.c | 6 +
drivers/gpu/drm/nouveau/nouveau_reset.c | 144 ++++++++++++++++++++++++++++
drivers/gpu/drm/nouveau/nouveau_state.c | 5 +
drivers/gpu/drm/nouveau/nv50_graph.c | 11 +-
12 files changed, 221 insertions(+), 14 deletions(-)
create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c
diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
index 03860f5..77d0c33 100644
--- a/drivers/gpu/drm/nouveau/Makefile
+++ b/drivers/gpu/drm/nouveau/Makefile
@@ -9,7 +9,7 @@ nouveau-y := nouveau_drv.o nouveau_state.o nouveau_channel.o nouveau_mem.o \
nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
- nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
+ nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_temp.o \
nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
nv04_timer.o \
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 5b0dc50..7de6cad 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -936,7 +936,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict, bool intr,
}
/* Software copy if the card isn't up and running yet. */
- if (!dev_priv->channel) {
+ if (!dev_priv->channel || nouveau_gpu_reset_in_progress(dev_priv->dev)) {
ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, no_wait_gpu, new_mem);
goto out;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_channel.c b/drivers/gpu/drm/nouveau/nouveau_channel.c
index 846afb0..c0fa5a7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_channel.c
+++ b/drivers/gpu/drm/nouveau/nouveau_channel.c
@@ -420,7 +420,7 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
init->fb_ctxdma_handle,
init->tt_ctxdma_handle);
if (ret)
- return ret;
+ goto out;
init->channel = chan->id;
if (nouveau_vram_pushbuf == 0) {
@@ -450,6 +450,9 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
if (ret == 0)
atomic_inc(&chan->users); /* userspace reference */
nouveau_channel_put(&chan);
+out:
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 090fff6..22c435f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -237,7 +237,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state)
if (!dev_priv->eng[e])
continue;
- ret = dev_priv->eng[e]->fini(dev, e, true);
+ ret = dev_priv->eng[e]->fini(dev, e, !nouveau_gpu_reset_in_progress(dev));
if (ret) {
NV_ERROR(dev, "... engine %d failed: %d\n", e, ret);
goto out_abort;
@@ -483,6 +483,7 @@ static struct drm_driver driver = {
.disable_vblank = nouveau_vblank_disable,
.reclaim_buffers = drm_core_reclaim_buffers,
.ioctls = nouveau_ioctls,
+ .ioctls_need_rwsem = true,
.fops = &nouveau_driver_fops,
.gem_init_object = nouveau_gem_object_new,
.gem_free_object = nouveau_gem_object_del,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index d120baf..01500e1 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -708,6 +708,10 @@ struct drm_nouveau_private {
struct drm_device *dev;
bool noaccel;
+ struct mutex reset_lock;
+ atomic_t gpureset_in_progress;
+ unsigned long last_gpu_reset;
+
/* the card type, takes NV_* as values */
enum nouveau_card_type card_type;
/* exact chipset, derived from NV_PMC_BOOT_0 */
@@ -841,6 +845,7 @@ struct drm_nouveau_private {
struct {
struct dentry *channel_root;
+ struct dentry *reset;
} debugfs;
struct nouveau_fbdev *nfbdev;
@@ -1537,6 +1542,20 @@ int nouveau_display_dumb_map_offset(struct drm_file *, struct drm_device *,
uint32_t handle, uint64_t *offset);
int nouveau_display_dumb_destroy(struct drm_file *, struct drm_device *,
uint32_t handle);
+/* nouveau_reset.c */
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+void nouveau_reset_debugfs_fini(struct drm_minor *minor);
+void nouveau_reset_debugfs_init(struct drm_minor *minor);
+#else
+static inline void nouveau_reset_debugfs_fini(struct drm_minor *minor) {}
+static inline void nouveau_reset_debugfs_init(struct drm_minor *minor) {}
+#endif
+int nouveau_reset_device(struct drm_device *dev);
+static inline bool nouveau_gpu_reset_in_progress(struct drm_device *dev)
+{
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+ return atomic_read(&dev_priv->gpureset_in_progress) != 0;
+}
/* nv10_gpio.c */
int nv10_gpio_init(struct drm_device *dev);
@@ -1632,12 +1651,20 @@ static inline void nv_wr08(struct drm_device *dev, unsigned reg, u8 val)
iowrite8(val, dev_priv->mmio + reg);
}
+static inline uint64_t nv_timeout(struct drm_device *dev)
+{
+ uint64_t tm = 2000000000ULL;
+ if (nouveau_gpu_reset_in_progress(dev))
+ tm /= 40; /* 50ms */
+ return tm;
+}
+
#define nv_wait(dev, reg, mask, val) \
- nouveau_wait_eq(dev, 2000000000ULL, (reg), (mask), (val))
+ nouveau_wait_eq(dev, nv_timeout(dev), (reg), (mask), (val))
#define nv_wait_ne(dev, reg, mask, val) \
- nouveau_wait_ne(dev, 2000000000ULL, (reg), (mask), (val))
+ nouveau_wait_ne(dev, nv_timeout(dev), (reg), (mask), (val))
#define nv_wait_cb(dev, func, data) \
- nouveau_wait_cb(dev, 2000000000ULL, (func), (data))
+ nouveau_wait_cb(dev, nv_timeout(dev), (func), (data))
/* PRAMIN access */
static inline u32 nv_ri32(struct drm_device *dev, unsigned offset)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 59f92e9..8c973ab 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -233,17 +233,22 @@ int
__nouveau_fence_wait(void *sync_obj, void *sync_arg, bool lazy, bool intr)
{
struct nouveau_fence *fence = nouveau_fence(sync_obj);
+ struct drm_device *dev = fence->channel->dev;
unsigned long timeout = fence->emitted_at + 3 * DRM_HZ;
unsigned long sleep_time = NSEC_PER_MSEC / 1000;
ktime_t t;
int ret = 0;
+ if (nouveau_gpu_reset_in_progress(dev))
+ timeout = fence->emitted_at + DRM_HZ / 5;
+
while (1) {
if (__nouveau_fence_signalled(sync_obj, sync_arg))
break;
if (time_after_eq(jiffies, timeout)) {
- ret = -EBUSY;
+ if (!nouveau_gpu_reset_in_progress(dev))
+ ret = -EIO;
break;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index ed52a6f..f9bbcc0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -214,7 +214,7 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
req->info.domain, req->info.tile_mode,
req->info.tile_flags, &nvbo);
if (ret)
- return ret;
+ goto out;
ret = drm_gem_handle_create(file_priv, nvbo->gem, &req->info.handle);
if (ret == 0) {
@@ -225,6 +225,9 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
/* drop reference from allocate - handle holds it now */
drm_gem_object_unreference_unlocked(nvbo->gem);
+out:
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
@@ -804,6 +807,9 @@ out_next:
}
nouveau_channel_put(&chan);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
@@ -839,6 +845,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
spin_unlock(&nvbo->bo.bdev->fence_lock);
drm_gem_object_unreference_unlocked(gem);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
@@ -863,6 +872,9 @@ nouveau_gem_ioctl_info(struct drm_device *dev, void *data,
ret = nouveau_gem_info(file_priv, gem, req);
drm_gem_object_unreference_unlocked(gem);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_notifier.c b/drivers/gpu/drm/nouveau/nouveau_notifier.c
index 2ef883c..e224b1c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_notifier.c
+++ b/drivers/gpu/drm/nouveau/nouveau_notifier.c
@@ -200,5 +200,8 @@ nouveau_ioctl_notifier_alloc(struct drm_device *dev, void *data,
ret = nouveau_notifier_alloc(chan, na->handle, na->size, 0, 0x1000,
&na->offset);
nouveau_channel_put(&chan);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_object.c b/drivers/gpu/drm/nouveau/nouveau_object.c
index cc419fa..ba592b0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_object.c
+++ b/drivers/gpu/drm/nouveau/nouveau_object.c
@@ -973,6 +973,9 @@ int nouveau_ioctl_grobj_alloc(struct drm_device *dev, void *data,
out:
nouveau_channel_put(&chan);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
@@ -992,6 +995,9 @@ int nouveau_ioctl_gpuobj_free(struct drm_device *dev, void *data,
ret = nouveau_ramht_remove(chan, objfree->handle);
nouveau_channel_put(&chan);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c b/drivers/gpu/drm/nouveau/nouveau_reset.c
new file mode 100644
index 0000000..93af3a1
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2012 Marcin Slusarz <marcin.slusarz@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include "drmP.h"
+#include "nouveau_drv.h"
+
+static bool off(struct drm_device *dev)
+{
+ struct pci_dev *pdev = dev->pdev;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ pm_message_t pmm = { .event = PM_EVENT_SUSPEND };
+ atomic_inc(&dev_priv->gpureset_in_progress);
+ down_write(&dev->ioctls_rwsem);
+
+ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+ if (nouveau_pci_suspend(pdev, pmm))
+ goto fail;
+
+ dev->switch_power_state = DRM_SWITCH_POWER_OFF;
+ return true;
+
+fail:
+ dev->switch_power_state = DRM_SWITCH_POWER_ON;
+ up_write(&dev->ioctls_rwsem);
+ return false;
+}
+
+static void on(struct drm_device *dev)
+{
+ struct pci_dev *pdev = dev->pdev;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+ atomic_dec(&dev_priv->gpureset_in_progress);
+ nouveau_pci_resume(pdev);
+ dev->switch_power_state = DRM_SWITCH_POWER_ON;
+
+ dev_priv->last_gpu_reset = jiffies;
+ up_write(&dev->ioctls_rwsem);
+}
+
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+static ssize_t nouveau_reset_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct drm_device *dev = filp->private_data;
+ char usercmd[2];
+ if (cnt > 2)
+ cnt = 2;
+
+ if (copy_from_user(usercmd, ubuf, cnt))
+ return -EFAULT;
+
+ if (usercmd[0] == '1') {
+ down_read(&dev->ioctls_rwsem);
+ nouveau_reset_device(dev);
+ up_read(&dev->ioctls_rwsem);
+ }
+
+ return cnt;
+}
+
+static const struct file_operations nouveau_reset_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .write = nouveau_reset_write,
+ .llseek = noop_llseek,
+};
+
+void nouveau_reset_debugfs_fini(struct drm_minor *minor)
+{
+ struct drm_device *dev = minor->dev;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ if (dev_priv->debugfs.reset) {
+ debugfs_remove(dev_priv->debugfs.reset);
+ dev_priv->debugfs.reset = NULL;
+ }
+}
+
+
+void nouveau_reset_debugfs_init(struct drm_minor *minor)
+{
+ struct drm_device *dev = minor->dev;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ dev_priv->debugfs.reset = debugfs_create_file("reset", 0200,
+ minor->debugfs_root, dev, &nouveau_reset_fops);
+ if (IS_ERR_OR_NULL(dev_priv->debugfs.reset))
+ dev_priv->debugfs.reset = NULL;
+
+}
+#endif
+
+int nouveau_reset_device(struct drm_device *dev)
+{
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ if (mutex_trylock(&dev_priv->reset_lock) == 0)
+ /* gpu reset in progress */
+ goto out;
+
+ if (time_after_eq(jiffies, dev_priv->last_gpu_reset + 10 * DRM_HZ)) {
+ unsigned long start, end;
+
+ up_read(&dev->ioctls_rwsem);
+ NV_INFO(dev, "GPU lockup detected, resetting...\n");
+ start = jiffies;
+ while (!off(dev))
+ ;
+ on(dev);
+ end = jiffies;
+ NV_INFO(dev, "GPU reset done, took %lu s\n", (end - start) / DRM_HZ);
+ down_read(&dev->ioctls_rwsem);
+ }
+ mutex_unlock(&dev_priv->reset_lock);
+
+out:
+ return -EAGAIN;
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
index afec760..2e981a8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -697,6 +697,7 @@ nouveau_card_init(struct drm_device *dev)
if (ret)
goto out;
engine = &dev_priv->engine;
+ mutex_init(&dev_priv->reset_lock);
spin_lock_init(&dev_priv->channels.lock);
spin_lock_init(&dev_priv->tile.lock);
spin_lock_init(&dev_priv->context_switch_lock);
@@ -886,6 +887,7 @@ nouveau_card_init(struct drm_device *dev)
nouveau_fbcon_init(dev);
}
+ nouveau_reset_debugfs_init(dev->primary);
return 0;
@@ -943,6 +945,8 @@ static void nouveau_card_takedown(struct drm_device *dev)
struct nouveau_engine *engine = &dev_priv->engine;
int e;
+ nouveau_reset_debugfs_fini(dev->primary);
+
if (dev->mode_config.num_crtc) {
nouveau_fbcon_fini(dev);
nouveau_display_fini(dev);
@@ -1129,6 +1133,7 @@ int nouveau_load(struct drm_device *dev, unsigned long flags)
}
dev->dev_private = dev_priv;
dev_priv->dev = dev;
+ atomic_set(&dev_priv->gpureset_in_progress, 0);
pci_set_master(dev->pdev);
diff --git a/drivers/gpu/drm/nouveau/nv50_graph.c b/drivers/gpu/drm/nouveau/nv50_graph.c
index a61853f..d0a2e50 100644
--- a/drivers/gpu/drm/nouveau/nv50_graph.c
+++ b/drivers/gpu/drm/nouveau/nv50_graph.c
@@ -440,13 +440,14 @@ nv84_graph_tlb_flush(struct drm_device *dev, int engine)
ret = -ERESTARTSYS;
break;
}
- } while (!idle && !(timeout = ptimer->read(dev) - start > 2000000000));
+ } while (!idle && !(timeout = ptimer->read(dev) - start > nv_timeout(dev)));
if (timeout) {
- NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
- "0x%08x 0x%08x 0x%08x 0x%08x\n",
- nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
- nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
+ if (!nouveau_gpu_reset_in_progress(dev))
+ NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
+ "0x%08x 0x%08x 0x%08x 0x%08x\n",
+ nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
+ nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
ret = -EIO;
}
--
1.7.8.5
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
2012-04-22 22:18 [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery Marcin Slusarz
@ 2012-04-23 8:43 ` Martin Peres
2012-04-23 16:32 ` Marcin Slusarz
2012-04-23 16:46 ` Martin Peres
1 sibling, 1 reply; 9+ messages in thread
From: Martin Peres @ 2012-04-23 8:43 UTC (permalink / raw)
To: dri-devel
Le 23/04/2012 00:18, Marcin Slusarz a écrit :
> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
>
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
>
> Signed-off-by: Marcin Slusarz<marcin.slusarz@gmail.com>
> ---
> Tested only on nv92.
Hi Marcin,
I'm really busy at the moment but I'm glad to see such patches coming out.
I'll try them out ASAP!
Do you have a recommended way to test your patch set?
Thanks,
Martin
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
2012-04-23 8:43 ` Martin Peres
@ 2012-04-23 16:32 ` Marcin Slusarz
[not found] ` <20120423163257.GA2886-OI9uyE9O0yo@public.gmane.org>
2012-04-25 11:06 ` [Nouveau] " Christoph Bumiller
0 siblings, 2 replies; 9+ messages in thread
From: Marcin Slusarz @ 2012-04-23 16:32 UTC (permalink / raw)
To: Martin Peres; +Cc: nouveau, dri-devel
On Mon, Apr 23, 2012 at 10:43:08AM +0200, Martin Peres wrote:
> Le 23/04/2012 00:18, Marcin Slusarz a écrit :
> > Overall idea:
> > Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> > handle them at ioctl level, reset the GPU and repeat last ioctl.
> >
> > GPU reset is done by doing suspend / resume cycle with few tweaks:
> > - CPU-only bo eviction
> > - ignoring vm flush / fence timeouts
> > - shortening waits
> >
> > Signed-off-by: Marcin Slusarz<marcin.slusarz@gmail.com>
> > ---
> > Tested only on nv92.
> Hi Marcin,
>
> I'm really busy at the moment but I'm glad to see such patches coming out.
> I'll try them out ASAP!
>
> Do you have a recommended way to test your patch set?
Just run piglit. Even "quick" tests can cause ~5 lockups (it eventually messes
up DDX channel, but this patchset can't fix this case).
You can run fs-discard-exit-2 test first - for me it causes instant GPU lockup.
Marcin
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 9+ messages in thread[parent not found: <20120423163257.GA2886-OI9uyE9O0yo@public.gmane.org>]
* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
[not found] ` <20120423163257.GA2886-OI9uyE9O0yo@public.gmane.org>
@ 2012-04-23 16:56 ` Martin Peres
2012-04-24 19:31 ` Marcin Slusarz
0 siblings, 1 reply; 9+ messages in thread
From: Martin Peres @ 2012-04-23 16:56 UTC (permalink / raw)
To: Marcin Slusarz
Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Le 23/04/2012 18:32, Marcin Slusarz a écrit :
>
> Just run piglit. Even "quick" tests can cause ~5 lockups (it eventually messes
> up DDX channel, but this patchset can't fix this case).
> You can run fs-discard-exit-2 test first - for me it causes instant GPU lockup.
>
> Marcin
Great, Thanks.
Did you have a look at
https://bugs.freedesktop.org/show_bug.cgi?id=40886 and
http://xorg.freedesktop.org/wiki/SummerOfCodeIdeas ?
The Ubuntu xorg devs were looking for something like this, but they also
wanted a lockup report. Are you also interested on working on it ?
Martin
_______________________________________________
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
2012-04-23 16:56 ` Martin Peres
@ 2012-04-24 19:31 ` Marcin Slusarz
[not found] ` <20120424193150.GC2857-OI9uyE9O0yo@public.gmane.org>
0 siblings, 1 reply; 9+ messages in thread
From: Marcin Slusarz @ 2012-04-24 19:31 UTC (permalink / raw)
To: Martin Peres; +Cc: nouveau, dri-devel
On Mon, Apr 23, 2012 at 06:56:44PM +0200, Martin Peres wrote:
> Le 23/04/2012 18:32, Marcin Slusarz a écrit :
> >
> > Just run piglit. Even "quick" tests can cause ~5 lockups (it eventually messes
> > up DDX channel, but this patchset can't fix this case).
> > You can run fs-discard-exit-2 test first - for me it causes instant GPU lockup.
> >
> > Marcin
> Great, Thanks.
>
> Did you have a look at
> https://bugs.freedesktop.org/show_bug.cgi?id=40886 and
> http://xorg.freedesktop.org/wiki/SummerOfCodeIdeas ?
Yeah, I've seen them some time ago.
> The Ubuntu xorg devs were looking for something like this, but they also
> wanted a lockup report. Are you also interested on working on it ?
Yes, when this patchset will be applied, I'm going to work on improving
error reporting.
Marcin
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Nouveau] [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
2012-04-23 16:32 ` Marcin Slusarz
[not found] ` <20120423163257.GA2886-OI9uyE9O0yo@public.gmane.org>
@ 2012-04-25 11:06 ` Christoph Bumiller
1 sibling, 0 replies; 9+ messages in thread
From: Christoph Bumiller @ 2012-04-25 11:06 UTC (permalink / raw)
To: Marcin Slusarz; +Cc: nouveau, dri-devel
On 04/23/2012 06:32 PM, Marcin Slusarz wrote:
> You can run fs-discard-exit-2 test first - for me it causes instant GPU lockup.
>
That's because it's designed (but not supposed) to do that, it also
locks up with the blob, it's a harmless shader infinite loop.
(May be a bug in the MPs or a wrong setting somewhere, but they don't
pop the control flow stack if all but killed pixels of a partially
killed quad remain on the current execution path).
Activated the watchdog on nv50 now, too, if you want to produce the
lockup with current mesa git run with NOUVEAU_SHADER_WATCHDOG=0 (yes
when we use COMPUTE it will have to be deactivated as well).
> Marcin
> _______________________________________________
> Nouveau mailing list
> Nouveau@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/nouveau
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
2012-04-22 22:18 [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery Marcin Slusarz
2012-04-23 8:43 ` Martin Peres
@ 2012-04-23 16:46 ` Martin Peres
2012-04-23 17:33 ` Marcin Slusarz
1 sibling, 1 reply; 9+ messages in thread
From: Martin Peres @ 2012-04-23 16:46 UTC (permalink / raw)
To: dri-devel
Hey,
Just a minor mistake spotted while skimming through the patch.
Le 23/04/2012 00:18, Marcin Slusarz a écrit :
> +static inline uint64_t nv_timeout(struct drm_device *dev)
> +{
> + uint64_t tm = 2000000000ULL;
> + if (nouveau_gpu_reset_in_progress(dev))
> + tm /= 40; /* 50ms */
This will cause a problem on 32 bit kernels. You should use do_div.
Martin
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
2012-04-23 16:46 ` Martin Peres
@ 2012-04-23 17:33 ` Marcin Slusarz
0 siblings, 0 replies; 9+ messages in thread
From: Marcin Slusarz @ 2012-04-23 17:33 UTC (permalink / raw)
To: Martin Peres; +Cc: nouveau, dri-devel
On Mon, Apr 23, 2012 at 06:46:41PM +0200, Martin Peres wrote:
> Hey,
>
> Just a minor mistake spotted while skimming through the patch.
>
> Le 23/04/2012 00:18, Marcin Slusarz a écrit :
> > +static inline uint64_t nv_timeout(struct drm_device *dev)
> > +{
> > + uint64_t tm = 2000000000ULL;
> > + if (nouveau_gpu_reset_in_progress(dev))
> > + tm /= 40; /* 50ms */
> This will cause a problem on 32 bit kernels. You should use do_div.
>
Thanks. I'll fix this later.
Marcin
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2012-04-25 11:06 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-04-22 22:18 [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery Marcin Slusarz
2012-04-23 8:43 ` Martin Peres
2012-04-23 16:32 ` Marcin Slusarz
[not found] ` <20120423163257.GA2886-OI9uyE9O0yo@public.gmane.org>
2012-04-23 16:56 ` Martin Peres
2012-04-24 19:31 ` Marcin Slusarz
[not found] ` <20120424193150.GC2857-OI9uyE9O0yo@public.gmane.org>
2012-04-25 0:32 ` Ben Skeggs
2012-04-25 11:06 ` [Nouveau] " Christoph Bumiller
2012-04-23 16:46 ` Martin Peres
2012-04-23 17:33 ` Marcin Slusarz
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.