* [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
@ 2012-04-25 21:20 Marcin Slusarz
2012-04-25 21:32 ` Marcin Slusarz
[not found] ` <1335388836-13127-4-git-send-email-marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
0 siblings, 2 replies; 12+ messages in thread
From: Marcin Slusarz @ 2012-04-25 21:20 UTC (permalink / raw)
To: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Ben Skeggs
Cc: Daniel Vetter, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Overall idea:
Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
handle them at ioctl level, reset the GPU and repeat last ioctl.
GPU reset is done by doing suspend / resume cycle with few tweaks:
- CPU-only bo eviction
- ignoring vm flush / fence timeouts
- shortening waits
Signed-off-by: Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
drivers/gpu/drm/nouveau/Makefile | 2 +-
drivers/gpu/drm/nouveau/nouveau_bo.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_channel.c | 5 +-
drivers/gpu/drm/nouveau/nouveau_drv.c | 56 ++++++++++-
drivers/gpu/drm/nouveau/nouveau_drv.h | 45 ++++++++-
drivers/gpu/drm/nouveau/nouveau_fence.c | 7 +-
drivers/gpu/drm/nouveau/nouveau_gem.c | 14 +++-
drivers/gpu/drm/nouveau/nouveau_notifier.c | 3 +
drivers/gpu/drm/nouveau/nouveau_object.c | 6 +
drivers/gpu/drm/nouveau/nouveau_reset.c | 148 ++++++++++++++++++++++++++++
drivers/gpu/drm/nouveau/nouveau_state.c | 6 +
drivers/gpu/drm/nouveau/nv50_graph.c | 11 +-
12 files changed, 290 insertions(+), 15 deletions(-)
create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c
diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
index 03860f5..77d0c33 100644
--- a/drivers/gpu/drm/nouveau/Makefile
+++ b/drivers/gpu/drm/nouveau/Makefile
@@ -9,7 +9,7 @@ nouveau-y := nouveau_drv.o nouveau_state.o nouveau_channel.o nouveau_mem.o \
nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
- nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
+ nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_temp.o \
nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
nv04_timer.o \
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 5b0dc50..7de6cad 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -936,7 +936,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict, bool intr,
}
/* Software copy if the card isn't up and running yet. */
- if (!dev_priv->channel) {
+ if (!dev_priv->channel || nouveau_gpu_reset_in_progress(dev_priv->dev)) {
ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, no_wait_gpu, new_mem);
goto out;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_channel.c b/drivers/gpu/drm/nouveau/nouveau_channel.c
index 846afb0..c0fa5a7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_channel.c
+++ b/drivers/gpu/drm/nouveau/nouveau_channel.c
@@ -420,7 +420,7 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
init->fb_ctxdma_handle,
init->tt_ctxdma_handle);
if (ret)
- return ret;
+ goto out;
init->channel = chan->id;
if (nouveau_vram_pushbuf == 0) {
@@ -450,6 +450,9 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
if (ret == 0)
atomic_inc(&chan->users); /* userspace reference */
nouveau_channel_put(&chan);
+out:
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 090fff6..261e1f5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -237,7 +237,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state)
if (!dev_priv->eng[e])
continue;
- ret = dev_priv->eng[e]->fini(dev, e, true);
+ ret = dev_priv->eng[e]->fini(dev, e, !nouveau_gpu_reset_in_progress(dev));
if (ret) {
NV_ERROR(dev, "... engine %d failed: %d\n", e, ret);
goto out_abort;
@@ -443,11 +443,63 @@ nouveau_pci_resume(struct pci_dev *pdev)
return 0;
}
+void intr_rwsem_init(struct intr_rwsem *r)
+{
+ init_rwsem(&r->rwsem);
+ mutex_init(&r->mutex);
+}
+
+int intr_rwsem_down_read_interruptible(struct intr_rwsem *r)
+{
+ while (down_read_trylock(&r->rwsem) == 0) {
+ int ret = mutex_lock_interruptible(&r->mutex);
+ if (ret)
+ return ret;
+ mutex_unlock(&r->mutex);
+ }
+ return 0;
+}
+
+void intr_rwsem_up_read(struct intr_rwsem *r)
+{
+ up_read(&r->rwsem);
+}
+
+void intr_rwsem_down_write(struct intr_rwsem *r)
+{
+ mutex_lock(&r->mutex);
+ down_write(&r->rwsem);
+}
+
+void intr_rwsem_up_write(struct intr_rwsem *r)
+{
+ up_write(&r->rwsem);
+ mutex_unlock(&r->mutex);
+}
+
+static long nouveau_ioctl(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct drm_file *file_priv = filp->private_data;
+ struct drm_device *dev = file_priv->minor->dev;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ long ret = intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem);
+ if (ret)
+ return ret;
+
+ ret = drm_ioctl(filp, cmd, arg);
+
+ intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
+
+ return ret;
+}
+
static const struct file_operations nouveau_driver_fops = {
.owner = THIS_MODULE,
.open = drm_open,
.release = drm_release,
- .unlocked_ioctl = drm_ioctl,
+ .unlocked_ioctl = nouveau_ioctl,
.mmap = nouveau_ttm_mmap,
.poll = drm_poll,
.fasync = drm_fasync,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index d120baf..ad146e7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -704,9 +704,25 @@ enum nouveau_card_type {
NV_E0 = 0xe0,
};
+struct intr_rwsem {
+ struct rw_semaphore rwsem;
+ struct mutex mutex;
+};
+
+extern void intr_rwsem_init(struct intr_rwsem *r);
+extern int intr_rwsem_down_read_interruptible(struct intr_rwsem *r);
+extern void intr_rwsem_up_read(struct intr_rwsem *r);
+extern void intr_rwsem_down_write(struct intr_rwsem *r);
+extern void intr_rwsem_up_write(struct intr_rwsem *r);
+
struct drm_nouveau_private {
struct drm_device *dev;
bool noaccel;
+ struct intr_rwsem ioctls_rwsem;
+
+ struct mutex reset_lock;
+ atomic_t gpureset_in_progress;
+ unsigned long last_gpu_reset;
/* the card type, takes NV_* as values */
enum nouveau_card_type card_type;
@@ -841,6 +857,7 @@ struct drm_nouveau_private {
struct {
struct dentry *channel_root;
+ struct dentry *reset;
} debugfs;
struct nouveau_fbdev *nfbdev;
@@ -1537,6 +1554,20 @@ int nouveau_display_dumb_map_offset(struct drm_file *, struct drm_device *,
uint32_t handle, uint64_t *offset);
int nouveau_display_dumb_destroy(struct drm_file *, struct drm_device *,
uint32_t handle);
+/* nouveau_reset.c */
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+void nouveau_reset_debugfs_fini(struct drm_minor *minor);
+void nouveau_reset_debugfs_init(struct drm_minor *minor);
+#else
+static inline void nouveau_reset_debugfs_fini(struct drm_minor *minor) {}
+static inline void nouveau_reset_debugfs_init(struct drm_minor *minor) {}
+#endif
+int nouveau_reset_device(struct drm_device *dev);
+static inline bool nouveau_gpu_reset_in_progress(struct drm_device *dev)
+{
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+ return atomic_read(&dev_priv->gpureset_in_progress) != 0;
+}
/* nv10_gpio.c */
int nv10_gpio_init(struct drm_device *dev);
@@ -1632,12 +1663,20 @@ static inline void nv_wr08(struct drm_device *dev, unsigned reg, u8 val)
iowrite8(val, dev_priv->mmio + reg);
}
+static inline uint64_t nv_timeout(struct drm_device *dev)
+{
+ uint64_t tm = 2000000000ULL;
+ if (nouveau_gpu_reset_in_progress(dev))
+ tm = 50000000; /* 50ms */
+ return tm;
+}
+
#define nv_wait(dev, reg, mask, val) \
- nouveau_wait_eq(dev, 2000000000ULL, (reg), (mask), (val))
+ nouveau_wait_eq(dev, nv_timeout(dev), (reg), (mask), (val))
#define nv_wait_ne(dev, reg, mask, val) \
- nouveau_wait_ne(dev, 2000000000ULL, (reg), (mask), (val))
+ nouveau_wait_ne(dev, nv_timeout(dev), (reg), (mask), (val))
#define nv_wait_cb(dev, func, data) \
- nouveau_wait_cb(dev, 2000000000ULL, (func), (data))
+ nouveau_wait_cb(dev, nv_timeout(dev), (func), (data))
/* PRAMIN access */
static inline u32 nv_ri32(struct drm_device *dev, unsigned offset)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 41ee17d..13d0176 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -233,17 +233,22 @@ int
__nouveau_fence_wait(void *sync_obj, void *sync_arg, bool lazy, bool intr)
{
struct nouveau_fence *fence = nouveau_fence(sync_obj);
+ struct drm_device *dev = fence->channel->dev;
unsigned long timeout = fence->timeout;
unsigned long sleep_time = NSEC_PER_MSEC / 1000;
ktime_t t;
int ret = 0;
+ if (nouveau_gpu_reset_in_progress(dev))
+ timeout = jiffies + DRM_HZ / 5;
+
while (1) {
if (__nouveau_fence_signalled(sync_obj, sync_arg))
break;
if (time_after_eq(jiffies, timeout)) {
- ret = -EBUSY;
+ if (!nouveau_gpu_reset_in_progress(dev))
+ ret = -EIO;
break;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index ed52a6f..f9bbcc0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -214,7 +214,7 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
req->info.domain, req->info.tile_mode,
req->info.tile_flags, &nvbo);
if (ret)
- return ret;
+ goto out;
ret = drm_gem_handle_create(file_priv, nvbo->gem, &req->info.handle);
if (ret == 0) {
@@ -225,6 +225,9 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
/* drop reference from allocate - handle holds it now */
drm_gem_object_unreference_unlocked(nvbo->gem);
+out:
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
@@ -804,6 +807,9 @@ out_next:
}
nouveau_channel_put(&chan);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
@@ -839,6 +845,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
spin_unlock(&nvbo->bo.bdev->fence_lock);
drm_gem_object_unreference_unlocked(gem);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
@@ -863,6 +872,9 @@ nouveau_gem_ioctl_info(struct drm_device *dev, void *data,
ret = nouveau_gem_info(file_priv, gem, req);
drm_gem_object_unreference_unlocked(gem);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_notifier.c b/drivers/gpu/drm/nouveau/nouveau_notifier.c
index 2ef883c..e224b1c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_notifier.c
+++ b/drivers/gpu/drm/nouveau/nouveau_notifier.c
@@ -200,5 +200,8 @@ nouveau_ioctl_notifier_alloc(struct drm_device *dev, void *data,
ret = nouveau_notifier_alloc(chan, na->handle, na->size, 0, 0x1000,
&na->offset);
nouveau_channel_put(&chan);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_object.c b/drivers/gpu/drm/nouveau/nouveau_object.c
index cc419fa..ba592b0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_object.c
+++ b/drivers/gpu/drm/nouveau/nouveau_object.c
@@ -973,6 +973,9 @@ int nouveau_ioctl_grobj_alloc(struct drm_device *dev, void *data,
out:
nouveau_channel_put(&chan);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
@@ -992,6 +995,9 @@ int nouveau_ioctl_gpuobj_free(struct drm_device *dev, void *data,
ret = nouveau_ramht_remove(chan, objfree->handle);
nouveau_channel_put(&chan);
+
+ if (ret == -EIO)
+ ret = nouveau_reset_device(dev);
return ret;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c b/drivers/gpu/drm/nouveau/nouveau_reset.c
new file mode 100644
index 0000000..e893096
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2012 Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include "drmP.h"
+#include "nouveau_drv.h"
+
+static bool off(struct drm_device *dev)
+{
+ struct pci_dev *pdev = dev->pdev;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ pm_message_t pmm = { .event = PM_EVENT_SUSPEND };
+ atomic_inc(&dev_priv->gpureset_in_progress);
+ intr_rwsem_down_write(&dev_priv->ioctls_rwsem);
+
+ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+ if (nouveau_pci_suspend(pdev, pmm))
+ goto fail;
+
+ dev->switch_power_state = DRM_SWITCH_POWER_OFF;
+ return true;
+
+fail:
+ dev->switch_power_state = DRM_SWITCH_POWER_ON;
+ intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
+ return false;
+}
+
+static void on(struct drm_device *dev)
+{
+ struct pci_dev *pdev = dev->pdev;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+ atomic_dec(&dev_priv->gpureset_in_progress);
+ nouveau_pci_resume(pdev);
+ dev->switch_power_state = DRM_SWITCH_POWER_ON;
+
+ dev_priv->last_gpu_reset = jiffies;
+ intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
+}
+
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+static ssize_t nouveau_reset_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct drm_device *dev = filp->private_data;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+ char usercmd[2];
+ if (cnt > 2)
+ cnt = 2;
+
+ if (copy_from_user(usercmd, ubuf, cnt))
+ return -EFAULT;
+
+ if (usercmd[0] == '1') {
+ int ret = intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem);
+ if (ret)
+ return ret;
+ nouveau_reset_device(dev);
+ intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
+ }
+
+ return cnt;
+}
+
+static const struct file_operations nouveau_reset_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .write = nouveau_reset_write,
+ .llseek = noop_llseek,
+};
+
+void nouveau_reset_debugfs_fini(struct drm_minor *minor)
+{
+ struct drm_device *dev = minor->dev;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ if (dev_priv->debugfs.reset) {
+ debugfs_remove(dev_priv->debugfs.reset);
+ dev_priv->debugfs.reset = NULL;
+ }
+}
+
+
+void nouveau_reset_debugfs_init(struct drm_minor *minor)
+{
+ struct drm_device *dev = minor->dev;
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ dev_priv->debugfs.reset = debugfs_create_file("reset", 0200,
+ minor->debugfs_root, dev, &nouveau_reset_fops);
+ if (IS_ERR_OR_NULL(dev_priv->debugfs.reset))
+ dev_priv->debugfs.reset = NULL;
+
+}
+#endif
+
+int nouveau_reset_device(struct drm_device *dev)
+{
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+ if (mutex_trylock(&dev_priv->reset_lock) == 0)
+ /* gpu reset in progress */
+ goto out;
+
+ if (time_after_eq(jiffies, dev_priv->last_gpu_reset + 10 * DRM_HZ)) {
+ unsigned long start, end;
+
+ intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
+ NV_INFO(dev, "GPU lockup detected, resetting...\n");
+ start = jiffies;
+ while (!off(dev))
+ ;
+ on(dev);
+ end = jiffies;
+ NV_INFO(dev, "GPU reset done, took %lu s\n", (end - start) / DRM_HZ);
+ while (intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem))
+ ; /* not possible, we are holding reset_lock */
+ }
+ mutex_unlock(&dev_priv->reset_lock);
+
+out:
+ return -EAGAIN;
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
index afec760..2fac5e5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -697,6 +697,8 @@ nouveau_card_init(struct drm_device *dev)
if (ret)
goto out;
engine = &dev_priv->engine;
+ intr_rwsem_init(&dev_priv->ioctls_rwsem);
+ mutex_init(&dev_priv->reset_lock);
spin_lock_init(&dev_priv->channels.lock);
spin_lock_init(&dev_priv->tile.lock);
spin_lock_init(&dev_priv->context_switch_lock);
@@ -886,6 +888,7 @@ nouveau_card_init(struct drm_device *dev)
nouveau_fbcon_init(dev);
}
+ nouveau_reset_debugfs_init(dev->primary);
return 0;
@@ -943,6 +946,8 @@ static void nouveau_card_takedown(struct drm_device *dev)
struct nouveau_engine *engine = &dev_priv->engine;
int e;
+ nouveau_reset_debugfs_fini(dev->primary);
+
if (dev->mode_config.num_crtc) {
nouveau_fbcon_fini(dev);
nouveau_display_fini(dev);
@@ -1129,6 +1134,7 @@ int nouveau_load(struct drm_device *dev, unsigned long flags)
}
dev->dev_private = dev_priv;
dev_priv->dev = dev;
+ atomic_set(&dev_priv->gpureset_in_progress, 0);
pci_set_master(dev->pdev);
diff --git a/drivers/gpu/drm/nouveau/nv50_graph.c b/drivers/gpu/drm/nouveau/nv50_graph.c
index a61853f..d0a2e50 100644
--- a/drivers/gpu/drm/nouveau/nv50_graph.c
+++ b/drivers/gpu/drm/nouveau/nv50_graph.c
@@ -440,13 +440,14 @@ nv84_graph_tlb_flush(struct drm_device *dev, int engine)
ret = -ERESTARTSYS;
break;
}
- } while (!idle && !(timeout = ptimer->read(dev) - start > 2000000000));
+ } while (!idle && !(timeout = ptimer->read(dev) - start > nv_timeout(dev)));
if (timeout) {
- NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
- "0x%08x 0x%08x 0x%08x 0x%08x\n",
- nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
- nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
+ if (!nouveau_gpu_reset_in_progress(dev))
+ NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
+ "0x%08x 0x%08x 0x%08x 0x%08x\n",
+ nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
+ nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
ret = -EIO;
}
--
1.7.8.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
2012-04-25 21:20 [PATCH v2 4/4] drm/nouveau: gpu lockup recovery Marcin Slusarz
@ 2012-04-25 21:32 ` Marcin Slusarz
[not found] ` <1335388836-13127-4-git-send-email-marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
1 sibling, 0 replies; 12+ messages in thread
From: Marcin Slusarz @ 2012-04-25 21:32 UTC (permalink / raw)
To: nouveau, Ben Skeggs; +Cc: dri-devel
On Wed, Apr 25, 2012 at 11:20:36PM +0200, Marcin Slusarz wrote:
> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
>
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
>
> Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
> ---
What changed from v1:
- moved ioctl locking from drm core to nouveau
- made down_reads interruptible
- fixed build bug on 32-bit systems
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
[not found] ` <1335388836-13127-4-git-send-email-marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2012-04-26 7:32 ` Ben Skeggs
2012-04-28 14:49 ` Marcin Slusarz
2012-04-28 14:56 ` Marcin Slusarz
` (2 subsequent siblings)
3 siblings, 1 reply; 12+ messages in thread
From: Ben Skeggs @ 2012-04-26 7:32 UTC (permalink / raw)
To: Marcin Slusarz
Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Daniel Vetter
On Wed, 2012-04-25 at 23:20 +0200, Marcin Slusarz wrote:
> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
>
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
Okay. I've thought about this a bit for a couple of days and think I'll
be able to coherently share my thoughts on this issue now :)
Firstly, while I agree that we need to become more resilient to errors,
I don't think that following in the radeon/intel footsteps with
something (imo, hackish) like this is the right choice for us
necessarily.
The *vast* majority of "lockups" we have are as a result of us badly
mishandling exceptions reported to us by the GPU. There are a couple of
exceptions, however, they're very rare..
A very common example is where people gain DMA_PUSHERs for whatever
reason, and things go haywire eventually. To handle a DMA_PUSHER
sanely, generally you have to drop all pending commands for the channel
(set GET=PUT, etc) and continue on. However, this leaves us with fences
and semaphores unsignalled etc, causing issues further up the stack with
perfectly good channels hanging on attempting to sync with the crashed
channel etc.
The next most common example I can think of is nv4x hardware, getting a
LIMIT_COLOR/ZETA exception from PGRAPH, and then a hang. The solution
is simple, learn how to handle the exception, log it, and PGRAPH
survives.
I strongly believe that if we focused our efforts on dealing with what
the GPU reports to us a lot better, we'll find we really don't need such
"lockup recovery".
I am, however, considering pulling the vm flush timeout error
propagation and break-out-of-waits-on-signals that builds on it. As we
really do need to become better at having killable processes if things
go wrong :)
Ben.
>
> Signed-off-by: Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> ---
> drivers/gpu/drm/nouveau/Makefile | 2 +-
> drivers/gpu/drm/nouveau/nouveau_bo.c | 2 +-
> drivers/gpu/drm/nouveau/nouveau_channel.c | 5 +-
> drivers/gpu/drm/nouveau/nouveau_drv.c | 56 ++++++++++-
> drivers/gpu/drm/nouveau/nouveau_drv.h | 45 ++++++++-
> drivers/gpu/drm/nouveau/nouveau_fence.c | 7 +-
> drivers/gpu/drm/nouveau/nouveau_gem.c | 14 +++-
> drivers/gpu/drm/nouveau/nouveau_notifier.c | 3 +
> drivers/gpu/drm/nouveau/nouveau_object.c | 6 +
> drivers/gpu/drm/nouveau/nouveau_reset.c | 148 ++++++++++++++++++++++++++++
> drivers/gpu/drm/nouveau/nouveau_state.c | 6 +
> drivers/gpu/drm/nouveau/nv50_graph.c | 11 +-
> 12 files changed, 290 insertions(+), 15 deletions(-)
> create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c
>
> diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
> index 03860f5..77d0c33 100644
> --- a/drivers/gpu/drm/nouveau/Makefile
> +++ b/drivers/gpu/drm/nouveau/Makefile
> @@ -9,7 +9,7 @@ nouveau-y := nouveau_drv.o nouveau_state.o nouveau_channel.o nouveau_mem.o \
> nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
> nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
> nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
> - nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
> + nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
> nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_temp.o \
> nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
> nv04_timer.o \
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index 5b0dc50..7de6cad 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -936,7 +936,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict, bool intr,
> }
>
> /* Software copy if the card isn't up and running yet. */
> - if (!dev_priv->channel) {
> + if (!dev_priv->channel || nouveau_gpu_reset_in_progress(dev_priv->dev)) {
> ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, no_wait_gpu, new_mem);
> goto out;
> }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_channel.c b/drivers/gpu/drm/nouveau/nouveau_channel.c
> index 846afb0..c0fa5a7 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_channel.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_channel.c
> @@ -420,7 +420,7 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
> init->fb_ctxdma_handle,
> init->tt_ctxdma_handle);
> if (ret)
> - return ret;
> + goto out;
> init->channel = chan->id;
>
> if (nouveau_vram_pushbuf == 0) {
> @@ -450,6 +450,9 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
> if (ret == 0)
> atomic_inc(&chan->users); /* userspace reference */
> nouveau_channel_put(&chan);
> +out:
> + if (ret == -EIO)
> + ret = nouveau_reset_device(dev);
> return ret;
> }
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c
> index 090fff6..261e1f5 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drv.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
> @@ -237,7 +237,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state)
> if (!dev_priv->eng[e])
> continue;
>
> - ret = dev_priv->eng[e]->fini(dev, e, true);
> + ret = dev_priv->eng[e]->fini(dev, e, !nouveau_gpu_reset_in_progress(dev));
> if (ret) {
> NV_ERROR(dev, "... engine %d failed: %d\n", e, ret);
> goto out_abort;
> @@ -443,11 +443,63 @@ nouveau_pci_resume(struct pci_dev *pdev)
> return 0;
> }
>
> +void intr_rwsem_init(struct intr_rwsem *r)
> +{
> + init_rwsem(&r->rwsem);
> + mutex_init(&r->mutex);
> +}
> +
> +int intr_rwsem_down_read_interruptible(struct intr_rwsem *r)
> +{
> + while (down_read_trylock(&r->rwsem) == 0) {
> + int ret = mutex_lock_interruptible(&r->mutex);
> + if (ret)
> + return ret;
> + mutex_unlock(&r->mutex);
> + }
> + return 0;
> +}
> +
> +void intr_rwsem_up_read(struct intr_rwsem *r)
> +{
> + up_read(&r->rwsem);
> +}
> +
> +void intr_rwsem_down_write(struct intr_rwsem *r)
> +{
> + mutex_lock(&r->mutex);
> + down_write(&r->rwsem);
> +}
> +
> +void intr_rwsem_up_write(struct intr_rwsem *r)
> +{
> + up_write(&r->rwsem);
> + mutex_unlock(&r->mutex);
> +}
> +
> +static long nouveau_ioctl(struct file *filp,
> + unsigned int cmd, unsigned long arg)
> +{
> + struct drm_file *file_priv = filp->private_data;
> + struct drm_device *dev = file_priv->minor->dev;
> + struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> + long ret = intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem);
> + if (ret)
> + return ret;
> +
> + ret = drm_ioctl(filp, cmd, arg);
> +
> + intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
> +
> + return ret;
> +}
> +
> static const struct file_operations nouveau_driver_fops = {
> .owner = THIS_MODULE,
> .open = drm_open,
> .release = drm_release,
> - .unlocked_ioctl = drm_ioctl,
> + .unlocked_ioctl = nouveau_ioctl,
> .mmap = nouveau_ttm_mmap,
> .poll = drm_poll,
> .fasync = drm_fasync,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
> index d120baf..ad146e7 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drv.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
> @@ -704,9 +704,25 @@ enum nouveau_card_type {
> NV_E0 = 0xe0,
> };
>
> +struct intr_rwsem {
> + struct rw_semaphore rwsem;
> + struct mutex mutex;
> +};
> +
> +extern void intr_rwsem_init(struct intr_rwsem *r);
> +extern int intr_rwsem_down_read_interruptible(struct intr_rwsem *r);
> +extern void intr_rwsem_up_read(struct intr_rwsem *r);
> +extern void intr_rwsem_down_write(struct intr_rwsem *r);
> +extern void intr_rwsem_up_write(struct intr_rwsem *r);
> +
> struct drm_nouveau_private {
> struct drm_device *dev;
> bool noaccel;
> + struct intr_rwsem ioctls_rwsem;
> +
> + struct mutex reset_lock;
> + atomic_t gpureset_in_progress;
> + unsigned long last_gpu_reset;
>
> /* the card type, takes NV_* as values */
> enum nouveau_card_type card_type;
> @@ -841,6 +857,7 @@ struct drm_nouveau_private {
>
> struct {
> struct dentry *channel_root;
> + struct dentry *reset;
> } debugfs;
>
> struct nouveau_fbdev *nfbdev;
> @@ -1537,6 +1554,20 @@ int nouveau_display_dumb_map_offset(struct drm_file *, struct drm_device *,
> uint32_t handle, uint64_t *offset);
> int nouveau_display_dumb_destroy(struct drm_file *, struct drm_device *,
> uint32_t handle);
> +/* nouveau_reset.c */
> +#ifdef CONFIG_DRM_NOUVEAU_DEBUG
> +void nouveau_reset_debugfs_fini(struct drm_minor *minor);
> +void nouveau_reset_debugfs_init(struct drm_minor *minor);
> +#else
> +static inline void nouveau_reset_debugfs_fini(struct drm_minor *minor) {}
> +static inline void nouveau_reset_debugfs_init(struct drm_minor *minor) {}
> +#endif
> +int nouveau_reset_device(struct drm_device *dev);
> +static inline bool nouveau_gpu_reset_in_progress(struct drm_device *dev)
> +{
> + struct drm_nouveau_private *dev_priv = dev->dev_private;
> + return atomic_read(&dev_priv->gpureset_in_progress) != 0;
> +}
>
> /* nv10_gpio.c */
> int nv10_gpio_init(struct drm_device *dev);
> @@ -1632,12 +1663,20 @@ static inline void nv_wr08(struct drm_device *dev, unsigned reg, u8 val)
> iowrite8(val, dev_priv->mmio + reg);
> }
>
> +static inline uint64_t nv_timeout(struct drm_device *dev)
> +{
> + uint64_t tm = 2000000000ULL;
> + if (nouveau_gpu_reset_in_progress(dev))
> + tm = 50000000; /* 50ms */
> + return tm;
> +}
> +
> #define nv_wait(dev, reg, mask, val) \
> - nouveau_wait_eq(dev, 2000000000ULL, (reg), (mask), (val))
> + nouveau_wait_eq(dev, nv_timeout(dev), (reg), (mask), (val))
> #define nv_wait_ne(dev, reg, mask, val) \
> - nouveau_wait_ne(dev, 2000000000ULL, (reg), (mask), (val))
> + nouveau_wait_ne(dev, nv_timeout(dev), (reg), (mask), (val))
> #define nv_wait_cb(dev, func, data) \
> - nouveau_wait_cb(dev, 2000000000ULL, (func), (data))
> + nouveau_wait_cb(dev, nv_timeout(dev), (func), (data))
>
> /* PRAMIN access */
> static inline u32 nv_ri32(struct drm_device *dev, unsigned offset)
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
> index 41ee17d..13d0176 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> @@ -233,17 +233,22 @@ int
> __nouveau_fence_wait(void *sync_obj, void *sync_arg, bool lazy, bool intr)
> {
> struct nouveau_fence *fence = nouveau_fence(sync_obj);
> + struct drm_device *dev = fence->channel->dev;
> unsigned long timeout = fence->timeout;
> unsigned long sleep_time = NSEC_PER_MSEC / 1000;
> ktime_t t;
> int ret = 0;
>
> + if (nouveau_gpu_reset_in_progress(dev))
> + timeout = jiffies + DRM_HZ / 5;
> +
> while (1) {
> if (__nouveau_fence_signalled(sync_obj, sync_arg))
> break;
>
> if (time_after_eq(jiffies, timeout)) {
> - ret = -EBUSY;
> + if (!nouveau_gpu_reset_in_progress(dev))
> + ret = -EIO;
> break;
> }
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index ed52a6f..f9bbcc0 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -214,7 +214,7 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
> req->info.domain, req->info.tile_mode,
> req->info.tile_flags, &nvbo);
> if (ret)
> - return ret;
> + goto out;
>
> ret = drm_gem_handle_create(file_priv, nvbo->gem, &req->info.handle);
> if (ret == 0) {
> @@ -225,6 +225,9 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
>
> /* drop reference from allocate - handle holds it now */
> drm_gem_object_unreference_unlocked(nvbo->gem);
> +out:
> + if (ret == -EIO)
> + ret = nouveau_reset_device(dev);
> return ret;
> }
>
> @@ -804,6 +807,9 @@ out_next:
> }
>
> nouveau_channel_put(&chan);
> +
> + if (ret == -EIO)
> + ret = nouveau_reset_device(dev);
> return ret;
> }
>
> @@ -839,6 +845,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
> ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
> spin_unlock(&nvbo->bo.bdev->fence_lock);
> drm_gem_object_unreference_unlocked(gem);
> +
> + if (ret == -EIO)
> + ret = nouveau_reset_device(dev);
> return ret;
> }
>
> @@ -863,6 +872,9 @@ nouveau_gem_ioctl_info(struct drm_device *dev, void *data,
>
> ret = nouveau_gem_info(file_priv, gem, req);
> drm_gem_object_unreference_unlocked(gem);
> +
> + if (ret == -EIO)
> + ret = nouveau_reset_device(dev);
> return ret;
> }
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_notifier.c b/drivers/gpu/drm/nouveau/nouveau_notifier.c
> index 2ef883c..e224b1c 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_notifier.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_notifier.c
> @@ -200,5 +200,8 @@ nouveau_ioctl_notifier_alloc(struct drm_device *dev, void *data,
> ret = nouveau_notifier_alloc(chan, na->handle, na->size, 0, 0x1000,
> &na->offset);
> nouveau_channel_put(&chan);
> +
> + if (ret == -EIO)
> + ret = nouveau_reset_device(dev);
> return ret;
> }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_object.c b/drivers/gpu/drm/nouveau/nouveau_object.c
> index cc419fa..ba592b0 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_object.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_object.c
> @@ -973,6 +973,9 @@ int nouveau_ioctl_grobj_alloc(struct drm_device *dev, void *data,
>
> out:
> nouveau_channel_put(&chan);
> +
> + if (ret == -EIO)
> + ret = nouveau_reset_device(dev);
> return ret;
> }
>
> @@ -992,6 +995,9 @@ int nouveau_ioctl_gpuobj_free(struct drm_device *dev, void *data,
>
> ret = nouveau_ramht_remove(chan, objfree->handle);
> nouveau_channel_put(&chan);
> +
> + if (ret == -EIO)
> + ret = nouveau_reset_device(dev);
> return ret;
> }
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c b/drivers/gpu/drm/nouveau/nouveau_reset.c
> new file mode 100644
> index 0000000..e893096
> --- /dev/null
> +++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
> @@ -0,0 +1,148 @@
> +/*
> + * Copyright (C) 2012 Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining
> + * a copy of this software and associated documentation files (the
> + * "Software"), to deal in the Software without restriction, including
> + * without limitation the rights to use, copy, modify, merge, publish,
> + * distribute, sublicense, and/or sell copies of the Software, and to
> + * permit persons to whom the Software is furnished to do so, subject to
> + * the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> + * next paragraph) shall be included in all copies or substantial
> + * portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
> + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
> + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
> + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
> + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/debugfs.h>
> +#include "drmP.h"
> +#include "nouveau_drv.h"
> +
> +static bool off(struct drm_device *dev)
> +{
> + struct pci_dev *pdev = dev->pdev;
> + struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> + pm_message_t pmm = { .event = PM_EVENT_SUSPEND };
> + atomic_inc(&dev_priv->gpureset_in_progress);
> + intr_rwsem_down_write(&dev_priv->ioctls_rwsem);
> +
> + dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
> + if (nouveau_pci_suspend(pdev, pmm))
> + goto fail;
> +
> + dev->switch_power_state = DRM_SWITCH_POWER_OFF;
> + return true;
> +
> +fail:
> + dev->switch_power_state = DRM_SWITCH_POWER_ON;
> + intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
> + return false;
> +}
> +
> +static void on(struct drm_device *dev)
> +{
> + struct pci_dev *pdev = dev->pdev;
> + struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> + dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
> + atomic_dec(&dev_priv->gpureset_in_progress);
> + nouveau_pci_resume(pdev);
> + dev->switch_power_state = DRM_SWITCH_POWER_ON;
> +
> + dev_priv->last_gpu_reset = jiffies;
> + intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
> +}
> +
> +#ifdef CONFIG_DRM_NOUVEAU_DEBUG
> +static ssize_t nouveau_reset_write(struct file *filp, const char __user *ubuf,
> + size_t cnt, loff_t *ppos)
> +{
> + struct drm_device *dev = filp->private_data;
> + struct drm_nouveau_private *dev_priv = dev->dev_private;
> + char usercmd[2];
> + if (cnt > 2)
> + cnt = 2;
> +
> + if (copy_from_user(usercmd, ubuf, cnt))
> + return -EFAULT;
> +
> + if (usercmd[0] == '1') {
> + int ret = intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem);
> + if (ret)
> + return ret;
> + nouveau_reset_device(dev);
> + intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
> + }
> +
> + return cnt;
> +}
> +
> +static const struct file_operations nouveau_reset_fops = {
> + .owner = THIS_MODULE,
> + .open = simple_open,
> + .write = nouveau_reset_write,
> + .llseek = noop_llseek,
> +};
> +
> +void nouveau_reset_debugfs_fini(struct drm_minor *minor)
> +{
> + struct drm_device *dev = minor->dev;
> + struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> + if (dev_priv->debugfs.reset) {
> + debugfs_remove(dev_priv->debugfs.reset);
> + dev_priv->debugfs.reset = NULL;
> + }
> +}
> +
> +
> +void nouveau_reset_debugfs_init(struct drm_minor *minor)
> +{
> + struct drm_device *dev = minor->dev;
> + struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> + dev_priv->debugfs.reset = debugfs_create_file("reset", 0200,
> + minor->debugfs_root, dev, &nouveau_reset_fops);
> + if (IS_ERR_OR_NULL(dev_priv->debugfs.reset))
> + dev_priv->debugfs.reset = NULL;
> +
> +}
> +#endif
> +
> +int nouveau_reset_device(struct drm_device *dev)
> +{
> + struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> + if (mutex_trylock(&dev_priv->reset_lock) == 0)
> + /* gpu reset in progress */
> + goto out;
> +
> + if (time_after_eq(jiffies, dev_priv->last_gpu_reset + 10 * DRM_HZ)) {
> + unsigned long start, end;
> +
> + intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
> + NV_INFO(dev, "GPU lockup detected, resetting...\n");
> + start = jiffies;
> + while (!off(dev))
> + ;
> + on(dev);
> + end = jiffies;
> + NV_INFO(dev, "GPU reset done, took %lu s\n", (end - start) / DRM_HZ);
> + while (intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem))
> + ; /* not possible, we are holding reset_lock */
> + }
> + mutex_unlock(&dev_priv->reset_lock);
> +
> +out:
> + return -EAGAIN;
> +}
> diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
> index afec760..2fac5e5 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_state.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_state.c
> @@ -697,6 +697,8 @@ nouveau_card_init(struct drm_device *dev)
> if (ret)
> goto out;
> engine = &dev_priv->engine;
> + intr_rwsem_init(&dev_priv->ioctls_rwsem);
> + mutex_init(&dev_priv->reset_lock);
> spin_lock_init(&dev_priv->channels.lock);
> spin_lock_init(&dev_priv->tile.lock);
> spin_lock_init(&dev_priv->context_switch_lock);
> @@ -886,6 +888,7 @@ nouveau_card_init(struct drm_device *dev)
>
> nouveau_fbcon_init(dev);
> }
> + nouveau_reset_debugfs_init(dev->primary);
>
> return 0;
>
> @@ -943,6 +946,8 @@ static void nouveau_card_takedown(struct drm_device *dev)
> struct nouveau_engine *engine = &dev_priv->engine;
> int e;
>
> + nouveau_reset_debugfs_fini(dev->primary);
> +
> if (dev->mode_config.num_crtc) {
> nouveau_fbcon_fini(dev);
> nouveau_display_fini(dev);
> @@ -1129,6 +1134,7 @@ int nouveau_load(struct drm_device *dev, unsigned long flags)
> }
> dev->dev_private = dev_priv;
> dev_priv->dev = dev;
> + atomic_set(&dev_priv->gpureset_in_progress, 0);
>
> pci_set_master(dev->pdev);
>
> diff --git a/drivers/gpu/drm/nouveau/nv50_graph.c b/drivers/gpu/drm/nouveau/nv50_graph.c
> index a61853f..d0a2e50 100644
> --- a/drivers/gpu/drm/nouveau/nv50_graph.c
> +++ b/drivers/gpu/drm/nouveau/nv50_graph.c
> @@ -440,13 +440,14 @@ nv84_graph_tlb_flush(struct drm_device *dev, int engine)
> ret = -ERESTARTSYS;
> break;
> }
> - } while (!idle && !(timeout = ptimer->read(dev) - start > 2000000000));
> + } while (!idle && !(timeout = ptimer->read(dev) - start > nv_timeout(dev)));
>
> if (timeout) {
> - NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
> - "0x%08x 0x%08x 0x%08x 0x%08x\n",
> - nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
> - nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
> + if (!nouveau_gpu_reset_in_progress(dev))
> + NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
> + "0x%08x 0x%08x 0x%08x 0x%08x\n",
> + nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
> + nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
> ret = -EIO;
> }
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
2012-04-26 7:32 ` Ben Skeggs
@ 2012-04-28 14:49 ` Marcin Slusarz
[not found] ` <20120428144956.GA10116-OI9uyE9O0yo@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Marcin Slusarz @ 2012-04-28 14:49 UTC (permalink / raw)
To: Ben Skeggs
Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Daniel Vetter
On Thu, Apr 26, 2012 at 05:32:29PM +1000, Ben Skeggs wrote:
> On Wed, 2012-04-25 at 23:20 +0200, Marcin Slusarz wrote:
> > Overall idea:
> > Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> > handle them at ioctl level, reset the GPU and repeat last ioctl.
> >
> > GPU reset is done by doing suspend / resume cycle with few tweaks:
> > - CPU-only bo eviction
> > - ignoring vm flush / fence timeouts
> > - shortening waits
> Okay. I've thought about this a bit for a couple of days and think I'll
> be able to coherently share my thoughts on this issue now :)
>
> Firstly, while I agree that we need to become more resilient to errors,
> I don't think that following in the radeon/intel footsteps with
> something (imo, hackish) like this is the right choice for us
> necessarily.
This is not only radeon/intel way. Windows, since Vista SP1, does the
same - see http://msdn.microsoft.com/en-us/windows/hardware/gg487368.
It's funny how similar it is to this patch (I haven't seen this page earlier).
If you fear people will stop reporting bugs - don't. GPU reset is painfully
slow and can take up to 50 seconds (BO eviction is the most time consuming
part), so people will be annoyed enough to report them.
Currently, GPU lockups make users so angry, they frequently switch to blob
without even thinking about reporting anything.
> The *vast* majority of "lockups" we have are as a result of us badly
> mishandling exceptions reported to us by the GPU. There are a couple of
> exceptions, however, they're very rare..
> A very common example is where people gain DMA_PUSHERs for whatever
> reason, and things go haywire eventually.
Nope, I had tens of lockups during testing, and only once I had DMA_PUSHER
before detecting GPU lockup.
> To handle a DMA_PUSHER
> sanely, generally you have to drop all pending commands for the channel
> (set GET=PUT, etc) and continue on. However, this leaves us with fences
> and semaphores unsignalled etc, causing issues further up the stack with
> perfectly good channels hanging on attempting to sync with the crashed
> channel etc.
>
> The next most common example I can think of is nv4x hardware, getting a
> LIMIT_COLOR/ZETA exception from PGRAPH, and then a hang. The solution
> is simple, learn how to handle the exception, log it, and PGRAPH
> survives.
>
> I strongly believe that if we focused our efforts on dealing with what
> the GPU reports to us a lot better, we'll find we really don't need such
> "lockup recovery".
While I agree we need to improve on error handling to make "lockup recovery"
not needed, the reality is we can't predict everything and driver needs to
cope with its own bugs.
> I am, however, considering pulling the vm flush timeout error
> propagation and break-out-of-waits-on-signals that builds on it. As we
> really do need to become better at having killable processes if things
> go wrong :)
Good :)
Marcin
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
[not found] ` <1335388836-13127-4-git-send-email-marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2012-04-26 7:32 ` Ben Skeggs
@ 2012-04-28 14:56 ` Marcin Slusarz
[not found] ` <20120428145615.GB10116-OI9uyE9O0yo@public.gmane.org>
2012-05-27 19:52 ` Marcin Slusarz
2012-08-05 21:15 ` Marcin Slusarz
3 siblings, 1 reply; 12+ messages in thread
From: Marcin Slusarz @ 2012-04-28 14:56 UTC (permalink / raw)
To: Martin Peres; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Ben Skeggs
On Wed, Apr 25, 2012 at 11:20:36PM +0200, Marcin Slusarz wrote:
> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
>
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
>
> Signed-off-by: Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> ---
Martin,
I'm wondering how below patch (which builds upon the above) affects
reclocking stability. I can't test it on my card, because it has only
one performance level. Can you test it on yours?
---
From: Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Subject: [PATCH] drm/nouveau: take ioctls_rwsem before reclocking
Signed-off-by: Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
drivers/gpu/drm/nouveau/nouveau_pm.c | 6 ++++++
drivers/gpu/drm/nouveau/nouveau_reset.c | 2 +-
2 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_pm.c b/drivers/gpu/drm/nouveau/nouveau_pm.c
index 34d591b..4716f39 100644
--- a/drivers/gpu/drm/nouveau/nouveau_pm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_pm.c
@@ -383,9 +383,15 @@ nouveau_pm_set_perflvl(struct device *d, struct device_attribute *a,
const char *buf, size_t count)
{
struct drm_device *dev = pci_get_drvdata(to_pci_dev(d));
+ struct drm_nouveau_private *dev_priv = dev->dev_private;
int ret;
+ intr_rwsem_down_write(&dev_priv->ioctls_rwsem);
+
ret = nouveau_pm_profile_set(dev, buf);
+
+ intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
+
if (ret)
return ret;
return strlen(buf);
diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c b/drivers/gpu/drm/nouveau/nouveau_reset.c
index e893096..7c25a3c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_reset.c
+++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
@@ -139,7 +139,7 @@ int nouveau_reset_device(struct drm_device *dev)
end = jiffies;
NV_INFO(dev, "GPU reset done, took %lu s\n", (end - start) / DRM_HZ);
while (intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem))
- ; /* not possible, we are holding reset_lock */
+ ;
}
mutex_unlock(&dev_priv->reset_lock);
--
1.7.8.5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
[not found] ` <20120428145615.GB10116-OI9uyE9O0yo@public.gmane.org>
@ 2012-04-30 9:47 ` Martin Peres
0 siblings, 0 replies; 12+ messages in thread
From: Martin Peres @ 2012-04-30 9:47 UTC (permalink / raw)
To: Marcin Slusarz; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Ben Skeggs
Le 28/04/2012 16:56, Marcin Slusarz a écrit :
> On Wed, Apr 25, 2012 at 11:20:36PM +0200, Marcin Slusarz wrote:
>> Overall idea:
>> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
>> handle them at ioctl level, reset the GPU and repeat last ioctl.
>>
>> GPU reset is done by doing suspend / resume cycle with few tweaks:
>> - CPU-only bo eviction
>> - ignoring vm flush / fence timeouts
>> - shortening waits
>>
>> Signed-off-by: Marcin Slusarz<marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> ---
> Martin,
>
> I'm wondering how below patch (which builds upon the above) affects
> reclocking stability. I can't test it on my card, because it has only
> one performance level. Can you test it on yours?
Hi Marcin,
I was away from my computers for a few days. I'll test this patch tonight.
Martin
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
[not found] ` <20120428144956.GA10116-OI9uyE9O0yo@public.gmane.org>
@ 2012-05-02 11:28 ` Ben Skeggs
2012-05-02 13:33 ` Martin Peres
0 siblings, 1 reply; 12+ messages in thread
From: Ben Skeggs @ 2012-05-02 11:28 UTC (permalink / raw)
To: Marcin Slusarz; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
On Sat, 2012-04-28 at 16:49 +0200, Marcin Slusarz wrote:
> On Thu, Apr 26, 2012 at 05:32:29PM +1000, Ben Skeggs wrote:
> > On Wed, 2012-04-25 at 23:20 +0200, Marcin Slusarz wrote:
> > > Overall idea:
> > > Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> > > handle them at ioctl level, reset the GPU and repeat last ioctl.
> > >
> > > GPU reset is done by doing suspend / resume cycle with few tweaks:
> > > - CPU-only bo eviction
> > > - ignoring vm flush / fence timeouts
> > > - shortening waits
> > Okay. I've thought about this a bit for a couple of days and think I'll
> > be able to coherently share my thoughts on this issue now :)
> >
> > Firstly, while I agree that we need to become more resilient to errors,
> > I don't think that following in the radeon/intel footsteps with
> > something (imo, hackish) like this is the right choice for us
> > necessarily.
>
> This is not only radeon/intel way. Windows, since Vista SP1, does the
> same - see http://msdn.microsoft.com/en-us/windows/hardware/gg487368.
> It's funny how similar it is to this patch (I haven't seen this page earlier).
Yes, I am aware of this feature in Windows. And I'm not arguing that
something like it isn't necessary.
>
> If you fear people will stop reporting bugs - don't. GPU reset is painfully
> slow and can take up to 50 seconds (BO eviction is the most time consuming
> part), so people will be annoyed enough to report them.
> Currently, GPU lockups make users so angry, they frequently switch to blob
> without even thinking about reporting anything.
I'm not so concerned about the lost bug reports, I expect the same
people that are actually willing to report bugs now will continue to do
so :)
>
> > The *vast* majority of "lockups" we have are as a result of us badly
> > mishandling exceptions reported to us by the GPU. There are a couple of
> > exceptions, however, they're very rare..
>
> > A very common example is where people gain DMA_PUSHERs for whatever
> > reason, and things go haywire eventually.
>
> Nope, I had tens of lockups during testing, and only once I had DMA_PUSHER
> before detecting GPU lockup.
Out of curiosity, what were the lockup situations you were triggering
exactly?
>
> > To handle a DMA_PUSHER
> > sanely, generally you have to drop all pending commands for the channel
> > (set GET=PUT, etc) and continue on. However, this leaves us with fences
> > and semaphores unsignalled etc, causing issues further up the stack with
> > perfectly good channels hanging on attempting to sync with the crashed
> > channel etc.
> >
> > The next most common example I can think of is nv4x hardware, getting a
> > LIMIT_COLOR/ZETA exception from PGRAPH, and then a hang. The solution
> > is simple, learn how to handle the exception, log it, and PGRAPH
> > survives.
> >
> > I strongly believe that if we focused our efforts on dealing with what
> > the GPU reports to us a lot better, we'll find we really don't need such
> > "lockup recovery".
>
> While I agree we need to improve on error handling to make "lockup recovery"
> not needed, the reality is we can't predict everything and driver needs to
> cope with its own bugs.
Right, again, I don't disagree :) I think we can improve a lot on the
big-hammer-suspend-the-gpu solution though, and instead reset only the
faulting engine. It's (in theory) almost possible for us to do now, but
I have a couple of reworks to areas related to this pending (basically,
making the various driver subsystems more independent), which should be
ready soon. This'll go a long way to making it very easy to reset a
single engine, and likely result in *far* faster recovery from hangs.
>
> > I am, however, considering pulling the vm flush timeout error
> > propagation and break-out-of-waits-on-signals that builds on it. As we
> > really do need to become better at having killable processes if things
> > go wrong :)
>
> Good :)
>
> Marcin
> _______________________________________________
> Nouveau mailing list
> Nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> http://lists.freedesktop.org/mailman/listinfo/nouveau
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
2012-05-02 11:28 ` Ben Skeggs
@ 2012-05-02 13:33 ` Martin Peres
[not found] ` <4FA137C4.3000900-GANU6spQydw@public.gmane.org>
0 siblings, 1 reply; 12+ messages in thread
From: Martin Peres @ 2012-05-02 13:33 UTC (permalink / raw)
To: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Ben Skeggs,
Marcin Slusarz
On 02/05/2012 13:28, Ben Skeggs wrote:
> Right, again, I don't disagree :) I think we can improve a lot on the
> big-hammer-suspend-the-gpu solution though, and instead reset only the
> faulting engine. It's (in theory) almost possible for us to do now, but
> I have a couple of reworks to areas related to this pending (basically,
> making the various driver subsystems more independent), which should be
> ready soon. This'll go a long way to making it very easy to reset a
> single engine, and likely result in *far* faster recovery from hangs.
Hey,
What about kicking a channel that put the card in a bad state? Wouldn't
that be possible?
This way, we don't loose the context of other channels and only the
application that hang the card will be exited.
I wonder how pfifo handles commands sent to a non-existing channel, but
I'm sure it shouldn't hang or anything.
Anyway, if this is not possible to only kick one channel, then what
about kicking all channels, rePOSTING the card and using KMS to output
the lockup report (and send a notification of the report through udev
and store the report in a sysfs file)?
Let's not try to be perfect, let us just be able to do better bug reports.
Martin
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
[not found] ` <4FA137C4.3000900-GANU6spQydw@public.gmane.org>
@ 2012-05-02 13:48 ` Ben Skeggs
2012-05-02 13:53 ` Martin Peres
0 siblings, 1 reply; 12+ messages in thread
From: Ben Skeggs @ 2012-05-02 13:48 UTC (permalink / raw)
To: Martin Peres; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
On Wed, 2012-05-02 at 15:33 +0200, Martin Peres wrote:
> On 02/05/2012 13:28, Ben Skeggs wrote:
> > Right, again, I don't disagree :) I think we can improve a lot on the
> > big-hammer-suspend-the-gpu solution though, and instead reset only the
> > faulting engine. It's (in theory) almost possible for us to do now, but
> > I have a couple of reworks to areas related to this pending (basically,
> > making the various driver subsystems more independent), which should be
> > ready soon. This'll go a long way to making it very easy to reset a
> > single engine, and likely result in *far* faster recovery from hangs.
> Hey,
>
> What about kicking a channel that put the card in a bad state? Wouldn't
> that be possible?
>
> This way, we don't loose the context of other channels and only the
> application that hang the card will be exited.
That's pretty much the idea. The trouble comes in where PFIFO will hang
waiting for the stuck engine to report that it's done (eg. it will wait
for PGRAPH to go "i've finished unloading my context now" after it's
told PGRAPH to do so).
Hence why it's important to be able to (preferably) un-stick the stuck
engine (usually handling the appropriate interrupts properly will
achieve this), and failing that, reset it and lose the context for just
that channel.
The work I'm doing at the moment will, among other nice things, make
handling all of this a lot nicer. And it should be nice and speedy in
comparison to the suspend/resume option, we won't have to evict all
buffers from vram without accel, which can take quite a while (not to
mention that it might not even be possible to get to the VRAM not mapped
into the FB BAR on earlier chipsets if accel dies).
>
> I wonder how pfifo handles commands sent to a non-existing channel, but
> I'm sure it shouldn't hang or anything.
It can't happen anyway, if we destroyed the fifo context for a channel
we wouldn't be telling it to execute commands still :)
>
> Anyway, if this is not possible to only kick one channel, then what
> about kicking all channels, rePOSTING the card and using KMS to output
> the lockup report (and send a notification of the report through udev
> and store the report in a sysfs file)?
>
> Let's not try to be perfect, let us just be able to do better bug reports.
I'm still skeptical about how useful any kind of generic "lockup report"
can possibly be, beyond kernel logs.. However, as part of the work I'm
working on, there may be some additional information available via
debugfs.. I don't wan't to elaborate on this too much yet until I wrap
my head around what exactly I want to achieve, but I'll give you a
heads-up once I do :)
Ben.
>
> Martin
> _______________________________________________
> Nouveau mailing list
> Nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> http://lists.freedesktop.org/mailman/listinfo/nouveau
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
2012-05-02 13:48 ` Ben Skeggs
@ 2012-05-02 13:53 ` Martin Peres
0 siblings, 0 replies; 12+ messages in thread
From: Martin Peres @ 2012-05-02 13:53 UTC (permalink / raw)
To: bskeggs-H+wXaHxf7aLQT0dZR+AlfA; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
On 02/05/2012 15:48, Ben Skeggs wrote:
> On Wed, 2012-05-02 at 15:33 +0200, Martin Peres wrote:
>> On 02/05/2012 13:28, Ben Skeggs wrote:
>>> Right, again, I don't disagree :) I think we can improve a lot on the
>>> big-hammer-suspend-the-gpu solution though, and instead reset only the
>>> faulting engine. It's (in theory) almost possible for us to do now, but
>>> I have a couple of reworks to areas related to this pending (basically,
>>> making the various driver subsystems more independent), which should be
>>> ready soon. This'll go a long way to making it very easy to reset a
>>> single engine, and likely result in *far* faster recovery from hangs.
>> Hey,
>>
>> What about kicking a channel that put the card in a bad state? Wouldn't
>> that be possible?
>>
>> This way, we don't loose the context of other channels and only the
>> application that hang the card will be exited.
> That's pretty much the idea. The trouble comes in where PFIFO will hang
> waiting for the stuck engine to report that it's done (eg. it will wait
> for PGRAPH to go "i've finished unloading my context now" after it's
> told PGRAPH to do so).
>
> Hence why it's important to be able to (preferably) un-stick the stuck
> engine (usually handling the appropriate interrupts properly will
> achieve this), and failing that, reset it and lose the context for just
> that channel.
>
> The work I'm doing at the moment will, among other nice things, make
> handling all of this a lot nicer. And it should be nice and speedy in
> comparison to the suspend/resume option, we won't have to evict all
> buffers from vram without accel, which can take quite a while (not to
> mention that it might not even be possible to get to the VRAM not mapped
> into the FB BAR on earlier chipsets if accel dies).
I get it, that seems nice and good.
>
>> I wonder how pfifo handles commands sent to a non-existing channel, but
>> I'm sure it shouldn't hang or anything.
> It can't happen anyway, if we destroyed the fifo context for a channel
> we wouldn't be telling it to execute commands still :)
Right, but there may still be some commands left in the IB ring buffer,
right?
>
>> Anyway, if this is not possible to only kick one channel, then what
>> about kicking all channels, rePOSTING the card and using KMS to output
>> the lockup report (and send a notification of the report through udev
>> and store the report in a sysfs file)?
>>
>> Let's not try to be perfect, let us just be able to do better bug reports.
> I'm still skeptical about how useful any kind of generic "lockup report"
> can possibly be, beyond kernel logs.. However, as part of the work I'm
> working on, there may be some additional information available via
> debugfs.. I don't wan't to elaborate on this too much yet until I wrap
> my head around what exactly I want to achieve, but I'll give you a
> heads-up once I do :)
Well, a good report is important so as we can have an idea of what went
wrong
and also, that would allow us to differenciate bug reports.
Basically, I'm now convinced that the nvaX random lockup is not actually
one issue.
Having such an enhanced bug report could allow us to verify this theory.
PS: Speaking about nvaX lockups. I still get lockups (nva3/5) and I
suspect that the
problem comes from the context switching micro code. Not loosing the
email I'm writing
simply because kwin's channel crashed would be a big win to me.
Martin
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
[not found] ` <1335388836-13127-4-git-send-email-marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2012-04-26 7:32 ` Ben Skeggs
2012-04-28 14:56 ` Marcin Slusarz
@ 2012-05-27 19:52 ` Marcin Slusarz
2012-08-05 21:15 ` Marcin Slusarz
3 siblings, 0 replies; 12+ messages in thread
From: Marcin Slusarz @ 2012-05-27 19:52 UTC (permalink / raw)
To: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Ben Skeggs
From: Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Subject: [PATCH v4] drm/nouveau: gpu lockup recovery
Detect lockups by watching for vm flush / fence timeouts and signal them by
returning EIO. When EIOs are met at ioctl level, reset the card and repeat
last ioctl.
GPU reset is done by going through suspend / resume cycle with few tweaks:
- CPU-only bo eviction
- ignoring vm flush / fence timeouts
- shortening wait times
v2:
- move ioctl locking from drm core to nouveau
- make ioctl-side locking interruptible
- fix build bug on 32-bit systems
v3:
- make reset-side locking interruptible
- add module parameter to disable lockup recovery
- move reset code to nouveau_ioctl
v4:
- rebased on top current nouveau-git
Signed-off-by: Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
I skipped posting v3 because of possible other approach to the problem, but
I find this patch useful for debugging, so I'm posting rebased version for
other devs.
---
drivers/gpu/drm/nouveau/Makefile | 2 +-
drivers/gpu/drm/nouveau/nouveau_bo.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_drv.c | 88 ++++++++++++++++-
drivers/gpu/drm/nouveau/nouveau_drv.h | 47 ++++++++-
drivers/gpu/drm/nouveau/nouveau_fence.c | 10 ++-
drivers/gpu/drm/nouveau/nouveau_reset.c | 166 +++++++++++++++++++++++++++++++
drivers/gpu/drm/nouveau/nouveau_state.c | 6 +
drivers/gpu/drm/nouveau/nv50_graph.c | 11 +-
8 files changed, 318 insertions(+), 14 deletions(-)
create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c
diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
index 338450e..1fa707c 100644
--- a/drivers/gpu/drm/nouveau/Makefile
+++ b/drivers/gpu/drm/nouveau/Makefile
@@ -10,7 +10,7 @@ nouveau-y := nouveau_device.o nouveau_subdev.o nouveau_engine.o \
nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
- nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
+ nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_therm.o \
nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
nouveau_fanctl.o nouveau_abi16.o nouveau_agp.o \
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index f30a75a..6827f2e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1133,7 +1133,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict, bool intr,
}
/* CPU copy if we have no accelerated method available */
- if (!ndev->ttm.move) {
+ if (!ndev->ttm.move || nouveau_gpu_reset_in_progress(ndev)) {
ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, no_wait_gpu, new_mem);
goto out;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 79b3236..1dccfcc 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -131,6 +131,10 @@ MODULE_PARM_DESC(mxmdcb, "Santise DCB table according to MXM-SIS");
int nouveau_mxmdcb = 1;
module_param_named(mxmdcb, nouveau_mxmdcb, int, 0400);
+MODULE_PARM_DESC(lockup_recovery, "Reset GPU on lockup (default: 1)\n");
+int nouveau_lockup_recovery = 1;
+module_param_named(lockup_recovery, nouveau_lockup_recovery, int, 0600);
+
int nouveau_fbpercrtc;
#if 0
module_param_named(fbpercrtc, nouveau_fbpercrtc, int, 0400);
@@ -222,7 +226,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state)
}
NV_INFO(ndev, "Disabling engines...\n");
- ret = nouveau_device_fini(ndev, true);
+ ret = nouveau_device_fini(ndev, !nouveau_gpu_reset_in_progress(ndev));
if (ret)
goto out_abort;
@@ -362,11 +366,91 @@ static struct drm_ioctl_desc nouveau_ioctls[] = {
DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info, DRM_UNLOCKED|DRM_AUTH),
};
+void intr_rwsem_init(struct intr_rwsem *r)
+{
+ atomic_set(&r->readers, 0);
+ mutex_init(&r->mutex);
+}
+
+int intr_rwsem_down_read_interruptible(struct intr_rwsem *r)
+{
+ int ret = mutex_lock_interruptible(&r->mutex);
+ if (ret)
+ return ret;
+ atomic_inc(&r->readers);
+ mutex_unlock(&r->mutex);
+ return 0;
+}
+
+void intr_rwsem_down_read(struct intr_rwsem *r)
+{
+ mutex_lock(&r->mutex);
+ atomic_inc(&r->readers);
+ mutex_unlock(&r->mutex);
+}
+
+void intr_rwsem_up_read(struct intr_rwsem *r)
+{
+ atomic_dec(&r->readers);
+}
+
+int intr_rwsem_down_write_interruptible(struct intr_rwsem *r)
+{
+ int ret = mutex_lock_interruptible(&r->mutex);
+ if (ret)
+ return ret;
+ while (atomic_read(&r->readers)) {
+ if (signal_pending(current)) {
+ mutex_unlock(&r->mutex);
+ return -EINTR;
+ }
+ cond_resched();
+ }
+
+ return 0;
+}
+
+void intr_rwsem_down_write(struct intr_rwsem *r)
+{
+ mutex_lock(&r->mutex);
+ while (atomic_read(&r->readers))
+ cond_resched();
+}
+
+void intr_rwsem_up_write(struct intr_rwsem *r)
+{
+ mutex_unlock(&r->mutex);
+}
+
+static long nouveau_ioctl(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct drm_file *file_priv = filp->private_data;
+ struct drm_device *dev = file_priv->minor->dev;
+ struct nouveau_device *ndev = dev->dev_private;
+
+ long ret = intr_rwsem_down_read_interruptible(&ndev->ioctls_rwsem);
+ if (ret)
+ return -ERESTARTSYS;
+
+ ret = drm_ioctl(filp, cmd, arg);
+
+ intr_rwsem_up_read(&ndev->ioctls_rwsem);
+
+ if (unlikely(ret == -EIO)) {
+ ret = nouveau_reset_device(ndev);
+ if (ret == -EINTR)
+ ret = -ERESTARTSYS;
+ }
+
+ return ret;
+}
+
static const struct file_operations nouveau_driver_fops = {
.owner = THIS_MODULE,
.open = drm_open,
.release = drm_release,
- .unlocked_ioctl = drm_ioctl,
+ .unlocked_ioctl = nouveau_ioctl,
.mmap = nouveau_ttm_mmap,
.poll = drm_poll,
.fasync = drm_fasync,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index c1539b5..83573b5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -481,8 +481,26 @@ enum nouveau_card_type {
NV_E0 = 0xe0,
};
+struct intr_rwsem {
+ struct mutex mutex;
+ atomic_t readers;
+};
+
+extern void intr_rwsem_init(struct intr_rwsem *r);
+extern void intr_rwsem_down_read(struct intr_rwsem *r);
+extern int intr_rwsem_down_read_interruptible(struct intr_rwsem *r);
+extern void intr_rwsem_up_read(struct intr_rwsem *r);
+extern void intr_rwsem_down_write(struct intr_rwsem *r);
+extern int intr_rwsem_down_write_interruptible(struct intr_rwsem *r);
+extern void intr_rwsem_up_write(struct intr_rwsem *r);
+
struct nouveau_device {
struct drm_device *dev;
+ struct intr_rwsem ioctls_rwsem;
+
+ struct mutex reset_lock;
+ atomic_t gpureset_in_progress;
+ unsigned long last_gpu_reset;
/* the card type, takes NV_* as values */
enum nouveau_card_type card_type;
@@ -575,6 +593,7 @@ struct nouveau_device {
struct {
struct dentry *channel_root;
+ struct dentry *reset;
} debugfs;
struct nouveau_fbdev *nfbdev;
@@ -652,6 +671,7 @@ extern int nouveau_perflvl_wr;
extern int nouveau_msi;
extern int nouveau_ctxfw;
extern int nouveau_mxmdcb;
+extern int nouveau_lockup_recovery;
int nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state);
int nouveau_pci_resume(struct pci_dev *pdev);
@@ -926,6 +946,19 @@ int nouveau_display_dumb_map_offset(struct drm_file *, struct drm_device *,
u32 handle, u64 *offset);
int nouveau_display_dumb_destroy(struct drm_file *, struct drm_device *,
u32 handle);
+/* nouveau_reset.c */
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+void nouveau_reset_debugfs_fini(struct drm_minor *minor);
+void nouveau_reset_debugfs_init(struct drm_minor *minor);
+#else
+static inline void nouveau_reset_debugfs_fini(struct drm_minor *minor) {}
+static inline void nouveau_reset_debugfs_init(struct drm_minor *minor) {}
+#endif
+int nouveau_reset_device(struct nouveau_device *ndev);
+static inline bool nouveau_gpu_reset_in_progress(struct nouveau_device *ndev)
+{
+ return atomic_read(&ndev->gpureset_in_progress) != 0;
+}
/* nv50_calc.c */
int nv50_calc_pll(struct nouveau_device *, struct pll_lims *, int clk,
@@ -1001,12 +1034,20 @@ static inline void nv_wr08(struct nouveau_device *ndev, unsigned reg, u8 val)
iowrite8(val, ndev->mmio + reg);
}
+static inline uint64_t nv_timeout(struct nouveau_device *ndev)
+{
+ uint64_t tm = 2000000000ULL;
+ if (nouveau_gpu_reset_in_progress(ndev))
+ tm = 50000000; /* 50ms */
+ return tm;
+}
+
#define nv_wait(dev, reg, mask, val) \
- nouveau_wait_eq(dev, 2000000000ULL, (reg), (mask), (val))
+ nouveau_wait_eq(dev, nv_timeout(dev), (reg), (mask), (val))
#define nv_wait_ne(dev, reg, mask, val) \
- nouveau_wait_ne(dev, 2000000000ULL, (reg), (mask), (val))
+ nouveau_wait_ne(dev, nv_timeout(dev), (reg), (mask), (val))
#define nv_wait_cb(dev, func, data) \
- nouveau_wait_cb(dev, 2000000000ULL, (func), (data))
+ nouveau_wait_cb(dev, nv_timeout(dev), (func), (data))
/* PRAMIN access */
static inline u32 nv_ri32(struct nouveau_device *ndev, unsigned offset)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 19a2534..e55fc52 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -114,13 +114,19 @@ nouveau_fence_done(struct nouveau_fence *fence)
int
nouveau_fence_wait(struct nouveau_fence *fence, bool lazy, bool intr)
{
+ struct nouveau_device *ndev = fence->channel->device;
+ unsigned long timeout = fence->timeout;
unsigned long sleep_time = NSEC_PER_MSEC / 1000;
ktime_t t;
int ret = 0;
+ if (nouveau_gpu_reset_in_progress(ndev))
+ timeout = jiffies + DRM_HZ / 5;
+
while (!nouveau_fence_done(fence)) {
- if (fence->timeout && time_after_eq(jiffies, fence->timeout)) {
- ret = -EBUSY;
+ if (fence->timeout && time_after_eq(jiffies, timeout)) {
+ if (!nouveau_gpu_reset_in_progress(ndev))
+ ret = -EIO;
break;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c b/drivers/gpu/drm/nouveau/nouveau_reset.c
new file mode 100644
index 0000000..9df93e6
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 Marcin Slusarz <marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include "drmP.h"
+#include "nouveau_drv.h"
+
+static int off(struct nouveau_device *ndev)
+{
+ struct drm_device *dev = ndev->dev;
+ struct pci_dev *pdev = dev->pdev;
+ int ret;
+
+ pm_message_t pmm = { .event = PM_EVENT_SUSPEND };
+ atomic_inc(&ndev->gpureset_in_progress);
+ ret = intr_rwsem_down_write_interruptible(&ndev->ioctls_rwsem);
+ if (ret)
+ goto fail2;
+
+ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+ ret = nouveau_pci_suspend(pdev, pmm);
+ if (ret)
+ goto fail;
+
+ dev->switch_power_state = DRM_SWITCH_POWER_OFF;
+ return 0;
+
+fail:
+ dev->switch_power_state = DRM_SWITCH_POWER_ON;
+ intr_rwsem_up_write(&ndev->ioctls_rwsem);
+fail2:
+ atomic_dec(&ndev->gpureset_in_progress);
+ return ret;
+}
+
+static void on(struct nouveau_device *ndev)
+{
+ struct drm_device *dev = ndev->dev;
+ struct pci_dev *pdev = dev->pdev;
+
+ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+ atomic_dec(&ndev->gpureset_in_progress);
+ nouveau_pci_resume(pdev);
+ dev->switch_power_state = DRM_SWITCH_POWER_ON;
+
+ ndev->last_gpu_reset = jiffies;
+ intr_rwsem_up_write(&ndev->ioctls_rwsem);
+}
+
+static int __nouveau_reset_device(struct nouveau_device *ndev, bool manual)
+{
+ int ret = -EAGAIN;
+ unsigned long start, end;
+ int offret;
+
+ if (mutex_trylock(&ndev->reset_lock) == 0)
+ /* gpu reset in progress */
+ return -EAGAIN;
+
+ if (time_before(jiffies, ndev->last_gpu_reset + 10 * DRM_HZ))
+ goto out;
+ if (!(nouveau_lockup_recovery || manual))
+ goto out;
+
+ if (manual)
+ NV_INFO(ndev, "Manual GPU reset invoked...\n");
+ else
+ NV_INFO(ndev, "GPU lockup detected, resetting... (process: %s[%d])\n",
+ current->comm, task_pid_nr(current));
+
+ start = jiffies;
+ do {
+ offret = off(ndev);
+ } while (offret != 0 && offret != -EINTR);
+
+ if (offret == 0) {
+ on(ndev);
+ end = jiffies;
+ NV_INFO(ndev, "GPU reset done, took %lus\n", (end - start) / DRM_HZ);
+ } else {
+ ret = offret;
+ end = jiffies;
+ NV_INFO(ndev, "GPU reset interrupted after %lus\n", (end - start) / DRM_HZ);
+ }
+
+out:
+ mutex_unlock(&ndev->reset_lock);
+ return ret;
+}
+
+int nouveau_reset_device(struct nouveau_device *ndev)
+{
+ return __nouveau_reset_device(ndev, false);
+}
+
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+static ssize_t nouveau_reset_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct nouveau_device *ndev = filp->private_data;
+ char usercmd[2];
+ if (cnt > 2)
+ cnt = 2;
+
+ if (copy_from_user(usercmd, ubuf, cnt))
+ return -EFAULT;
+
+ if (usercmd[0] == '1')
+ __nouveau_reset_device(ndev, true);
+
+ return cnt;
+}
+
+static const struct file_operations nouveau_reset_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .write = nouveau_reset_write,
+ .llseek = noop_llseek,
+};
+
+void nouveau_reset_debugfs_fini(struct drm_minor *minor)
+{
+ struct drm_device *dev = minor->dev;
+ struct nouveau_device *ndev = dev->dev_private;
+
+ if (ndev->debugfs.reset) {
+ debugfs_remove(ndev->debugfs.reset);
+ ndev->debugfs.reset = NULL;
+ }
+}
+
+
+void nouveau_reset_debugfs_init(struct drm_minor *minor)
+{
+ struct drm_device *dev = minor->dev;
+ struct nouveau_device *ndev = dev->dev_private;
+
+ ndev->debugfs.reset = debugfs_create_file("reset", 0200,
+ minor->debugfs_root, ndev, &nouveau_reset_fops);
+ if (IS_ERR_OR_NULL(ndev->debugfs.reset))
+ ndev->debugfs.reset = NULL;
+
+}
+#endif
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
index 628c46c..304b6a1 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -241,6 +241,8 @@ nouveau_card_init(struct nouveau_device *ndev)
if (ret)
goto out;
engine = &ndev->subsys;
+ intr_rwsem_init(&ndev->ioctls_rwsem);
+ mutex_init(&ndev->reset_lock);
spin_lock_init(&ndev->channels.lock);
spin_lock_init(&ndev->tile.lock);
spin_lock_init(&ndev->context_switch_lock);
@@ -323,6 +325,7 @@ nouveau_card_init(struct nouveau_device *ndev)
nouveau_fbcon_init(ndev);
}
+ nouveau_reset_debugfs_init(dev->primary);
return 0;
@@ -354,6 +357,8 @@ static void nouveau_card_takedown(struct nouveau_device *ndev)
struct nouveau_subsys *engine = &ndev->subsys;
struct drm_device *dev = ndev->dev;
+ nouveau_reset_debugfs_fini(dev->primary);
+
if (dev->mode_config.num_crtc) {
nouveau_fbcon_fini(ndev);
nouveau_display_fini(ndev);
@@ -528,6 +533,7 @@ int nouveau_load(struct drm_device *dev, unsigned long flags)
}
dev->dev_private = ndev;
ndev->dev = dev;
+ atomic_set(&ndev->gpureset_in_progress, 0);
pci_set_master(dev->pdev);
diff --git a/drivers/gpu/drm/nouveau/nv50_graph.c b/drivers/gpu/drm/nouveau/nv50_graph.c
index ef6757f..26728100 100644
--- a/drivers/gpu/drm/nouveau/nv50_graph.c
+++ b/drivers/gpu/drm/nouveau/nv50_graph.c
@@ -247,13 +247,14 @@ nv84_graph_tlb_flush(struct nouveau_device *ndev, int engine)
break;
}
} while (!idle &&
- !(timeout = ptimer->read(ptimer) - start > 2000000000));
+ !(timeout = ptimer->read(ptimer) - start > nv_timeout(ndev)));
if (timeout) {
- NV_ERROR(ndev, "PGRAPH TLB flush idle timeout fail: "
- "0x%08x 0x%08x 0x%08x 0x%08x\n",
- nv_rd32(ndev, 0x400700), nv_rd32(ndev, 0x400380),
- nv_rd32(ndev, 0x400384), nv_rd32(ndev, 0x400388));
+ if (!nouveau_gpu_reset_in_progress(ndev))
+ NV_ERROR(ndev, "PGRAPH TLB flush idle timeout fail: "
+ "0x%08x 0x%08x 0x%08x 0x%08x\n",
+ nv_rd32(ndev, 0x400700), nv_rd32(ndev, 0x400380),
+ nv_rd32(ndev, 0x400384), nv_rd32(ndev, 0x400388));
ret = -EIO;
}
--
1.7.8.6
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2 4/4] drm/nouveau: gpu lockup recovery
[not found] ` <1335388836-13127-4-git-send-email-marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
` (2 preceding siblings ...)
2012-05-27 19:52 ` Marcin Slusarz
@ 2012-08-05 21:15 ` Marcin Slusarz
3 siblings, 0 replies; 12+ messages in thread
From: Marcin Slusarz @ 2012-08-05 21:15 UTC (permalink / raw)
To: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Hi
I refreshed this patchset to current nouveau git.
http://people.freedesktop.org/~mslusarz/gpu-lockup-recovery/
Marcin
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2012-08-05 21:15 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-04-25 21:20 [PATCH v2 4/4] drm/nouveau: gpu lockup recovery Marcin Slusarz
2012-04-25 21:32 ` Marcin Slusarz
[not found] ` <1335388836-13127-4-git-send-email-marcin.slusarz-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2012-04-26 7:32 ` Ben Skeggs
2012-04-28 14:49 ` Marcin Slusarz
[not found] ` <20120428144956.GA10116-OI9uyE9O0yo@public.gmane.org>
2012-05-02 11:28 ` Ben Skeggs
2012-05-02 13:33 ` Martin Peres
[not found] ` <4FA137C4.3000900-GANU6spQydw@public.gmane.org>
2012-05-02 13:48 ` Ben Skeggs
2012-05-02 13:53 ` Martin Peres
2012-04-28 14:56 ` Marcin Slusarz
[not found] ` <20120428145615.GB10116-OI9uyE9O0yo@public.gmane.org>
2012-04-30 9:47 ` Martin Peres
2012-05-27 19:52 ` Marcin Slusarz
2012-08-05 21:15 ` Marcin Slusarz
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.