All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
@ 2012-04-22 22:18 Marcin Slusarz
  2012-04-23  8:43 ` Martin Peres
  2012-04-23 16:46 ` Martin Peres
  0 siblings, 2 replies; 9+ messages in thread
From: Marcin Slusarz @ 2012-04-22 22:18 UTC (permalink / raw)
  To: nouveau, dri-devel, Ben Skeggs

Overall idea:
Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
handle them at ioctl level, reset the GPU and repeat last ioctl.

GPU reset is done by doing suspend / resume cycle with few tweaks:
- CPU-only bo eviction
- ignoring vm flush / fence timeouts
- shortening waits

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
---
Tested only on nv92.
---
 drivers/gpu/drm/nouveau/Makefile           |    2 +-
 drivers/gpu/drm/nouveau/nouveau_bo.c       |    2 +-
 drivers/gpu/drm/nouveau/nouveau_channel.c  |    5 +-
 drivers/gpu/drm/nouveau/nouveau_drv.c      |    3 +-
 drivers/gpu/drm/nouveau/nouveau_drv.h      |   33 ++++++-
 drivers/gpu/drm/nouveau/nouveau_fence.c    |    7 +-
 drivers/gpu/drm/nouveau/nouveau_gem.c      |   14 +++-
 drivers/gpu/drm/nouveau/nouveau_notifier.c |    3 +
 drivers/gpu/drm/nouveau/nouveau_object.c   |    6 +
 drivers/gpu/drm/nouveau/nouveau_reset.c    |  144 ++++++++++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_state.c    |    5 +
 drivers/gpu/drm/nouveau/nv50_graph.c       |   11 +-
 12 files changed, 221 insertions(+), 14 deletions(-)
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c

diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
index 03860f5..77d0c33 100644
--- a/drivers/gpu/drm/nouveau/Makefile
+++ b/drivers/gpu/drm/nouveau/Makefile
@@ -9,7 +9,7 @@ nouveau-y := nouveau_drv.o nouveau_state.o nouveau_channel.o nouveau_mem.o \
              nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
              nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
              nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
-             nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
+             nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
 	     nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_temp.o \
 	     nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
              nv04_timer.o \
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 5b0dc50..7de6cad 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -936,7 +936,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict, bool intr,
 	}
 
 	/* Software copy if the card isn't up and running yet. */
-	if (!dev_priv->channel) {
+	if (!dev_priv->channel || nouveau_gpu_reset_in_progress(dev_priv->dev)) {
 		ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, no_wait_gpu, new_mem);
 		goto out;
 	}
diff --git a/drivers/gpu/drm/nouveau/nouveau_channel.c b/drivers/gpu/drm/nouveau/nouveau_channel.c
index 846afb0..c0fa5a7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_channel.c
+++ b/drivers/gpu/drm/nouveau/nouveau_channel.c
@@ -420,7 +420,7 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
 				    init->fb_ctxdma_handle,
 				    init->tt_ctxdma_handle);
 	if (ret)
-		return ret;
+		goto out;
 	init->channel  = chan->id;
 
 	if (nouveau_vram_pushbuf == 0) {
@@ -450,6 +450,9 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
 	if (ret == 0)
 		atomic_inc(&chan->users); /* userspace reference */
 	nouveau_channel_put(&chan);
+out:
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 090fff6..22c435f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -237,7 +237,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state)
 		if (!dev_priv->eng[e])
 			continue;
 
-		ret = dev_priv->eng[e]->fini(dev, e, true);
+		ret = dev_priv->eng[e]->fini(dev, e, !nouveau_gpu_reset_in_progress(dev));
 		if (ret) {
 			NV_ERROR(dev, "... engine %d failed: %d\n", e, ret);
 			goto out_abort;
@@ -483,6 +483,7 @@ static struct drm_driver driver = {
 	.disable_vblank = nouveau_vblank_disable,
 	.reclaim_buffers = drm_core_reclaim_buffers,
 	.ioctls = nouveau_ioctls,
+	.ioctls_need_rwsem = true,
 	.fops = &nouveau_driver_fops,
 	.gem_init_object = nouveau_gem_object_new,
 	.gem_free_object = nouveau_gem_object_del,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index d120baf..01500e1 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -708,6 +708,10 @@ struct drm_nouveau_private {
 	struct drm_device *dev;
 	bool noaccel;
 
+	struct mutex reset_lock;
+	atomic_t gpureset_in_progress;
+	unsigned long last_gpu_reset;
+
 	/* the card type, takes NV_* as values */
 	enum nouveau_card_type card_type;
 	/* exact chipset, derived from NV_PMC_BOOT_0 */
@@ -841,6 +845,7 @@ struct drm_nouveau_private {
 
 	struct {
 		struct dentry *channel_root;
+		struct dentry *reset;
 	} debugfs;
 
 	struct nouveau_fbdev *nfbdev;
@@ -1537,6 +1542,20 @@ int nouveau_display_dumb_map_offset(struct drm_file *, struct drm_device *,
 				    uint32_t handle, uint64_t *offset);
 int nouveau_display_dumb_destroy(struct drm_file *, struct drm_device *,
 				 uint32_t handle);
+/* nouveau_reset.c */
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+void nouveau_reset_debugfs_fini(struct drm_minor *minor);
+void nouveau_reset_debugfs_init(struct drm_minor *minor);
+#else
+static inline void nouveau_reset_debugfs_fini(struct drm_minor *minor) {}
+static inline void nouveau_reset_debugfs_init(struct drm_minor *minor) {}
+#endif
+int  nouveau_reset_device(struct drm_device *dev);
+static inline bool nouveau_gpu_reset_in_progress(struct drm_device *dev)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	return atomic_read(&dev_priv->gpureset_in_progress) != 0;
+}
 
 /* nv10_gpio.c */
 int nv10_gpio_init(struct drm_device *dev);
@@ -1632,12 +1651,20 @@ static inline void nv_wr08(struct drm_device *dev, unsigned reg, u8 val)
 	iowrite8(val, dev_priv->mmio + reg);
 }
 
+static inline uint64_t nv_timeout(struct drm_device *dev)
+{
+	uint64_t tm = 2000000000ULL;
+	if (nouveau_gpu_reset_in_progress(dev))
+		tm /= 40; /* 50ms */
+	return tm;
+}
+
 #define nv_wait(dev, reg, mask, val) \
-	nouveau_wait_eq(dev, 2000000000ULL, (reg), (mask), (val))
+	nouveau_wait_eq(dev, nv_timeout(dev), (reg), (mask), (val))
 #define nv_wait_ne(dev, reg, mask, val) \
-	nouveau_wait_ne(dev, 2000000000ULL, (reg), (mask), (val))
+	nouveau_wait_ne(dev, nv_timeout(dev), (reg), (mask), (val))
 #define nv_wait_cb(dev, func, data) \
-	nouveau_wait_cb(dev, 2000000000ULL, (func), (data))
+	nouveau_wait_cb(dev, nv_timeout(dev), (func), (data))
 
 /* PRAMIN access */
 static inline u32 nv_ri32(struct drm_device *dev, unsigned offset)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 59f92e9..8c973ab 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -233,17 +233,22 @@ int
 __nouveau_fence_wait(void *sync_obj, void *sync_arg, bool lazy, bool intr)
 {
 	struct nouveau_fence *fence = nouveau_fence(sync_obj);
+	struct drm_device *dev = fence->channel->dev;
 	unsigned long timeout = fence->emitted_at + 3 * DRM_HZ;
 	unsigned long sleep_time = NSEC_PER_MSEC / 1000;
 	ktime_t t;
 	int ret = 0;
 
+	if (nouveau_gpu_reset_in_progress(dev))
+		timeout = fence->emitted_at + DRM_HZ / 5;
+
 	while (1) {
 		if (__nouveau_fence_signalled(sync_obj, sync_arg))
 			break;
 
 		if (time_after_eq(jiffies, timeout)) {
-			ret = -EBUSY;
+			if (!nouveau_gpu_reset_in_progress(dev))
+				ret = -EIO;
 			break;
 		}
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index ed52a6f..f9bbcc0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -214,7 +214,7 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
 			      req->info.domain, req->info.tile_mode,
 			      req->info.tile_flags, &nvbo);
 	if (ret)
-		return ret;
+		goto out;
 
 	ret = drm_gem_handle_create(file_priv, nvbo->gem, &req->info.handle);
 	if (ret == 0) {
@@ -225,6 +225,9 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
 
 	/* drop reference from allocate - handle holds it now */
 	drm_gem_object_unreference_unlocked(nvbo->gem);
+out:
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
@@ -804,6 +807,9 @@ out_next:
 	}
 
 	nouveau_channel_put(&chan);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
@@ -839,6 +845,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
 	ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
 	spin_unlock(&nvbo->bo.bdev->fence_lock);
 	drm_gem_object_unreference_unlocked(gem);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
@@ -863,6 +872,9 @@ nouveau_gem_ioctl_info(struct drm_device *dev, void *data,
 
 	ret = nouveau_gem_info(file_priv, gem, req);
 	drm_gem_object_unreference_unlocked(gem);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_notifier.c b/drivers/gpu/drm/nouveau/nouveau_notifier.c
index 2ef883c..e224b1c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_notifier.c
+++ b/drivers/gpu/drm/nouveau/nouveau_notifier.c
@@ -200,5 +200,8 @@ nouveau_ioctl_notifier_alloc(struct drm_device *dev, void *data,
 	ret = nouveau_notifier_alloc(chan, na->handle, na->size, 0, 0x1000,
 				     &na->offset);
 	nouveau_channel_put(&chan);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_object.c b/drivers/gpu/drm/nouveau/nouveau_object.c
index cc419fa..ba592b0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_object.c
+++ b/drivers/gpu/drm/nouveau/nouveau_object.c
@@ -973,6 +973,9 @@ int nouveau_ioctl_grobj_alloc(struct drm_device *dev, void *data,
 
 out:
 	nouveau_channel_put(&chan);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
@@ -992,6 +995,9 @@ int nouveau_ioctl_gpuobj_free(struct drm_device *dev, void *data,
 
 	ret = nouveau_ramht_remove(chan, objfree->handle);
 	nouveau_channel_put(&chan);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c b/drivers/gpu/drm/nouveau/nouveau_reset.c
new file mode 100644
index 0000000..93af3a1
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2012 Marcin Slusarz <marcin.slusarz@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include "drmP.h"
+#include "nouveau_drv.h"
+
+static bool off(struct drm_device *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	pm_message_t pmm = { .event = PM_EVENT_SUSPEND };
+	atomic_inc(&dev_priv->gpureset_in_progress);
+	down_write(&dev->ioctls_rwsem);
+
+	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+	if (nouveau_pci_suspend(pdev, pmm))
+		goto fail;
+
+	dev->switch_power_state = DRM_SWITCH_POWER_OFF;
+	return true;
+
+fail:
+	dev->switch_power_state = DRM_SWITCH_POWER_ON;
+	up_write(&dev->ioctls_rwsem);
+	return false;
+}
+
+static void on(struct drm_device *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+	atomic_dec(&dev_priv->gpureset_in_progress);
+	nouveau_pci_resume(pdev);
+	dev->switch_power_state = DRM_SWITCH_POWER_ON;
+
+	dev_priv->last_gpu_reset = jiffies;
+	up_write(&dev->ioctls_rwsem);
+}
+
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+static ssize_t nouveau_reset_write(struct file *filp, const char __user *ubuf,
+			     size_t cnt, loff_t *ppos)
+{
+	struct drm_device *dev = filp->private_data;
+	char usercmd[2];
+	if (cnt > 2)
+		cnt = 2;
+
+	if (copy_from_user(usercmd, ubuf, cnt))
+		return -EFAULT;
+
+	if (usercmd[0] == '1') {
+		down_read(&dev->ioctls_rwsem);
+		nouveau_reset_device(dev);
+		up_read(&dev->ioctls_rwsem);
+	}
+
+	return cnt;
+}
+
+static const struct file_operations nouveau_reset_fops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.write = nouveau_reset_write,
+	.llseek = noop_llseek,
+};
+
+void nouveau_reset_debugfs_fini(struct drm_minor *minor)
+{
+	struct drm_device *dev = minor->dev;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	if (dev_priv->debugfs.reset) {
+		debugfs_remove(dev_priv->debugfs.reset);
+		dev_priv->debugfs.reset = NULL;
+	}
+}
+
+
+void nouveau_reset_debugfs_init(struct drm_minor *minor)
+{
+	struct drm_device *dev = minor->dev;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	dev_priv->debugfs.reset = debugfs_create_file("reset", 0200,
+			minor->debugfs_root, dev, &nouveau_reset_fops);
+	if (IS_ERR_OR_NULL(dev_priv->debugfs.reset))
+		dev_priv->debugfs.reset = NULL;
+
+}
+#endif
+
+int nouveau_reset_device(struct drm_device *dev)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	if (mutex_trylock(&dev_priv->reset_lock) == 0)
+		/* gpu reset in progress */
+		goto out;
+
+	if (time_after_eq(jiffies, dev_priv->last_gpu_reset + 10 * DRM_HZ)) {
+		unsigned long start, end;
+
+		up_read(&dev->ioctls_rwsem);
+		NV_INFO(dev, "GPU lockup detected, resetting...\n");
+		start = jiffies;
+		while (!off(dev))
+			;
+		on(dev);
+		end = jiffies;
+		NV_INFO(dev, "GPU reset done, took %lu s\n", (end - start) / DRM_HZ);
+		down_read(&dev->ioctls_rwsem);
+	}
+	mutex_unlock(&dev_priv->reset_lock);
+
+out:
+	return -EAGAIN;
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
index afec760..2e981a8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -697,6 +697,7 @@ nouveau_card_init(struct drm_device *dev)
 	if (ret)
 		goto out;
 	engine = &dev_priv->engine;
+	mutex_init(&dev_priv->reset_lock);
 	spin_lock_init(&dev_priv->channels.lock);
 	spin_lock_init(&dev_priv->tile.lock);
 	spin_lock_init(&dev_priv->context_switch_lock);
@@ -886,6 +887,7 @@ nouveau_card_init(struct drm_device *dev)
 
 		nouveau_fbcon_init(dev);
 	}
+	nouveau_reset_debugfs_init(dev->primary);
 
 	return 0;
 
@@ -943,6 +945,8 @@ static void nouveau_card_takedown(struct drm_device *dev)
 	struct nouveau_engine *engine = &dev_priv->engine;
 	int e;
 
+	nouveau_reset_debugfs_fini(dev->primary);
+
 	if (dev->mode_config.num_crtc) {
 		nouveau_fbcon_fini(dev);
 		nouveau_display_fini(dev);
@@ -1129,6 +1133,7 @@ int nouveau_load(struct drm_device *dev, unsigned long flags)
 	}
 	dev->dev_private = dev_priv;
 	dev_priv->dev = dev;
+	atomic_set(&dev_priv->gpureset_in_progress, 0);
 
 	pci_set_master(dev->pdev);
 
diff --git a/drivers/gpu/drm/nouveau/nv50_graph.c b/drivers/gpu/drm/nouveau/nv50_graph.c
index a61853f..d0a2e50 100644
--- a/drivers/gpu/drm/nouveau/nv50_graph.c
+++ b/drivers/gpu/drm/nouveau/nv50_graph.c
@@ -440,13 +440,14 @@ nv84_graph_tlb_flush(struct drm_device *dev, int engine)
 			ret = -ERESTARTSYS;
 			break;
 		}
-	} while (!idle && !(timeout = ptimer->read(dev) - start > 2000000000));
+	} while (!idle && !(timeout = ptimer->read(dev) - start > nv_timeout(dev)));
 
 	if (timeout) {
-		NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
-			      "0x%08x 0x%08x 0x%08x 0x%08x\n",
-			 nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
-			 nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
+		if (!nouveau_gpu_reset_in_progress(dev))
+			NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
+				"0x%08x 0x%08x 0x%08x 0x%08x\n",
+				nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
+				nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
 		ret = -EIO;
 	}
 
-- 
1.7.8.5

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
  2012-04-22 22:18 [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery Marcin Slusarz
@ 2012-04-23  8:43 ` Martin Peres
  2012-04-23 16:32   ` Marcin Slusarz
  2012-04-23 16:46 ` Martin Peres
  1 sibling, 1 reply; 9+ messages in thread
From: Martin Peres @ 2012-04-23  8:43 UTC (permalink / raw)
  To: dri-devel

Le 23/04/2012 00:18, Marcin Slusarz a écrit :
> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
>
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
>
> Signed-off-by: Marcin Slusarz<marcin.slusarz@gmail.com>
> ---
> Tested only on nv92.
Hi Marcin,

I'm really busy at the moment but I'm glad to see such patches coming out.
I'll try them out ASAP!

Do you have a recommended way to test your patch set?

Thanks,

Martin

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
  2012-04-23  8:43 ` Martin Peres
@ 2012-04-23 16:32   ` Marcin Slusarz
       [not found]     ` <20120423163257.GA2886-OI9uyE9O0yo@public.gmane.org>
  2012-04-25 11:06     ` [Nouveau] " Christoph Bumiller
  0 siblings, 2 replies; 9+ messages in thread
From: Marcin Slusarz @ 2012-04-23 16:32 UTC (permalink / raw)
  To: Martin Peres; +Cc: nouveau, dri-devel

On Mon, Apr 23, 2012 at 10:43:08AM +0200, Martin Peres wrote:
> Le 23/04/2012 00:18, Marcin Slusarz a écrit :
> > Overall idea:
> > Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> > handle them at ioctl level, reset the GPU and repeat last ioctl.
> >
> > GPU reset is done by doing suspend / resume cycle with few tweaks:
> > - CPU-only bo eviction
> > - ignoring vm flush / fence timeouts
> > - shortening waits
> >
> > Signed-off-by: Marcin Slusarz<marcin.slusarz@gmail.com>
> > ---
> > Tested only on nv92.
> Hi Marcin,
> 
> I'm really busy at the moment but I'm glad to see such patches coming out.
> I'll try them out ASAP!
> 
> Do you have a recommended way to test your patch set?

Just run piglit. Even "quick" tests can cause ~5 lockups (it eventually messes
up DDX channel, but this patchset can't fix this case).
You can run fs-discard-exit-2 test first - for me it causes instant GPU lockup.

Marcin
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
  2012-04-22 22:18 [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery Marcin Slusarz
  2012-04-23  8:43 ` Martin Peres
@ 2012-04-23 16:46 ` Martin Peres
  2012-04-23 17:33   ` Marcin Slusarz
  1 sibling, 1 reply; 9+ messages in thread
From: Martin Peres @ 2012-04-23 16:46 UTC (permalink / raw)
  To: dri-devel

Hey,

Just a minor mistake spotted while skimming through the patch.

Le 23/04/2012 00:18, Marcin Slusarz a écrit :
> +static inline uint64_t nv_timeout(struct drm_device *dev)
> +{
> +	uint64_t tm = 2000000000ULL;
> +	if (nouveau_gpu_reset_in_progress(dev))
> +		tm /= 40; /* 50ms */
This will cause a problem on 32 bit kernels. You should use do_div.

Martin

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
       [not found]     ` <20120423163257.GA2886-OI9uyE9O0yo@public.gmane.org>
@ 2012-04-23 16:56       ` Martin Peres
  2012-04-24 19:31         ` Marcin Slusarz
  0 siblings, 1 reply; 9+ messages in thread
From: Martin Peres @ 2012-04-23 16:56 UTC (permalink / raw)
  To: Marcin Slusarz
  Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Le 23/04/2012 18:32, Marcin Slusarz a écrit :
>
> Just run piglit. Even "quick" tests can cause ~5 lockups (it eventually messes
> up DDX channel, but this patchset can't fix this case).
> You can run fs-discard-exit-2 test first - for me it causes instant GPU lockup.
>
> Marcin
Great, Thanks.

Did you have a look at 
https://bugs.freedesktop.org/show_bug.cgi?id=40886 and 
http://xorg.freedesktop.org/wiki/SummerOfCodeIdeas ?
The Ubuntu xorg devs were looking for something like this, but they also 
wanted a lockup report. Are you also interested on working on it ?

Martin
_______________________________________________
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
  2012-04-23 16:46 ` Martin Peres
@ 2012-04-23 17:33   ` Marcin Slusarz
  0 siblings, 0 replies; 9+ messages in thread
From: Marcin Slusarz @ 2012-04-23 17:33 UTC (permalink / raw)
  To: Martin Peres; +Cc: nouveau, dri-devel

On Mon, Apr 23, 2012 at 06:46:41PM +0200, Martin Peres wrote:
> Hey,
> 
> Just a minor mistake spotted while skimming through the patch.
> 
> Le 23/04/2012 00:18, Marcin Slusarz a écrit :
> > +static inline uint64_t nv_timeout(struct drm_device *dev)
> > +{
> > +	uint64_t tm = 2000000000ULL;
> > +	if (nouveau_gpu_reset_in_progress(dev))
> > +		tm /= 40; /* 50ms */
> This will cause a problem on 32 bit kernels. You should use do_div.
> 

Thanks. I'll fix this later.

Marcin
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
  2012-04-23 16:56       ` Martin Peres
@ 2012-04-24 19:31         ` Marcin Slusarz
       [not found]           ` <20120424193150.GC2857-OI9uyE9O0yo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Marcin Slusarz @ 2012-04-24 19:31 UTC (permalink / raw)
  To: Martin Peres; +Cc: nouveau, dri-devel

On Mon, Apr 23, 2012 at 06:56:44PM +0200, Martin Peres wrote:
> Le 23/04/2012 18:32, Marcin Slusarz a écrit :
> >
> > Just run piglit. Even "quick" tests can cause ~5 lockups (it eventually messes
> > up DDX channel, but this patchset can't fix this case).
> > You can run fs-discard-exit-2 test first - for me it causes instant GPU lockup.
> >
> > Marcin
> Great, Thanks.
> 
> Did you have a look at 
> https://bugs.freedesktop.org/show_bug.cgi?id=40886 and 
> http://xorg.freedesktop.org/wiki/SummerOfCodeIdeas ?

Yeah, I've seen them some time ago.

> The Ubuntu xorg devs were looking for something like this, but they also 
> wanted a lockup report. Are you also interested on working on it ?

Yes, when this patchset will be applied, I'm going to work on improving
error reporting.

Marcin
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
       [not found]           ` <20120424193150.GC2857-OI9uyE9O0yo@public.gmane.org>
@ 2012-04-25  0:32             ` Ben Skeggs
  0 siblings, 0 replies; 9+ messages in thread
From: Ben Skeggs @ 2012-04-25  0:32 UTC (permalink / raw)
  To: Marcin Slusarz
  Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Tue, 2012-04-24 at 21:31 +0200, Marcin Slusarz wrote:
> On Mon, Apr 23, 2012 at 06:56:44PM +0200, Martin Peres wrote:
> > Le 23/04/2012 18:32, Marcin Slusarz a écrit :
> > >
> > > Just run piglit. Even "quick" tests can cause ~5 lockups (it eventually messes
> > > up DDX channel, but this patchset can't fix this case).
> > > You can run fs-discard-exit-2 test first - for me it causes instant GPU lockup.
> > >
> > > Marcin
> > Great, Thanks.
> > 
> > Did you have a look at 
> > https://bugs.freedesktop.org/show_bug.cgi?id=40886 and 
> > http://xorg.freedesktop.org/wiki/SummerOfCodeIdeas ?
> 
> Yeah, I've seen them some time ago.
> 
> > The Ubuntu xorg devs were looking for something like this, but they also 
> > wanted a lockup report. Are you also interested on working on it ?
As I argued at XDC last year, I really question the usefulness of
something like this.  We have stupidly HUGE amounts of state that could
be relevant, and the situations where we even need something like this
are RARE.

I don't want this useless crap in our kernel module just because some
random distro thinks it's so useful, when it's not.  On the very very
rare (I can think of one situation where we've wanted these register
dumps, and they weren't useful even then) occasions we need this info,
we can ask people to install envytools and grab it..

We have a GPU with *very* good error reporting, and we log this to
dmesg.  This is good enough.  Any random errorless lockups are much
harder, and unless you dump *all* the card state right from the memory
controllers, to the clocks, to PFIFO to the particular engine that's
involved.. It's going to be useless.  The problem could be anything.

> 
> Yes, when this patchset will be applied, I'm going to work on improving
> error reporting.
Assuming you're not talking about a register-dump style lockup report
like above, this could be good.  Particularly, fleshing out and
improving/completing each engine's IRQ handlers (which will probably
have the nice side-effect of surviving a few more errors without locking
up) :)

Cheers,
Ben.

> 
> Marcin
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel


_______________________________________________
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Nouveau] [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery
  2012-04-23 16:32   ` Marcin Slusarz
       [not found]     ` <20120423163257.GA2886-OI9uyE9O0yo@public.gmane.org>
@ 2012-04-25 11:06     ` Christoph Bumiller
  1 sibling, 0 replies; 9+ messages in thread
From: Christoph Bumiller @ 2012-04-25 11:06 UTC (permalink / raw)
  To: Marcin Slusarz; +Cc: nouveau, dri-devel

On 04/23/2012 06:32 PM, Marcin Slusarz wrote:
> You can run fs-discard-exit-2 test first - for me it causes instant GPU lockup.
> 

That's because it's designed (but not supposed) to do that, it also
locks up with the blob, it's a harmless shader infinite loop.

(May be a bug in the MPs or a wrong setting somewhere, but they don't
pop the control flow stack if all but killed pixels of a partially
killed quad remain on the current execution path).

Activated the watchdog on nv50 now, too, if you want to produce the
lockup with current mesa git run with NOUVEAU_SHADER_WATCHDOG=0 (yes
when we use COMPUTE it will have to be deactivated as well).

> Marcin
> _______________________________________________
> Nouveau mailing list
> Nouveau@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/nouveau
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2012-04-25 11:06 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-04-22 22:18 [RFC PATCH 5/5] drm/nouveau: gpu lockup recovery Marcin Slusarz
2012-04-23  8:43 ` Martin Peres
2012-04-23 16:32   ` Marcin Slusarz
     [not found]     ` <20120423163257.GA2886-OI9uyE9O0yo@public.gmane.org>
2012-04-23 16:56       ` Martin Peres
2012-04-24 19:31         ` Marcin Slusarz
     [not found]           ` <20120424193150.GC2857-OI9uyE9O0yo@public.gmane.org>
2012-04-25  0:32             ` Ben Skeggs
2012-04-25 11:06     ` [Nouveau] " Christoph Bumiller
2012-04-23 16:46 ` Martin Peres
2012-04-23 17:33   ` Marcin Slusarz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.