public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Maarten Lankhorst <maarten.lankhorst@canonical.com>
To: airlied@linux.ie
Cc: thellstrom@vmware.com, nouveau@lists.freedesktop.org,
	linux-kernel@vger.kernel.org, dri-devel@lists.freedesktop.org,
	bskeggs@redhat.com, alexander.deucher@amd.com,
	christian.koenig@amd.com
Subject: [PATCH 09/19] drm/radeon: handle lockup in delayed work, v2
Date: Thu, 31 Jul 2014 17:33:42 +0200	[thread overview]
Message-ID: <20140731153342.15061.54264.stgit@patser> (raw)
In-Reply-To: <20140731153245.15061.63023.stgit@patser>

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
V1 had a nasty bug breaking gpu lockup recovery. The fix is not
allowing radeon_fence_driver_check_lockup to take exclusive_lock,
and kill it during lockup recovery instead.
---
 drivers/gpu/drm/radeon/radeon.h        |    3 +
 drivers/gpu/drm/radeon/radeon_device.c |    5 +
 drivers/gpu/drm/radeon/radeon_fence.c  |  124 ++++++++++++++++++--------------
 drivers/gpu/drm/radeon/radeon_ring.c   |    1 
 4 files changed, 77 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 60c47f829122..b01d88fc10cb 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -347,6 +347,8 @@ struct radeon_fence_driver {
 	uint64_t			sync_seq[RADEON_NUM_RINGS];
 	atomic64_t			last_seq;
 	bool				initialized;
+	struct delayed_work	fence_check_work;
+	struct radeon_device	*rdev;
 };
 
 struct radeon_fence {
@@ -360,6 +362,7 @@ struct radeon_fence {
 
 int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring);
 int radeon_fence_driver_init(struct radeon_device *rdev);
+void radeon_fence_cancel_delayed_check(struct radeon_device *rdev, int ring);
 void radeon_fence_driver_fini(struct radeon_device *rdev);
 void radeon_fence_driver_force_completion(struct radeon_device *rdev);
 int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 697add2cd4e3..21efd32d07ee 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1637,6 +1637,11 @@ int radeon_gpu_reset(struct radeon_device *rdev)
 	radeon_save_bios_scratch_regs(rdev);
 	/* block TTM */
 	resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
+
+	/* kill all the hangcheck tests too */
+	for (i = 0; i < RADEON_NUM_RINGS; ++i)
+		radeon_fence_cancel_delayed_check(rdev, i);
+
 	radeon_pm_suspend(rdev);
 	radeon_suspend(rdev);
 
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 913787085dfa..c055acc3a271 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev,
 	return 0;
 }
 
-/**
- * radeon_fence_process - process a fence
- *
- * @rdev: radeon_device pointer
- * @ring: ring index the fence is associated with
- *
- * Checks the current fence value and wakes the fence queue
- * if the sequence number has increased (all asics).
- */
-void radeon_fence_process(struct radeon_device *rdev, int ring)
+static bool __radeon_fence_process(struct radeon_device *rdev, int ring)
 {
 	uint64_t seq, last_seq, last_emitted;
 	unsigned count_loop = 0;
@@ -190,7 +181,51 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
 		}
 	} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
 
-	if (wake)
+	if (seq < last_emitted)
+		mod_delayed_work(system_power_efficient_wq,
+				 &rdev->fence_drv[ring].fence_check_work,
+				 RADEON_FENCE_JIFFIES_TIMEOUT);
+
+	return wake;
+}
+
+static void radeon_fence_driver_check_lockup(struct work_struct *work)
+{
+	struct radeon_fence_driver *fence_drv;
+	struct radeon_device *rdev;
+	unsigned long iring;
+
+	fence_drv = container_of(work, struct radeon_fence_driver, fence_check_work.work);
+	rdev = fence_drv->rdev;
+	iring = fence_drv - &rdev->fence_drv[0];
+
+	if (__radeon_fence_process(rdev, iring))
+		wake_up_all(&rdev->fence_queue);
+	else if (radeon_ring_is_lockup(rdev, iring, &rdev->ring[iring])) {
+		/* good news we believe it's a lockup */
+		dev_warn(rdev->dev, "GPU lockup (current fence id "
+			 "0x%016llx last fence id 0x%016llx on ring %ld)\n",
+			 (uint64_t)atomic64_read(&fence_drv->last_seq),
+			 fence_drv->sync_seq[iring], iring);
+
+		/* remember that we need an reset */
+		rdev->needs_reset = true;
+		wake_up_all(&rdev->fence_queue);
+	}
+}
+
+/**
+ * radeon_fence_process - process a fence
+ *
+ * @rdev: radeon_device pointer
+ * @ring: ring index the fence is associated with
+ *
+ * Checks the current fence value and wakes the fence queue
+ * if the sequence number has increased (all asics).
+ */
+void radeon_fence_process(struct radeon_device *rdev, int ring)
+{
+	if (__radeon_fence_process(rdev, ring))
 		wake_up_all(&rdev->fence_queue);
 }
 
@@ -302,9 +337,10 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
 {
 	uint64_t last_seq[RADEON_NUM_RINGS];
 	bool signaled;
-	int i, r;
+	int i;
 
 	while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
+		long r;
 
 		/* Save current sequence values, used to check for GPU lockups */
 		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -319,11 +355,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
 		if (intr) {
 			r = wait_event_interruptible_timeout(rdev->fence_queue, (
 				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
-				 || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
+				 || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
 		} else {
 			r = wait_event_timeout(rdev->fence_queue, (
 				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
-				 || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
+				 || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
 		}
 
 		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -334,50 +370,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
 			trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
 		}
 
-		if (unlikely(r < 0))
+		if (r < 0)
 			return r;
 
-		if (unlikely(!signaled)) {
-			if (rdev->needs_reset)
-				return -EDEADLK;
-
-			/* we were interrupted for some reason and fence
-			 * isn't signaled yet, resume waiting */
-			if (r)
-				continue;
-
-			for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-				if (!target_seq[i])
-					continue;
-
-				if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
-					break;
-			}
-
-			if (i != RADEON_NUM_RINGS)
-				continue;
-
-			for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-				if (!target_seq[i])
-					continue;
-
-				if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
-					break;
-			}
-
-			if (i < RADEON_NUM_RINGS) {
-				/* good news we believe it's a lockup */
-				dev_warn(rdev->dev, "GPU lockup (waiting for "
-					 "0x%016llx last fence id 0x%016llx on"
-					 " ring %d)\n",
-					 target_seq[i], last_seq[i], i);
-
-				/* remember that we need an reset */
-				rdev->needs_reset = true;
-				wake_up_all(&rdev->fence_queue);
-				return -EDEADLK;
-			}
-		}
+		if (rdev->needs_reset)
+			return -EDEADLK;
 	}
 	return 0;
 }
@@ -711,6 +708,8 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
 		rdev->fence_drv[ring].sync_seq[i] = 0;
 	atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
 	rdev->fence_drv[ring].initialized = false;
+	INIT_DELAYED_WORK(&rdev->fence_drv[ring].fence_check_work, radeon_fence_driver_check_lockup);
+	rdev->fence_drv[ring].rdev = rdev;
 }
 
 /**
@@ -740,6 +739,17 @@ int radeon_fence_driver_init(struct radeon_device *rdev)
 }
 
 /**
+ * radeon_fence_cancel_delayed_check - cancel all delayed checks on a ring during lockup
+ *
+ * This prevents the lockup check from being done while suspend is running
+ * during a recovery from a lockup.
+ */
+void radeon_fence_cancel_delayed_check(struct radeon_device *rdev, int ring)
+{
+	cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work);
+}
+
+/**
  * radeon_fence_driver_fini - tear down the fence driver
  * for all possible rings.
  *
@@ -755,6 +765,8 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
 	for (ring = 0; ring < RADEON_NUM_RINGS; ring++) {
 		if (!rdev->fence_drv[ring].initialized)
 			continue;
+
+		cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work);
 		r = radeon_fence_wait_empty(rdev, ring);
 		if (r) {
 			/* no need to trigger GPU reset as we are unloading */
diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
index f8050f5429e2..594d871a6fce 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -261,6 +261,7 @@ int radeon_ib_ring_tests(struct radeon_device *rdev)
 
 		r = radeon_ib_test(rdev, i, ring);
 		if (r) {
+			radeon_fence_cancel_delayed_check(rdev, i);
 			ring->ready = false;
 			rdev->needs_reset = false;
 


  parent reply	other threads:[~2014-07-31 15:34 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-07-31 15:32 [PATCH 01/19] fence: add debugging lines to fence_is_signaled for the callback Maarten Lankhorst
2014-07-31 15:32 ` [PATCH 02/19] drm/ttm: add interruptible parameter to ttm_eu_reserve_buffers Maarten Lankhorst
2014-07-31 15:33 ` [PATCH 03/19] drm/ttm: kill off some members to ttm_validate_buffer Maarten Lankhorst
2014-07-31 15:33 ` [PATCH 04/19] drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep Maarten Lankhorst
2014-07-31 15:33 ` [PATCH 05/19] drm/nouveau: require reservations for nouveau_fence_sync and nouveau_bo_fence Maarten Lankhorst
2014-07-31 15:33 ` [PATCH 06/19] drm/ttm: call ttm_bo_wait while inside a reservation Maarten Lankhorst
2014-07-31 15:33 ` [PATCH 07/19] drm/ttm: kill fence_lock Maarten Lankhorst
2014-07-31 15:33 ` [PATCH 08/19] drm/nouveau: rework to new fence interface Maarten Lankhorst
2014-07-31 15:33 ` Maarten Lankhorst [this message]
2014-08-01 16:35   ` [PATCH 09/19] drm/radeon: handle lockup in delayed work, v2 Christian König
2014-08-01 17:46     ` Maarten Lankhorst
2014-08-04  8:36       ` Christian König
2014-08-04  8:55         ` Maarten Lankhorst
2014-08-04 11:57           ` Christian König
2014-08-04 13:34             ` Maarten Lankhorst
2014-08-04 14:37               ` Christian König
2014-08-04 14:40                 ` Maarten Lankhorst
2014-08-04 14:45                   ` Christian König
2014-08-04 14:58                     ` Maarten Lankhorst
2014-08-04 15:04                       ` Christian König
2014-08-04 15:09                         ` Maarten Lankhorst
2014-08-04 17:04                           ` Christian König
2014-08-05  8:16                             ` Daniel Vetter
2014-08-05  9:34                             ` Maarten Lankhorst
2014-07-31 15:33 ` [PATCH 10/19] drm/radeon: add timeout argument to radeon_fence_wait_seq Maarten Lankhorst
2014-07-31 15:33 ` [PATCH 11/19] drm/radeon: use common fence implementation for fences, v2 Maarten Lankhorst
2014-07-31 15:34 ` [PATCH 12/19] drm/qxl: rework to new fence interface Maarten Lankhorst
2014-07-31 15:34 ` [PATCH 13/19] drm/vmwgfx: get rid of different types of fence_flags entirely Maarten Lankhorst
2014-07-31 15:34 ` [PATCH 14/19] drm/vmwgfx: rework to new fence interface Maarten Lankhorst
2014-07-31 15:34 ` [PATCH 15/19] drm/ttm: flip the switch, and convert to dma_fence Maarten Lankhorst
2014-07-31 15:34 ` [PATCH 16/19] drm/nouveau: use rcu in nouveau_gem_ioctl_cpu_prep Maarten Lankhorst
2014-07-31 15:34 ` [PATCH 17/19] drm/radeon: use rcu waits in some ioctls Maarten Lankhorst
2014-07-31 15:34 ` [PATCH 18/19] drm/vmwgfx: use rcu in vmw_user_dmabuf_synccpu_grab Maarten Lankhorst
2014-07-31 15:34 ` [PATCH 19/19] drm/ttm: use rcu in core ttm Maarten Lankhorst

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140731153342.15061.54264.stgit@patser \
    --to=maarten.lankhorst@canonical.com \
    --cc=airlied@linux.ie \
    --cc=alexander.deucher@amd.com \
    --cc=bskeggs@redhat.com \
    --cc=christian.koenig@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=nouveau@lists.freedesktop.org \
    --cc=thellstrom@vmware.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox