All of lore.kernel.org
 help / color / mirror / Atom feed
From: Maarten Lankhorst <maarten.lankhorst@canonical.com>
To: "dri-devel@lists.freedesktop.org"
	<dri-devel@lists.freedesktop.org>,
	"Christian König" <deathsimple@vodafone.de>
Subject: [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3
Date: Mon, 18 Aug 2014 16:45:59 +0200	[thread overview]
Message-ID: <53F211A7.5080506@canonical.com> (raw)
In-Reply-To: <53F2118F.8000706@canonical.com>

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
V1 had a nasty bug breaking gpu lockup recovery. The fix is not
allowing radeon_fence_driver_check_lockup to take exclusive_lock,
and kill it during lockup recovery instead.
V2 used delayed work that ran during lockup recovery, but required
read lock. I've fixed this by downgrading the write, and retrying if
recovery fails.
---
 drivers/gpu/drm/radeon/radeon.h       |   2 +
 drivers/gpu/drm/radeon/radeon_fence.c | 115 +++++++++++++++++-----------------
 2 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 1d806983ec7b..29504efe8971 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -355,6 +355,8 @@ struct radeon_fence_driver {
 	uint64_t			sync_seq[RADEON_NUM_RINGS];
 	atomic64_t			last_seq;
 	bool				initialized;
+	struct delayed_work		fence_check_work;
+	struct radeon_device		*rdev;
 };
 
 struct radeon_fence {
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 913787085dfa..94eca53d99f8 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev,
 	return 0;
 }
 
-/**
- * radeon_fence_process - process a fence
- *
- * @rdev: radeon_device pointer
- * @ring: ring index the fence is associated with
- *
- * Checks the current fence value and wakes the fence queue
- * if the sequence number has increased (all asics).
- */
-void radeon_fence_process(struct radeon_device *rdev, int ring)
+static bool __radeon_fence_process(struct radeon_device *rdev, int ring)
 {
 	uint64_t seq, last_seq, last_emitted;
 	unsigned count_loop = 0;
@@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
 		}
 	} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
 
-	if (wake)
+	if (seq < last_emitted && !rdev->in_reset)
+		mod_delayed_work(system_power_efficient_wq,
+				 &rdev->fence_drv[ring].fence_check_work,
+				 RADEON_FENCE_JIFFIES_TIMEOUT);
+
+	return wake;
+}
+
+static void radeon_fence_driver_check_lockup(struct work_struct *work)
+{
+	struct radeon_fence_driver *fence_drv;
+	struct radeon_device *rdev;
+	unsigned long iring;
+
+	fence_drv = container_of(work, struct radeon_fence_driver, fence_check_work.work);
+	rdev = fence_drv->rdev;
+	iring = fence_drv - &rdev->fence_drv[0];
+
+	down_read(&rdev->exclusive_lock);
+	if (__radeon_fence_process(rdev, iring))
+		wake_up_all(&rdev->fence_queue);
+	else if (radeon_ring_is_lockup(rdev, iring, &rdev->ring[iring])) {
+		/* good news we believe it's a lockup */
+		dev_warn(rdev->dev, "GPU lockup (current fence id "
+			 "0x%016llx last fence id 0x%016llx on ring %ld)\n",
+			 (uint64_t)atomic64_read(&fence_drv->last_seq),
+			 fence_drv->sync_seq[iring], iring);
+
+		/* remember that we need an reset */
+		rdev->needs_reset = true;
+		wake_up_all(&rdev->fence_queue);
+	}
+	up_read(&rdev->exclusive_lock);
+}
+
+/**
+ * radeon_fence_process - process a fence
+ *
+ * @rdev: radeon_device pointer
+ * @ring: ring index the fence is associated with
+ *
+ * Checks the current fence value and wakes the fence queue
+ * if the sequence number has increased (all asics).
+ */
+void radeon_fence_process(struct radeon_device *rdev, int ring)
+{
+	if (__radeon_fence_process(rdev, ring))
 		wake_up_all(&rdev->fence_queue);
 }
 
@@ -302,9 +339,10 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
 {
 	uint64_t last_seq[RADEON_NUM_RINGS];
 	bool signaled;
-	int i, r;
+	int i;
 
 	while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
+		long r;
 
 		/* Save current sequence values, used to check for GPU lockups */
 		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -319,11 +357,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
 		if (intr) {
 			r = wait_event_interruptible_timeout(rdev->fence_queue, (
 				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
-				 || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
+				 || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
 		} else {
 			r = wait_event_timeout(rdev->fence_queue, (
 				(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
-				 || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
+				 || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
 		}
 
 		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -334,50 +372,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
 			trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
 		}
 
-		if (unlikely(r < 0))
+		if (r < 0)
 			return r;
 
-		if (unlikely(!signaled)) {
-			if (rdev->needs_reset)
-				return -EDEADLK;
-
-			/* we were interrupted for some reason and fence
-			 * isn't signaled yet, resume waiting */
-			if (r)
-				continue;
-
-			for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-				if (!target_seq[i])
-					continue;
-
-				if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
-					break;
-			}
-
-			if (i != RADEON_NUM_RINGS)
-				continue;
-
-			for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-				if (!target_seq[i])
-					continue;
-
-				if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
-					break;
-			}
-
-			if (i < RADEON_NUM_RINGS) {
-				/* good news we believe it's a lockup */
-				dev_warn(rdev->dev, "GPU lockup (waiting for "
-					 "0x%016llx last fence id 0x%016llx on"
-					 " ring %d)\n",
-					 target_seq[i], last_seq[i], i);
-
-				/* remember that we need an reset */
-				rdev->needs_reset = true;
-				wake_up_all(&rdev->fence_queue);
-				return -EDEADLK;
-			}
-		}
+		if (rdev->needs_reset)
+			return -EDEADLK;
 	}
 	return 0;
 }
@@ -711,6 +710,9 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
 		rdev->fence_drv[ring].sync_seq[i] = 0;
 	atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
 	rdev->fence_drv[ring].initialized = false;
+	INIT_DELAYED_WORK(&rdev->fence_drv[ring].fence_check_work,
+			  radeon_fence_driver_check_lockup);
+	rdev->fence_drv[ring].rdev = rdev;
 }
 
 /**
@@ -760,6 +762,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
 			/* no need to trigger GPU reset as we are unloading */
 			radeon_fence_driver_force_completion(rdev);
 		}
+		cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work);
 		wake_up_all(&rdev->fence_queue);
 		radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg);
 		rdev->fence_drv[ring].initialized = false;
-- 
2.0.4

  reply	other threads:[~2014-08-18 14:45 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-08-18 14:45 [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Maarten Lankhorst
2014-08-18 14:45 ` Maarten Lankhorst [this message]
2014-08-18 15:12   ` [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3 Christian König
2014-08-18 15:28     ` Maarten Lankhorst
2014-08-18 16:20       ` Christian König
2014-08-18 14:46 ` [PATCH 3/3] drm/radeon: add timeout argument to radeon_fence_wait_seq Maarten Lankhorst
2014-08-18 15:02 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Christian König
2014-08-18 15:24   ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests, v2 Maarten Lankhorst
2014-08-18 16:03   ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Alex Deucher
2014-08-18 16:07     ` Christian König
2014-08-18 16:10       ` Alex Deucher
2014-08-18 20:02         ` Maarten Lankhorst
2014-08-18 20:56           ` Alex Deucher

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=53F211A7.5080506@canonical.com \
    --to=maarten.lankhorst@canonical.com \
    --cc=deathsimple@vodafone.de \
    --cc=dri-devel@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.