* [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests
@ 2014-08-18 14:45 Maarten Lankhorst
2014-08-18 14:45 ` [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3 Maarten Lankhorst
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Maarten Lankhorst @ 2014-08-18 14:45 UTC (permalink / raw)
To: dri-devel@lists.freedesktop.org, Christian König
This is needed for the next commit, because the lockup detection
will need the read lock to run.
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
drivers/gpu/drm/radeon/radeon.h | 2 +-
drivers/gpu/drm/radeon/radeon_device.c | 61 ++++++++++++++++++++--------------
2 files changed, 37 insertions(+), 26 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 9e1732eb402c..1d806983ec7b 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2312,7 +2312,7 @@ struct radeon_device {
bool need_dma32;
bool accel_working;
bool fastfb_working; /* IGP feature*/
- bool needs_reset;
+ bool needs_reset, in_reset;
struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
const struct firmware *me_fw; /* all family ME firmware */
const struct firmware *pfp_fw; /* r6/700 PFP firmware */
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index c8ea050c8fa4..82633fdd399d 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev)
down_write(&rdev->exclusive_lock);
if (!rdev->needs_reset) {
+ WARN_ON(rdev->in_reset);
up_write(&rdev->exclusive_lock);
return 0;
}
rdev->needs_reset = false;
-
- radeon_save_bios_scratch_regs(rdev);
- /* block TTM */
resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
- radeon_pm_suspend(rdev);
- radeon_suspend(rdev);
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
- &ring_data[i]);
- if (ring_sizes[i]) {
- saved = true;
- dev_info(rdev->dev, "Saved %d dwords of commands "
- "on ring %d.\n", ring_sizes[i], i);
+ if (!rdev->in_reset) {
+ rdev->in_reset = true;
+
+ radeon_save_bios_scratch_regs(rdev);
+ /* block TTM */
+ radeon_pm_suspend(rdev);
+ radeon_suspend(rdev);
+
+ for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+ ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
+ &ring_data[i]);
+ if (ring_sizes[i]) {
+ saved = true;
+ dev_info(rdev->dev, "Saved %d dwords of commands "
+ "on ring %d.\n", ring_sizes[i], i);
+ }
}
- }
+ } else
+ memset(ring_data, 0, sizeof(ring_data));
-retry:
r = radeon_asic_reset(rdev);
if (!r) {
dev_info(rdev->dev, "GPU reset succeeded, trying to resume\n");
@@ -1702,40 +1707,46 @@ retry:
radeon_restore_bios_scratch_regs(rdev);
- if (!r) {
+ if (!r && saved) {
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
radeon_ring_restore(rdev, &rdev->ring[i],
ring_sizes[i], ring_data[i]);
- ring_sizes[i] = 0;
ring_data[i] = NULL;
}
+ } else {
+ radeon_fence_driver_force_completion(rdev);
+
+ for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+ kfree(ring_data[i]);
+ }
+ }
+ downgrade_write(&rdev->exclusive_lock);
+ ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
+ if (!r) {
r = radeon_ib_ring_tests(rdev);
if (r) {
dev_err(rdev->dev, "ib ring test failed (%d).\n", r);
if (saved) {
- saved = false;
+ /* if reset fails, try without saving data */
+ rdev->needs_reset = true;
radeon_suspend(rdev);
- goto retry;
+ up_read(&rdev->exclusive_lock);
+ return -EAGAIN;
}
}
- } else {
- radeon_fence_driver_force_completion(rdev);
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- kfree(ring_data[i]);
- }
}
radeon_pm_resume(rdev);
drm_helper_resume_force_mode(rdev->ddev);
- ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
if (r) {
/* bad news, how to tell it to userspace ? */
dev_info(rdev->dev, "GPU reset failed\n");
}
- up_write(&rdev->exclusive_lock);
+ rdev->in_reset = false;
+ up_read(&rdev->exclusive_lock);
return r;
}
--
2.0.4
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3
2014-08-18 14:45 [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Maarten Lankhorst
@ 2014-08-18 14:45 ` Maarten Lankhorst
2014-08-18 15:12 ` Christian König
2014-08-18 14:46 ` [PATCH 3/3] drm/radeon: add timeout argument to radeon_fence_wait_seq Maarten Lankhorst
2014-08-18 15:02 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Christian König
2 siblings, 1 reply; 13+ messages in thread
From: Maarten Lankhorst @ 2014-08-18 14:45 UTC (permalink / raw)
To: dri-devel@lists.freedesktop.org, Christian König
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
V1 had a nasty bug breaking gpu lockup recovery. The fix is not
allowing radeon_fence_driver_check_lockup to take exclusive_lock,
and kill it during lockup recovery instead.
V2 used delayed work that ran during lockup recovery, but required
read lock. I've fixed this by downgrading the write, and retrying if
recovery fails.
---
drivers/gpu/drm/radeon/radeon.h | 2 +
drivers/gpu/drm/radeon/radeon_fence.c | 115 +++++++++++++++++-----------------
2 files changed, 61 insertions(+), 56 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 1d806983ec7b..29504efe8971 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -355,6 +355,8 @@ struct radeon_fence_driver {
uint64_t sync_seq[RADEON_NUM_RINGS];
atomic64_t last_seq;
bool initialized;
+ struct delayed_work fence_check_work;
+ struct radeon_device *rdev;
};
struct radeon_fence {
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 913787085dfa..94eca53d99f8 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev,
return 0;
}
-/**
- * radeon_fence_process - process a fence
- *
- * @rdev: radeon_device pointer
- * @ring: ring index the fence is associated with
- *
- * Checks the current fence value and wakes the fence queue
- * if the sequence number has increased (all asics).
- */
-void radeon_fence_process(struct radeon_device *rdev, int ring)
+static bool __radeon_fence_process(struct radeon_device *rdev, int ring)
{
uint64_t seq, last_seq, last_emitted;
unsigned count_loop = 0;
@@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
}
} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
- if (wake)
+ if (seq < last_emitted && !rdev->in_reset)
+ mod_delayed_work(system_power_efficient_wq,
+ &rdev->fence_drv[ring].fence_check_work,
+ RADEON_FENCE_JIFFIES_TIMEOUT);
+
+ return wake;
+}
+
+static void radeon_fence_driver_check_lockup(struct work_struct *work)
+{
+ struct radeon_fence_driver *fence_drv;
+ struct radeon_device *rdev;
+ unsigned long iring;
+
+ fence_drv = container_of(work, struct radeon_fence_driver, fence_check_work.work);
+ rdev = fence_drv->rdev;
+ iring = fence_drv - &rdev->fence_drv[0];
+
+ down_read(&rdev->exclusive_lock);
+ if (__radeon_fence_process(rdev, iring))
+ wake_up_all(&rdev->fence_queue);
+ else if (radeon_ring_is_lockup(rdev, iring, &rdev->ring[iring])) {
+ /* good news we believe it's a lockup */
+ dev_warn(rdev->dev, "GPU lockup (current fence id "
+ "0x%016llx last fence id 0x%016llx on ring %ld)\n",
+ (uint64_t)atomic64_read(&fence_drv->last_seq),
+ fence_drv->sync_seq[iring], iring);
+
+ /* remember that we need an reset */
+ rdev->needs_reset = true;
+ wake_up_all(&rdev->fence_queue);
+ }
+ up_read(&rdev->exclusive_lock);
+}
+
+/**
+ * radeon_fence_process - process a fence
+ *
+ * @rdev: radeon_device pointer
+ * @ring: ring index the fence is associated with
+ *
+ * Checks the current fence value and wakes the fence queue
+ * if the sequence number has increased (all asics).
+ */
+void radeon_fence_process(struct radeon_device *rdev, int ring)
+{
+ if (__radeon_fence_process(rdev, ring))
wake_up_all(&rdev->fence_queue);
}
@@ -302,9 +339,10 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
{
uint64_t last_seq[RADEON_NUM_RINGS];
bool signaled;
- int i, r;
+ int i;
while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
+ long r;
/* Save current sequence values, used to check for GPU lockups */
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -319,11 +357,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
if (intr) {
r = wait_event_interruptible_timeout(rdev->fence_queue, (
(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
- || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
+ || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
} else {
r = wait_event_timeout(rdev->fence_queue, (
(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
- || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
+ || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
}
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -334,50 +372,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
}
- if (unlikely(r < 0))
+ if (r < 0)
return r;
- if (unlikely(!signaled)) {
- if (rdev->needs_reset)
- return -EDEADLK;
-
- /* we were interrupted for some reason and fence
- * isn't signaled yet, resume waiting */
- if (r)
- continue;
-
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- if (!target_seq[i])
- continue;
-
- if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
- break;
- }
-
- if (i != RADEON_NUM_RINGS)
- continue;
-
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- if (!target_seq[i])
- continue;
-
- if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
- break;
- }
-
- if (i < RADEON_NUM_RINGS) {
- /* good news we believe it's a lockup */
- dev_warn(rdev->dev, "GPU lockup (waiting for "
- "0x%016llx last fence id 0x%016llx on"
- " ring %d)\n",
- target_seq[i], last_seq[i], i);
-
- /* remember that we need an reset */
- rdev->needs_reset = true;
- wake_up_all(&rdev->fence_queue);
- return -EDEADLK;
- }
- }
+ if (rdev->needs_reset)
+ return -EDEADLK;
}
return 0;
}
@@ -711,6 +710,9 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
rdev->fence_drv[ring].sync_seq[i] = 0;
atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
rdev->fence_drv[ring].initialized = false;
+ INIT_DELAYED_WORK(&rdev->fence_drv[ring].fence_check_work,
+ radeon_fence_driver_check_lockup);
+ rdev->fence_drv[ring].rdev = rdev;
}
/**
@@ -760,6 +762,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
/* no need to trigger GPU reset as we are unloading */
radeon_fence_driver_force_completion(rdev);
}
+ cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work);
wake_up_all(&rdev->fence_queue);
radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg);
rdev->fence_drv[ring].initialized = false;
--
2.0.4
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 3/3] drm/radeon: add timeout argument to radeon_fence_wait_seq
2014-08-18 14:45 [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Maarten Lankhorst
2014-08-18 14:45 ` [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3 Maarten Lankhorst
@ 2014-08-18 14:46 ` Maarten Lankhorst
2014-08-18 15:02 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Christian König
2 siblings, 0 replies; 13+ messages in thread
From: Maarten Lankhorst @ 2014-08-18 14:46 UTC (permalink / raw)
To: dri-devel@lists.freedesktop.org, Christian König
This makes it possible to wait for a specific amount of time,
rather than wait until infinity.
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Reviewed-by: Christian König <deathsimple@vodafone.de>
---
drivers/gpu/drm/radeon/radeon_fence.c | 50 +++++++++++++++++++++--------------
1 file changed, 30 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 94eca53d99f8..e8ea423648ab 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -320,22 +320,25 @@ static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, u64 *seq)
}
/**
- * radeon_fence_wait_seq - wait for a specific sequence numbers
+ * radeon_fence_wait_seq_timeout - wait for a specific sequence numbers
*
* @rdev: radeon device pointer
* @target_seq: sequence number(s) we want to wait for
* @intr: use interruptable sleep
+ * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
*
* Wait for the requested sequence number(s) to be written by any ring
* (all asics). Sequnce number array is indexed by ring id.
* @intr selects whether to use interruptable (true) or non-interruptable
* (false) sleep when waiting for the sequence number. Helper function
* for radeon_fence_wait_*().
- * Returns 0 if the sequence number has passed, error for all other cases.
+ * Returns remaining time if the sequence number has passed, 0 when
+ * the wait timeout, or an error for all other cases.
* -EDEADLK is returned when a GPU lockup has been detected.
*/
-static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
- bool intr)
+static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
+ u64 *target_seq, bool intr,
+ long timeout)
{
uint64_t last_seq[RADEON_NUM_RINGS];
bool signaled;
@@ -357,11 +360,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
if (intr) {
r = wait_event_interruptible_timeout(rdev->fence_queue, (
(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
- || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
+ || rdev->needs_reset), timeout);
} else {
r = wait_event_timeout(rdev->fence_queue, (
(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
- || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
+ || rdev->needs_reset), timeout);
}
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -372,20 +375,22 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
}
- if (r < 0)
+ if (r <= 0)
return r;
if (rdev->needs_reset)
return -EDEADLK;
+
+ timeout = r;
}
- return 0;
+ return timeout;
}
/**
* radeon_fence_wait - wait for a fence to signal
*
* @fence: radeon fence object
- * @intr: use interruptable sleep
+ * @intr: use interruptible sleep
*
* Wait for the requested fence to signal (all asics).
* @intr selects whether to use interruptable (true) or non-interruptable
@@ -395,7 +400,7 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
int radeon_fence_wait(struct radeon_fence *fence, bool intr)
{
uint64_t seq[RADEON_NUM_RINGS] = {};
- int r;
+ long r;
if (fence == NULL) {
WARN(1, "Querying an invalid fence : %p !\n", fence);
@@ -406,9 +411,10 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
if (seq[fence->ring] == RADEON_FENCE_SIGNALED_SEQ)
return 0;
- r = radeon_fence_wait_seq(fence->rdev, seq, intr);
- if (r)
+ r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
+ if (r < 0) {
return r;
+ }
fence->seq = RADEON_FENCE_SIGNALED_SEQ;
return 0;
@@ -433,7 +439,7 @@ int radeon_fence_wait_any(struct radeon_device *rdev,
{
uint64_t seq[RADEON_NUM_RINGS];
unsigned i, num_rings = 0;
- int r;
+ long r;
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
seq[i] = 0;
@@ -454,8 +460,8 @@ int radeon_fence_wait_any(struct radeon_device *rdev,
if (num_rings == 0)
return -ENOENT;
- r = radeon_fence_wait_seq(rdev, seq, intr);
- if (r) {
+ r = radeon_fence_wait_seq_timeout(rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
+ if (r < 0) {
return r;
}
return 0;
@@ -474,6 +480,7 @@ int radeon_fence_wait_any(struct radeon_device *rdev,
int radeon_fence_wait_next(struct radeon_device *rdev, int ring)
{
uint64_t seq[RADEON_NUM_RINGS] = {};
+ long r;
seq[ring] = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
if (seq[ring] >= rdev->fence_drv[ring].sync_seq[ring]) {
@@ -481,7 +488,10 @@ int radeon_fence_wait_next(struct radeon_device *rdev, int ring)
already the last emited fence */
return -ENOENT;
}
- return radeon_fence_wait_seq(rdev, seq, false);
+ r = radeon_fence_wait_seq_timeout(rdev, seq, false, MAX_SCHEDULE_TIMEOUT);
+ if (r < 0)
+ return r;
+ return 0;
}
/**
@@ -497,18 +507,18 @@ int radeon_fence_wait_next(struct radeon_device *rdev, int ring)
int radeon_fence_wait_empty(struct radeon_device *rdev, int ring)
{
uint64_t seq[RADEON_NUM_RINGS] = {};
- int r;
+ long r;
seq[ring] = rdev->fence_drv[ring].sync_seq[ring];
if (!seq[ring])
return 0;
- r = radeon_fence_wait_seq(rdev, seq, false);
- if (r) {
+ r = radeon_fence_wait_seq_timeout(rdev, seq, false, MAX_SCHEDULE_TIMEOUT);
+ if (r < 0) {
if (r == -EDEADLK)
return -EDEADLK;
- dev_err(rdev->dev, "error waiting for ring[%d] to become idle (%d)\n",
+ dev_err(rdev->dev, "error waiting for ring[%d] to become idle (%ld)\n",
ring, r);
}
return 0;
--
2.0.4
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests
2014-08-18 14:45 [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Maarten Lankhorst
2014-08-18 14:45 ` [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3 Maarten Lankhorst
2014-08-18 14:46 ` [PATCH 3/3] drm/radeon: add timeout argument to radeon_fence_wait_seq Maarten Lankhorst
@ 2014-08-18 15:02 ` Christian König
2014-08-18 15:24 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests, v2 Maarten Lankhorst
2014-08-18 16:03 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Alex Deucher
2 siblings, 2 replies; 13+ messages in thread
From: Christian König @ 2014-08-18 15:02 UTC (permalink / raw)
To: Maarten Lankhorst, dri-devel@lists.freedesktop.org
Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
> This is needed for the next commit, because the lockup detection
> will need the read lock to run.
>
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
> ---
> drivers/gpu/drm/radeon/radeon.h | 2 +-
> drivers/gpu/drm/radeon/radeon_device.c | 61 ++++++++++++++++++++--------------
> 2 files changed, 37 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 9e1732eb402c..1d806983ec7b 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -2312,7 +2312,7 @@ struct radeon_device {
> bool need_dma32;
> bool accel_working;
> bool fastfb_working; /* IGP feature*/
> - bool needs_reset;
> + bool needs_reset, in_reset;
> struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
> const struct firmware *me_fw; /* all family ME firmware */
> const struct firmware *pfp_fw; /* r6/700 PFP firmware */
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
> index c8ea050c8fa4..82633fdd399d 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev)
> down_write(&rdev->exclusive_lock);
>
> if (!rdev->needs_reset) {
> + WARN_ON(rdev->in_reset);
> up_write(&rdev->exclusive_lock);
> return 0;
> }
>
> rdev->needs_reset = false;
> -
> - radeon_save_bios_scratch_regs(rdev);
> - /* block TTM */
> resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
> - radeon_pm_suspend(rdev);
> - radeon_suspend(rdev);
>
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
> - &ring_data[i]);
> - if (ring_sizes[i]) {
> - saved = true;
> - dev_info(rdev->dev, "Saved %d dwords of commands "
> - "on ring %d.\n", ring_sizes[i], i);
> + if (!rdev->in_reset) {
> + rdev->in_reset = true;
> +
> + radeon_save_bios_scratch_regs(rdev);
> + /* block TTM */
> + radeon_pm_suspend(rdev);
> + radeon_suspend(rdev);
That won't work correctly because you might end up with calling
pm_resume more often than suspend and that can only lead to a crash.
Saying this we probably already have a bug in the reset code at this
point anyway, but see below.
> +
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
> + &ring_data[i]);
> + if (ring_sizes[i]) {
> + saved = true;
> + dev_info(rdev->dev, "Saved %d dwords of commands "
> + "on ring %d.\n", ring_sizes[i], i);
> + }
> }
> - }
> + } else
> + memset(ring_data, 0, sizeof(ring_data));
>
> -retry:
> r = radeon_asic_reset(rdev);
> if (!r) {
> dev_info(rdev->dev, "GPU reset succeeded, trying to resume\n");
> @@ -1702,40 +1707,46 @@ retry:
>
> radeon_restore_bios_scratch_regs(rdev);
We should resume PM here as well.
>
> - if (!r) {
> + if (!r && saved) {
> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> radeon_ring_restore(rdev, &rdev->ring[i],
> ring_sizes[i], ring_data[i]);
> - ring_sizes[i] = 0;
> ring_data[i] = NULL;
> }
> + } else {
> + radeon_fence_driver_force_completion(rdev);
> +
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + kfree(ring_data[i]);
> + }
> + }
> + downgrade_write(&rdev->exclusive_lock);
> + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
I would unlock the delayed_workqueue first and then downgrade the readlock.
>
> + if (!r) {
> r = radeon_ib_ring_tests(rdev);
> if (r) {
> dev_err(rdev->dev, "ib ring test failed (%d).\n", r);
> if (saved) {
> - saved = false;
> + /* if reset fails, try without saving data */
> + rdev->needs_reset = true;
> radeon_suspend(rdev);
> - goto retry;
> + up_read(&rdev->exclusive_lock);
> + return -EAGAIN;
> }
> }
> - } else {
> - radeon_fence_driver_force_completion(rdev);
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - kfree(ring_data[i]);
> - }
> }
>
> radeon_pm_resume(rdev);
Move this more up.
Alex is more into this, but it's probably a bug in the current reset
code that this is after the IB tests, cause the IB tests needs
everything powered up and with PM handling suspended it is possible that
individual blocks are powered down.
Thanks,
Christian.
> drm_helper_resume_force_mode(rdev->ddev);
>
> - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
> if (r) {
> /* bad news, how to tell it to userspace ? */
> dev_info(rdev->dev, "GPU reset failed\n");
> }
>
> - up_write(&rdev->exclusive_lock);
> + rdev->in_reset = false;
> + up_read(&rdev->exclusive_lock);
> return r;
> }
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3
2014-08-18 14:45 ` [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3 Maarten Lankhorst
@ 2014-08-18 15:12 ` Christian König
2014-08-18 15:28 ` Maarten Lankhorst
0 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2014-08-18 15:12 UTC (permalink / raw)
To: Maarten Lankhorst, dri-devel@lists.freedesktop.org
Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
> ---
> V1 had a nasty bug breaking gpu lockup recovery. The fix is not
> allowing radeon_fence_driver_check_lockup to take exclusive_lock,
> and kill it during lockup recovery instead.
> V2 used delayed work that ran during lockup recovery, but required
> read lock. I've fixed this by downgrading the write, and retrying if
> recovery fails.
> ---
> drivers/gpu/drm/radeon/radeon.h | 2 +
> drivers/gpu/drm/radeon/radeon_fence.c | 115 +++++++++++++++++-----------------
> 2 files changed, 61 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 1d806983ec7b..29504efe8971 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -355,6 +355,8 @@ struct radeon_fence_driver {
> uint64_t sync_seq[RADEON_NUM_RINGS];
> atomic64_t last_seq;
> bool initialized;
> + struct delayed_work fence_check_work;
> + struct radeon_device *rdev;
Put the reference to the device as the first field in the structure.
> };
>
> struct radeon_fence {
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index 913787085dfa..94eca53d99f8 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev,
> return 0;
> }
>
> -/**
> - * radeon_fence_process - process a fence
> - *
> - * @rdev: radeon_device pointer
> - * @ring: ring index the fence is associated with
> - *
> - * Checks the current fence value and wakes the fence queue
> - * if the sequence number has increased (all asics).
> - */
> -void radeon_fence_process(struct radeon_device *rdev, int ring)
> +static bool __radeon_fence_process(struct radeon_device *rdev, int ring)
Don't use "__" for internal radeon function names and especially don't
remove the function documentation.
> {
> uint64_t seq, last_seq, last_emitted;
> unsigned count_loop = 0;
> @@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
> }
> } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
>
> - if (wake)
> + if (seq < last_emitted && !rdev->in_reset)
> + mod_delayed_work(system_power_efficient_wq,
> + &rdev->fence_drv[ring].fence_check_work,
> + RADEON_FENCE_JIFFIES_TIMEOUT);
Am I wrong or do you queue the work only after radeon_fence_process is
called for the first time?
Might be a good idea to have an explicit queue_delayed_work in
radeon_fence_emit as well.
> +
> + return wake;
> +}
> +
> +static void radeon_fence_driver_check_lockup(struct work_struct *work)
> +{
> + struct radeon_fence_driver *fence_drv;
> + struct radeon_device *rdev;
> + unsigned long iring;
> +
> + fence_drv = container_of(work, struct radeon_fence_driver, fence_check_work.work);
> + rdev = fence_drv->rdev;
> + iring = fence_drv - &rdev->fence_drv[0];
> +
> + down_read(&rdev->exclusive_lock);
> + if (__radeon_fence_process(rdev, iring))
> + wake_up_all(&rdev->fence_queue);
> + else if (radeon_ring_is_lockup(rdev, iring, &rdev->ring[iring])) {
> + /* good news we believe it's a lockup */
> + dev_warn(rdev->dev, "GPU lockup (current fence id "
> + "0x%016llx last fence id 0x%016llx on ring %ld)\n",
> + (uint64_t)atomic64_read(&fence_drv->last_seq),
> + fence_drv->sync_seq[iring], iring);
> +
> + /* remember that we need an reset */
> + rdev->needs_reset = true;
> + wake_up_all(&rdev->fence_queue);
> + }
> + up_read(&rdev->exclusive_lock);
> +}
> +
> +/**
> + * radeon_fence_process - process a fence
> + *
> + * @rdev: radeon_device pointer
> + * @ring: ring index the fence is associated with
> + *
> + * Checks the current fence value and wakes the fence queue
> + * if the sequence number has increased (all asics).
> + */
> +void radeon_fence_process(struct radeon_device *rdev, int ring)
> +{
> + if (__radeon_fence_process(rdev, ring))
> wake_up_all(&rdev->fence_queue);
> }
>
> @@ -302,9 +339,10 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
> {
> uint64_t last_seq[RADEON_NUM_RINGS];
> bool signaled;
> - int i, r;
> + int i;
>
> while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
> + long r;
>
> /* Save current sequence values, used to check for GPU lockups */
> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> @@ -319,11 +357,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
> if (intr) {
> r = wait_event_interruptible_timeout(rdev->fence_queue, (
> (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
> - || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
> + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
> } else {
> r = wait_event_timeout(rdev->fence_queue, (
> (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
> - || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
> + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
> }
>
> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> @@ -334,50 +372,11 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
> trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
> }
>
> - if (unlikely(r < 0))
> + if (r < 0)
> return r;
>
> - if (unlikely(!signaled)) {
> - if (rdev->needs_reset)
> - return -EDEADLK;
> -
> - /* we were interrupted for some reason and fence
> - * isn't signaled yet, resume waiting */
> - if (r)
> - continue;
> -
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (!target_seq[i])
> - continue;
> -
> - if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
> - break;
> - }
> -
> - if (i != RADEON_NUM_RINGS)
> - continue;
> -
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (!target_seq[i])
> - continue;
> -
> - if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
> - break;
> - }
> -
> - if (i < RADEON_NUM_RINGS) {
> - /* good news we believe it's a lockup */
> - dev_warn(rdev->dev, "GPU lockup (waiting for "
> - "0x%016llx last fence id 0x%016llx on"
> - " ring %d)\n",
> - target_seq[i], last_seq[i], i);
> -
> - /* remember that we need an reset */
> - rdev->needs_reset = true;
> - wake_up_all(&rdev->fence_queue);
> - return -EDEADLK;
> - }
> - }
> + if (rdev->needs_reset)
> + return -EDEADLK;
> }
> return 0;
> }
> @@ -711,6 +710,9 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
> rdev->fence_drv[ring].sync_seq[i] = 0;
> atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
> rdev->fence_drv[ring].initialized = false;
> + INIT_DELAYED_WORK(&rdev->fence_drv[ring].fence_check_work,
> + radeon_fence_driver_check_lockup);
> + rdev->fence_drv[ring].rdev = rdev;
> }
>
> /**
> @@ -760,6 +762,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
> /* no need to trigger GPU reset as we are unloading */
> radeon_fence_driver_force_completion(rdev);
> }
> + cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work);
> wake_up_all(&rdev->fence_queue);
> radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg);
> rdev->fence_drv[ring].initialized = false;
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests, v2
2014-08-18 15:02 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Christian König
@ 2014-08-18 15:24 ` Maarten Lankhorst
2014-08-18 16:03 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Alex Deucher
1 sibling, 0 replies; 13+ messages in thread
From: Maarten Lankhorst @ 2014-08-18 15:24 UTC (permalink / raw)
To: Christian König, dri-devel@lists.freedesktop.org
This is needed for the next commit, because the lockup detection
will need the read lock to run.
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
Changes since v1:
Changed order of resuming a bit according to ckoenig's feedback.
Reset handling seems just as unreliable as before this commit,
but at least it restores mode correctly now. :-)
drivers/gpu/drm/radeon/radeon.h | 2 +-
drivers/gpu/drm/radeon/radeon_device.c | 64 ++++++++++++++++++++--------------
2 files changed, 39 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 9e1732eb402c..1d806983ec7b 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2312,7 +2312,7 @@ struct radeon_device {
bool need_dma32;
bool accel_working;
bool fastfb_working; /* IGP feature*/
- bool needs_reset;
+ bool needs_reset, in_reset;
struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
const struct firmware *me_fw; /* all family ME firmware */
const struct firmware *pfp_fw; /* r6/700 PFP firmware */
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index c8ea050c8fa4..0e9541acccd5 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1671,29 +1671,35 @@ int radeon_gpu_reset(struct radeon_device *rdev)
down_write(&rdev->exclusive_lock);
if (!rdev->needs_reset) {
+ WARN_ON(rdev->in_reset);
up_write(&rdev->exclusive_lock);
return 0;
}
rdev->needs_reset = false;
+ resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
- radeon_save_bios_scratch_regs(rdev);
/* block TTM */
- resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
radeon_pm_suspend(rdev);
- radeon_suspend(rdev);
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
- &ring_data[i]);
- if (ring_sizes[i]) {
- saved = true;
- dev_info(rdev->dev, "Saved %d dwords of commands "
- "on ring %d.\n", ring_sizes[i], i);
+ if (!rdev->in_reset) {
+ rdev->in_reset = true;
+
+ radeon_save_bios_scratch_regs(rdev);
+ radeon_suspend(rdev);
+
+ for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+ ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
+ &ring_data[i]);
+ if (ring_sizes[i]) {
+ saved = true;
+ dev_info(rdev->dev, "Saved %d dwords of commands "
+ "on ring %d.\n", ring_sizes[i], i);
+ }
}
- }
+ } else
+ memset(ring_data, 0, sizeof(ring_data));
-retry:
r = radeon_asic_reset(rdev);
if (!r) {
dev_info(rdev->dev, "GPU reset succeeded, trying to resume\n");
@@ -1701,41 +1707,47 @@ retry:
}
radeon_restore_bios_scratch_regs(rdev);
+ radeon_pm_resume(rdev);
- if (!r) {
+ if (!r && saved) {
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
radeon_ring_restore(rdev, &rdev->ring[i],
ring_sizes[i], ring_data[i]);
- ring_sizes[i] = 0;
ring_data[i] = NULL;
}
+ } else {
+ radeon_fence_driver_force_completion(rdev);
+ for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+ kfree(ring_data[i]);
+ }
+ }
+
+ drm_helper_resume_force_mode(rdev->ddev);
+ ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
+ downgrade_write(&rdev->exclusive_lock);
+
+ if (!r) {
r = radeon_ib_ring_tests(rdev);
if (r) {
dev_err(rdev->dev, "ib ring test failed (%d).\n", r);
if (saved) {
- saved = false;
+ /* if reset fails, try without saving data */
+ rdev->needs_reset = true;
radeon_suspend(rdev);
- goto retry;
+ up_read(&rdev->exclusive_lock);
+ return -EAGAIN;
}
}
- } else {
- radeon_fence_driver_force_completion(rdev);
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- kfree(ring_data[i]);
- }
}
- radeon_pm_resume(rdev);
- drm_helper_resume_force_mode(rdev->ddev);
-
- ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
if (r) {
/* bad news, how to tell it to userspace ? */
dev_info(rdev->dev, "GPU reset failed\n");
}
- up_write(&rdev->exclusive_lock);
+ rdev->in_reset = false;
+ up_read(&rdev->exclusive_lock);
return r;
}
--
2.0.4
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3
2014-08-18 15:12 ` Christian König
@ 2014-08-18 15:28 ` Maarten Lankhorst
2014-08-18 16:20 ` Christian König
0 siblings, 1 reply; 13+ messages in thread
From: Maarten Lankhorst @ 2014-08-18 15:28 UTC (permalink / raw)
To: Christian König, dri-devel@lists.freedesktop.org
Op 18-08-14 om 17:12 schreef Christian König:
> Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>> ---
>> V1 had a nasty bug breaking gpu lockup recovery. The fix is not
>> allowing radeon_fence_driver_check_lockup to take exclusive_lock,
>> and kill it during lockup recovery instead.
>> V2 used delayed work that ran during lockup recovery, but required
>> read lock. I've fixed this by downgrading the write, and retrying if
>> recovery fails.
>> ---
>> drivers/gpu/drm/radeon/radeon.h | 2 +
>> drivers/gpu/drm/radeon/radeon_fence.c | 115 +++++++++++++++++-----------------
>> 2 files changed, 61 insertions(+), 56 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
>> index 1d806983ec7b..29504efe8971 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -355,6 +355,8 @@ struct radeon_fence_driver {
>> uint64_t sync_seq[RADEON_NUM_RINGS];
>> atomic64_t last_seq;
>> bool initialized;
>> + struct delayed_work fence_check_work;
>> + struct radeon_device *rdev;
>
> Put the reference to the device as the first field in the structure.
>
>> };
>> struct radeon_fence {
>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
>> index 913787085dfa..94eca53d99f8 100644
>> --- a/drivers/gpu/drm/radeon/radeon_fence.c
>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
>> @@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev,
>> return 0;
>> }
>> -/**
>> - * radeon_fence_process - process a fence
>> - *
>> - * @rdev: radeon_device pointer
>> - * @ring: ring index the fence is associated with
>> - *
>> - * Checks the current fence value and wakes the fence queue
>> - * if the sequence number has increased (all asics).
>> - */
>> -void radeon_fence_process(struct radeon_device *rdev, int ring)
>> +static bool __radeon_fence_process(struct radeon_device *rdev, int ring)
>
> Don't use "__" for internal radeon function names and especially don't remove the function documentation.
I've moved the documentation to the new radeon_fence_process function that calls the __radeon_fence_process one,
which is an internal function that is only used by the driver. I guess I can rename it to __radeon_fence_process_nowake or something instead,
but documentation doesn't make sense since it's not used outside of radeon_fence.c
>> {
>> uint64_t seq, last_seq, last_emitted;
>> unsigned count_loop = 0;
>> @@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
>> }
>> } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
>> - if (wake)
>> + if (seq < last_emitted && !rdev->in_reset)
>> + mod_delayed_work(system_power_efficient_wq,
>> + &rdev->fence_drv[ring].fence_check_work,
>> + RADEON_FENCE_JIFFIES_TIMEOUT);
>
> Am I wrong or do you queue the work only after radeon_fence_process is called for the first time?
>
> Might be a good idea to have an explicit queue_delayed_work in radeon_fence_emit as well.
Yeah might as well, with these changes it makes sense to run it as soon as possible.
But if there are no waiters, there's no real need. It's a tree falling in a forest with no-one around to hear it. ;-)
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests
2014-08-18 15:02 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Christian König
2014-08-18 15:24 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests, v2 Maarten Lankhorst
@ 2014-08-18 16:03 ` Alex Deucher
2014-08-18 16:07 ` Christian König
1 sibling, 1 reply; 13+ messages in thread
From: Alex Deucher @ 2014-08-18 16:03 UTC (permalink / raw)
To: Christian König; +Cc: dri-devel@lists.freedesktop.org
[-- Attachment #1: Type: text/plain, Size: 6896 bytes --]
On Mon, Aug 18, 2014 at 11:02 AM, Christian König
<deathsimple@vodafone.de> wrote:
> Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
>
>> This is needed for the next commit, because the lockup detection
>> will need the read lock to run.
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>> ---
>> drivers/gpu/drm/radeon/radeon.h | 2 +-
>> drivers/gpu/drm/radeon/radeon_device.c | 61
>> ++++++++++++++++++++--------------
>> 2 files changed, 37 insertions(+), 26 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>> b/drivers/gpu/drm/radeon/radeon.h
>> index 9e1732eb402c..1d806983ec7b 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -2312,7 +2312,7 @@ struct radeon_device {
>> bool need_dma32;
>> bool accel_working;
>> bool fastfb_working; /* IGP feature*/
>> - bool needs_reset;
>> + bool needs_reset, in_reset;
>> struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
>> const struct firmware *me_fw; /* all family ME firmware */
>> const struct firmware *pfp_fw; /* r6/700 PFP firmware */
>> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
>> b/drivers/gpu/drm/radeon/radeon_device.c
>> index c8ea050c8fa4..82633fdd399d 100644
>> --- a/drivers/gpu/drm/radeon/radeon_device.c
>> +++ b/drivers/gpu/drm/radeon/radeon_device.c
>> @@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>> down_write(&rdev->exclusive_lock);
>> if (!rdev->needs_reset) {
>> + WARN_ON(rdev->in_reset);
>> up_write(&rdev->exclusive_lock);
>> return 0;
>> }
>> rdev->needs_reset = false;
>> -
>> - radeon_save_bios_scratch_regs(rdev);
>> - /* block TTM */
>> resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
>> - radeon_pm_suspend(rdev);
>> - radeon_suspend(rdev);
>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> - ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
>> - &ring_data[i]);
>> - if (ring_sizes[i]) {
>> - saved = true;
>> - dev_info(rdev->dev, "Saved %d dwords of commands "
>> - "on ring %d.\n", ring_sizes[i], i);
>> + if (!rdev->in_reset) {
>> + rdev->in_reset = true;
>> +
>> + radeon_save_bios_scratch_regs(rdev);
>> + /* block TTM */
>> + radeon_pm_suspend(rdev);
>> + radeon_suspend(rdev);
>
>
> That won't work correctly because you might end up with calling pm_resume
> more often than suspend and that can only lead to a crash. Saying this we
> probably already have a bug in the reset code at this point anyway, but see
> below.
>
>
>> +
>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> + ring_sizes[i] = radeon_ring_backup(rdev,
>> &rdev->ring[i],
>> + &ring_data[i]);
>> + if (ring_sizes[i]) {
>> + saved = true;
>> + dev_info(rdev->dev, "Saved %d dwords of
>> commands "
>> + "on ring %d.\n", ring_sizes[i],
>> i);
>> + }
>> }
>> - }
>> + } else
>> + memset(ring_data, 0, sizeof(ring_data));
>> -retry:
>> r = radeon_asic_reset(rdev);
>> if (!r) {
>> dev_info(rdev->dev, "GPU reset succeeded, trying to
>> resume\n");
>> @@ -1702,40 +1707,46 @@ retry:
>> radeon_restore_bios_scratch_regs(rdev);
>
>
> We should resume PM here as well.
>
>
>> - if (!r) {
>> + if (!r && saved) {
>> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> radeon_ring_restore(rdev, &rdev->ring[i],
>> ring_sizes[i], ring_data[i]);
>> - ring_sizes[i] = 0;
>> ring_data[i] = NULL;
>> }
>> + } else {
>> + radeon_fence_driver_force_completion(rdev);
>> +
>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> + kfree(ring_data[i]);
>> + }
>> + }
>> + downgrade_write(&rdev->exclusive_lock);
>> + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>
>
> I would unlock the delayed_workqueue first and then downgrade the readlock.
>
>
>> + if (!r) {
>> r = radeon_ib_ring_tests(rdev);
>> if (r) {
>> dev_err(rdev->dev, "ib ring test failed (%d).\n",
>> r);
>> if (saved) {
>> - saved = false;
>> + /* if reset fails, try without saving data
>> */
>> + rdev->needs_reset = true;
>> radeon_suspend(rdev);
>> - goto retry;
>> + up_read(&rdev->exclusive_lock);
>> + return -EAGAIN;
>> }
>> }
>> - } else {
>> - radeon_fence_driver_force_completion(rdev);
>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> - kfree(ring_data[i]);
>> - }
>> }
>>
>
>
>> radeon_pm_resume(rdev);
>
>
> Move this more up.
>
> Alex is more into this, but it's probably a bug in the current reset code
> that this is after the IB tests, cause the IB tests needs everything powered
> up and with PM handling suspended it is possible that individual blocks are
> powered down.
Yeah, looks like a bug. I think the attached patch should fix it.
Alex
>
> Thanks,
> Christian.
>
>
>> drm_helper_resume_force_mode(rdev->ddev);
>> - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>> if (r) {
>> /* bad news, how to tell it to userspace ? */
>> dev_info(rdev->dev, "GPU reset failed\n");
>> }
>> - up_write(&rdev->exclusive_lock);
>> + rdev->in_reset = false;
>> + up_read(&rdev->exclusive_lock);
>> return r;
>> }
>>
>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
[-- Attachment #2: 0001-drm-radeon-fix-pm-handling-in-radeon_gpu_reset.patch --]
[-- Type: text/x-diff, Size: 1883 bytes --]
From 43021a83cf04064e9771964233487fe9f49fa3db Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Mon, 18 Aug 2014 11:57:28 -0400
Subject: [PATCH] drm/radeon: fix pm handling in radeon_gpu_reset
pm_suspend is handled in the radeon_suspend callbacks.
pm_resume has special handling depending on whether
dpm or legacy pm is enabled. Change radeon_gpu_reset
to mirror the behavior in the suspend and resume
pathes.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
---
drivers/gpu/drm/radeon/radeon_device.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index c8ea050..8e61870 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1680,7 +1680,6 @@ int radeon_gpu_reset(struct radeon_device *rdev)
radeon_save_bios_scratch_regs(rdev);
/* block TTM */
resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
- radeon_pm_suspend(rdev);
radeon_suspend(rdev);
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -1726,9 +1725,24 @@ retry:
}
}
- radeon_pm_resume(rdev);
+ if ((rdev->pm.pm_method == PM_METHOD_DPM) && rdev->pm.dpm_enabled) {
+ /* do dpm late init */
+ r = radeon_pm_late_init(rdev);
+ if (r) {
+ rdev->pm.dpm_enabled = false;
+ DRM_ERROR("radeon_pm_late_init failed, disabling dpm\n");
+ }
+ } else {
+ /* resume old pm late */
+ radeon_pm_resume(rdev);
+ }
+
drm_helper_resume_force_mode(rdev->ddev);
+ /* set the power state here in case we are a PX system or headless */
+ if ((rdev->pm.pm_method == PM_METHOD_DPM) && rdev->pm.dpm_enabled)
+ radeon_pm_compute_clocks(rdev);
+
ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
if (r) {
/* bad news, how to tell it to userspace ? */
--
1.8.3.1
[-- Attachment #3: Type: text/plain, Size: 159 bytes --]
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests
2014-08-18 16:03 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Alex Deucher
@ 2014-08-18 16:07 ` Christian König
2014-08-18 16:10 ` Alex Deucher
0 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2014-08-18 16:07 UTC (permalink / raw)
To: Alex Deucher; +Cc: dri-devel@lists.freedesktop.org
> Yeah, looks like a bug. I think the attached patch should fix it.
Sounds logical and the patch is Reviewed-by: Christian König <christian.koenig@amd.com>
Going to apply Maartens patch on top and test that one a bit to make sure it works as expected.
Regards,
Christian.
Am 18.08.2014 um 18:03 schrieb Alex Deucher:
> On Mon, Aug 18, 2014 at 11:02 AM, Christian König
> <deathsimple@vodafone.de> wrote:
>> Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
>>
>>> This is needed for the next commit, because the lockup detection
>>> will need the read lock to run.
>>>
>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>>> ---
>>> drivers/gpu/drm/radeon/radeon.h | 2 +-
>>> drivers/gpu/drm/radeon/radeon_device.c | 61
>>> ++++++++++++++++++++--------------
>>> 2 files changed, 37 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>>> b/drivers/gpu/drm/radeon/radeon.h
>>> index 9e1732eb402c..1d806983ec7b 100644
>>> --- a/drivers/gpu/drm/radeon/radeon.h
>>> +++ b/drivers/gpu/drm/radeon/radeon.h
>>> @@ -2312,7 +2312,7 @@ struct radeon_device {
>>> bool need_dma32;
>>> bool accel_working;
>>> bool fastfb_working; /* IGP feature*/
>>> - bool needs_reset;
>>> + bool needs_reset, in_reset;
>>> struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
>>> const struct firmware *me_fw; /* all family ME firmware */
>>> const struct firmware *pfp_fw; /* r6/700 PFP firmware */
>>> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
>>> b/drivers/gpu/drm/radeon/radeon_device.c
>>> index c8ea050c8fa4..82633fdd399d 100644
>>> --- a/drivers/gpu/drm/radeon/radeon_device.c
>>> +++ b/drivers/gpu/drm/radeon/radeon_device.c
>>> @@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>>> down_write(&rdev->exclusive_lock);
>>> if (!rdev->needs_reset) {
>>> + WARN_ON(rdev->in_reset);
>>> up_write(&rdev->exclusive_lock);
>>> return 0;
>>> }
>>> rdev->needs_reset = false;
>>> -
>>> - radeon_save_bios_scratch_regs(rdev);
>>> - /* block TTM */
>>> resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
>>> - radeon_pm_suspend(rdev);
>>> - radeon_suspend(rdev);
>>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>> - ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
>>> - &ring_data[i]);
>>> - if (ring_sizes[i]) {
>>> - saved = true;
>>> - dev_info(rdev->dev, "Saved %d dwords of commands "
>>> - "on ring %d.\n", ring_sizes[i], i);
>>> + if (!rdev->in_reset) {
>>> + rdev->in_reset = true;
>>> +
>>> + radeon_save_bios_scratch_regs(rdev);
>>> + /* block TTM */
>>> + radeon_pm_suspend(rdev);
>>> + radeon_suspend(rdev);
>>
>> That won't work correctly because you might end up with calling pm_resume
>> more often than suspend and that can only lead to a crash. Saying this we
>> probably already have a bug in the reset code at this point anyway, but see
>> below.
>>
>>
>>> +
>>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>> + ring_sizes[i] = radeon_ring_backup(rdev,
>>> &rdev->ring[i],
>>> + &ring_data[i]);
>>> + if (ring_sizes[i]) {
>>> + saved = true;
>>> + dev_info(rdev->dev, "Saved %d dwords of
>>> commands "
>>> + "on ring %d.\n", ring_sizes[i],
>>> i);
>>> + }
>>> }
>>> - }
>>> + } else
>>> + memset(ring_data, 0, sizeof(ring_data));
>>> -retry:
>>> r = radeon_asic_reset(rdev);
>>> if (!r) {
>>> dev_info(rdev->dev, "GPU reset succeeded, trying to
>>> resume\n");
>>> @@ -1702,40 +1707,46 @@ retry:
>>> radeon_restore_bios_scratch_regs(rdev);
>>
>> We should resume PM here as well.
>>
>>
>>> - if (!r) {
>>> + if (!r && saved) {
>>> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>> radeon_ring_restore(rdev, &rdev->ring[i],
>>> ring_sizes[i], ring_data[i]);
>>> - ring_sizes[i] = 0;
>>> ring_data[i] = NULL;
>>> }
>>> + } else {
>>> + radeon_fence_driver_force_completion(rdev);
>>> +
>>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>> + kfree(ring_data[i]);
>>> + }
>>> + }
>>> + downgrade_write(&rdev->exclusive_lock);
>>> + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>>
>> I would unlock the delayed_workqueue first and then downgrade the readlock.
>>
>>
>>> + if (!r) {
>>> r = radeon_ib_ring_tests(rdev);
>>> if (r) {
>>> dev_err(rdev->dev, "ib ring test failed (%d).\n",
>>> r);
>>> if (saved) {
>>> - saved = false;
>>> + /* if reset fails, try without saving data
>>> */
>>> + rdev->needs_reset = true;
>>> radeon_suspend(rdev);
>>> - goto retry;
>>> + up_read(&rdev->exclusive_lock);
>>> + return -EAGAIN;
>>> }
>>> }
>>> - } else {
>>> - radeon_fence_driver_force_completion(rdev);
>>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>> - kfree(ring_data[i]);
>>> - }
>>> }
>>>
>>
>>> radeon_pm_resume(rdev);
>>
>> Move this more up.
>>
>> Alex is more into this, but it's probably a bug in the current reset code
>> that this is after the IB tests, cause the IB tests needs everything powered
>> up and with PM handling suspended it is possible that individual blocks are
>> powered down.
> Yeah, looks like a bug. I think the attached patch should fix it.
>
> Alex
>
>> Thanks,
>> Christian.
>>
>>
>>> drm_helper_resume_force_mode(rdev->ddev);
>>> - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>>> if (r) {
>>> /* bad news, how to tell it to userspace ? */
>>> dev_info(rdev->dev, "GPU reset failed\n");
>>> }
>>> - up_write(&rdev->exclusive_lock);
>>> + rdev->in_reset = false;
>>> + up_read(&rdev->exclusive_lock);
>>> return r;
>>> }
>>>
>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests
2014-08-18 16:07 ` Christian König
@ 2014-08-18 16:10 ` Alex Deucher
2014-08-18 20:02 ` Maarten Lankhorst
0 siblings, 1 reply; 13+ messages in thread
From: Alex Deucher @ 2014-08-18 16:10 UTC (permalink / raw)
To: Christian König; +Cc: dri-devel@lists.freedesktop.org
On Mon, Aug 18, 2014 at 12:07 PM, Christian König
<deathsimple@vodafone.de> wrote:
>> Yeah, looks like a bug. I think the attached patch should fix it.
>
> Sounds logical and the patch is Reviewed-by: Christian König
> <christian.koenig@amd.com>
>
> Going to apply Maartens patch on top and test that one a bit to make sure it
> works as expected.
pushed my current -fixes queue to my drm-fixes-3.17-wip branch if that helps.
Alex
>
> Regards,
> Christian.
>
> Am 18.08.2014 um 18:03 schrieb Alex Deucher:
>
>> On Mon, Aug 18, 2014 at 11:02 AM, Christian König
>> <deathsimple@vodafone.de> wrote:
>>>
>>> Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
>>>
>>>> This is needed for the next commit, because the lockup detection
>>>> will need the read lock to run.
>>>>
>>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>>>> ---
>>>> drivers/gpu/drm/radeon/radeon.h | 2 +-
>>>> drivers/gpu/drm/radeon/radeon_device.c | 61
>>>> ++++++++++++++++++++--------------
>>>> 2 files changed, 37 insertions(+), 26 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>>>> b/drivers/gpu/drm/radeon/radeon.h
>>>> index 9e1732eb402c..1d806983ec7b 100644
>>>> --- a/drivers/gpu/drm/radeon/radeon.h
>>>> +++ b/drivers/gpu/drm/radeon/radeon.h
>>>> @@ -2312,7 +2312,7 @@ struct radeon_device {
>>>> bool need_dma32;
>>>> bool accel_working;
>>>> bool fastfb_working; /* IGP
>>>> feature*/
>>>> - bool needs_reset;
>>>> + bool needs_reset, in_reset;
>>>> struct radeon_surface_reg
>>>> surface_regs[RADEON_GEM_MAX_SURFACES];
>>>> const struct firmware *me_fw; /* all family ME firmware */
>>>> const struct firmware *pfp_fw; /* r6/700 PFP firmware */
>>>> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
>>>> b/drivers/gpu/drm/radeon/radeon_device.c
>>>> index c8ea050c8fa4..82633fdd399d 100644
>>>> --- a/drivers/gpu/drm/radeon/radeon_device.c
>>>> +++ b/drivers/gpu/drm/radeon/radeon_device.c
>>>> @@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>>>> down_write(&rdev->exclusive_lock);
>>>> if (!rdev->needs_reset) {
>>>> + WARN_ON(rdev->in_reset);
>>>> up_write(&rdev->exclusive_lock);
>>>> return 0;
>>>> }
>>>> rdev->needs_reset = false;
>>>> -
>>>> - radeon_save_bios_scratch_regs(rdev);
>>>> - /* block TTM */
>>>> resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
>>>> - radeon_pm_suspend(rdev);
>>>> - radeon_suspend(rdev);
>>>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> - ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
>>>> - &ring_data[i]);
>>>> - if (ring_sizes[i]) {
>>>> - saved = true;
>>>> - dev_info(rdev->dev, "Saved %d dwords of commands
>>>> "
>>>> - "on ring %d.\n", ring_sizes[i], i);
>>>> + if (!rdev->in_reset) {
>>>> + rdev->in_reset = true;
>>>> +
>>>> + radeon_save_bios_scratch_regs(rdev);
>>>> + /* block TTM */
>>>> + radeon_pm_suspend(rdev);
>>>> + radeon_suspend(rdev);
>>>
>>>
>>> That won't work correctly because you might end up with calling pm_resume
>>> more often than suspend and that can only lead to a crash. Saying this we
>>> probably already have a bug in the reset code at this point anyway, but
>>> see
>>> below.
>>>
>>>
>>>> +
>>>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> + ring_sizes[i] = radeon_ring_backup(rdev,
>>>> &rdev->ring[i],
>>>> +
>>>> &ring_data[i]);
>>>> + if (ring_sizes[i]) {
>>>> + saved = true;
>>>> + dev_info(rdev->dev, "Saved %d dwords of
>>>> commands "
>>>> + "on ring %d.\n", ring_sizes[i],
>>>> i);
>>>> + }
>>>> }
>>>> - }
>>>> + } else
>>>> + memset(ring_data, 0, sizeof(ring_data));
>>>> -retry:
>>>> r = radeon_asic_reset(rdev);
>>>> if (!r) {
>>>> dev_info(rdev->dev, "GPU reset succeeded, trying to
>>>> resume\n");
>>>> @@ -1702,40 +1707,46 @@ retry:
>>>> radeon_restore_bios_scratch_regs(rdev);
>>>
>>>
>>> We should resume PM here as well.
>>>
>>>
>>>> - if (!r) {
>>>> + if (!r && saved) {
>>>> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> radeon_ring_restore(rdev, &rdev->ring[i],
>>>> ring_sizes[i],
>>>> ring_data[i]);
>>>> - ring_sizes[i] = 0;
>>>> ring_data[i] = NULL;
>>>> }
>>>> + } else {
>>>> + radeon_fence_driver_force_completion(rdev);
>>>> +
>>>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> + kfree(ring_data[i]);
>>>> + }
>>>> + }
>>>> + downgrade_write(&rdev->exclusive_lock);
>>>> + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>>>
>>>
>>> I would unlock the delayed_workqueue first and then downgrade the
>>> readlock.
>>>
>>>
>>>> + if (!r) {
>>>> r = radeon_ib_ring_tests(rdev);
>>>> if (r) {
>>>> dev_err(rdev->dev, "ib ring test failed
>>>> (%d).\n",
>>>> r);
>>>> if (saved) {
>>>> - saved = false;
>>>> + /* if reset fails, try without saving
>>>> data
>>>> */
>>>> + rdev->needs_reset = true;
>>>> radeon_suspend(rdev);
>>>> - goto retry;
>>>> + up_read(&rdev->exclusive_lock);
>>>> + return -EAGAIN;
>>>> }
>>>> }
>>>> - } else {
>>>> - radeon_fence_driver_force_completion(rdev);
>>>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> - kfree(ring_data[i]);
>>>> - }
>>>> }
>>>>
>>>
>>>> radeon_pm_resume(rdev);
>>>
>>>
>>> Move this more up.
>>>
>>> Alex is more into this, but it's probably a bug in the current reset code
>>> that this is after the IB tests, cause the IB tests needs everything
>>> powered
>>> up and with PM handling suspended it is possible that individual blocks
>>> are
>>> powered down.
>>
>> Yeah, looks like a bug. I think the attached patch should fix it.
>>
>> Alex
>>
>>> Thanks,
>>> Christian.
>>>
>>>
>>>> drm_helper_resume_force_mode(rdev->ddev);
>>>> - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>>>> if (r) {
>>>> /* bad news, how to tell it to userspace ? */
>>>> dev_info(rdev->dev, "GPU reset failed\n");
>>>> }
>>>> - up_write(&rdev->exclusive_lock);
>>>> + rdev->in_reset = false;
>>>> + up_read(&rdev->exclusive_lock);
>>>> return r;
>>>> }
>>>>
>>>
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>
>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3
2014-08-18 15:28 ` Maarten Lankhorst
@ 2014-08-18 16:20 ` Christian König
0 siblings, 0 replies; 13+ messages in thread
From: Christian König @ 2014-08-18 16:20 UTC (permalink / raw)
To: Maarten Lankhorst, dri-devel@lists.freedesktop.org
Am 18.08.2014 um 17:28 schrieb Maarten Lankhorst:
> Op 18-08-14 om 17:12 schreef Christian König:
>> Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>>> ---
>>> V1 had a nasty bug breaking gpu lockup recovery. The fix is not
>>> allowing radeon_fence_driver_check_lockup to take exclusive_lock,
>>> and kill it during lockup recovery instead.
>>> V2 used delayed work that ran during lockup recovery, but required
>>> read lock. I've fixed this by downgrading the write, and retrying if
>>> recovery fails.
>>> ---
>>> drivers/gpu/drm/radeon/radeon.h | 2 +
>>> drivers/gpu/drm/radeon/radeon_fence.c | 115 +++++++++++++++++-----------------
>>> 2 files changed, 61 insertions(+), 56 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
>>> index 1d806983ec7b..29504efe8971 100644
>>> --- a/drivers/gpu/drm/radeon/radeon.h
>>> +++ b/drivers/gpu/drm/radeon/radeon.h
>>> @@ -355,6 +355,8 @@ struct radeon_fence_driver {
>>> uint64_t sync_seq[RADEON_NUM_RINGS];
>>> atomic64_t last_seq;
>>> bool initialized;
>>> + struct delayed_work fence_check_work;
>>> + struct radeon_device *rdev;
>> Put the reference to the device as the first field in the structure.
>>
>>> };
>>> struct radeon_fence {
>>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
>>> index 913787085dfa..94eca53d99f8 100644
>>> --- a/drivers/gpu/drm/radeon/radeon_fence.c
>>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
>>> @@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev,
>>> return 0;
>>> }
>>> -/**
>>> - * radeon_fence_process - process a fence
>>> - *
>>> - * @rdev: radeon_device pointer
>>> - * @ring: ring index the fence is associated with
>>> - *
>>> - * Checks the current fence value and wakes the fence queue
>>> - * if the sequence number has increased (all asics).
>>> - */
>>> -void radeon_fence_process(struct radeon_device *rdev, int ring)
>>> +static bool __radeon_fence_process(struct radeon_device *rdev, int ring)
>> Don't use "__" for internal radeon function names and especially don't remove the function documentation.
> I've moved the documentation to the new radeon_fence_process function that calls the __radeon_fence_process one,
> which is an internal function that is only used by the driver. I guess I can rename it to __radeon_fence_process_nowake or something instead,
> but documentation doesn't make sense since it's not used outside of radeon_fence.c
We try to document even static functions in doxygen style even if you
can't see them outside the C file.
>>> {
>>> uint64_t seq, last_seq, last_emitted;
>>> unsigned count_loop = 0;
>>> @@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
>>> }
>>> } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
>>> - if (wake)
>>> + if (seq < last_emitted && !rdev->in_reset)
>>> + mod_delayed_work(system_power_efficient_wq,
>>> + &rdev->fence_drv[ring].fence_check_work,
>>> + RADEON_FENCE_JIFFIES_TIMEOUT);
>> Am I wrong or do you queue the work only after radeon_fence_process is called for the first time?
>>
>> Might be a good idea to have an explicit queue_delayed_work in radeon_fence_emit as well.
> Yeah might as well, with these changes it makes sense to run it as soon as possible.
>
> But if there are no waiters, there's no real need. It's a tree falling in a forest with no-one around to hear it. ;-)
That's why I suggested to schedule the work item only when the IRQ is
enabled for waiting, otherwise it indeed doesn't make much sense.
Christian.
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests
2014-08-18 16:10 ` Alex Deucher
@ 2014-08-18 20:02 ` Maarten Lankhorst
2014-08-18 20:56 ` Alex Deucher
0 siblings, 1 reply; 13+ messages in thread
From: Maarten Lankhorst @ 2014-08-18 20:02 UTC (permalink / raw)
To: Alex Deucher, Christian König; +Cc: dri-devel@lists.freedesktop.org
Hey,
On 18-08-14 18:10, Alex Deucher wrote:
> On Mon, Aug 18, 2014 at 12:07 PM, Christian König
> <deathsimple@vodafone.de> wrote:
>>> Yeah, looks like a bug. I think the attached patch should fix it.
>>
>> Sounds logical and the patch is Reviewed-by: Christian König
>> <christian.koenig@amd.com>
>>
>> Going to apply Maartens patch on top and test that one a bit to make sure it
>> works as expected.
>
> pushed my current -fixes queue to my drm-fixes-3.17-wip branch if that helps.
Thanks, maybe that fixes uvd on resume for me. :-)
I'll have to rework it to include the changes, but does resuming everything in the order of my v2 patch look sane?
Then as a final act I'm downgrading to read, and run the tests.
~Maarten
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests
2014-08-18 20:02 ` Maarten Lankhorst
@ 2014-08-18 20:56 ` Alex Deucher
0 siblings, 0 replies; 13+ messages in thread
From: Alex Deucher @ 2014-08-18 20:56 UTC (permalink / raw)
To: Maarten Lankhorst; +Cc: dri-devel@lists.freedesktop.org
[-- Attachment #1: Type: text/plain, Size: 1063 bytes --]
On Mon, Aug 18, 2014 at 4:02 PM, Maarten Lankhorst
<maarten.lankhorst@canonical.com> wrote:
> Hey,
>
> On 18-08-14 18:10, Alex Deucher wrote:
>> On Mon, Aug 18, 2014 at 12:07 PM, Christian König
>> <deathsimple@vodafone.de> wrote:
>>>> Yeah, looks like a bug. I think the attached patch should fix it.
>>>
>>> Sounds logical and the patch is Reviewed-by: Christian König
>>> <christian.koenig@amd.com>
>>>
>>> Going to apply Maartens patch on top and test that one a bit to make sure it
>>> works as expected.
>>
>> pushed my current -fixes queue to my drm-fixes-3.17-wip branch if that helps.
>
> Thanks, maybe that fixes uvd on resume for me. :-)
>
> I'll have to rework it to include the changes, but does resuming everything in the order of my v2 patch look sane?
> Then as a final act I'm downgrading to read, and run the tests.
Seems sane. Looks like we probably also need this attached patch as
well to be on the safe side for displays. It would be nice to unify
the suspend/resume and gpu_reset paths at some point.
Alex
[-- Attachment #2: 0001-drm-radeon-fix-display-handling-in-radeon_gpu_reset.patch --]
[-- Type: text/x-diff, Size: 1677 bytes --]
From e8cfbe410871de7a4805f13c6f18c3551741c639 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Mon, 18 Aug 2014 16:51:46 -0400
Subject: [PATCH] drm/radeon: fix display handling in radeon_gpu_reset
If the display hw was reset or a hard reset was used,
we need to re-init some of the common display hardware as well.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
---
drivers/gpu/drm/radeon/radeon_device.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 8e61870..6a219bc 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1681,6 +1681,7 @@ int radeon_gpu_reset(struct radeon_device *rdev)
/* block TTM */
resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
radeon_suspend(rdev);
+ radeon_hpd_fini(rdev);
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
@@ -1737,6 +1738,21 @@ retry:
radeon_pm_resume(rdev);
}
+ /* init dig PHYs, disp eng pll */
+ if (rdev->is_atom_bios) {
+ radeon_atom_encoder_init(rdev);
+ radeon_atom_disp_eng_pll_init(rdev);
+ /* turn on the BL */
+ if (rdev->mode_info.bl_encoder) {
+ u8 bl_level = radeon_get_backlight_level(rdev,
+ rdev->mode_info.bl_encoder);
+ radeon_set_backlight_level(rdev, rdev->mode_info.bl_encoder,
+ bl_level);
+ }
+ }
+ /* reset hpd state */
+ radeon_hpd_init(rdev);
+
drm_helper_resume_force_mode(rdev->ddev);
/* set the power state here in case we are a PX system or headless */
--
1.8.3.1
[-- Attachment #3: Type: text/plain, Size: 159 bytes --]
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 13+ messages in thread
end of thread, other threads:[~2014-08-18 20:56 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-08-18 14:45 [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Maarten Lankhorst
2014-08-18 14:45 ` [PATCH 2/3] drm/radeon: handle lockup in delayed work, v3 Maarten Lankhorst
2014-08-18 15:12 ` Christian König
2014-08-18 15:28 ` Maarten Lankhorst
2014-08-18 16:20 ` Christian König
2014-08-18 14:46 ` [PATCH 3/3] drm/radeon: add timeout argument to radeon_fence_wait_seq Maarten Lankhorst
2014-08-18 15:02 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Christian König
2014-08-18 15:24 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests, v2 Maarten Lankhorst
2014-08-18 16:03 ` [PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests Alex Deucher
2014-08-18 16:07 ` Christian König
2014-08-18 16:10 ` Alex Deucher
2014-08-18 20:02 ` Maarten Lankhorst
2014-08-18 20:56 ` Alex Deucher
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.