* [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test
@ 2013-10-14 9:32 Christian König
2013-10-14 9:32 ` [PATCH 2/2] drm/radeon: rework and fix reset detection v2 Christian König
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Christian König @ 2013-10-14 9:32 UTC (permalink / raw)
To: dri-devel
From: Christian König <christian.koenig@amd.com>
Stop leaking IB memory and scratch register space when the test fails.
Signed-off-by: Christian König <christian.koenig@amd.com>
---
drivers/gpu/drm/radeon/cik.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
index b874ccd..8f393df 100644
--- a/drivers/gpu/drm/radeon/cik.c
+++ b/drivers/gpu/drm/radeon/cik.c
@@ -3182,6 +3182,7 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
r = radeon_ib_get(rdev, ring->idx, &ib, NULL, 256);
if (r) {
DRM_ERROR("radeon: failed to get ib (%d).\n", r);
+ radeon_scratch_free(rdev, scratch);
return r;
}
ib.ptr[0] = PACKET3(PACKET3_SET_UCONFIG_REG, 1);
@@ -3198,6 +3199,8 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
r = radeon_fence_wait(ib.fence, false);
if (r) {
DRM_ERROR("radeon: fence wait failed (%d).\n", r);
+ radeon_scratch_free(rdev, scratch);
+ radeon_ib_free(rdev, &ib);
return r;
}
for (i = 0; i < rdev->usec_timeout; i++) {
--
1.8.1.2
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/2] drm/radeon: rework and fix reset detection v2
2013-10-14 9:32 [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test Christian König
@ 2013-10-14 9:32 ` Christian König
2013-10-14 13:20 ` Dieter Nützel
2013-10-14 17:35 ` [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test Alex Deucher
2013-10-14 19:13 ` Marek Olšák
2 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2013-10-14 9:32 UTC (permalink / raw)
To: dri-devel
From: Christian König <christian.koenig@amd.com>
Stop fiddling with jiffies, always wait for RADEON_FENCE_JIFFIES_TIMEOUT.
Consolidate the two wait sequence implementations into just one function.
Activate all waiters and remember if the reset was already done instead of
trying to reset from only one thread.
v2: clear reset flag earlier to avoid timeout in IB test
Signed-off-by: Christian König <christian.koenig@amd.com>
---
drivers/gpu/drm/radeon/radeon.h | 2 +-
drivers/gpu/drm/radeon/radeon_device.c | 8 +
drivers/gpu/drm/radeon/radeon_fence.c | 347 +++++++++++----------------------
3 files changed, 127 insertions(+), 230 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index a400ac1..0201c6e 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -327,7 +327,6 @@ struct radeon_fence_driver {
/* sync_seq is protected by ring emission lock */
uint64_t sync_seq[RADEON_NUM_RINGS];
atomic64_t last_seq;
- unsigned long last_activity;
bool initialized;
};
@@ -2170,6 +2169,7 @@ struct radeon_device {
bool need_dma32;
bool accel_working;
bool fastfb_working; /* IGP feature*/
+ bool needs_reset;
struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
const struct firmware *me_fw; /* all family ME firmware */
const struct firmware *pfp_fw; /* r6/700 PFP firmware */
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 841d0e0..3f35f21 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1549,6 +1549,14 @@ int radeon_gpu_reset(struct radeon_device *rdev)
int resched;
down_write(&rdev->exclusive_lock);
+
+ if (!rdev->needs_reset) {
+ up_write(&rdev->exclusive_lock);
+ return 0;
+ }
+
+ rdev->needs_reset = false;
+
radeon_save_bios_scratch_regs(rdev);
/* block TTM */
resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index ddb8f8e..b8f68b2 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -190,10 +190,8 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
}
} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
- if (wake) {
- rdev->fence_drv[ring].last_activity = jiffies;
+ if (wake)
wake_up_all(&rdev->fence_queue);
- }
}
/**
@@ -212,13 +210,13 @@ static void radeon_fence_destroy(struct kref *kref)
}
/**
- * radeon_fence_seq_signaled - check if a fence sequeuce number has signaled
+ * radeon_fence_seq_signaled - check if a fence sequence number has signaled
*
* @rdev: radeon device pointer
* @seq: sequence number
* @ring: ring index the fence is associated with
*
- * Check if the last singled fence sequnce number is >= the requested
+ * Check if the last signaled fence sequnce number is >= the requested
* sequence number (all asics).
* Returns true if the fence has signaled (current fence value
* is >= requested value) or false if it has not (current fence
@@ -263,113 +261,131 @@ bool radeon_fence_signaled(struct radeon_fence *fence)
}
/**
- * radeon_fence_wait_seq - wait for a specific sequence number
+ * radeon_fence_any_seq_signaled - check if any sequence number is signaled
*
* @rdev: radeon device pointer
- * @target_seq: sequence number we want to wait for
- * @ring: ring index the fence is associated with
+ * @seq: sequence numbers
+ *
+ * Check if the last signaled fence sequnce number is >= the requested
+ * sequence number (all asics).
+ * Returns true if any has signaled (current value is >= requested value)
+ * or false if it has not. Helper function for radeon_fence_wait_seq.
+ */
+static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, u64 *seq)
+{
+ unsigned i;
+
+ for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+ if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i))
+ return true;
+ }
+ return false;
+}
+
+/**
+ * radeon_fence_wait_seq - wait for a specific sequence numbers
+ *
+ * @rdev: radeon device pointer
+ * @target_seq: sequence number(s) we want to wait for
* @intr: use interruptable sleep
* @lock_ring: whether the ring should be locked or not
*
- * Wait for the requested sequence number to be written (all asics).
+ * Wait for the requested sequence number(s) to be written by any ring
+ * (all asics). Sequnce number array is indexed by ring id.
* @intr selects whether to use interruptable (true) or non-interruptable
* (false) sleep when waiting for the sequence number. Helper function
- * for radeon_fence_wait(), et al.
+ * for radeon_fence_wait_*().
* Returns 0 if the sequence number has passed, error for all other cases.
- * -EDEADLK is returned when a GPU lockup has been detected and the ring is
- * marked as not ready so no further jobs get scheduled until a successful
- * reset.
+ * -EDEADLK is returned when a GPU lockup has been detected.
*/
-static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 target_seq,
- unsigned ring, bool intr, bool lock_ring)
+static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
+ bool intr, bool lock_ring)
{
- unsigned long timeout, last_activity;
- uint64_t seq;
- unsigned i;
+ uint64_t last_seq[RADEON_NUM_RINGS];
bool signaled;
- int r;
+ int i, r;
+
+ while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
+
+ /* Save current sequence values, used to check for GPU lockups */
+ for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+ if (!target_seq[i])
+ continue;
- while (target_seq > atomic64_read(&rdev->fence_drv[ring].last_seq)) {
- if (!rdev->ring[ring].ready) {
- return -EBUSY;
+ last_seq[i] = atomic64_read(&rdev->fence_drv[i].last_seq);
+ trace_radeon_fence_wait_begin(rdev->ddev, target_seq[i]);
+ radeon_irq_kms_sw_irq_get(rdev, i);
}
- timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
- if (time_after(rdev->fence_drv[ring].last_activity, timeout)) {
- /* the normal case, timeout is somewhere before last_activity */
- timeout = rdev->fence_drv[ring].last_activity - timeout;
+ if (intr) {
+ r = wait_event_interruptible_timeout(rdev->fence_queue, (
+ (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
+ || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
} else {
- /* either jiffies wrapped around, or no fence was signaled in the last 500ms
- * anyway we will just wait for the minimum amount and then check for a lockup
- */
- timeout = 1;
+ r = wait_event_timeout(rdev->fence_queue, (
+ (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
+ || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
}
- seq = atomic64_read(&rdev->fence_drv[ring].last_seq);
- /* Save current last activity valuee, used to check for GPU lockups */
- last_activity = rdev->fence_drv[ring].last_activity;
- trace_radeon_fence_wait_begin(rdev->ddev, seq);
- radeon_irq_kms_sw_irq_get(rdev, ring);
- if (intr) {
- r = wait_event_interruptible_timeout(rdev->fence_queue,
- (signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
- timeout);
- } else {
- r = wait_event_timeout(rdev->fence_queue,
- (signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
- timeout);
+ for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+ if (!target_seq[i])
+ continue;
+
+ radeon_irq_kms_sw_irq_put(rdev, i);
+ trace_radeon_fence_wait_end(rdev->ddev, target_seq[i]);
}
- radeon_irq_kms_sw_irq_put(rdev, ring);
- if (unlikely(r < 0)) {
+
+ if (unlikely(r < 0))
return r;
- }
- trace_radeon_fence_wait_end(rdev->ddev, seq);
if (unlikely(!signaled)) {
+ if (rdev->needs_reset)
+ return -EDEADLK;
+
/* we were interrupted for some reason and fence
* isn't signaled yet, resume waiting */
- if (r) {
+ if (r)
continue;
+
+ for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+ if (!target_seq[i])
+ continue;
+
+ if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
+ break;
}
- /* check if sequence value has changed since last_activity */
- if (seq != atomic64_read(&rdev->fence_drv[ring].last_seq)) {
+ if (i != RADEON_NUM_RINGS)
continue;
- }
- if (lock_ring) {
+ if (lock_ring)
mutex_lock(&rdev->ring_lock);
- }
- /* test if somebody else has already decided that this is a lockup */
- if (last_activity != rdev->fence_drv[ring].last_activity) {
- if (lock_ring) {
- mutex_unlock(&rdev->ring_lock);
- }
- continue;
+ for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+ if (!target_seq[i])
+ continue;
+
+ if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
+ break;
}
- if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
+ if (i < RADEON_NUM_RINGS) {
/* good news we believe it's a lockup */
- dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx last fence id 0x%016llx)\n",
- target_seq, seq);
-
- /* change last activity so nobody else think there is a lockup */
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- rdev->fence_drv[i].last_activity = jiffies;
- }
-
- /* mark the ring as not ready any more */
- rdev->ring[ring].ready = false;
- if (lock_ring) {
+ dev_warn(rdev->dev, "GPU lockup (waiting for "
+ "0x%016llx last fence id 0x%016llx on"
+ " ring %d)\n",
+ target_seq[i], last_seq[i], i);
+
+ /* remember that we need an reset */
+ rdev->needs_reset = true;
+ if (lock_ring)
mutex_unlock(&rdev->ring_lock);
- }
+ wake_up_all(&rdev->fence_queue);
return -EDEADLK;
}
- if (lock_ring) {
+ if (lock_ring)
mutex_unlock(&rdev->ring_lock);
- }
}
}
return 0;
@@ -388,6 +404,7 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 target_seq,
*/
int radeon_fence_wait(struct radeon_fence *fence, bool intr)
{
+ uint64_t seq[RADEON_NUM_RINGS] = {};
int r;
if (fence == NULL) {
@@ -395,147 +412,15 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
return -EINVAL;
}
- r = radeon_fence_wait_seq(fence->rdev, fence->seq,
- fence->ring, intr, true);
- if (r) {
- return r;
- }
- fence->seq = RADEON_FENCE_SIGNALED_SEQ;
- return 0;
-}
-
-static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, u64 *seq)
-{
- unsigned i;
-
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i)) {
- return true;
- }
- }
- return false;
-}
-
-/**
- * radeon_fence_wait_any_seq - wait for a sequence number on any ring
- *
- * @rdev: radeon device pointer
- * @target_seq: sequence number(s) we want to wait for
- * @intr: use interruptable sleep
- *
- * Wait for the requested sequence number(s) to be written by any ring
- * (all asics). Sequnce number array is indexed by ring id.
- * @intr selects whether to use interruptable (true) or non-interruptable
- * (false) sleep when waiting for the sequence number. Helper function
- * for radeon_fence_wait_any(), et al.
- * Returns 0 if the sequence number has passed, error for all other cases.
- */
-static int radeon_fence_wait_any_seq(struct radeon_device *rdev,
- u64 *target_seq, bool intr)
-{
- unsigned long timeout, last_activity, tmp;
- unsigned i, ring = RADEON_NUM_RINGS;
- bool signaled;
- int r;
-
- for (i = 0, last_activity = 0; i < RADEON_NUM_RINGS; ++i) {
- if (!target_seq[i]) {
- continue;
- }
-
- /* use the most recent one as indicator */
- if (time_after(rdev->fence_drv[i].last_activity, last_activity)) {
- last_activity = rdev->fence_drv[i].last_activity;
- }
-
- /* For lockup detection just pick the lowest ring we are
- * actively waiting for
- */
- if (i < ring) {
- ring = i;
- }
- }
-
- /* nothing to wait for ? */
- if (ring == RADEON_NUM_RINGS) {
- return -ENOENT;
- }
-
- while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
- timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
- if (time_after(last_activity, timeout)) {
- /* the normal case, timeout is somewhere before last_activity */
- timeout = last_activity - timeout;
- } else {
- /* either jiffies wrapped around, or no fence was signaled in the last 500ms
- * anyway we will just wait for the minimum amount and then check for a lockup
- */
- timeout = 1;
- }
+ seq[fence->ring] = fence->seq;
+ if (seq[fence->ring] == RADEON_FENCE_SIGNALED_SEQ)
+ return 0;
- trace_radeon_fence_wait_begin(rdev->ddev, target_seq[ring]);
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- if (target_seq[i]) {
- radeon_irq_kms_sw_irq_get(rdev, i);
- }
- }
- if (intr) {
- r = wait_event_interruptible_timeout(rdev->fence_queue,
- (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
- timeout);
- } else {
- r = wait_event_timeout(rdev->fence_queue,
- (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
- timeout);
- }
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- if (target_seq[i]) {
- radeon_irq_kms_sw_irq_put(rdev, i);
- }
- }
- if (unlikely(r < 0)) {
- return r;
- }
- trace_radeon_fence_wait_end(rdev->ddev, target_seq[ring]);
-
- if (unlikely(!signaled)) {
- /* we were interrupted for some reason and fence
- * isn't signaled yet, resume waiting */
- if (r) {
- continue;
- }
-
- mutex_lock(&rdev->ring_lock);
- for (i = 0, tmp = 0; i < RADEON_NUM_RINGS; ++i) {
- if (time_after(rdev->fence_drv[i].last_activity, tmp)) {
- tmp = rdev->fence_drv[i].last_activity;
- }
- }
- /* test if somebody else has already decided that this is a lockup */
- if (last_activity != tmp) {
- last_activity = tmp;
- mutex_unlock(&rdev->ring_lock);
- continue;
- }
-
- if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
- /* good news we believe it's a lockup */
- dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx)\n",
- target_seq[ring]);
-
- /* change last activity so nobody else think there is a lockup */
- for (i = 0; i < RADEON_NUM_RINGS; ++i) {
- rdev->fence_drv[i].last_activity = jiffies;
- }
+ r = radeon_fence_wait_seq(fence->rdev, seq, intr, true);
+ if (r)
+ return r;
- /* mark the ring as not ready any more */
- rdev->ring[ring].ready = false;
- mutex_unlock(&rdev->ring_lock);
- return -EDEADLK;
- }
- mutex_unlock(&rdev->ring_lock);
- }
- }
+ fence->seq = RADEON_FENCE_SIGNALED_SEQ;
return 0;
}
@@ -557,7 +442,7 @@ int radeon_fence_wait_any(struct radeon_device *rdev,
bool intr)
{
uint64_t seq[RADEON_NUM_RINGS];
- unsigned i;
+ unsigned i, num_rings = 0;
int r;
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -567,15 +452,19 @@ int radeon_fence_wait_any(struct radeon_device *rdev,
continue;
}
- if (fences[i]->seq == RADEON_FENCE_SIGNALED_SEQ) {
- /* something was allready signaled */
- return 0;
- }
-
seq[i] = fences[i]->seq;
+ ++num_rings;
+
+ /* test if something was allready signaled */
+ if (seq[i] == RADEON_FENCE_SIGNALED_SEQ)
+ return 0;
}
- r = radeon_fence_wait_any_seq(rdev, seq, intr);
+ /* nothing to wait for ? */
+ if (num_rings == 0)
+ return -ENOENT;
+
+ r = radeon_fence_wait_seq(rdev, seq, intr, true);
if (r) {
return r;
}
@@ -594,15 +483,15 @@ int radeon_fence_wait_any(struct radeon_device *rdev,
*/
int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring)
{
- uint64_t seq;
+ uint64_t seq[RADEON_NUM_RINGS] = {};
- seq = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
- if (seq >= rdev->fence_drv[ring].sync_seq[ring]) {
+ seq[ring] = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
+ if (seq[ring] >= rdev->fence_drv[ring].sync_seq[ring]) {
/* nothing to wait for, last_seq is
already the last emited fence */
return -ENOENT;
}
- return radeon_fence_wait_seq(rdev, seq, ring, false, false);
+ return radeon_fence_wait_seq(rdev, seq, false, false);
}
/**
@@ -617,14 +506,15 @@ int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring)
*/
int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
{
- uint64_t seq = rdev->fence_drv[ring].sync_seq[ring];
+ uint64_t seq[RADEON_NUM_RINGS] = {};
int r;
- r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
+ seq[ring] = rdev->fence_drv[ring].sync_seq[ring];
+ r = radeon_fence_wait_seq(rdev, seq, false, false);
if (r) {
- if (r == -EDEADLK) {
+ if (r == -EDEADLK)
return -EDEADLK;
- }
+
dev_err(rdev->dev, "error waiting for ring[%d] to become idle (%d)\n",
ring, r);
}
@@ -826,7 +716,6 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
for (i = 0; i < RADEON_NUM_RINGS; ++i)
rdev->fence_drv[ring].sync_seq[i] = 0;
atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
- rdev->fence_drv[ring].last_activity = jiffies;
rdev->fence_drv[ring].initialized = false;
}
--
1.8.1.2
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] drm/radeon: rework and fix reset detection v2
2013-10-14 9:32 ` [PATCH 2/2] drm/radeon: rework and fix reset detection v2 Christian König
@ 2013-10-14 13:20 ` Dieter Nützel
0 siblings, 0 replies; 9+ messages in thread
From: Dieter Nützel @ 2013-10-14 13:20 UTC (permalink / raw)
To: Christian König, Dan Carpenter
Cc: dri-devel, dri-devel-bounces+dieter=nuetzel-hh.de
Am 14.10.2013 11:32, schrieb Christian König:
> From: Christian König <christian.koenig@amd.com>
>
> Stop fiddling with jiffies, always wait for
> RADEON_FENCE_JIFFIES_TIMEOUT.
> Consolidate the two wait sequence implementations into just one
> function.
> Activate all waiters and remember if the reset was already done instead
> of
> trying to reset from only one thread.
>
> v2: clear reset flag earlier to avoid timeout in IB test
Hello Christian,
even for radeon_fence.c Dan Carpenter had a question/patch on October,
6
[patch] drm/radeon: tweak a timeout condition slightly
and nobody replied.
Greetings,
Dieter
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
> drivers/gpu/drm/radeon/radeon.h | 2 +-
> drivers/gpu/drm/radeon/radeon_device.c | 8 +
> drivers/gpu/drm/radeon/radeon_fence.c | 347
> +++++++++++----------------------
> 3 files changed, 127 insertions(+), 230 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h
> b/drivers/gpu/drm/radeon/radeon.h
> index a400ac1..0201c6e 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -327,7 +327,6 @@ struct radeon_fence_driver {
> /* sync_seq is protected by ring emission lock */
> uint64_t sync_seq[RADEON_NUM_RINGS];
> atomic64_t last_seq;
> - unsigned long last_activity;
> bool initialized;
> };
>
> @@ -2170,6 +2169,7 @@ struct radeon_device {
> bool need_dma32;
> bool accel_working;
> bool fastfb_working; /* IGP feature*/
> + bool needs_reset;
> struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
> const struct firmware *me_fw; /* all family ME firmware */
> const struct firmware *pfp_fw; /* r6/700 PFP firmware */
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
> b/drivers/gpu/drm/radeon/radeon_device.c
> index 841d0e0..3f35f21 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -1549,6 +1549,14 @@ int radeon_gpu_reset(struct radeon_device *rdev)
> int resched;
>
> down_write(&rdev->exclusive_lock);
> +
> + if (!rdev->needs_reset) {
> + up_write(&rdev->exclusive_lock);
> + return 0;
> + }
> +
> + rdev->needs_reset = false;
> +
> radeon_save_bios_scratch_regs(rdev);
> /* block TTM */
> resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
> b/drivers/gpu/drm/radeon/radeon_fence.c
> index ddb8f8e..b8f68b2 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -190,10 +190,8 @@ void radeon_fence_process(struct radeon_device
> *rdev, int ring)
> }
> } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
>
> - if (wake) {
> - rdev->fence_drv[ring].last_activity = jiffies;
> + if (wake)
> wake_up_all(&rdev->fence_queue);
> - }
> }
>
> /**
> @@ -212,13 +210,13 @@ static void radeon_fence_destroy(struct kref
> *kref)
> }
>
> /**
> - * radeon_fence_seq_signaled - check if a fence sequeuce number has
> signaled
> + * radeon_fence_seq_signaled - check if a fence sequence number has
> signaled
> *
> * @rdev: radeon device pointer
> * @seq: sequence number
> * @ring: ring index the fence is associated with
> *
> - * Check if the last singled fence sequnce number is >= the requested
> + * Check if the last signaled fence sequnce number is >= the requested
> * sequence number (all asics).
> * Returns true if the fence has signaled (current fence value
> * is >= requested value) or false if it has not (current fence
> @@ -263,113 +261,131 @@ bool radeon_fence_signaled(struct radeon_fence
> *fence)
> }
>
> /**
> - * radeon_fence_wait_seq - wait for a specific sequence number
> + * radeon_fence_any_seq_signaled - check if any sequence number is
> signaled
> *
> * @rdev: radeon device pointer
> - * @target_seq: sequence number we want to wait for
> - * @ring: ring index the fence is associated with
> + * @seq: sequence numbers
> + *
> + * Check if the last signaled fence sequnce number is >= the requested
> + * sequence number (all asics).
> + * Returns true if any has signaled (current value is >= requested
> value)
> + * or false if it has not. Helper function for radeon_fence_wait_seq.
> + */
> +static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev,
> u64 *seq)
> +{
> + unsigned i;
> +
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i))
> + return true;
> + }
> + return false;
> +}
> +
> +/**
> + * radeon_fence_wait_seq - wait for a specific sequence numbers
> + *
> + * @rdev: radeon device pointer
> + * @target_seq: sequence number(s) we want to wait for
> * @intr: use interruptable sleep
> * @lock_ring: whether the ring should be locked or not
> *
> - * Wait for the requested sequence number to be written (all asics).
> + * Wait for the requested sequence number(s) to be written by any ring
> + * (all asics). Sequnce number array is indexed by ring id.
> * @intr selects whether to use interruptable (true) or
> non-interruptable
> * (false) sleep when waiting for the sequence number. Helper
> function
> - * for radeon_fence_wait(), et al.
> + * for radeon_fence_wait_*().
> * Returns 0 if the sequence number has passed, error for all other
> cases.
> - * -EDEADLK is returned when a GPU lockup has been detected and the
> ring is
> - * marked as not ready so no further jobs get scheduled until a
> successful
> - * reset.
> + * -EDEADLK is returned when a GPU lockup has been detected.
> */
> -static int radeon_fence_wait_seq(struct radeon_device *rdev, u64
> target_seq,
> - unsigned ring, bool intr, bool lock_ring)
> +static int radeon_fence_wait_seq(struct radeon_device *rdev, u64
> *target_seq,
> + bool intr, bool lock_ring)
> {
> - unsigned long timeout, last_activity;
> - uint64_t seq;
> - unsigned i;
> + uint64_t last_seq[RADEON_NUM_RINGS];
> bool signaled;
> - int r;
> + int i, r;
> +
> + while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
> +
> + /* Save current sequence values, used to check for GPU lockups */
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (!target_seq[i])
> + continue;
>
> - while (target_seq > atomic64_read(&rdev->fence_drv[ring].last_seq)) {
> - if (!rdev->ring[ring].ready) {
> - return -EBUSY;
> + last_seq[i] = atomic64_read(&rdev->fence_drv[i].last_seq);
> + trace_radeon_fence_wait_begin(rdev->ddev, target_seq[i]);
> + radeon_irq_kms_sw_irq_get(rdev, i);
> }
>
> - timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
> - if (time_after(rdev->fence_drv[ring].last_activity, timeout)) {
> - /* the normal case, timeout is somewhere before last_activity */
> - timeout = rdev->fence_drv[ring].last_activity - timeout;
> + if (intr) {
> + r = wait_event_interruptible_timeout(rdev->fence_queue, (
> + (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
> + || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
> } else {
> - /* either jiffies wrapped around, or no fence was signaled in the
> last 500ms
> - * anyway we will just wait for the minimum amount and then check
> for a lockup
> - */
> - timeout = 1;
> + r = wait_event_timeout(rdev->fence_queue, (
> + (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
> + || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
> }
> - seq = atomic64_read(&rdev->fence_drv[ring].last_seq);
> - /* Save current last activity valuee, used to check for GPU lockups
> */
> - last_activity = rdev->fence_drv[ring].last_activity;
>
> - trace_radeon_fence_wait_begin(rdev->ddev, seq);
> - radeon_irq_kms_sw_irq_get(rdev, ring);
> - if (intr) {
> - r = wait_event_interruptible_timeout(rdev->fence_queue,
> - (signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
> - timeout);
> - } else {
> - r = wait_event_timeout(rdev->fence_queue,
> - (signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
> - timeout);
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (!target_seq[i])
> + continue;
> +
> + radeon_irq_kms_sw_irq_put(rdev, i);
> + trace_radeon_fence_wait_end(rdev->ddev, target_seq[i]);
> }
> - radeon_irq_kms_sw_irq_put(rdev, ring);
> - if (unlikely(r < 0)) {
> +
> + if (unlikely(r < 0))
> return r;
> - }
> - trace_radeon_fence_wait_end(rdev->ddev, seq);
>
> if (unlikely(!signaled)) {
> + if (rdev->needs_reset)
> + return -EDEADLK;
> +
> /* we were interrupted for some reason and fence
> * isn't signaled yet, resume waiting */
> - if (r) {
> + if (r)
> continue;
> +
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (!target_seq[i])
> + continue;
> +
> + if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
> + break;
> }
>
> - /* check if sequence value has changed since last_activity */
> - if (seq != atomic64_read(&rdev->fence_drv[ring].last_seq)) {
> + if (i != RADEON_NUM_RINGS)
> continue;
> - }
>
> - if (lock_ring) {
> + if (lock_ring)
> mutex_lock(&rdev->ring_lock);
> - }
>
> - /* test if somebody else has already decided that this is a lockup
> */
> - if (last_activity != rdev->fence_drv[ring].last_activity) {
> - if (lock_ring) {
> - mutex_unlock(&rdev->ring_lock);
> - }
> - continue;
> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> + if (!target_seq[i])
> + continue;
> +
> + if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
> + break;
> }
>
> - if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
> + if (i < RADEON_NUM_RINGS) {
> /* good news we believe it's a lockup */
> - dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx last fence
> id 0x%016llx)\n",
> - target_seq, seq);
> -
> - /* change last activity so nobody else think there is a lockup */
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - rdev->fence_drv[i].last_activity = jiffies;
> - }
> -
> - /* mark the ring as not ready any more */
> - rdev->ring[ring].ready = false;
> - if (lock_ring) {
> + dev_warn(rdev->dev, "GPU lockup (waiting for "
> + "0x%016llx last fence id 0x%016llx on"
> + " ring %d)\n",
> + target_seq[i], last_seq[i], i);
> +
> + /* remember that we need an reset */
> + rdev->needs_reset = true;
> + if (lock_ring)
> mutex_unlock(&rdev->ring_lock);
> - }
> + wake_up_all(&rdev->fence_queue);
> return -EDEADLK;
> }
>
> - if (lock_ring) {
> + if (lock_ring)
> mutex_unlock(&rdev->ring_lock);
> - }
> }
> }
> return 0;
> @@ -388,6 +404,7 @@ static int radeon_fence_wait_seq(struct
> radeon_device *rdev, u64 target_seq,
> */
> int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> {
> + uint64_t seq[RADEON_NUM_RINGS] = {};
> int r;
>
> if (fence == NULL) {
> @@ -395,147 +412,15 @@ int radeon_fence_wait(struct radeon_fence
> *fence, bool intr)
> return -EINVAL;
> }
>
> - r = radeon_fence_wait_seq(fence->rdev, fence->seq,
> - fence->ring, intr, true);
> - if (r) {
> - return r;
> - }
> - fence->seq = RADEON_FENCE_SIGNALED_SEQ;
> - return 0;
> -}
> -
> -static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev,
> u64 *seq)
> -{
> - unsigned i;
> -
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i)) {
> - return true;
> - }
> - }
> - return false;
> -}
> -
> -/**
> - * radeon_fence_wait_any_seq - wait for a sequence number on any ring
> - *
> - * @rdev: radeon device pointer
> - * @target_seq: sequence number(s) we want to wait for
> - * @intr: use interruptable sleep
> - *
> - * Wait for the requested sequence number(s) to be written by any ring
> - * (all asics). Sequnce number array is indexed by ring id.
> - * @intr selects whether to use interruptable (true) or
> non-interruptable
> - * (false) sleep when waiting for the sequence number. Helper
> function
> - * for radeon_fence_wait_any(), et al.
> - * Returns 0 if the sequence number has passed, error for all other
> cases.
> - */
> -static int radeon_fence_wait_any_seq(struct radeon_device *rdev,
> - u64 *target_seq, bool intr)
> -{
> - unsigned long timeout, last_activity, tmp;
> - unsigned i, ring = RADEON_NUM_RINGS;
> - bool signaled;
> - int r;
> -
> - for (i = 0, last_activity = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (!target_seq[i]) {
> - continue;
> - }
> -
> - /* use the most recent one as indicator */
> - if (time_after(rdev->fence_drv[i].last_activity, last_activity)) {
> - last_activity = rdev->fence_drv[i].last_activity;
> - }
> -
> - /* For lockup detection just pick the lowest ring we are
> - * actively waiting for
> - */
> - if (i < ring) {
> - ring = i;
> - }
> - }
> -
> - /* nothing to wait for ? */
> - if (ring == RADEON_NUM_RINGS) {
> - return -ENOENT;
> - }
> -
> - while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
> - timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
> - if (time_after(last_activity, timeout)) {
> - /* the normal case, timeout is somewhere before last_activity */
> - timeout = last_activity - timeout;
> - } else {
> - /* either jiffies wrapped around, or no fence was signaled in the
> last 500ms
> - * anyway we will just wait for the minimum amount and then check
> for a lockup
> - */
> - timeout = 1;
> - }
> + seq[fence->ring] = fence->seq;
> + if (seq[fence->ring] == RADEON_FENCE_SIGNALED_SEQ)
> + return 0;
>
> - trace_radeon_fence_wait_begin(rdev->ddev, target_seq[ring]);
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (target_seq[i]) {
> - radeon_irq_kms_sw_irq_get(rdev, i);
> - }
> - }
> - if (intr) {
> - r = wait_event_interruptible_timeout(rdev->fence_queue,
> - (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
> - timeout);
> - } else {
> - r = wait_event_timeout(rdev->fence_queue,
> - (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
> - timeout);
> - }
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (target_seq[i]) {
> - radeon_irq_kms_sw_irq_put(rdev, i);
> - }
> - }
> - if (unlikely(r < 0)) {
> - return r;
> - }
> - trace_radeon_fence_wait_end(rdev->ddev, target_seq[ring]);
> -
> - if (unlikely(!signaled)) {
> - /* we were interrupted for some reason and fence
> - * isn't signaled yet, resume waiting */
> - if (r) {
> - continue;
> - }
> -
> - mutex_lock(&rdev->ring_lock);
> - for (i = 0, tmp = 0; i < RADEON_NUM_RINGS; ++i) {
> - if (time_after(rdev->fence_drv[i].last_activity, tmp)) {
> - tmp = rdev->fence_drv[i].last_activity;
> - }
> - }
> - /* test if somebody else has already decided that this is a lockup
> */
> - if (last_activity != tmp) {
> - last_activity = tmp;
> - mutex_unlock(&rdev->ring_lock);
> - continue;
> - }
> -
> - if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
> - /* good news we believe it's a lockup */
> - dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx)\n",
> - target_seq[ring]);
> -
> - /* change last activity so nobody else think there is a lockup */
> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> - rdev->fence_drv[i].last_activity = jiffies;
> - }
> + r = radeon_fence_wait_seq(fence->rdev, seq, intr, true);
> + if (r)
> + return r;
>
> - /* mark the ring as not ready any more */
> - rdev->ring[ring].ready = false;
> - mutex_unlock(&rdev->ring_lock);
> - return -EDEADLK;
> - }
> - mutex_unlock(&rdev->ring_lock);
> - }
> - }
> + fence->seq = RADEON_FENCE_SIGNALED_SEQ;
> return 0;
> }
>
> @@ -557,7 +442,7 @@ int radeon_fence_wait_any(struct radeon_device
> *rdev,
> bool intr)
> {
> uint64_t seq[RADEON_NUM_RINGS];
> - unsigned i;
> + unsigned i, num_rings = 0;
> int r;
>
> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> @@ -567,15 +452,19 @@ int radeon_fence_wait_any(struct radeon_device
> *rdev,
> continue;
> }
>
> - if (fences[i]->seq == RADEON_FENCE_SIGNALED_SEQ) {
> - /* something was allready signaled */
> - return 0;
> - }
> -
> seq[i] = fences[i]->seq;
> + ++num_rings;
> +
> + /* test if something was allready signaled */
> + if (seq[i] == RADEON_FENCE_SIGNALED_SEQ)
> + return 0;
> }
>
> - r = radeon_fence_wait_any_seq(rdev, seq, intr);
> + /* nothing to wait for ? */
> + if (num_rings == 0)
> + return -ENOENT;
> +
> + r = radeon_fence_wait_seq(rdev, seq, intr, true);
> if (r) {
> return r;
> }
> @@ -594,15 +483,15 @@ int radeon_fence_wait_any(struct radeon_device
> *rdev,
> */
> int radeon_fence_wait_next_locked(struct radeon_device *rdev, int
> ring)
> {
> - uint64_t seq;
> + uint64_t seq[RADEON_NUM_RINGS] = {};
>
> - seq = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
> - if (seq >= rdev->fence_drv[ring].sync_seq[ring]) {
> + seq[ring] = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
> + if (seq[ring] >= rdev->fence_drv[ring].sync_seq[ring]) {
> /* nothing to wait for, last_seq is
> already the last emited fence */
> return -ENOENT;
> }
> - return radeon_fence_wait_seq(rdev, seq, ring, false, false);
> + return radeon_fence_wait_seq(rdev, seq, false, false);
> }
>
> /**
> @@ -617,14 +506,15 @@ int radeon_fence_wait_next_locked(struct
> radeon_device *rdev, int ring)
> */
> int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int
> ring)
> {
> - uint64_t seq = rdev->fence_drv[ring].sync_seq[ring];
> + uint64_t seq[RADEON_NUM_RINGS] = {};
> int r;
>
> - r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
> + seq[ring] = rdev->fence_drv[ring].sync_seq[ring];
> + r = radeon_fence_wait_seq(rdev, seq, false, false);
> if (r) {
> - if (r == -EDEADLK) {
> + if (r == -EDEADLK)
> return -EDEADLK;
> - }
> +
> dev_err(rdev->dev, "error waiting for ring[%d] to become idle
> (%d)\n",
> ring, r);
> }
> @@ -826,7 +716,6 @@ static void radeon_fence_driver_init_ring(struct
> radeon_device *rdev, int ring)
> for (i = 0; i < RADEON_NUM_RINGS; ++i)
> rdev->fence_drv[ring].sync_seq[i] = 0;
> atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
> - rdev->fence_drv[ring].last_activity = jiffies;
> rdev->fence_drv[ring].initialized = false;
> }
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test
2013-10-14 9:32 [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test Christian König
2013-10-14 9:32 ` [PATCH 2/2] drm/radeon: rework and fix reset detection v2 Christian König
@ 2013-10-14 17:35 ` Alex Deucher
2013-10-14 19:13 ` Marek Olšák
2 siblings, 0 replies; 9+ messages in thread
From: Alex Deucher @ 2013-10-14 17:35 UTC (permalink / raw)
To: Christian König; +Cc: Maling list - DRI developers
On Mon, Oct 14, 2013 at 5:32 AM, Christian König
<deathsimple@vodafone.de> wrote:
> From: Christian König <christian.koenig@amd.com>
>
> Stop leaking IB memory and scratch register space when the test fails.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
Both patches applied.
Thanks!
Alex
> ---
> drivers/gpu/drm/radeon/cik.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
> index b874ccd..8f393df 100644
> --- a/drivers/gpu/drm/radeon/cik.c
> +++ b/drivers/gpu/drm/radeon/cik.c
> @@ -3182,6 +3182,7 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> r = radeon_ib_get(rdev, ring->idx, &ib, NULL, 256);
> if (r) {
> DRM_ERROR("radeon: failed to get ib (%d).\n", r);
> + radeon_scratch_free(rdev, scratch);
> return r;
> }
> ib.ptr[0] = PACKET3(PACKET3_SET_UCONFIG_REG, 1);
> @@ -3198,6 +3199,8 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> r = radeon_fence_wait(ib.fence, false);
> if (r) {
> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> + radeon_scratch_free(rdev, scratch);
> + radeon_ib_free(rdev, &ib);
> return r;
> }
> for (i = 0; i < rdev->usec_timeout; i++) {
> --
> 1.8.1.2
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test
2013-10-14 9:32 [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test Christian König
2013-10-14 9:32 ` [PATCH 2/2] drm/radeon: rework and fix reset detection v2 Christian König
2013-10-14 17:35 ` [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test Alex Deucher
@ 2013-10-14 19:13 ` Marek Olšák
2013-10-14 19:34 ` Marek Olšák
2 siblings, 1 reply; 9+ messages in thread
From: Marek Olšák @ 2013-10-14 19:13 UTC (permalink / raw)
To: Christian König; +Cc: dri-devel
I tested this and had over 1546 lockups followed by a successful GPU
reset. Then the kernel probably crashed (judging by the fact ssh was
dead). Still, it's pretty impressive.
There is a new problem though. The X server sometimes gets stuck in
GEM_WAIT and waits forever, even if there were no lockups before. It
occurs very rarely though. I didn't see this issue without your
patches.
Marek
On Mon, Oct 14, 2013 at 11:32 AM, Christian König
<deathsimple@vodafone.de> wrote:
> From: Christian König <christian.koenig@amd.com>
>
> Stop leaking IB memory and scratch register space when the test fails.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
> drivers/gpu/drm/radeon/cik.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
> index b874ccd..8f393df 100644
> --- a/drivers/gpu/drm/radeon/cik.c
> +++ b/drivers/gpu/drm/radeon/cik.c
> @@ -3182,6 +3182,7 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> r = radeon_ib_get(rdev, ring->idx, &ib, NULL, 256);
> if (r) {
> DRM_ERROR("radeon: failed to get ib (%d).\n", r);
> + radeon_scratch_free(rdev, scratch);
> return r;
> }
> ib.ptr[0] = PACKET3(PACKET3_SET_UCONFIG_REG, 1);
> @@ -3198,6 +3199,8 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
> r = radeon_fence_wait(ib.fence, false);
> if (r) {
> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> + radeon_scratch_free(rdev, scratch);
> + radeon_ib_free(rdev, &ib);
> return r;
> }
> for (i = 0; i < rdev->usec_timeout; i++) {
> --
> 1.8.1.2
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test
2013-10-14 19:13 ` Marek Olšák
@ 2013-10-14 19:34 ` Marek Olšák
2013-10-15 9:11 ` Christian König
0 siblings, 1 reply; 9+ messages in thread
From: Marek Olšák @ 2013-10-14 19:34 UTC (permalink / raw)
To: Christian König; +Cc: dri-devel
Ooops, the new problem is not so rare. It has now happened to me 3
times in an hour.
Marek
On Mon, Oct 14, 2013 at 9:13 PM, Marek Olšák <maraeo@gmail.com> wrote:
> I tested this and had over 1546 lockups followed by a successful GPU
> reset. Then the kernel probably crashed (judging by the fact ssh was
> dead). Still, it's pretty impressive.
>
> There is a new problem though. The X server sometimes gets stuck in
> GEM_WAIT and waits forever, even if there were no lockups before. It
> occurs very rarely though. I didn't see this issue without your
> patches.
>
> Marek
>
> On Mon, Oct 14, 2013 at 11:32 AM, Christian König
> <deathsimple@vodafone.de> wrote:
>> From: Christian König <christian.koenig@amd.com>
>>
>> Stop leaking IB memory and scratch register space when the test fails.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>> drivers/gpu/drm/radeon/cik.c | 3 +++
>> 1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
>> index b874ccd..8f393df 100644
>> --- a/drivers/gpu/drm/radeon/cik.c
>> +++ b/drivers/gpu/drm/radeon/cik.c
>> @@ -3182,6 +3182,7 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>> r = radeon_ib_get(rdev, ring->idx, &ib, NULL, 256);
>> if (r) {
>> DRM_ERROR("radeon: failed to get ib (%d).\n", r);
>> + radeon_scratch_free(rdev, scratch);
>> return r;
>> }
>> ib.ptr[0] = PACKET3(PACKET3_SET_UCONFIG_REG, 1);
>> @@ -3198,6 +3199,8 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>> r = radeon_fence_wait(ib.fence, false);
>> if (r) {
>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> + radeon_scratch_free(rdev, scratch);
>> + radeon_ib_free(rdev, &ib);
>> return r;
>> }
>> for (i = 0; i < rdev->usec_timeout; i++) {
>> --
>> 1.8.1.2
>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test
2013-10-14 19:34 ` Marek Olšák
@ 2013-10-15 9:11 ` Christian König
2013-10-15 10:57 ` Marek Olšák
0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2013-10-15 9:11 UTC (permalink / raw)
To: Marek Olšák; +Cc: dri-devel
Mhm hard to say what's going wrong this time, but we probably need to
fix it before the final release.
Do you have a kernel backtrace from the lockups? Or at least some way to
reproduce it?
Christian.
Am 14.10.2013 21:34, schrieb Marek Olšák:
> Ooops, the new problem is not so rare. It has now happened to me 3
> times in an hour.
>
> Marek
>
> On Mon, Oct 14, 2013 at 9:13 PM, Marek Olšák <maraeo@gmail.com> wrote:
>> I tested this and had over 1546 lockups followed by a successful GPU
>> reset. Then the kernel probably crashed (judging by the fact ssh was
>> dead). Still, it's pretty impressive.
>>
>> There is a new problem though. The X server sometimes gets stuck in
>> GEM_WAIT and waits forever, even if there were no lockups before. It
>> occurs very rarely though. I didn't see this issue without your
>> patches.
>>
>> Marek
>>
>> On Mon, Oct 14, 2013 at 11:32 AM, Christian König
>> <deathsimple@vodafone.de> wrote:
>>> From: Christian König <christian.koenig@amd.com>
>>>
>>> Stop leaking IB memory and scratch register space when the test fails.
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> ---
>>> drivers/gpu/drm/radeon/cik.c | 3 +++
>>> 1 file changed, 3 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
>>> index b874ccd..8f393df 100644
>>> --- a/drivers/gpu/drm/radeon/cik.c
>>> +++ b/drivers/gpu/drm/radeon/cik.c
>>> @@ -3182,6 +3182,7 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>>> r = radeon_ib_get(rdev, ring->idx, &ib, NULL, 256);
>>> if (r) {
>>> DRM_ERROR("radeon: failed to get ib (%d).\n", r);
>>> + radeon_scratch_free(rdev, scratch);
>>> return r;
>>> }
>>> ib.ptr[0] = PACKET3(PACKET3_SET_UCONFIG_REG, 1);
>>> @@ -3198,6 +3199,8 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>>> r = radeon_fence_wait(ib.fence, false);
>>> if (r) {
>>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>> + radeon_scratch_free(rdev, scratch);
>>> + radeon_ib_free(rdev, &ib);
>>> return r;
>>> }
>>> for (i = 0; i < rdev->usec_timeout; i++) {
>>> --
>>> 1.8.1.2
>>>
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test
2013-10-15 9:11 ` Christian König
@ 2013-10-15 10:57 ` Marek Olšák
2013-10-16 11:31 ` Christian König
0 siblings, 1 reply; 9+ messages in thread
From: Marek Olšák @ 2013-10-15 10:57 UTC (permalink / raw)
To: Christian König; +Cc: dri-devel
They are not lockups. X just freezes in GEM_WAIT. The only way to
reproduce it is to apply the patches, use the computer and wait. It
looks like a fence is not signalled and the process calling GEM_WAIT
is not woken up.
Marek
On Tue, Oct 15, 2013 at 11:11 AM, Christian König
<deathsimple@vodafone.de> wrote:
> Mhm hard to say what's going wrong this time, but we probably need to fix it
> before the final release.
>
> Do you have a kernel backtrace from the lockups? Or at least some way to
> reproduce it?
>
> Christian.
>
> Am 14.10.2013 21:34, schrieb Marek Olšák:
>
>> Ooops, the new problem is not so rare. It has now happened to me 3
>> times in an hour.
>>
>> Marek
>>
>> On Mon, Oct 14, 2013 at 9:13 PM, Marek Olšák <maraeo@gmail.com> wrote:
>>>
>>> I tested this and had over 1546 lockups followed by a successful GPU
>>> reset. Then the kernel probably crashed (judging by the fact ssh was
>>> dead). Still, it's pretty impressive.
>>>
>>> There is a new problem though. The X server sometimes gets stuck in
>>> GEM_WAIT and waits forever, even if there were no lockups before. It
>>> occurs very rarely though. I didn't see this issue without your
>>> patches.
>>>
>>> Marek
>>>
>>> On Mon, Oct 14, 2013 at 11:32 AM, Christian König
>>> <deathsimple@vodafone.de> wrote:
>>>>
>>>> From: Christian König <christian.koenig@amd.com>
>>>>
>>>> Stop leaking IB memory and scratch register space when the test fails.
>>>>
>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>> ---
>>>> drivers/gpu/drm/radeon/cik.c | 3 +++
>>>> 1 file changed, 3 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
>>>> index b874ccd..8f393df 100644
>>>> --- a/drivers/gpu/drm/radeon/cik.c
>>>> +++ b/drivers/gpu/drm/radeon/cik.c
>>>> @@ -3182,6 +3182,7 @@ int cik_ib_test(struct radeon_device *rdev, struct
>>>> radeon_ring *ring)
>>>> r = radeon_ib_get(rdev, ring->idx, &ib, NULL, 256);
>>>> if (r) {
>>>> DRM_ERROR("radeon: failed to get ib (%d).\n", r);
>>>> + radeon_scratch_free(rdev, scratch);
>>>> return r;
>>>> }
>>>> ib.ptr[0] = PACKET3(PACKET3_SET_UCONFIG_REG, 1);
>>>> @@ -3198,6 +3199,8 @@ int cik_ib_test(struct radeon_device *rdev, struct
>>>> radeon_ring *ring)
>>>> r = radeon_fence_wait(ib.fence, false);
>>>> if (r) {
>>>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>>> + radeon_scratch_free(rdev, scratch);
>>>> + radeon_ib_free(rdev, &ib);
>>>> return r;
>>>> }
>>>> for (i = 0; i < rdev->usec_timeout; i++) {
>>>> --
>>>> 1.8.1.2
>>>>
>>>> _______________________________________________
>>>> dri-devel mailing list
>>>> dri-devel@lists.freedesktop.org
>>>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>
>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test
2013-10-15 10:57 ` Marek Olšák
@ 2013-10-16 11:31 ` Christian König
0 siblings, 0 replies; 9+ messages in thread
From: Christian König @ 2013-10-16 11:31 UTC (permalink / raw)
To: Marek Olšák; +Cc: dri-devel
Strange. I tested X with ~120 glxgears instances which got killed and
restarted every 60-120 seconds for the whole night, but without any
lockup or freeze.
What's the kernel backtrace when this happens? If I understand you
correctly X is killable in that situation, is that right?
Please try the following:
echo 1 >
/sys/kernel/debug/tracing/events/radeon/radeon_fence_wait_begin/enable
echo 1 >
/sys/kernel/debug/tracing/events/radeon/radeon_fence_wait_end/enable
before starting X. And when X freezed "cat /sys/kernel/debug/tracing/trace".
Thanks,
Christian.
Am 15.10.2013 12:57, schrieb Marek Olšák:
> They are not lockups. X just freezes in GEM_WAIT. The only way to
> reproduce it is to apply the patches, use the computer and wait. It
> looks like a fence is not signalled and the process calling GEM_WAIT
> is not woken up.
>
> Marek
>
> On Tue, Oct 15, 2013 at 11:11 AM, Christian König
> <deathsimple@vodafone.de> wrote:
>> Mhm hard to say what's going wrong this time, but we probably need to fix it
>> before the final release.
>>
>> Do you have a kernel backtrace from the lockups? Or at least some way to
>> reproduce it?
>>
>> Christian.
>>
>> Am 14.10.2013 21:34, schrieb Marek Olšák:
>>
>>> Ooops, the new problem is not so rare. It has now happened to me 3
>>> times in an hour.
>>>
>>> Marek
>>>
>>> On Mon, Oct 14, 2013 at 9:13 PM, Marek Olšák <maraeo@gmail.com> wrote:
>>>> I tested this and had over 1546 lockups followed by a successful GPU
>>>> reset. Then the kernel probably crashed (judging by the fact ssh was
>>>> dead). Still, it's pretty impressive.
>>>>
>>>> There is a new problem though. The X server sometimes gets stuck in
>>>> GEM_WAIT and waits forever, even if there were no lockups before. It
>>>> occurs very rarely though. I didn't see this issue without your
>>>> patches.
>>>>
>>>> Marek
>>>>
>>>> On Mon, Oct 14, 2013 at 11:32 AM, Christian König
>>>> <deathsimple@vodafone.de> wrote:
>>>>> From: Christian König <christian.koenig@amd.com>
>>>>>
>>>>> Stop leaking IB memory and scratch register space when the test fails.
>>>>>
>>>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>>>> ---
>>>>> drivers/gpu/drm/radeon/cik.c | 3 +++
>>>>> 1 file changed, 3 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
>>>>> index b874ccd..8f393df 100644
>>>>> --- a/drivers/gpu/drm/radeon/cik.c
>>>>> +++ b/drivers/gpu/drm/radeon/cik.c
>>>>> @@ -3182,6 +3182,7 @@ int cik_ib_test(struct radeon_device *rdev, struct
>>>>> radeon_ring *ring)
>>>>> r = radeon_ib_get(rdev, ring->idx, &ib, NULL, 256);
>>>>> if (r) {
>>>>> DRM_ERROR("radeon: failed to get ib (%d).\n", r);
>>>>> + radeon_scratch_free(rdev, scratch);
>>>>> return r;
>>>>> }
>>>>> ib.ptr[0] = PACKET3(PACKET3_SET_UCONFIG_REG, 1);
>>>>> @@ -3198,6 +3199,8 @@ int cik_ib_test(struct radeon_device *rdev, struct
>>>>> radeon_ring *ring)
>>>>> r = radeon_fence_wait(ib.fence, false);
>>>>> if (r) {
>>>>> DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>>>> + radeon_scratch_free(rdev, scratch);
>>>>> + radeon_ib_free(rdev, &ib);
>>>>> return r;
>>>>> }
>>>>> for (i = 0; i < rdev->usec_timeout; i++) {
>>>>> --
>>>>> 1.8.1.2
>>>>>
>>>>> _______________________________________________
>>>>> dri-devel mailing list
>>>>> dri-devel@lists.freedesktop.org
>>>>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2013-10-16 11:31 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-10-14 9:32 [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test Christian König
2013-10-14 9:32 ` [PATCH 2/2] drm/radeon: rework and fix reset detection v2 Christian König
2013-10-14 13:20 ` Dieter Nützel
2013-10-14 17:35 ` [PATCH 1/2] drm/radeon: stop the leaks in cik_ib_test Alex Deucher
2013-10-14 19:13 ` Marek Olšák
2013-10-14 19:34 ` Marek Olšák
2013-10-15 9:11 ` Christian König
2013-10-15 10:57 ` Marek Olšák
2013-10-16 11:31 ` Christian König
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.