* [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads
@ 2019-02-15 10:27 Chris Wilson
2019-02-15 10:27 ` [PATCH 2/2] drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset() Chris Wilson
` (3 more replies)
0 siblings, 4 replies; 10+ messages in thread
From: Chris Wilson @ 2019-02-15 10:27 UTC (permalink / raw)
To: intel-gfx; +Cc: Mika Kuoppala
In selftests/live_hangcheck, we have a lot of tests for resetting simple
spinners, but nothing quite prepared us for how the GPU reacted to
triggering a reset outside of the safe spinner. These two subtests fill
the ring with plain old empty, non-spinning requests, and then triggers
a reset. Without a user-payload to blame, these requests will exercise
the 'non-started' paths and mostly be replayed verbatim.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
.../gpu/drm/i915/selftests/intel_hangcheck.c | 217 ++++++++++++++++++
1 file changed, 217 insertions(+)
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 92475596ff40..f75cb56ff8ad 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -415,6 +415,221 @@ static bool wait_for_idle(struct intel_engine_cs *engine)
return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
}
+static int igt_reset_nop(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct i915_gem_context *ctx;
+ unsigned int reset_count, count;
+ enum intel_engine_id id;
+ intel_wakeref_t wakeref;
+ struct drm_file *file;
+ IGT_TIMEOUT(end_time);
+ int err = 0;
+
+ /* Check that we can reset during non-user portions of requests */
+
+ file = mock_file(i915);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ mutex_lock(&i915->drm.struct_mutex);
+ ctx = live_context(i915, file);
+ mutex_unlock(&i915->drm.struct_mutex);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto out;
+ }
+
+ wakeref = intel_runtime_pm_get(i915);
+
+ reset_count = i915_reset_count(&i915->gpu_error);
+ count = 0;
+ do {
+ mutex_lock(&i915->drm.struct_mutex);
+ for_each_engine(engine, i915, id) {
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_alloc(engine, ctx);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ break;
+ }
+
+ i915_request_add(rq);
+ }
+ }
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ igt_global_reset_lock(i915);
+ i915_reset(i915, ALL_ENGINES, NULL);
+ igt_global_reset_unlock(i915);
+ if (i915_terminally_wedged(&i915->gpu_error)) {
+ err = -EIO;
+ break;
+ }
+
+ if (i915_reset_count(&i915->gpu_error) !=
+ reset_count + ++count) {
+ pr_err("Full GPU reset not recorded!\n");
+ err = -EINVAL;
+ break;
+ }
+
+ if (!i915_reset_flush(i915)) {
+ struct drm_printer p =
+ drm_info_printer(i915->drm.dev);
+
+ pr_err("%s failed to idle after reset\n",
+ engine->name);
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
+
+ err = -EIO;
+ break;
+ }
+
+ err = igt_flush_test(i915, 0);
+ if (err)
+ break;
+ } while (time_before(jiffies, end_time));
+ pr_info("%s: %d resets\n", __func__, count);
+
+ mutex_lock(&i915->drm.struct_mutex);
+ err = igt_flush_test(i915, I915_WAIT_LOCKED);
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ intel_runtime_pm_put(i915, wakeref);
+
+out:
+ mock_file_free(i915, file);
+ if (i915_terminally_wedged(&i915->gpu_error))
+ err = -EIO;
+ return err;
+}
+
+static int igt_reset_nop_engine(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ struct intel_engine_cs *engine;
+ struct i915_gem_context *ctx;
+ enum intel_engine_id id;
+ intel_wakeref_t wakeref;
+ struct drm_file *file;
+ int err = 0;
+
+ /* Check that we can engine-reset during non-user portions */
+
+ if (!intel_has_reset_engine(i915))
+ return 0;
+
+ file = mock_file(i915);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ mutex_lock(&i915->drm.struct_mutex);
+ ctx = live_context(i915, file);
+ mutex_unlock(&i915->drm.struct_mutex);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto out;
+ }
+
+ wakeref = intel_runtime_pm_get(i915);
+ for_each_engine(engine, i915, id) {
+ unsigned int reset_count, reset_engine_count;
+ unsigned int count;
+ IGT_TIMEOUT(end_time);
+
+ reset_count = i915_reset_count(&i915->gpu_error);
+ reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
+ engine);
+ count = 0;
+
+ set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+ do {
+ int i;
+
+ if (!wait_for_idle(engine)) {
+ pr_err("%s failed to idle before reset\n",
+ engine->name);
+ err = -EIO;
+ break;
+ }
+
+ mutex_lock(&i915->drm.struct_mutex);
+ for (i = 0; i < 16; i++) {
+ struct i915_request *rq;
+
+ rq = i915_request_alloc(engine, ctx);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ break;
+ }
+
+ i915_request_add(rq);
+ }
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ err = i915_reset_engine(engine, NULL);
+ if (err) {
+ pr_err("i915_reset_engine failed\n");
+ break;
+ }
+
+ if (i915_reset_count(&i915->gpu_error) != reset_count) {
+ pr_err("Full GPU reset recorded! (engine reset expected)\n");
+ err = -EINVAL;
+ break;
+ }
+
+ if (i915_reset_engine_count(&i915->gpu_error, engine) !=
+ reset_engine_count + ++count) {
+ pr_err("%s engine reset not recorded!\n",
+ engine->name);
+ err = -EINVAL;
+ break;
+ }
+
+ if (!i915_reset_flush(i915)) {
+ struct drm_printer p =
+ drm_info_printer(i915->drm.dev);
+
+ pr_err("%s failed to idle after reset\n",
+ engine->name);
+ intel_engine_dump(engine, &p,
+ "%s\n", engine->name);
+
+ err = -EIO;
+ break;
+ }
+ } while (time_before(jiffies, end_time));
+ clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+ pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
+
+ if (err)
+ break;
+
+ err = igt_flush_test(i915, 0);
+ if (err)
+ break;
+ }
+
+ mutex_lock(&i915->drm.struct_mutex);
+ err = igt_flush_test(i915, I915_WAIT_LOCKED);
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ intel_runtime_pm_put(i915, wakeref);
+out:
+ mock_file_free(i915, file);
+ if (i915_terminally_wedged(&i915->gpu_error))
+ err = -EIO;
+ return err;
+}
+
static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
{
struct intel_engine_cs *engine;
@@ -1647,6 +1862,8 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
SUBTEST(igt_global_reset), /* attempt to recover GPU first */
SUBTEST(igt_wedged_reset),
SUBTEST(igt_hang_sanitycheck),
+ SUBTEST(igt_reset_nop),
+ SUBTEST(igt_reset_nop_engine),
SUBTEST(igt_reset_idle_engine),
SUBTEST(igt_reset_active_engine),
SUBTEST(igt_reset_engines),
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 2/2] drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset()
2019-02-15 10:27 [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads Chris Wilson
@ 2019-02-15 10:27 ` Chris Wilson
2019-02-15 15:21 ` Mika Kuoppala
2019-02-15 15:18 ` [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads Mika Kuoppala
` (2 subsequent siblings)
3 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2019-02-15 10:27 UTC (permalink / raw)
To: intel-gfx
Since we no longer need to hold struct_mutex to perform a global device
reset, don't do so for igt_reset_wedge().
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index f75cb56ff8ad..57e02840be9e 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -399,10 +399,8 @@ static int igt_wedged_reset(void *arg)
i915_gem_set_wedged(i915);
- mutex_lock(&i915->drm.struct_mutex);
GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
i915_reset(i915, ALL_ENGINES, NULL);
- mutex_unlock(&i915->drm.struct_mutex);
intel_runtime_pm_put(i915, wakeref);
igt_global_reset_unlock(i915);
--
2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads
2019-02-15 10:27 [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads Chris Wilson
2019-02-15 10:27 ` [PATCH 2/2] drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset() Chris Wilson
@ 2019-02-15 15:18 ` Mika Kuoppala
2019-02-15 15:33 ` Chris Wilson
2019-02-15 17:35 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] " Patchwork
2019-02-15 18:11 ` ✗ Fi.CI.BAT: failure " Patchwork
3 siblings, 1 reply; 10+ messages in thread
From: Mika Kuoppala @ 2019-02-15 15:18 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
Chris Wilson <chris@chris-wilson.co.uk> writes:
> In selftests/live_hangcheck, we have a lot of tests for resetting simple
> spinners, but nothing quite prepared us for how the GPU reacted to
> triggering a reset outside of the safe spinner. These two subtests fill
> the ring with plain old empty, non-spinning requests, and then triggers
> a reset. Without a user-payload to blame, these requests will exercise
> the 'non-started' paths and mostly be replayed verbatim.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
> .../gpu/drm/i915/selftests/intel_hangcheck.c | 217 ++++++++++++++++++
> 1 file changed, 217 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 92475596ff40..f75cb56ff8ad 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -415,6 +415,221 @@ static bool wait_for_idle(struct intel_engine_cs *engine)
> return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
> }
>
> +static int igt_reset_nop(void *arg)
> +{
> + struct drm_i915_private *i915 = arg;
> + struct intel_engine_cs *engine;
> + struct i915_gem_context *ctx;
> + unsigned int reset_count, count;
> + enum intel_engine_id id;
> + intel_wakeref_t wakeref;
> + struct drm_file *file;
> + IGT_TIMEOUT(end_time);
> + int err = 0;
> +
> + /* Check that we can reset during non-user portions of requests */
> +
> + file = mock_file(i915);
> + if (IS_ERR(file))
> + return PTR_ERR(file);
> +
> + mutex_lock(&i915->drm.struct_mutex);
> + ctx = live_context(i915, file);
> + mutex_unlock(&i915->drm.struct_mutex);
> + if (IS_ERR(ctx)) {
> + err = PTR_ERR(ctx);
> + goto out;
> + }
> +
> + wakeref = intel_runtime_pm_get(i915);
> +
> + reset_count = i915_reset_count(&i915->gpu_error);
> + count = 0;
> + do {
> + mutex_lock(&i915->drm.struct_mutex);
> + for_each_engine(engine, i915, id) {
> + int i;
> +
> + for (i = 0; i < 16; i++) {
> + struct i915_request *rq;
> +
> + rq = i915_request_alloc(engine, ctx);
> + if (IS_ERR(rq)) {
> + err = PTR_ERR(rq);
> + break;
> + }
> +
> + i915_request_add(rq);
> + }
> + }
> + mutex_unlock(&i915->drm.struct_mutex);
> +
> + igt_global_reset_lock(i915);
> + i915_reset(i915, ALL_ENGINES, NULL);
> + igt_global_reset_unlock(i915);
> + if (i915_terminally_wedged(&i915->gpu_error)) {
> + err = -EIO;
> + break;
> + }
> +
> + if (i915_reset_count(&i915->gpu_error) !=
> + reset_count + ++count) {
> + pr_err("Full GPU reset not recorded!\n");
> + err = -EINVAL;
> + break;
> + }
> +
> + if (!i915_reset_flush(i915)) {
> + struct drm_printer p =
> + drm_info_printer(i915->drm.dev);
> +
> + pr_err("%s failed to idle after reset\n",
> + engine->name);
> + intel_engine_dump(engine, &p,
> + "%s\n", engine->name);
> +
> + err = -EIO;
> + break;
> + }
> +
> + err = igt_flush_test(i915, 0);
> + if (err)
> + break;
> + } while (time_before(jiffies, end_time));
> + pr_info("%s: %d resets\n", __func__, count);
> +
> + mutex_lock(&i915->drm.struct_mutex);
> + err = igt_flush_test(i915, I915_WAIT_LOCKED);
> + mutex_unlock(&i915->drm.struct_mutex);
> +
> + intel_runtime_pm_put(i915, wakeref);
> +
> +out:
> + mock_file_free(i915, file);
> + if (i915_terminally_wedged(&i915->gpu_error))
> + err = -EIO;
> + return err;
> +}
> +
> +static int igt_reset_nop_engine(void *arg)
> +{
> + struct drm_i915_private *i915 = arg;
> + struct intel_engine_cs *engine;
> + struct i915_gem_context *ctx;
> + enum intel_engine_id id;
> + intel_wakeref_t wakeref;
> + struct drm_file *file;
> + int err = 0;
err = 0 for the gcc as it seems unnecessary. Regardless
I didn't spot anything out of place in this patch.
The amount of 16 requests feels low as a number but
we should have plenty of time to hit the button during.
Nice addition to reset tests. And as you mentioned
we should have plenty of resets going in on idle engines
on other tests. Not that someone would object one explicit
test into this file :)
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> +
> + /* Check that we can engine-reset during non-user portions */
> +
> + if (!intel_has_reset_engine(i915))
> + return 0;
> +
> + file = mock_file(i915);
> + if (IS_ERR(file))
> + return PTR_ERR(file);
> +
> + mutex_lock(&i915->drm.struct_mutex);
> + ctx = live_context(i915, file);
> + mutex_unlock(&i915->drm.struct_mutex);
> + if (IS_ERR(ctx)) {
> + err = PTR_ERR(ctx);
> + goto out;
> + }
> +
> + wakeref = intel_runtime_pm_get(i915);
> + for_each_engine(engine, i915, id) {
> + unsigned int reset_count, reset_engine_count;
> + unsigned int count;
> + IGT_TIMEOUT(end_time);
> +
> + reset_count = i915_reset_count(&i915->gpu_error);
> + reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
> + engine);
> + count = 0;
> +
> + set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> + do {
> + int i;
> +
> + if (!wait_for_idle(engine)) {
> + pr_err("%s failed to idle before reset\n",
> + engine->name);
> + err = -EIO;
> + break;
> + }
> +
> + mutex_lock(&i915->drm.struct_mutex);
> + for (i = 0; i < 16; i++) {
> + struct i915_request *rq;
> +
> + rq = i915_request_alloc(engine, ctx);
> + if (IS_ERR(rq)) {
> + err = PTR_ERR(rq);
> + break;
> + }
> +
> + i915_request_add(rq);
> + }
> + mutex_unlock(&i915->drm.struct_mutex);
> +
> + err = i915_reset_engine(engine, NULL);
> + if (err) {
> + pr_err("i915_reset_engine failed\n");
> + break;
> + }
> +
> + if (i915_reset_count(&i915->gpu_error) != reset_count) {
> + pr_err("Full GPU reset recorded! (engine reset expected)\n");
> + err = -EINVAL;
> + break;
> + }
> +
> + if (i915_reset_engine_count(&i915->gpu_error, engine) !=
> + reset_engine_count + ++count) {
> + pr_err("%s engine reset not recorded!\n",
> + engine->name);
> + err = -EINVAL;
> + break;
> + }
> +
> + if (!i915_reset_flush(i915)) {
> + struct drm_printer p =
> + drm_info_printer(i915->drm.dev);
> +
> + pr_err("%s failed to idle after reset\n",
> + engine->name);
> + intel_engine_dump(engine, &p,
> + "%s\n", engine->name);
> +
> + err = -EIO;
> + break;
> + }
> + } while (time_before(jiffies, end_time));
> + clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> + pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
> +
> + if (err)
> + break;
> +
> + err = igt_flush_test(i915, 0);
> + if (err)
> + break;
> + }
> +
> + mutex_lock(&i915->drm.struct_mutex);
> + err = igt_flush_test(i915, I915_WAIT_LOCKED);
> + mutex_unlock(&i915->drm.struct_mutex);
> +
> + intel_runtime_pm_put(i915, wakeref);
> +out:
> + mock_file_free(i915, file);
> + if (i915_terminally_wedged(&i915->gpu_error))
> + err = -EIO;
> + return err;
> +}
> +
> static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
> {
> struct intel_engine_cs *engine;
> @@ -1647,6 +1862,8 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> SUBTEST(igt_global_reset), /* attempt to recover GPU first */
> SUBTEST(igt_wedged_reset),
> SUBTEST(igt_hang_sanitycheck),
> + SUBTEST(igt_reset_nop),
> + SUBTEST(igt_reset_nop_engine),
> SUBTEST(igt_reset_idle_engine),
> SUBTEST(igt_reset_active_engine),
> SUBTEST(igt_reset_engines),
> --
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset()
2019-02-15 10:27 ` [PATCH 2/2] drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset() Chris Wilson
@ 2019-02-15 15:21 ` Mika Kuoppala
2019-02-15 15:28 ` Chris Wilson
0 siblings, 1 reply; 10+ messages in thread
From: Mika Kuoppala @ 2019-02-15 15:21 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
Chris Wilson <chris@chris-wilson.co.uk> writes:
> Since we no longer need to hold struct_mutex to perform a global device
> reset, don't do so for igt_reset_wedge().
>
Oh but the interesting question is not about need, but should =)
Do it both ways?
-Mika
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
> drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 --
> 1 file changed, 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index f75cb56ff8ad..57e02840be9e 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -399,10 +399,8 @@ static int igt_wedged_reset(void *arg)
>
> i915_gem_set_wedged(i915);
>
> - mutex_lock(&i915->drm.struct_mutex);
> GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
> i915_reset(i915, ALL_ENGINES, NULL);
> - mutex_unlock(&i915->drm.struct_mutex);
>
> intel_runtime_pm_put(i915, wakeref);
> igt_global_reset_unlock(i915);
> --
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset()
2019-02-15 15:21 ` Mika Kuoppala
@ 2019-02-15 15:28 ` Chris Wilson
2019-02-15 15:42 ` Mika Kuoppala
0 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2019-02-15 15:28 UTC (permalink / raw)
To: Mika Kuoppala, intel-gfx
Quoting Mika Kuoppala (2019-02-15 15:21:11)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
>
> > Since we no longer need to hold struct_mutex to perform a global device
> > reset, don't do so for igt_reset_wedge().
> >
>
> Oh but the interesting question is not about need, but should =)
> Do it both ways?
Just doing it without should be enough for lockdep, and specifically
remember to guard everything with lockdep_assert_held() when required.
The reverse case of testing with it held can only prove a deadlock by
hitting it! (At which point CI reboots the machine.)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads
2019-02-15 15:18 ` [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads Mika Kuoppala
@ 2019-02-15 15:33 ` Chris Wilson
0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2019-02-15 15:33 UTC (permalink / raw)
To: Mika Kuoppala, intel-gfx
Quoting Mika Kuoppala (2019-02-15 15:18:08)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
>
> > In selftests/live_hangcheck, we have a lot of tests for resetting simple
> > spinners, but nothing quite prepared us for how the GPU reacted to
> > triggering a reset outside of the safe spinner. These two subtests fill
> > the ring with plain old empty, non-spinning requests, and then triggers
> > a reset. Without a user-payload to blame, these requests will exercise
> > the 'non-started' paths and mostly be replayed verbatim.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> > .../gpu/drm/i915/selftests/intel_hangcheck.c | 217 ++++++++++++++++++
> > 1 file changed, 217 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > index 92475596ff40..f75cb56ff8ad 100644
> > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > @@ -415,6 +415,221 @@ static bool wait_for_idle(struct intel_engine_cs *engine)
> > return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
> > }
> >
> > +static int igt_reset_nop(void *arg)
> > +{
> > + struct drm_i915_private *i915 = arg;
> > + struct intel_engine_cs *engine;
> > + struct i915_gem_context *ctx;
> > + unsigned int reset_count, count;
> > + enum intel_engine_id id;
> > + intel_wakeref_t wakeref;
> > + struct drm_file *file;
> > + IGT_TIMEOUT(end_time);
> > + int err = 0;
> > +
> > + /* Check that we can reset during non-user portions of requests */
> > +
> > + file = mock_file(i915);
> > + if (IS_ERR(file))
> > + return PTR_ERR(file);
> > +
> > + mutex_lock(&i915->drm.struct_mutex);
> > + ctx = live_context(i915, file);
> > + mutex_unlock(&i915->drm.struct_mutex);
> > + if (IS_ERR(ctx)) {
> > + err = PTR_ERR(ctx);
> > + goto out;
> > + }
> > +
> > + wakeref = intel_runtime_pm_get(i915);
> > +
> > + reset_count = i915_reset_count(&i915->gpu_error);
> > + count = 0;
> > + do {
> > + mutex_lock(&i915->drm.struct_mutex);
> > + for_each_engine(engine, i915, id) {
> > + int i;
> > +
> > + for (i = 0; i < 16; i++) {
> > + struct i915_request *rq;
> > +
> > + rq = i915_request_alloc(engine, ctx);
> > + if (IS_ERR(rq)) {
> > + err = PTR_ERR(rq);
> > + break;
> > + }
> > +
> > + i915_request_add(rq);
> > + }
> > + }
> > + mutex_unlock(&i915->drm.struct_mutex);
> > +
> > + igt_global_reset_lock(i915);
> > + i915_reset(i915, ALL_ENGINES, NULL);
> > + igt_global_reset_unlock(i915);
> > + if (i915_terminally_wedged(&i915->gpu_error)) {
> > + err = -EIO;
> > + break;
> > + }
> > +
> > + if (i915_reset_count(&i915->gpu_error) !=
> > + reset_count + ++count) {
> > + pr_err("Full GPU reset not recorded!\n");
> > + err = -EINVAL;
> > + break;
> > + }
> > +
> > + if (!i915_reset_flush(i915)) {
> > + struct drm_printer p =
> > + drm_info_printer(i915->drm.dev);
> > +
> > + pr_err("%s failed to idle after reset\n",
> > + engine->name);
> > + intel_engine_dump(engine, &p,
> > + "%s\n", engine->name);
> > +
> > + err = -EIO;
> > + break;
> > + }
> > +
> > + err = igt_flush_test(i915, 0);
> > + if (err)
> > + break;
> > + } while (time_before(jiffies, end_time));
> > + pr_info("%s: %d resets\n", __func__, count);
> > +
> > + mutex_lock(&i915->drm.struct_mutex);
> > + err = igt_flush_test(i915, I915_WAIT_LOCKED);
> > + mutex_unlock(&i915->drm.struct_mutex);
> > +
> > + intel_runtime_pm_put(i915, wakeref);
> > +
> > +out:
> > + mock_file_free(i915, file);
> > + if (i915_terminally_wedged(&i915->gpu_error))
> > + err = -EIO;
> > + return err;
> > +}
> > +
> > +static int igt_reset_nop_engine(void *arg)
> > +{
> > + struct drm_i915_private *i915 = arg;
> > + struct intel_engine_cs *engine;
> > + struct i915_gem_context *ctx;
> > + enum intel_engine_id id;
> > + intel_wakeref_t wakeref;
> > + struct drm_file *file;
> > + int err = 0;
>
> err = 0 for the gcc as it seems unnecessary. Regardless
> I didn't spot anything out of place in this patch.
It's copy'n'paste. There probably isn't a path where it isn't set, but
I've been burnt by sparse/smatch enough around selftests not to push too
hard.
> The amount of 16 requests feels low as a number but
> we should have plenty of time to hit the button during.
Yeah. 16 was a figure to avoid running out of ring space, given that
we're using small rings (4K rather than the usual 16K) and that some
platforms are getting quite fat in their requests.
> Nice addition to reset tests. And as you mentioned
> we should have plenty of resets going in on idle engines
> on other tests. Not that someone would object one explicit
> test into this file :)
Where to go next? I was thinking of scribbling over the context images
with garbage and/or executing garbage batches... But that strays into HW
validation and away from proving the driver can handle the recovery.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset()
2019-02-15 15:28 ` Chris Wilson
@ 2019-02-15 15:42 ` Mika Kuoppala
0 siblings, 0 replies; 10+ messages in thread
From: Mika Kuoppala @ 2019-02-15 15:42 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
Chris Wilson <chris@chris-wilson.co.uk> writes:
> Quoting Mika Kuoppala (2019-02-15 15:21:11)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>>
>> > Since we no longer need to hold struct_mutex to perform a global device
>> > reset, don't do so for igt_reset_wedge().
>> >
>>
>> Oh but the interesting question is not about need, but should =)
>> Do it both ways?
>
> Just doing it without should be enough for lockdep, and specifically
> remember to guard everything with lockdep_assert_held() when required.
> The reverse case of testing with it held can only prove a deadlock by
> hitting it! (At which point CI reboots the machine.)
Not much value to prove that the old model works with new rules,
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/i915/selftests: Exercise resetting during non-user payloads
2019-02-15 10:27 [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads Chris Wilson
2019-02-15 10:27 ` [PATCH 2/2] drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset() Chris Wilson
2019-02-15 15:18 ` [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads Mika Kuoppala
@ 2019-02-15 17:35 ` Patchwork
2019-02-15 18:11 ` ✗ Fi.CI.BAT: failure " Patchwork
3 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2019-02-15 17:35 UTC (permalink / raw)
To: intel-gfx
== Series Details ==
Series: series starting with [1/2] drm/i915/selftests: Exercise resetting during non-user payloads
URL : https://patchwork.freedesktop.org/series/56716/
State : warning
== Summary ==
$ dim checkpatch origin/drm-tip
3473db45d1ed drm/i915/selftests: Exercise resetting during non-user payloads
-:35: WARNING:LINE_SPACING: Missing a blank line after declarations
#35: FILE: drivers/gpu/drm/i915/selftests/intel_hangcheck.c:427:
+ struct drm_file *file;
+ IGT_TIMEOUT(end_time);
-:153: WARNING:LINE_SPACING: Missing a blank line after declarations
#153: FILE: drivers/gpu/drm/i915/selftests/intel_hangcheck.c:545:
+ unsigned int count;
+ IGT_TIMEOUT(end_time);
total: 0 errors, 2 warnings, 0 checks, 229 lines checked
fc54b0e0ee61 drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset()
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* ✗ Fi.CI.BAT: failure for series starting with [1/2] drm/i915/selftests: Exercise resetting during non-user payloads
2019-02-15 10:27 [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads Chris Wilson
` (2 preceding siblings ...)
2019-02-15 17:35 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] " Patchwork
@ 2019-02-15 18:11 ` Patchwork
2019-02-15 18:41 ` Chris Wilson
3 siblings, 1 reply; 10+ messages in thread
From: Patchwork @ 2019-02-15 18:11 UTC (permalink / raw)
To: intel-gfx
== Series Details ==
Series: series starting with [1/2] drm/i915/selftests: Exercise resetting during non-user payloads
URL : https://patchwork.freedesktop.org/series/56716/
State : failure
== Summary ==
CI Bug Log - changes from CI_DRM_5608 -> Patchwork_12226
====================================================
Summary
-------
**FAILURE**
Serious unknown changes coming with Patchwork_12226 absolutely need to be
verified manually.
If you think the reported changes have nothing to do with the changes
introduced in Patchwork_12226, please notify your bug team to allow them
to document this new failure mode, which will reduce false positives in CI.
External URL: https://patchwork.freedesktop.org/api/1.0/series/56716/revisions/1/mbox/
Possible new issues
-------------------
Here are the unknown changes that may have been introduced in Patchwork_12226:
### IGT changes ###
#### Possible regressions ####
* igt@i915_selftest@live_hangcheck:
- fi-bdw-gvtdvm: PASS -> DMESG-FAIL
- fi-bdw-5557u: PASS -> DMESG-FAIL
- fi-glk-j4005: PASS -> DMESG-FAIL
- fi-cfl-8109u: PASS -> DMESG-FAIL
- fi-cfl-8700k: PASS -> DMESG-FAIL
- fi-skl-iommu: PASS -> DMESG-FAIL
- fi-skl-gvtdvm: PASS -> DMESG-FAIL
- fi-bxt-j4205: PASS -> DMESG-FAIL
- fi-skl-6600u: PASS -> DMESG-FAIL
- fi-bsw-n3050: PASS -> DMESG-FAIL
- fi-skl-6770hq: PASS -> DMESG-FAIL
- fi-skl-6700k2: PASS -> DMESG-FAIL
- fi-skl-6260u: PASS -> DMESG-FAIL
#### Suppressed ####
The following results come from untrusted machines, tests, or statuses.
They do not affect the overall result.
* igt@i915_selftest@live_hangcheck:
- {fi-whl-u}: PASS -> DMESG-FAIL
- {fi-icl-u2}: PASS -> DMESG-FAIL
- {fi-icl-u3}: PASS -> DMESG-FAIL
* {igt@runner@aborted}:
- {fi-icl-y}: NOTRUN -> FAIL
Known issues
------------
Here are the changes found in Patchwork_12226 that come from known issues:
### IGT changes ###
#### Issues hit ####
* igt@i915_selftest@live_hangcheck:
- fi-kbl-r: PASS -> DMESG-FAIL [fdo#107860]
- fi-kbl-7567u: PASS -> DMESG-FAIL [fdo#107860]
- fi-kbl-x1275: PASS -> DMESG-FAIL [fdo#107860]
- fi-kbl-7500u: PASS -> DMESG-FAIL [fdo#107860]
- fi-kbl-8809g: PASS -> DMESG-FAIL [fdo#107860]
- fi-kbl-7560u: PASS -> DMESG-FAIL [fdo#107860]
* igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b:
- fi-byt-clapper: PASS -> FAIL [fdo#103191] / [fdo#107362]
#### Possible fixes ####
* igt@gem_exec_suspend@basic-s3:
- {fi-icl-u2}: FAIL [fdo#103375] -> PASS
- fi-blb-e6850: INCOMPLETE [fdo#107718] -> PASS
* igt@i915_selftest@live_evict:
- fi-bsw-kefka: DMESG-WARN [fdo#107709] -> PASS
* igt@i915_selftest@live_execlists:
- fi-apl-guc: INCOMPLETE [fdo#103927] -> PASS
* igt@kms_chamelium@hdmi-hpd-fast:
- fi-kbl-7500u: FAIL [fdo#109485] -> PASS
* igt@kms_pipe_crc_basic@nonblocking-crc-pipe-a-frame-sequence:
- fi-byt-clapper: FAIL [fdo#103191] / [fdo#107362] -> PASS
* igt@pm_rpm@basic-pci-d3-state:
- fi-byt-j1900: {SKIP} [fdo#109271] -> PASS
* igt@pm_rpm@basic-rte:
- fi-byt-j1900: FAIL [fdo#108800] -> PASS
{name}: This element is suppressed. This means it is ignored when computing
the status of the difference (SUCCESS, WARNING, or FAILURE).
[fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
[fdo#103375]: https://bugs.freedesktop.org/show_bug.cgi?id=103375
[fdo#103927]: https://bugs.freedesktop.org/show_bug.cgi?id=103927
[fdo#105998]: https://bugs.freedesktop.org/show_bug.cgi?id=105998
[fdo#107362]: https://bugs.freedesktop.org/show_bug.cgi?id=107362
[fdo#107709]: https://bugs.freedesktop.org/show_bug.cgi?id=107709
[fdo#107718]: https://bugs.freedesktop.org/show_bug.cgi?id=107718
[fdo#107860]: https://bugs.freedesktop.org/show_bug.cgi?id=107860
[fdo#108569]: https://bugs.freedesktop.org/show_bug.cgi?id=108569
[fdo#108800]: https://bugs.freedesktop.org/show_bug.cgi?id=108800
[fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
[fdo#109276]: https://bugs.freedesktop.org/show_bug.cgi?id=109276
[fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
[fdo#109284]: https://bugs.freedesktop.org/show_bug.cgi?id=109284
[fdo#109285]: https://bugs.freedesktop.org/show_bug.cgi?id=109285
[fdo#109289]: https://bugs.freedesktop.org/show_bug.cgi?id=109289
[fdo#109294]: https://bugs.freedesktop.org/show_bug.cgi?id=109294
[fdo#109315]: https://bugs.freedesktop.org/show_bug.cgi?id=109315
[fdo#109485]: https://bugs.freedesktop.org/show_bug.cgi?id=109485
[fdo#109527]: https://bugs.freedesktop.org/show_bug.cgi?id=109527
[fdo#109528]: https://bugs.freedesktop.org/show_bug.cgi?id=109528
[fdo#109530]: https://bugs.freedesktop.org/show_bug.cgi?id=109530
Participating hosts (48 -> 44)
------------------------------
Additional (2): fi-icl-y fi-pnv-d510
Missing (6): fi-kbl-soraka fi-ilk-m540 fi-hsw-4200u fi-bsw-cyan fi-ctg-p8600 fi-bdw-samus
Build changes
-------------
* Linux: CI_DRM_5608 -> Patchwork_12226
CI_DRM_5608: a6301f3b868a688c97ae63d9ad4cb69ff62596bb @ git://anongit.freedesktop.org/gfx-ci/linux
IGT_4830: 2fa3de65d6a29bfe0f86da391c7da90791a2a767 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
Patchwork_12226: fc54b0e0ee61cb1572c3eaf253eff959b1c55e97 @ git://anongit.freedesktop.org/gfx-ci/linux
== Linux commits ==
fc54b0e0ee61 drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset()
3473db45d1ed drm/i915/selftests: Exercise resetting during non-user payloads
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12226/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: ✗ Fi.CI.BAT: failure for series starting with [1/2] drm/i915/selftests: Exercise resetting during non-user payloads
2019-02-15 18:11 ` ✗ Fi.CI.BAT: failure " Patchwork
@ 2019-02-15 18:41 ` Chris Wilson
0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2019-02-15 18:41 UTC (permalink / raw)
To: Patchwork, intel-gfx
Quoting Patchwork (2019-02-15 18:11:23)
> == Series Details ==
>
> Series: series starting with [1/2] drm/i915/selftests: Exercise resetting during non-user payloads
> URL : https://patchwork.freedesktop.org/series/56716/
> State : failure
>
> == Summary ==
>
> CI Bug Log - changes from CI_DRM_5608 -> Patchwork_12226
> ====================================================
>
> Summary
> -------
>
> **FAILURE**
>
> Serious unknown changes coming with Patchwork_12226 absolutely need to be
> verified manually.
>
> If you think the reported changes have nothing to do with the changes
> introduced in Patchwork_12226, please notify your bug team to allow them
> to document this new failure mode, which will reduce false positives in CI.
>
> External URL: https://patchwork.freedesktop.org/api/1.0/series/56716/revisions/1/mbox/
>
> Possible new issues
> -------------------
>
> Here are the unknown changes that may have been introduced in Patchwork_12226:
>
> ### IGT changes ###
>
> #### Possible regressions ####
>
> * igt@i915_selftest@live_hangcheck:
> - fi-bdw-gvtdvm: PASS -> DMESG-FAIL
> - fi-bdw-5557u: PASS -> DMESG-FAIL
> - fi-glk-j4005: PASS -> DMESG-FAIL
> - fi-cfl-8109u: PASS -> DMESG-FAIL
> - fi-cfl-8700k: PASS -> DMESG-FAIL
> - fi-skl-iommu: PASS -> DMESG-FAIL
> - fi-skl-gvtdvm: PASS -> DMESG-FAIL
> - fi-bxt-j4205: PASS -> DMESG-FAIL
> - fi-skl-6600u: PASS -> DMESG-FAIL
> - fi-bsw-n3050: PASS -> DMESG-FAIL
> - fi-skl-6770hq: PASS -> DMESG-FAIL
> - fi-skl-6700k2: PASS -> DMESG-FAIL
> - fi-skl-6260u: PASS -> DMESG-FAIL
That does actually appear to be a genuine bug with us not writing the
global_seqno (context seqno is fine), and has occurred in the wild
https://bugs.freedesktop.org/show_bug.cgi?id=109606
Removal of global_seqno fixes \o/ (at least locally)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2019-02-15 18:41 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-02-15 10:27 [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads Chris Wilson
2019-02-15 10:27 ` [PATCH 2/2] drm/i915/selftests: Drop unnecessary struct_mutex around i915_reset() Chris Wilson
2019-02-15 15:21 ` Mika Kuoppala
2019-02-15 15:28 ` Chris Wilson
2019-02-15 15:42 ` Mika Kuoppala
2019-02-15 15:18 ` [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads Mika Kuoppala
2019-02-15 15:33 ` Chris Wilson
2019-02-15 17:35 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] " Patchwork
2019-02-15 18:11 ` ✗ Fi.CI.BAT: failure " Patchwork
2019-02-15 18:41 ` Chris Wilson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox