From mboxrd@z Thu Jan  1 00:00:00 1970
From: Ville =?iso-8859-1?Q?Syrj=E4l=E4?= <ville.syrjala@linux.intel.com>
Subject: Re: [PATCH] drm/i915: Convert hangcheck from a timer
 into a delayed work item
Date: Thu, 4 Sep 2014 18:25:03 +0300
Message-ID: <20140904152503.GM4193@intel.com>
References: <1409832275-14943-1-git-send-email-chris@chris-wilson.co.uk>
 <1409843342-437-1-git-send-email-chris@chris-wilson.co.uk>
Mime-Version: 1.0
Content-Type: text/plain; charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable
Return-path: <intel-gfx-bounces@lists.freedesktop.org>
Received: from mga11.intel.com (mga11.intel.com [192.55.52.93])
 by gabe.freedesktop.org (Postfix) with ESMTP id C365A6E7A9
 for <intel-gfx@lists.freedesktop.org>; Thu,  4 Sep 2014 08:26:05 -0700 (PDT)
Content-Disposition: inline
In-Reply-To: <1409843342-437-1-git-send-email-chris@chris-wilson.co.uk>
List-Unsubscribe: <http://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <http://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <http://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
To: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jani Nikula <jani.nikula@intel.com>, Daniel Vetter <dnaiel.vetter@ffwll.chm>, intel-gfx@lists.freedesktop.org
List-Id: intel-gfx@lists.freedesktop.org

On Thu, Sep 04, 2014 at 04:09:02PM +0100, Chris Wilson wrote:
> When run as a timer, i915_hangcheck_elapsed() must adhere to all the
> rules of running in a softirq context. This is advantageous to us as we
> want to minimise the risk that a driver bug will prevent us from
> detecting a hung GPU. However, that is irrelevant if the driver bug
> prevents us from resetting and recovering. Still it is prudent not to
> rely on mutexes inside the checker, but given the coarseness of
> dev->struct_mutex doing so is extremely hard.
> =

> Give in and run from a work queue, i.e. outside of softirq.
> =

> v2:
> =

> The conversion does have one significant change, from the use of
> mod_timer to schedule_delayed_work, means that the time that we execute
> the first hangcheck is fixed and not continually deferred by later work.
> This has the advantage of not allowing userspace to fill the ring before
> hangcheck can finally run. At the same time, it removes the ability for
> the interrupt to defer the hangcheck as well. This is sensible for that
> an interrupt is only for a single engine, whereas we perform hangcheck
> globally, so whilst one ring may have hung, the other could be running
> normally and preventing the hangcheck from firing.

But doesn't this make it so that we may not detect a hang unless more
work gets submitted constantly? Eg.

1. execbuffer batch 1 -> queue hangcheck schedules work
2. execbuffer batch 2 -> queue hangcheck does nothing
3. execbuffer batch 3 -> queue hangcheck does nothing
4. hangcheck expires and sees progress up to batch 2 -> everything is fine
5. batch 3 hangs

> =

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Jani Nikula <jani.nikula@intel.com>
> Cc: Daniel Vetter <dnaiel.vetter@ffwll.chm>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.h |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c |  2 +-
>  drivers/gpu/drm/i915/i915_irq.c | 16 ++++++++--------
>  5 files changed, 12 insertions(+), 12 deletions(-)
> =

> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_=
dma.c
> index 6502ae2d7b7d..6a8e71cb2be8 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
>  	}
>  =

>  	/* Free error state after interrupts are fully disabled. */
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	cancel_work_sync(&dev_priv->gpu_error.work);
>  	i915_destroy_error_state(dev);
>  =

> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_=
drv.c
> index f7cc6a9c14fd..ea9224a977c1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *dev=
ice)
>  		return ret;
>  	}
>  =

> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	intel_uncore_forcewake_reset(dev, false);
>  	dev_priv->pm.suspended =3D true;
>  =

> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_=
drv.h
> index a920ca3789d8..e1f8ffcb2cf3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1209,7 +1209,7 @@ struct i915_gpu_error {
>  	/* Hang gpu twice in this window and your context gets banned */
>  #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD=
, 1000)
>  =

> -	struct timer_list hangcheck_timer;
> +	struct delayed_work hangcheck_work;
>  =

>  	/* For reset and error_state handling. */
>  	spinlock_t lock;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_=
gem.c
> index b37177afc3c0..3e80c777bf12 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
>  							     DRIVER_MODESET);
>  	mutex_unlock(&dev->struct_mutex);
>  =

> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
>  	flush_delayed_work(&dev_priv->mm.idle_work);
>  =

> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_=
irq.c
> index 2ade9efe078c..d295f546b58d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1274,7 +1274,6 @@ static void notify_ring(struct drm_device *dev,
>  	atomic_inc(&engine->interrupts);
>  =

>  	wake_up_all(&engine->irq_queue);
> -	i915_queue_hangcheck(dev);
>  }
>  =

>  static void vlv_c0_read(struct drm_i915_private *dev_priv,
> @@ -3189,9 +3188,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 a=
cthd)
>   * we kick the ring. If we see no progress on three subsequent calls
>   * we assume chip is wedged and try to fix it by resetting the chip.
>   */
> -static void i915_hangcheck_elapsed(unsigned long data)
> +static void i915_hangcheck_elapsed(struct work_struct *work)
>  {
> -	struct drm_i915_private *dev_priv =3D (struct drm_i915_private *)data;
> +	struct drm_i915_private *dev_priv =3D
> +		container_of(work, typeof(*dev_priv),
> +			     gpu_error.hangcheck_work.work);
>  	struct intel_engine_cs *engine;
>  	int i;
>  	int busy_count =3D 0, rings_hung =3D 0;
> @@ -3312,8 +3313,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
>  	if (!i915_module.enable_hangcheck)
>  		return;
>  =

> -	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
> -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> +	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
> +			      round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES));
>  }
>  =

>  static void ibx_irq_reset(struct drm_device *dev)
> @@ -4615,9 +4616,8 @@ void intel_irq_init(struct drm_device *dev)
>  	else
>  		dev_priv->rps.pm_events =3D GEN6_PM_RPS_EVENTS;
>  =

> -	setup_timer(&dev_priv->gpu_error.hangcheck_timer,
> -		    i915_hangcheck_elapsed,
> -		    (unsigned long) dev_priv);
> +	INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
> +			  i915_hangcheck_elapsed);
>  	INIT_DELAYED_WORK(&dev_priv->hotplug_reenable_work,
>  			  intel_hpd_irq_reenable);
>  =

> -- =

> 2.1.0
> =

> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- =

Ville Syrj=E4l=E4
Intel OTC