From mboxrd@z Thu Jan  1 00:00:00 1970
From: Ville =?iso-8859-1?Q?Syrj=E4l=E4?= <ville.syrjala@linux.intel.com>
Subject: Re: [PATCH 1/2] drm/i915/bdw: Apply workarounds in
 render ring init function
Date: Tue, 26 Aug 2014 17:37:05 +0300
Message-ID: <20140826143705.GN4193@intel.com>
References: <1409060691-4916-1-git-send-email-arun.siluvery@linux.intel.com>
 <1409060691-4916-2-git-send-email-arun.siluvery@linux.intel.com>
Mime-Version: 1.0
Content-Type: text/plain; charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable
Return-path: <intel-gfx-bounces@lists.freedesktop.org>
Received: from mga11.intel.com (mga11.intel.com [192.55.52.93])
 by gabe.freedesktop.org (Postfix) with ESMTP id A67266E08C
 for <intel-gfx@lists.freedesktop.org>; Tue, 26 Aug 2014 07:39:50 -0700 (PDT)
Content-Disposition: inline
In-Reply-To: <1409060691-4916-2-git-send-email-arun.siluvery@linux.intel.com>
List-Unsubscribe: <http://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <http://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <http://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>
To: Arun Siluvery <arun.siluvery@linux.intel.com>
Cc: intel-gfx@lists.freedesktop.org
List-Id: intel-gfx@lists.freedesktop.org

On Tue, Aug 26, 2014 at 02:44:50PM +0100, Arun Siluvery wrote:
> For BDW workarounds are currently initialized in init_clock_gating() but
> they are lost during reset, suspend/resume etc; this patch moves the WAs
> that are part of register state context to render ring init fn otherwise
> default context ends up with incorrect values as they don't get initializ=
ed
> until init_clock_gating fn.
> =

> v2: Add workarounds to golden render state
> This method has its own issues, first of all this is different for
> each gen and it is generated using a tool so adding new workaround
> and mainitaining them across gens is not a straightforward process.
> =

> v3: Use LRIs to emit these workarounds (Ville)
> Instead of modifying the golden render state the same LRIs are
> emitted from within the driver.
> =

> v4: Use abstract name when exporting gen specific routines (Chris)
> =

> For: VIZ-4092
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>

This one looks good as far as I'm concerned.
Reviewed-by: Ville Syrj=E4l=E4 <ville.syrjala@linux.intel.com>

Do you plan to give other platforms the same treatment? We need at least
CHV converted ASAP. But if you don't have a test machine I can take care
of that myself.

> ---
>  drivers/gpu/drm/i915/i915_gem_context.c |  6 +++
>  drivers/gpu/drm/i915/intel_pm.c         | 48 --------------------
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 79 +++++++++++++++++++++++++++=
++++++
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  2 +
>  4 files changed, 87 insertions(+), 48 deletions(-)
> =

> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i9=
15/i915_gem_context.c
> index 9683e62..0a9bb0e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -631,20 +631,26 @@ static int do_switch(struct intel_engine_cs *ring,
>  	}
>  =

>  	uninitialized =3D !to->legacy_hw_ctx.initialized && from =3D=3D NULL;
>  	to->legacy_hw_ctx.initialized =3D true;
>  =

>  done:
>  	i915_gem_context_reference(to);
>  	ring->last_context =3D to;
>  =

>  	if (uninitialized) {
> +		if (ring->init_context) {
> +			ret =3D ring->init_context(ring);
> +			if (ret)
> +				DRM_ERROR("ring init context: %d\n", ret);
> +		}
> +
>  		ret =3D i915_gem_render_state_init(ring);
>  		if (ret)
>  			DRM_ERROR("init render state: %d\n", ret);
>  	}
>  =

>  	return 0;
>  =

>  unpin_out:
>  	if (ring->id =3D=3D RCS)
>  		i915_gem_object_ggtt_unpin(to->legacy_hw_ctx.rcs_state);
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel=
_pm.c
> index c8f744c..668acd9 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -5507,101 +5507,53 @@ static void gen8_init_clock_gating(struct drm_de=
vice *dev)
>  	struct drm_i915_private *dev_priv =3D dev->dev_private;
>  	enum pipe pipe;
>  =

>  	I915_WRITE(WM3_LP_ILK, 0);
>  	I915_WRITE(WM2_LP_ILK, 0);
>  	I915_WRITE(WM1_LP_ILK, 0);
>  =

>  	/* FIXME(BDW): Check all the w/a, some might only apply to
>  	 * pre-production hw. */
>  =

> -	/* WaDisablePartialInstShootdown:bdw */
> -	I915_WRITE(GEN8_ROW_CHICKEN,
> -		   _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
> -
> -	/* WaDisableThreadStallDopClockGating:bdw */
> -	/* FIXME: Unclear whether we really need this on production bdw. */
> -	I915_WRITE(GEN8_ROW_CHICKEN,
> -		   _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
>  =

> -	/*
> -	 * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
> -	 * pre-production hardware
> -	 */
> -	I915_WRITE(HALF_SLICE_CHICKEN3,
> -		   _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
> -	I915_WRITE(HALF_SLICE_CHICKEN3,
> -		   _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
>  	I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
>  =

>  	I915_WRITE(_3D_CHICKEN3,
>  		   _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
>  =

> -	I915_WRITE(COMMON_SLICE_CHICKEN2,
> -		   _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
> -
> -	I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
> -		   _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
> -
> -	/* WaDisableDopClockGating:bdw May not be needed for production */
> -	I915_WRITE(GEN7_ROW_CHICKEN2,
> -		   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
>  =

>  	/* WaSwitchSolVfFArbitrationPriority:bdw */
>  	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
>  =

>  	/* WaPsrDPAMaskVBlankInSRD:bdw */
>  	I915_WRITE(CHICKEN_PAR1_1,
>  		   I915_READ(CHICKEN_PAR1_1) | DPA_MASK_VBLANK_SRD);
>  =

>  	/* WaPsrDPRSUnmaskVBlankInSRD:bdw */
>  	for_each_pipe(pipe) {
>  		I915_WRITE(CHICKEN_PIPESL_1(pipe),
>  			   I915_READ(CHICKEN_PIPESL_1(pipe)) |
>  			   BDW_DPRS_MASK_VBLANK_SRD);
>  	}
>  =

> -	/* Use Force Non-Coherent whenever executing a 3D context. This is a
> -	 * workaround for for a possible hang in the unlikely event a TLB
> -	 * invalidation occurs during a PSD flush.
> -	 */
> -	I915_WRITE(HDC_CHICKEN0,
> -		   I915_READ(HDC_CHICKEN0) |
> -		   _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
> -
>  	/* WaVSRefCountFullforceMissDisable:bdw */
>  	/* WaDSRefCountFullforceMissDisable:bdw */
>  	I915_WRITE(GEN7_FF_THREAD_MODE,
>  		   I915_READ(GEN7_FF_THREAD_MODE) &
>  		   ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
>  =

> -	/*
> -	 * BSpec recommends 8x4 when MSAA is used,
> -	 * however in practice 16x4 seems fastest.
> -	 *
> -	 * Note that PS/WM thread counts depend on the WIZ hashing
> -	 * disable bit, which we don't touch here, but it's good
> -	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
> -	 */
> -	I915_WRITE(GEN7_GT_MODE,
> -		   GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
> -
>  	I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
>  		   _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
>  =

>  	/* WaDisableSDEUnitClockGating:bdw */
>  	I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
>  		   GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
> -
> -	/* Wa4x4STCOptimizationDisable:bdw */
> -	I915_WRITE(CACHE_MODE_1,
> -		   _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
>  }
>  =

>  static void haswell_init_clock_gating(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv =3D dev->dev_private;
>  =

>  	ilk_init_lp_watermarks(dev);
>  =

>  	/* L3 caching of data atomics doesn't work -- disable it. */
>  	I915_WRITE(HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i9=
15/intel_ringbuffer.c
> index 13543f8..4146582 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -643,20 +643,98 @@ intel_init_pipe_control(struct intel_engine_cs *rin=
g)
>  	return 0;
>  =

>  err_unpin:
>  	i915_gem_object_ggtt_unpin(ring->scratch.obj);
>  err_unref:
>  	drm_gem_object_unreference(&ring->scratch.obj->base);
>  err:
>  	return ret;
>  }
>  =

> +static inline void intel_ring_emit_wa(struct intel_engine_cs *ring,
> +				       u32 addr, u32 value)
> +{
> +	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
> +	intel_ring_emit(ring, addr);
> +	intel_ring_emit(ring, value);
> +}
> +
> +static int gen8_init_workarounds(struct intel_engine_cs *ring)
> +{
> +	int ret;
> +
> +	/*
> +	 * workarounds applied in this fn are part of register state context,
> +	 * they need to be re-initialized followed by gpu reset, suspend/resume,
> +	 * module reload.
> +	 */
> +
> +	/*
> +	 * update the number of dwords required based on the
> +	 * actual number of workarounds applied
> +	 */
> +	ret =3D intel_ring_begin(ring, 24);
> +	if (ret)
> +		return ret;
> +
> +	/* WaDisablePartialInstShootdown:bdw */
> +	/* WaDisableThreadStallDopClockGating:bdw */
> +	/* FIXME: Unclear whether we really need this on production bdw. */
> +	intel_ring_emit_wa(ring, GEN8_ROW_CHICKEN,
> +			   _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
> +					     | STALL_DOP_GATING_DISABLE));
> +
> +	/* WaDisableDopClockGating:bdw May not be needed for production */
> +	intel_ring_emit_wa(ring, GEN7_ROW_CHICKEN2,
> +			   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
> +
> +	/*
> +	 * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
> +	 * pre-production hardware
> +	 */
> +	intel_ring_emit_wa(ring, HALF_SLICE_CHICKEN3,
> +			   _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
> +					      | GEN8_SAMPLER_POWER_BYPASS_DIS));
> +
> +	intel_ring_emit_wa(ring, GEN7_HALF_SLICE_CHICKEN1,
> +			   _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
> +
> +	intel_ring_emit_wa(ring, COMMON_SLICE_CHICKEN2,
> +			   _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
> +
> +	/* Use Force Non-Coherent whenever executing a 3D context. This is a
> +	 * workaround for for a possible hang in the unlikely event a TLB
> +	 * invalidation occurs during a PSD flush.
> +	 */
> +	intel_ring_emit_wa(ring, HDC_CHICKEN0,
> +			   _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
> +
> +	/* Wa4x4STCOptimizationDisable:bdw */
> +	intel_ring_emit_wa(ring, CACHE_MODE_1,
> +			   _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
> +
> +	/*
> +	 * BSpec recommends 8x4 when MSAA is used,
> +	 * however in practice 16x4 seems fastest.
> +	 *
> +	 * Note that PS/WM thread counts depend on the WIZ hashing
> +	 * disable bit, which we don't touch here, but it's good
> +	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
> +	 */
> +	intel_ring_emit_wa(ring, GEN7_GT_MODE,
> +			   GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
> +
> +	intel_ring_advance(ring);
> +
> +	return 0;
> +}
> +
>  static int init_render_ring(struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev =3D ring->dev;
>  	struct drm_i915_private *dev_priv =3D dev->dev_private;
>  	int ret =3D init_ring_common(ring);
>  	if (ret)
>  		return ret;
>  =

>  	/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
>  	if (INTEL_INFO(dev)->gen >=3D 4 && INTEL_INFO(dev)->gen < 7)
> @@ -2128,20 +2206,21 @@ int intel_init_render_ring_buffer(struct drm_devi=
ce *dev)
>  				i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
>  				ret =3D i915_gem_obj_ggtt_pin(obj, 0, PIN_NONBLOCK);
>  				if (ret !=3D 0) {
>  					drm_gem_object_unreference(&obj->base);
>  					DRM_ERROR("Failed to pin semaphore bo. Disabling semaphores\n");
>  					i915.semaphores =3D 0;
>  				} else
>  					dev_priv->semaphore_obj =3D obj;
>  			}
>  		}
> +		ring->init_context =3D gen8_init_workarounds;
>  		ring->add_request =3D gen6_add_request;
>  		ring->flush =3D gen8_render_ring_flush;
>  		ring->irq_get =3D gen8_ring_get_irq;
>  		ring->irq_put =3D gen8_ring_put_irq;
>  		ring->irq_enable_mask =3D GT_RENDER_USER_INTERRUPT;
>  		ring->get_seqno =3D gen6_ring_get_seqno;
>  		ring->set_seqno =3D ring_set_seqno;
>  		if (i915_semaphore_is_enabled(dev)) {
>  			WARN_ON(!dev_priv->semaphore_obj);
>  			ring->semaphore.sync_to =3D gen8_ring_sync;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i9=
15/intel_ringbuffer.h
> index 9cbf7b0..96479c8 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -141,20 +141,22 @@ struct  intel_engine_cs {
>  	struct intel_hw_status_page status_page;
>  =

>  	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
>  	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */
>  	u32		trace_irq_seqno;
>  	bool __must_check (*irq_get)(struct intel_engine_cs *ring);
>  	void		(*irq_put)(struct intel_engine_cs *ring);
>  =

>  	int		(*init)(struct intel_engine_cs *ring);
>  =

> +	int		(*init_context)(struct intel_engine_cs *ring);
> +
>  	void		(*write_tail)(struct intel_engine_cs *ring,
>  				      u32 value);
>  	int __must_check (*flush)(struct intel_engine_cs *ring,
>  				  u32	invalidate_domains,
>  				  u32	flush_domains);
>  	int		(*add_request)(struct intel_engine_cs *ring);
>  	/* Some chipsets are not quite as coherent as advertised and need
>  	 * an expensive kick to force a true read of the up-to-date seqno.
>  	 * However, the up-to-date seqno is not always required and the last
>  	 * seen value is good enough. Note that the seqno will always be
> -- =

> 2.0.4
> =

> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- =

Ville Syrj=E4l=E4
Intel OTC