From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Michael H. Nguyen" Subject: Re: [PATCH] drm/i915/bdw: Initialize workarounds in render ring init fn. Date: Fri, 15 Aug 2014 16:26:26 -0700 Message-ID: <53EE9722.5020606@intel.com> References: <1408035060-26456-1-git-send-email-arun.siluvery@linux.intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii"; Format="flowed" Content-Transfer-Encoding: 7bit Return-path: Received: from mga09.intel.com (mga09.intel.com [134.134.136.24]) by gabe.freedesktop.org (Postfix) with ESMTP id 1B0A76E089 for ; Fri, 15 Aug 2014 16:26:26 -0700 (PDT) In-Reply-To: <1408035060-26456-1-git-send-email-arun.siluvery@linux.intel.com> List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" To: arun.siluvery@linux.intel.com, intel-gfx@lists.freedesktop.org List-Id: intel-gfx@lists.freedesktop.org On 08/14/2014 09:51 AM, arun.siluvery@linux.intel.com wrote: > From: Arun Siluvery > > The workarounds at the moment are initialized in init_clock_gating() but > they are lost during reset, suspend/resume; this patch moves workarounds > that are part of register state context to render ring init fn otherwise > default context ends up with incorrect values as they don't get initialized > until init_clock_gating fn. This should be ok as they are not related to > display clock gating in the first place. > > v2: Add macro to generate w/a table (Daniel) > > Signed-off-by: Arun Siluvery > --- > drivers/gpu/drm/i915/i915_debugfs.c | 38 +++++++++++++++++++ > drivers/gpu/drm/i915/i915_drv.h | 32 ++++++++++++++++ > drivers/gpu/drm/i915/intel_pm.c | 50 ------------------------- > drivers/gpu/drm/i915/intel_ringbuffer.c | 66 +++++++++++++++++++++++++++++++++ > 4 files changed, 136 insertions(+), 50 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 9e737b7..fc28835 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -2406,6 +2406,43 @@ static int i915_shared_dplls_info(struct seq_file *m, void *unused) > return 0; > } > > +static int intel_wa_registers(struct seq_file *m, void *unused) > +{ > + struct drm_info_node *node = (struct drm_info_node *) m->private; > + struct drm_device *dev = node->minor->dev; > + struct drm_i915_private *dev_priv = dev->dev_private; > + int i; > + int ret; > + > + if (!dev_priv->num_wa) { > + DRM_DEBUG_DRIVER("Workaround table not available\n"); > + return -EINVAL; > + } > + > + if (dev_priv->num_wa > I915_MAX_WA_REGS) > + dev_priv->num_wa = I915_MAX_WA_REGS; > + > + ret = mutex_lock_interruptible(&dev->struct_mutex); > + if (ret) > + return ret; > + > + intel_runtime_pm_get(dev_priv); > + > + seq_printf(m, "Workarounds applied: %d\n", dev_priv->num_wa); > + for (i = 0; i < dev_priv->num_wa; ++i) { > + if (dev_priv->wa_regs[i].addr) > + seq_printf(m, "0x%X: 0x%08X, mask: 0x%08X\n", > + dev_priv->wa_regs[i].addr, > + I915_READ(dev_priv->wa_regs[i].addr), > + dev_priv->wa_regs[i].mask); > + } > + > + intel_runtime_pm_put(dev_priv); > + mutex_unlock(&dev->struct_mutex); > + > + return 0; > +} > + > struct pipe_crc_info { > const char *name; > struct drm_device *dev; > @@ -3936,6 +3973,7 @@ static const struct drm_info_list i915_debugfs_list[] = { > {"i915_semaphore_status", i915_semaphore_status, 0}, > {"i915_shared_dplls_info", i915_shared_dplls_info, 0}, > {"i915_dp_mst_info", i915_dp_mst_info, 0}, > + {"intel_wa_registers", intel_wa_registers, 0}, > }; > #define I915_DEBUGFS_ENTRIES ARRAY_SIZE(i915_debugfs_list) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 18c9ad8..c4190a9 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -1222,6 +1222,24 @@ struct i915_gpu_error { > unsigned int test_irq_rings; > }; > > +/* > + * workarounds are currently applied at different places and changes > + * are being done to consolidate them at a single place hence the > + * exact count is not clear at this point; replace this with accurate > + * number based on gen later. > + */ > +#define I915_MAX_WA_REGS 16 > + > +/* > + * structure to verify workarounds status > + * mask: represent w/a bits > + */ > +struct intel_wa { > + u32 addr; > + u32 val; > + u32 mask; > +}; Can we rename this to intel_reg_wa or similar? intel_wa confused me for a second. Personally when I think about wa's, they can be simple register writes like is the case here or more complex requiring some software logic. Also, should this be defined in a userspace accessible header? I see this defined again locally in the IGT test. > + > enum modeset_restore { > MODESET_ON_LID_OPEN, > MODESET_DONE, > @@ -1422,6 +1440,9 @@ struct drm_i915_private { > struct drm_i915_gem_object *semaphore_obj; > uint32_t last_seqno, next_seqno; > > + uint32_t num_wa; > + struct intel_wa wa_regs[I915_MAX_WA_REGS]; > + > drm_dma_handle_t *status_page_dmah; > struct resource mch_res; > > @@ -2803,6 +2824,17 @@ int vlv_freq_opcode(struct drm_i915_private *dev_priv, int val); > #define POSTING_READ(reg) (void)I915_READ_NOTRACE(reg) > #define POSTING_READ16(reg) (void)I915_READ16_NOTRACE(reg) > > +#define I915_WRITE_WA(reg, val) ({ \ > + if (dev_priv->num_wa >= 0 \ > + && dev_priv->num_wa < I915_MAX_WA_REGS) { \ > + dev_priv->wa_regs[dev_priv->num_wa].addr = reg; \ > + dev_priv->wa_regs[dev_priv->num_wa].mask \ > + = (val) & 0xFFFF; \ > + dev_priv->num_wa++; \ > + I915_WRITE(reg, val); \ > + } \ If this macro is called too many times, there will be silent failures I believe. Should we add an else case that does a WARN_ON or prints a message? This will help developers catch bugs. Thanks, Mike > + }) > + > /* "Broadcast RGB" property */ > #define INTEL_BROADCAST_RGB_AUTO 0 > #define INTEL_BROADCAST_RGB_FULL 1 > diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c > index 1ddd4df..ab64b64 100644 > --- a/drivers/gpu/drm/i915/intel_pm.c > +++ b/drivers/gpu/drm/i915/intel_pm.c > @@ -5402,38 +5402,11 @@ static void gen8_init_clock_gating(struct drm_device *dev) > /* FIXME(BDW): Check all the w/a, some might only apply to > * pre-production hw. */ > > - /* WaDisablePartialInstShootdown:bdw */ > - I915_WRITE(GEN8_ROW_CHICKEN, > - _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE)); > - > - /* WaDisableThreadStallDopClockGating:bdw */ > - /* FIXME: Unclear whether we really need this on production bdw. */ > - I915_WRITE(GEN8_ROW_CHICKEN, > - _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); > - > - /* > - * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for > - * pre-production hardware > - */ > - I915_WRITE(HALF_SLICE_CHICKEN3, > - _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS)); > - I915_WRITE(HALF_SLICE_CHICKEN3, > - _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS)); > I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE)); > > I915_WRITE(_3D_CHICKEN3, > _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2))); > > - I915_WRITE(COMMON_SLICE_CHICKEN2, > - _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE)); > - > - I915_WRITE(GEN7_HALF_SLICE_CHICKEN1, > - _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE)); > - > - /* WaDisableDopClockGating:bdw May not be needed for production */ > - I915_WRITE(GEN7_ROW_CHICKEN2, > - _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); > - > /* WaSwitchSolVfFArbitrationPriority:bdw */ > I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL); > > @@ -5448,41 +5421,18 @@ static void gen8_init_clock_gating(struct drm_device *dev) > BDW_DPRS_MASK_VBLANK_SRD); > } > > - /* Use Force Non-Coherent whenever executing a 3D context. This is a > - * workaround for for a possible hang in the unlikely event a TLB > - * invalidation occurs during a PSD flush. > - */ > - I915_WRITE(HDC_CHICKEN0, > - I915_READ(HDC_CHICKEN0) | > - _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT)); > - > /* WaVSRefCountFullforceMissDisable:bdw */ > /* WaDSRefCountFullforceMissDisable:bdw */ > I915_WRITE(GEN7_FF_THREAD_MODE, > I915_READ(GEN7_FF_THREAD_MODE) & > ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME)); > > - /* > - * BSpec recommends 8x4 when MSAA is used, > - * however in practice 16x4 seems fastest. > - * > - * Note that PS/WM thread counts depend on the WIZ hashing > - * disable bit, which we don't touch here, but it's good > - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). > - */ > - I915_WRITE(GEN7_GT_MODE, > - GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4); > - > I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL, > _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE)); > > /* WaDisableSDEUnitClockGating:bdw */ > I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) | > GEN8_SDEUNIT_CLOCK_GATE_DISABLE); > - > - /* Wa4x4STCOptimizationDisable:bdw */ > - I915_WRITE(CACHE_MODE_1, > - _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE)); > } > > static void haswell_init_clock_gating(struct drm_device *dev) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index b3d8f76..abec00c 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -598,6 +598,69 @@ err: > return ret; > } > > +static void bdw_init_workarounds(struct drm_device *dev) > +{ > + struct drm_i915_private *dev_priv = dev->dev_private; > + > + /* > + * workarounds applied in this fn are part of register state context, > + * they need to be re-initialized followed by gpu reset, suspend/resume, > + * module reload. > + */ > + dev_priv->num_wa = 0; > + memset(dev_priv->wa_regs, 0, sizeof(dev_priv->wa_regs)); > + > + /* WaDisablePartialInstShootdown:bdw */ > + /* WaDisableThreadStallDopClockGating:bdw */ > + /* FIXME: Unclear whether we really need this on production bdw. */ > + > + I915_WRITE_WA(GEN8_ROW_CHICKEN, > + _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE > + | STALL_DOP_GATING_DISABLE)); > + > + /* WaDisableDopClockGating:bdw May not be needed for production */ > + I915_WRITE_WA(GEN7_ROW_CHICKEN2, > + _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); > + > + /* > + * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for > + * pre-production hardware > + */ > + I915_WRITE_WA(HALF_SLICE_CHICKEN3, > + _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS > + | GEN8_SAMPLER_POWER_BYPASS_DIS)); > + > + I915_WRITE_WA(GEN7_HALF_SLICE_CHICKEN1, > + _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE)); > + > + /* Wa4x4STCOptimizationDisable:bdw */ > + I915_WRITE_WA(CACHE_MODE_1, > + _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE)); > + > + /* > + * BSpec recommends 8x4 when MSAA is used, > + * however in practice 16x4 seems fastest. > + * > + * Note that PS/WM thread counts depend on the WIZ hashing > + * disable bit, which we don't touch here, but it's good > + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). > + */ > + I915_WRITE_WA(GEN7_GT_MODE, > + GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4); > + > + I915_WRITE_WA(COMMON_SLICE_CHICKEN2, > + _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE)); > + > + /* Use Force Non-Coherent whenever executing a 3D context. This is a > + * workaround for for a possible hang in the unlikely event a TLB > + * invalidation occurs during a PSD flush. > + */ > + I915_WRITE_WA(HDC_CHICKEN0, > + _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT)); > + > + DRM_DEBUG_DRIVER("No. of workarounds applied: %d\n", dev_priv->num_wa); > +} > + > static int init_render_ring(struct intel_engine_cs *ring) > { > struct drm_device *dev = ring->dev; > @@ -653,6 +716,9 @@ static int init_render_ring(struct intel_engine_cs *ring) > if (HAS_L3_DPF(dev)) > I915_WRITE_IMR(ring, ~GT_PARITY_ERROR(dev)); > > + if (IS_BROADWELL(dev)) > + bdw_init_workarounds(dev); > + > return ret; > } > >