* Re: [PATCH] drm/i915: GFDT support for SNB/IVB
2013-03-01 18:32 [PATCH] drm/i915: GFDT support for SNB/IVB ville.syrjala
@ 2013-03-03 16:28 ` Daniel Vetter
2013-03-03 17:39 ` Chris Wilson
2013-03-04 14:03 ` Ville Syrjälä
2013-03-06 16:28 ` Chris Wilson
1 sibling, 2 replies; 10+ messages in thread
From: Daniel Vetter @ 2013-03-03 16:28 UTC (permalink / raw)
To: ville.syrjala; +Cc: intel-gfx
On Fri, Mar 01, 2013 at 08:32:57PM +0200, ville.syrjala@linux.intel.com wrote:
> From: Ville Syrjälä <ville.syrjala@linux.intel.com>
>
> Currently all scanout buffers must be uncached because the
> display controller doesn't snoop the LLC. SNB introduced another
> method to guarantee coherency for the display controller. It's
> called the GFDT or graphics data type.
>
> Pages that have the GFDT bit enabled in their PTEs get flushed
> all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
> issued with the "synchronize GFDT" bit set.
>
> So rather than making all scanout buffers uncached, set the GFDT
> bit in their PTEs, and modify the ring flush functions to enable
> the "synchronize GFDT" bit.
>
> On HSW the GFDT bit was removed from the PTE, and it's only present in
> surface state, so we can't really set it from the kernel. Also the docs
> state that the hardware isn't actually guaranteed to respect the GFDT
> bit. So it looks like GFDT isn't all that useful on HSW.
>
> So far I've tried this very quickly on an IVB machine, and
> it seems to be working as advertised. No idea if it does any
> good though.
>
> TODO:
> - make sure there are no missing flushes (CPU access doesn't
> respect GFDT after all).
> - measure it and see if there's some real benefit
> - maybe we can track whether "synchronize GFDT" needs to be
> issued, and skip it when possible. needs some numbers to
> determine if it's worthwile.
>
> Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Iirc when I've tried this out a while back it regressed a few benchmarks.
Chris&me suspected cache trahsing, but hard to tell without proper
instrumentation support. Chris played around with a few tricks to mark
other giant bos as uncacheable, but he couldn't find any improved
workloads.
In short I think this needs to come with decent amounts of numbers
attached, like the TODO says ;-)
The other thing was that I didn't manage to get things to work properly,
leaving some random cachline dirt on the screen. Looking at your code, you
add the gfdt flush to every ring_flush, whereas I've tried to be clever
and only flushed after batches rendering to the frontbuffer. So probably a
bug in my code, or a flush on a given ring doesn't flush out caches for
one of the other engines.
And finally a bikeshed: I'd vote for a slightly more generice unsigned
long cache_flags instead of bool gfdt.
Cheers, Daniel
> ---
> drivers/gpu/drm/i915/i915_drv.h | 18 +++--
> drivers/gpu/drm/i915/i915_gem.c | 109 ++++++++++++++++++++++++-----
> drivers/gpu/drm/i915/i915_gem_context.c | 3 +-
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 10 +--
> drivers/gpu/drm/i915/i915_gem_gtt.c | 38 ++++++----
> drivers/gpu/drm/i915/i915_reg.h | 2 +
> drivers/gpu/drm/i915/intel_display.c | 4 +-
> drivers/gpu/drm/i915/intel_overlay.c | 2 +-
> drivers/gpu/drm/i915/intel_ringbuffer.c | 25 +++++++
> 9 files changed, 167 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e95337c..537b344 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -408,7 +408,8 @@ struct i915_gtt {
> void (*gtt_insert_entries)(struct drm_device *dev,
> struct sg_table *st,
> unsigned int pg_start,
> - enum i915_cache_level cache_level);
> + enum i915_cache_level cache_level,
> + bool gfdt);
> };
> #define gtt_total_entries(gtt) ((gtt).total >> PAGE_SHIFT)
>
> @@ -429,7 +430,8 @@ struct i915_hw_ppgtt {
> void (*insert_entries)(struct i915_hw_ppgtt *ppgtt,
> struct sg_table *st,
> unsigned int pg_start,
> - enum i915_cache_level cache_level);
> + enum i915_cache_level cache_level,
> + bool gfdt);
> void (*cleanup)(struct i915_hw_ppgtt *ppgtt);
> };
>
> @@ -1169,6 +1171,7 @@ struct drm_i915_gem_object {
> unsigned int fenced_gpu_access:1;
>
> unsigned int cache_level:2;
> + unsigned int gfdt:1;
>
> unsigned int has_aliasing_ppgtt_mapping:1;
> unsigned int has_global_gtt_mapping:1;
> @@ -1310,6 +1313,10 @@ struct drm_i915_file_private {
> #define HAS_LLC(dev) (INTEL_INFO(dev)->has_llc)
> #define I915_NEED_GFX_HWS(dev) (INTEL_INFO(dev)->need_gfx_hws)
>
> +/* Only SNB and IVB have GFDT in PTEs */
> +#define HAS_GFDT(dev) (HAS_LLC(dev) && !IS_HASWELL(dev) && \
> + (IS_GEN6(dev) || IS_GEN7(dev)))
> +
> #define HAS_HW_CONTEXTS(dev) (INTEL_INFO(dev)->gen >= 6)
> #define HAS_ALIASING_PPGTT(dev) (INTEL_INFO(dev)->gen >=6 && !IS_VALLEYVIEW(dev))
>
> @@ -1639,7 +1646,8 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
> int __must_check
> i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write);
> int __must_check
> -i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
> +i915_gem_object_pin_to_display_plane(struct drm_device *dev,
> + struct drm_i915_gem_object *obj,
> u32 alignment,
> struct intel_ring_buffer *pipelined);
> int i915_gem_attach_phys_object(struct drm_device *dev,
> @@ -1681,14 +1689,14 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
> void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev);
> void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
> struct drm_i915_gem_object *obj,
> - enum i915_cache_level cache_level);
> + enum i915_cache_level cache_level, bool gfdt);
> void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
> struct drm_i915_gem_object *obj);
>
> void i915_gem_restore_gtt_mappings(struct drm_device *dev);
> int __must_check i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj);
> void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
> - enum i915_cache_level cache_level);
> + enum i915_cache_level cache_level, bool gfdt);
> void i915_gem_gtt_unbind_object(struct drm_i915_gem_object *obj);
> void i915_gem_gtt_finish_object(struct drm_i915_gem_object *obj);
> void i915_gem_init_global_gtt(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 8413ffc..c635430 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3035,8 +3035,13 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj)
> * and flushes when moving into and out of the RENDER domain, correct
> * snooping behaviour occurs naturally as the result of our domain
> * tracking.
> + *
> + * Also flush all pinned objects with GFDT enabled. Such objects
> + * may be used for scanout, and the CPU doesn't know anything
> + * about GFDT.
> */
> - if (obj->cache_level != I915_CACHE_NONE)
> + if (obj->cache_level != I915_CACHE_NONE &&
> + !(obj->gfdt && obj->pin_count))
> return;
>
> trace_i915_gem_object_clflush(obj);
> @@ -3154,6 +3159,11 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> struct drm_device *dev = obj->base.dev;
> drm_i915_private_t *dev_priv = dev->dev_private;
> int ret;
> + bool gfdt = obj->gfdt;
> +
> + /* Clear GFDT when moving to uncached */
> + if (cache_level == I915_CACHE_NONE)
> + gfdt = false;
>
> if (obj->cache_level == cache_level)
> return 0;
> @@ -3187,10 +3197,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> }
>
> if (obj->has_global_gtt_mapping)
> - i915_gem_gtt_bind_object(obj, cache_level);
> + i915_gem_gtt_bind_object(obj, cache_level, gfdt);
> if (obj->has_aliasing_ppgtt_mapping)
> i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
> - obj, cache_level);
> + obj, cache_level, gfdt);
>
> obj->gtt_space->color = cache_level;
> }
> @@ -3219,6 +3229,65 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> }
>
> obj->cache_level = cache_level;
> + obj->gfdt = gfdt;
> + i915_gem_verify_gtt(dev);
> + return 0;
> +}
> +
> +static int i915_gem_object_set_gfdt(struct drm_i915_gem_object *obj,
> + bool gfdt)
> +{
> + struct drm_device *dev = obj->base.dev;
> + drm_i915_private_t *dev_priv = dev->dev_private;
> + int ret;
> +
> + if (!HAS_GFDT(dev))
> + return -ENODEV;
> +
> + if (obj->gfdt == gfdt)
> + return 0;
> +
> + /* no point in setting GFDT on uncached object */
> + if (obj->cache_level == I915_CACHE_NONE)
> + return -EINVAL;
> +
> + if (obj->gtt_space) {
> + ret = i915_gem_object_finish_gpu(obj);
> + if (ret)
> + return ret;
> +
> + i915_gem_object_finish_gtt(obj);
> +
> + if (obj->has_global_gtt_mapping)
> + i915_gem_gtt_bind_object(obj, obj->cache_level, gfdt);
> + if (obj->has_aliasing_ppgtt_mapping)
> + i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
> + obj, obj->cache_level, gfdt);
> + }
> +
> + if (gfdt) {
> + u32 old_read_domains, old_write_domain;
> +
> + /*
> + * Since we're just now enabling GFDT there's most
> + * likely dirty data in the LLC. It must be flushed
> + * to memory the old fashined way.
> + */
> + WARN_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
> + WARN_ON(obj->base.read_domains & ~I915_GEM_DOMAIN_CPU);
> +
> + old_read_domains = obj->base.read_domains;
> + old_write_domain = obj->base.write_domain;
> +
> + obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> + obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +
> + trace_i915_gem_object_change_domain(obj,
> + old_read_domains,
> + old_write_domain);
> + }
> +
> + obj->gfdt = gfdt;
> i915_gem_verify_gtt(dev);
> return 0;
> }
> @@ -3291,7 +3360,8 @@ unlock:
> * any flushes to be pipelined (for pageflips).
> */
> int
> -i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
> +i915_gem_object_pin_to_display_plane(struct drm_device *dev,
> + struct drm_i915_gem_object *obj,
> u32 alignment,
> struct intel_ring_buffer *pipelined)
> {
> @@ -3304,18 +3374,23 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
> return ret;
> }
>
> - /* The display engine is not coherent with the LLC cache on gen6. As
> - * a result, we make sure that the pinning that is about to occur is
> - * done with uncached PTEs. This is lowest common denominator for all
> - * chipsets.
> - *
> - * However for gen6+, we could do better by using the GFDT bit instead
> - * of uncaching, which would allow us to flush all the LLC-cached data
> - * with that bit in the PTE to main memory with just one PIPE_CONTROL.
> + /*
> + * Try to set the GFDT bit instead of uncaching. This allow us to flush
> + * all the LLC-cached data with that bit in the PTE to main memory with
> + * just one PIPE_CONTROL.
> */
> - ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
> - if (ret)
> - return ret;
> + ret = i915_gem_object_set_gfdt(obj, true);
> + if (ret) {
> + /*
> + * The display engine is not coherent with the LLC cache on gen6. As
> + * a result, we make sure that the pinning that is about to occur is
> + * done with uncached PTEs. This is lowest common denominator for all
> + * chipsets.
> + */
> + ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
> + if (ret)
> + return ret;
> + }
>
> /* As the user may map the buffer once pinned in the display plane
> * (e.g. libkms for the bootup splash), we have to ensure that we
> @@ -3499,11 +3574,11 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj,
> return ret;
>
> if (!dev_priv->mm.aliasing_ppgtt)
> - i915_gem_gtt_bind_object(obj, obj->cache_level);
> + i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
> }
>
> if (!obj->has_global_gtt_mapping && map_and_fenceable)
> - i915_gem_gtt_bind_object(obj, obj->cache_level);
> + i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
>
> obj->pin_count++;
> obj->pin_mappable |= map_and_fenceable;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 21177d9..7244c0c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -382,7 +382,8 @@ static int do_switch(struct i915_hw_context *to)
> }
>
> if (!to->obj->has_global_gtt_mapping)
> - i915_gem_gtt_bind_object(to->obj, to->obj->cache_level);
> + i915_gem_gtt_bind_object(to->obj, to->obj->cache_level,
> + to->obj->gfdt);
>
> if (!to->is_initialized || is_default_context(to))
> hw_flags |= MI_RESTORE_INHIBIT;
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 2f2daeb..28a3c42 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -197,7 +197,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
> !target_i915_obj->has_global_gtt_mapping)) {
> i915_gem_gtt_bind_object(target_i915_obj,
> - target_i915_obj->cache_level);
> + target_i915_obj->cache_level,
> + target_i915_obj->gfdt);
> }
>
> /* Validate that the target is in a valid r/w GPU domain */
> @@ -432,7 +433,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
> /* Ensure ppgtt mapping exists if needed */
> if (dev_priv->mm.aliasing_ppgtt && !obj->has_aliasing_ppgtt_mapping) {
> i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
> - obj, obj->cache_level);
> + obj, obj->cache_level, obj->gfdt);
>
> obj->has_aliasing_ppgtt_mapping = 1;
> }
> @@ -449,7 +450,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
>
> if (entry->flags & EXEC_OBJECT_NEEDS_GTT &&
> !obj->has_global_gtt_mapping)
> - i915_gem_gtt_bind_object(obj, obj->cache_level);
> + i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
>
> return 0;
> }
> @@ -1011,7 +1012,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> * hsw should have this fixed, but let's be paranoid and do it
> * unconditionally for now. */
> if (flags & I915_DISPATCH_SECURE && !batch_obj->has_global_gtt_mapping)
> - i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level);
> + i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level,
> + batch_obj->gfdt);
>
> ret = i915_gem_execbuffer_move_to_gpu(ring, &eb->objects);
> if (ret)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 926a1e2..3c7e48b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -42,15 +42,20 @@ typedef uint32_t gtt_pte_t;
> #define HSW_PTE_UNCACHED (0)
> #define GEN6_PTE_CACHE_LLC (2 << 1)
> #define GEN6_PTE_CACHE_LLC_MLC (3 << 1)
> +#define GEN6_PTE_GFDT (1 << 3)
> #define GEN6_PTE_ADDR_ENCODE(addr) GEN6_GTT_ADDR_ENCODE(addr)
>
> static inline gtt_pte_t gen6_pte_encode(struct drm_device *dev,
> dma_addr_t addr,
> - enum i915_cache_level level)
> + enum i915_cache_level level,
> + bool gfdt)
> {
> gtt_pte_t pte = GEN6_PTE_VALID;
> pte |= GEN6_PTE_ADDR_ENCODE(addr);
>
> + if (gfdt && HAS_GFDT(dev))
> + pte |= GEN6_PTE_GFDT;
> +
> switch (level) {
> case I915_CACHE_LLC_MLC:
> /* Haswell doesn't set L3 this way */
> @@ -89,7 +94,7 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
>
> scratch_pte = gen6_pte_encode(ppgtt->dev,
> ppgtt->scratch_page_dma_addr,
> - I915_CACHE_LLC);
> + I915_CACHE_LLC, false);
>
> while (num_entries) {
> last_pte = first_pte + num_entries;
> @@ -112,7 +117,8 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
> static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
> struct sg_table *pages,
> unsigned first_entry,
> - enum i915_cache_level cache_level)
> + enum i915_cache_level cache_level,
> + bool gfdt)
> {
> gtt_pte_t *pt_vaddr;
> unsigned act_pd = first_entry / I915_PPGTT_PT_ENTRIES;
> @@ -133,7 +139,7 @@ static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
> for (j = first_pte; j < I915_PPGTT_PT_ENTRIES; j++) {
> page_addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
> pt_vaddr[j] = gen6_pte_encode(ppgtt->dev, page_addr,
> - cache_level);
> + cache_level, gfdt);
>
> /* grab the next page */
> if (++m == segment_len) {
> @@ -279,11 +285,12 @@ void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev)
>
> void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
> struct drm_i915_gem_object *obj,
> - enum i915_cache_level cache_level)
> + enum i915_cache_level cache_level,
> + bool gfdt)
> {
> ppgtt->insert_entries(ppgtt, obj->pages,
> obj->gtt_space->start >> PAGE_SHIFT,
> - cache_level);
> + cache_level, gfdt);
> }
>
> void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
> @@ -401,7 +408,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
>
> list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
> i915_gem_clflush_object(obj);
> - i915_gem_gtt_bind_object(obj, obj->cache_level);
> + i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
> }
>
> i915_gem_chipset_flush(dev);
> @@ -429,7 +436,8 @@ int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
> static void gen6_ggtt_insert_entries(struct drm_device *dev,
> struct sg_table *st,
> unsigned int first_entry,
> - enum i915_cache_level level)
> + enum i915_cache_level level,
> + bool gfdt)
> {
> struct drm_i915_private *dev_priv = dev->dev_private;
> struct scatterlist *sg = st->sgl;
> @@ -443,7 +451,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
> len = sg_dma_len(sg) >> PAGE_SHIFT;
> for (m = 0; m < len; m++) {
> addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
> - iowrite32(gen6_pte_encode(dev, addr, level),
> + iowrite32(gen6_pte_encode(dev, addr, level, gfdt),
> >t_entries[i]);
> i++;
> }
> @@ -457,7 +465,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
> */
> if (i != 0)
> WARN_ON(readl(>t_entries[i-1])
> - != gen6_pte_encode(dev, addr, level));
> + != gen6_pte_encode(dev, addr, level, gfdt));
>
> /* This next bit makes the above posting read even more important. We
> * want to flush the TLBs only after we're certain all the PTE updates
> @@ -483,7 +491,7 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
> num_entries = max_entries;
>
> scratch_pte = gen6_pte_encode(dev, dev_priv->gtt.scratch_page_dma,
> - I915_CACHE_LLC);
> + I915_CACHE_LLC, false);
> for (i = 0; i < num_entries; i++)
> iowrite32(scratch_pte, >t_base[i]);
> readl(gtt_base);
> @@ -493,7 +501,8 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
> static void i915_ggtt_insert_entries(struct drm_device *dev,
> struct sg_table *st,
> unsigned int pg_start,
> - enum i915_cache_level cache_level)
> + enum i915_cache_level cache_level,
> + bool gfdt)
> {
> unsigned int flags = (cache_level == I915_CACHE_NONE) ?
> AGP_USER_MEMORY : AGP_USER_CACHED_MEMORY;
> @@ -511,14 +520,15 @@ static void i915_ggtt_clear_range(struct drm_device *dev,
>
>
> void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
> - enum i915_cache_level cache_level)
> + enum i915_cache_level cache_level,
> + bool gfdt)
> {
> struct drm_device *dev = obj->base.dev;
> struct drm_i915_private *dev_priv = dev->dev_private;
>
> dev_priv->gtt.gtt_insert_entries(dev, obj->pages,
> obj->gtt_space->start >> PAGE_SHIFT,
> - cache_level);
> + cache_level, gfdt);
>
> obj->has_global_gtt_mapping = 1;
> }
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index cd226c2..6aa29d5 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -242,6 +242,7 @@
> #define MI_FLUSH_DW MI_INSTR(0x26, 1) /* for GEN6 */
> #define MI_FLUSH_DW_STORE_INDEX (1<<21)
> #define MI_INVALIDATE_TLB (1<<18)
> +#define MI_SYNCHRONIZE_GFDT (1<<17)
> #define MI_FLUSH_DW_OP_STOREDW (1<<14)
> #define MI_INVALIDATE_BSD (1<<7)
> #define MI_FLUSH_DW_USE_GTT (1<<2)
> @@ -311,6 +312,7 @@
> #define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24) /* gen7+ */
> #define PIPE_CONTROL_CS_STALL (1<<20)
> #define PIPE_CONTROL_TLB_INVALIDATE (1<<18)
> +#define PIPE_CONTROL_SYNCHRONIZE_GFDT (1<<17)
> #define PIPE_CONTROL_QW_WRITE (1<<14)
> #define PIPE_CONTROL_DEPTH_STALL (1<<13)
> #define PIPE_CONTROL_WRITE_FLUSH (1<<12)
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index d912306..fc99d55 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -1968,7 +1968,7 @@ intel_pin_and_fence_fb_obj(struct drm_device *dev,
> }
>
> dev_priv->mm.interruptible = false;
> - ret = i915_gem_object_pin_to_display_plane(obj, alignment, pipelined);
> + ret = i915_gem_object_pin_to_display_plane(dev, obj, alignment, pipelined);
> if (ret)
> goto err_interruptible;
>
> @@ -6340,7 +6340,7 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
> goto fail_locked;
> }
>
> - ret = i915_gem_object_pin_to_display_plane(obj, 0, NULL);
> + ret = i915_gem_object_pin_to_display_plane(dev, obj, 0, NULL);
> if (ret) {
> DRM_ERROR("failed to move cursor bo into the GTT\n");
> goto fail_locked;
> diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
> index 67a2501..b4110a2 100644
> --- a/drivers/gpu/drm/i915/intel_overlay.c
> +++ b/drivers/gpu/drm/i915/intel_overlay.c
> @@ -695,7 +695,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
> if (ret != 0)
> return ret;
>
> - ret = i915_gem_object_pin_to_display_plane(new_bo, 0, NULL);
> + ret = i915_gem_object_pin_to_display_plane(dev, new_bo, 0, NULL);
> if (ret != 0)
> return ret;
>
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 1d5d613..8406247 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -234,6 +234,12 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
> * when the render cache is indeed flushed.
> */
> flags |= PIPE_CONTROL_CS_STALL;
> +
> + /* Flush GFDT out to memory */
> + if (HAS_GFDT(ring->dev)) {
> + flags |= PIPE_CONTROL_QW_WRITE;
> + flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
> + }
> }
> if (invalidate_domains) {
> flags |= PIPE_CONTROL_TLB_INVALIDATE;
> @@ -306,6 +312,13 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
> if (flush_domains) {
> flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
> flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
> +
> + /* Flush GFDT out to memory */
> + if (HAS_GFDT(ring->dev)) {
> + flags |= PIPE_CONTROL_QW_WRITE;
> + flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
> + flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
> + }
> }
> if (invalidate_domains) {
> flags |= PIPE_CONTROL_TLB_INVALIDATE;
> @@ -1557,6 +1570,12 @@ static int gen6_ring_flush(struct intel_ring_buffer *ring,
> return ret;
>
> cmd = MI_FLUSH_DW;
> +
> + /* Flush GFDT out to memory */
> + if (flush & I915_GEM_GPU_DOMAINS && HAS_GFDT(ring->dev))
> + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
> + MI_SYNCHRONIZE_GFDT;
> +
> /*
> * Bspec vol 1c.5 - video engine command streamer:
> * "If ENABLED, all TLBs will be invalidated once the flush
> @@ -1629,6 +1648,12 @@ static int blt_ring_flush(struct intel_ring_buffer *ring,
> return ret;
>
> cmd = MI_FLUSH_DW;
> +
> + /* Flush GFDT out to memory */
> + if (flush & I915_GEM_DOMAIN_RENDER && HAS_GFDT(ring->dev))
> + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
> + MI_SYNCHRONIZE_GFDT;
> +
> /*
> * Bspec vol 1c.3 - blitter engine command streamer:
> * "If ENABLED, all TLBs will be invalidated once the flush
> --
> 1.7.12.4
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH] drm/i915: GFDT support for SNB/IVB
2013-03-01 18:32 [PATCH] drm/i915: GFDT support for SNB/IVB ville.syrjala
2013-03-03 16:28 ` Daniel Vetter
@ 2013-03-06 16:28 ` Chris Wilson
2013-04-04 9:01 ` Daniel Vetter
1 sibling, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2013-03-06 16:28 UTC (permalink / raw)
To: intel-gfx
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Currently all scanout buffers must be uncached because the
display controller doesn't snoop the LLC. SNB introduced another
method to guarantee coherency for the display controller. It's
called the GFDT or graphics data type.
Pages that have the GFDT bit enabled in their PTEs get flushed
all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
issued with the "synchronize GFDT" bit set.
So rather than making all scanout buffers uncached, set the GFDT
bit in their PTEs, and modify the ring flush functions to enable
the "synchronize GFDT" bit.
On HSW the GFDT bit was removed from the PTE, and it's only present in
surface state, so we can't really set it from the kernel. Also the docs
state that the hardware isn't actually guaranteed to respect the GFDT
bit. So it looks like GFDT isn't all that useful on HSW.
So far I've tried this very quickly on an IVB machine, and
it seems to be working as advertised. No idea if it does any
good though.
On an i5-2520m (laptop) running gnome-shell at 1366x768:
padman 140.78 -> 145.98 fps
openarena 183.72 -> 186.87 fps
gtkperf ComboBoxEntry 20.27 -> 22.14s
gtkperf pixbuf 1.12 -> 1.47s
x11perf -aa10text 13.40 -> 13.20 Mglyphs
which are well within the throttling noise.
v2 [ickle]: adapt to comply with existing userspace guarantees
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_drv.c | 4 +
drivers/gpu/drm/i915/i915_drv.h | 16 +++-
drivers/gpu/drm/i915/i915_gem.c | 111 ++++++++++++++++++++++++----
drivers/gpu/drm/i915/i915_gem_context.c | 5 +-
drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 ++-
drivers/gpu/drm/i915/i915_gem_gtt.c | 38 ++++++----
drivers/gpu/drm/i915/i915_reg.h | 2 +
drivers/gpu/drm/i915/i915_trace.h | 10 ++-
drivers/gpu/drm/i915/intel_ringbuffer.c | 79 +++++++++++++++++---
drivers/gpu/drm/i915/intel_ringbuffer.h | 5 +-
10 files changed, 225 insertions(+), 57 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 66d06ac..ff935f1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -235,6 +235,7 @@ static const struct intel_device_info intel_sandybridge_d_info = {
.has_bsd_ring = 1,
.has_blt_ring = 1,
.has_llc = 1,
+ .has_gfdt = 1,
.has_force_wake = 1,
};
@@ -245,6 +246,7 @@ static const struct intel_device_info intel_sandybridge_m_info = {
.has_bsd_ring = 1,
.has_blt_ring = 1,
.has_llc = 1,
+ .has_gfdt = 1,
.has_force_wake = 1,
};
@@ -254,6 +256,7 @@ static const struct intel_device_info intel_ivybridge_d_info = {
.has_bsd_ring = 1,
.has_blt_ring = 1,
.has_llc = 1,
+ .has_gfdt = 1,
.has_force_wake = 1,
};
@@ -264,6 +267,7 @@ static const struct intel_device_info intel_ivybridge_m_info = {
.has_bsd_ring = 1,
.has_blt_ring = 1,
.has_llc = 1,
+ .has_gfdt = 1,
.has_force_wake = 1,
};
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 535bf29..9841dd7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -367,6 +367,7 @@ struct intel_device_info {
u8 has_bsd_ring:1;
u8 has_blt_ring:1;
u8 has_llc:1;
+ u8 has_gfdt:1;
};
enum i915_cache_level {
@@ -409,7 +410,8 @@ struct i915_gtt {
void (*gtt_insert_entries)(struct drm_device *dev,
struct sg_table *st,
unsigned int pg_start,
- enum i915_cache_level cache_level);
+ enum i915_cache_level cache_level,
+ bool gfdt);
};
#define gtt_total_entries(gtt) ((gtt).total >> PAGE_SHIFT)
@@ -430,7 +432,8 @@ struct i915_hw_ppgtt {
void (*insert_entries)(struct i915_hw_ppgtt *ppgtt,
struct sg_table *st,
unsigned int pg_start,
- enum i915_cache_level cache_level);
+ enum i915_cache_level cache_level,
+ bool gfdt);
void (*cleanup)(struct i915_hw_ppgtt *ppgtt);
};
@@ -1170,6 +1173,8 @@ struct drm_i915_gem_object {
unsigned int fenced_gpu_access:1;
unsigned int cache_level:2;
+ unsigned int gfdt:1;
+ unsigned int gfdt_dirty:1;
unsigned int has_aliasing_ppgtt_mapping:1;
unsigned int has_global_gtt_mapping:1;
@@ -1328,6 +1333,9 @@ struct drm_i915_file_private {
#define HAS_LLC(dev) (INTEL_INFO(dev)->has_llc)
#define I915_NEED_GFX_HWS(dev) (INTEL_INFO(dev)->need_gfx_hws)
+/* Only SNB and IVB have GFDT in PTEs */
+#define HAS_GFDT(dev) (INTEL_INFO(dev)->has_gfdt)
+
#define HAS_HW_CONTEXTS(dev) (INTEL_INFO(dev)->gen >= 6)
#define HAS_ALIASING_PPGTT(dev) (INTEL_INFO(dev)->gen >=6 && !IS_VALLEYVIEW(dev))
@@ -1702,14 +1710,14 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev);
void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
struct drm_i915_gem_object *obj,
- enum i915_cache_level cache_level);
+ enum i915_cache_level cache_level, bool gfdt);
void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
struct drm_i915_gem_object *obj);
void i915_gem_restore_gtt_mappings(struct drm_device *dev);
int __must_check i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj);
void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
- enum i915_cache_level cache_level);
+ enum i915_cache_level cache_level, bool gfdt);
void i915_gem_gtt_unbind_object(struct drm_i915_gem_object *obj);
void i915_gem_gtt_finish_object(struct drm_i915_gem_object *obj);
void i915_gem_init_global_gtt(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 19fc21b..2b4bc88 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2282,6 +2282,25 @@ i915_gem_retire_work_handler(struct work_struct *work)
mutex_unlock(&dev->struct_mutex);
}
+static int i915_gem_object_flush_gfdt(struct drm_i915_gem_object *obj,
+ struct intel_ring_buffer *ring)
+{
+ struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
+ int ret;
+
+ if (!obj->gfdt_dirty)
+ return 0;
+
+ if (ring == NULL)
+ ring = &dev_priv->ring[BCS];
+ ret = intel_ring_flush_internal(ring, I915_FLUSH_GFDT);
+ if (ret)
+ return ret;
+
+ obj->gfdt_dirty = false;
+ return 0;
+}
+
/**
* Ensures that an object will eventually get non-busy by flushing any required
* write domains, emitting any outstanding lazy request and retiring and
@@ -2292,6 +2311,10 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
{
int ret;
+ ret = i915_gem_object_flush_gfdt(obj, obj->ring);
+ if (ret)
+ return ret;
+
if (obj->active) {
ret = i915_gem_check_olr(obj->ring, obj->last_read_seqno);
if (ret)
@@ -3034,6 +3057,9 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj)
if (obj->stolen)
return;
+ if (obj->gfdt)
+ obj->gfdt_dirty = 1;
+
/* If the GPU is snooping the contents of the CPU cache,
* we do not need to manually clear the CPU cache lines. However,
* the caches are only snooped when the render cache is
@@ -3161,7 +3187,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
drm_i915_private_t *dev_priv = dev->dev_private;
int ret;
- if (obj->cache_level == cache_level)
+ if (obj->cache_level == cache_level && !obj->gfdt)
return 0;
if (obj->pin_count) {
@@ -3193,10 +3219,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
}
if (obj->has_global_gtt_mapping)
- i915_gem_gtt_bind_object(obj, cache_level);
+ i915_gem_gtt_bind_object(obj, cache_level, false);
if (obj->has_aliasing_ppgtt_mapping)
i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
- obj, cache_level);
+ obj, cache_level, false);
obj->gtt_space->color = cache_level;
}
@@ -3225,6 +3251,48 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
}
obj->cache_level = cache_level;
+ obj->gfdt_dirty = obj->gfdt = false;
+ i915_gem_verify_gtt(dev);
+ return 0;
+}
+
+static int i915_gem_object_set_gfdt(struct drm_i915_gem_object *obj,
+ bool gfdt)
+{
+ struct drm_device *dev = obj->base.dev;
+ drm_i915_private_t *dev_priv = dev->dev_private;
+ int ret;
+
+ if (!HAS_GFDT(dev))
+ return -ENODEV;
+
+ if (obj->gfdt == gfdt)
+ return 0;
+
+ /* no point in setting GFDT on uncached object */
+ if (obj->cache_level == I915_CACHE_NONE)
+ return -EINVAL;
+
+ if (obj->gtt_space) {
+ ret = i915_gem_object_finish_gpu(obj);
+ if (ret)
+ return ret;
+
+ i915_gem_object_finish_gtt(obj);
+
+ if (obj->has_global_gtt_mapping)
+ i915_gem_gtt_bind_object(obj, obj->cache_level, gfdt);
+ if (obj->has_aliasing_ppgtt_mapping)
+ i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
+ obj, obj->cache_level, gfdt);
+ }
+
+ /* Explicity perform the clflush to clear untagged dirty data */
+ if (obj->pages && !obj->stolen)
+ drm_clflush_sg(obj->pages);
+
+ obj->gfdt = gfdt;
+ obj->gfdt_dirty = false;
i915_gem_verify_gtt(dev);
return 0;
}
@@ -3310,18 +3378,23 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
return ret;
}
- /* The display engine is not coherent with the LLC cache on gen6. As
- * a result, we make sure that the pinning that is about to occur is
- * done with uncached PTEs. This is lowest common denominator for all
- * chipsets.
- *
- * However for gen6+, we could do better by using the GFDT bit instead
- * of uncaching, which would allow us to flush all the LLC-cached data
- * with that bit in the PTE to main memory with just one PIPE_CONTROL.
+ /*
+ * Try to set the GFDT bit instead of uncaching. This allow us to flush
+ * all the LLC-cached data with that bit in the PTE to main memory with
+ * just one PIPE_CONTROL.
*/
- ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
- if (ret)
- return ret;
+ ret = i915_gem_object_set_gfdt(obj, true);
+ if (ret) {
+ /*
+ * The display engine is not coherent with the LLC cache on gen6. As
+ * a result, we make sure that the pinning that is about to occur is
+ * done with uncached PTEs. This is lowest common denominator for all
+ * chipsets.
+ */
+ ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
+ if (ret)
+ return ret;
+ }
/* As the user may map the buffer once pinned in the display plane
* (e.g. libkms for the bootup splash), we have to ensure that we
@@ -3346,6 +3419,12 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
old_read_domains,
old_write_domain);
+ ret = i915_gem_object_flush_gfdt(obj, pipelined);
+ if (ret) {
+ i915_gem_object_unpin(obj);
+ return ret;
+ }
+
return 0;
}
@@ -3505,11 +3584,11 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj,
return ret;
if (!dev_priv->mm.aliasing_ppgtt)
- i915_gem_gtt_bind_object(obj, obj->cache_level);
+ i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
}
if (!obj->has_global_gtt_mapping && map_and_fenceable)
- i915_gem_gtt_bind_object(obj, obj->cache_level);
+ i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
obj->pin_count++;
obj->pin_mappable |= map_and_fenceable;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 94d873a..b8301d3 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -309,7 +309,7 @@ mi_set_context(struct intel_ring_buffer *ring,
* itlb_before_ctx_switch.
*/
if (IS_GEN6(ring->dev) && ring->itlb_before_ctx_switch) {
- ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, 0);
+ ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, 0, 0);
if (ret)
return ret;
}
@@ -371,7 +371,8 @@ static int do_switch(struct i915_hw_context *to)
}
if (!to->obj->has_global_gtt_mapping)
- i915_gem_gtt_bind_object(to->obj, to->obj->cache_level);
+ i915_gem_gtt_bind_object(to->obj, to->obj->cache_level,
+ to->obj->gfdt);
if (!to->is_initialized || is_default_context(to))
hw_flags |= MI_RESTORE_INHIBIT;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6ae62ea..4ad0323 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -197,7 +197,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
!target_i915_obj->has_global_gtt_mapping)) {
i915_gem_gtt_bind_object(target_i915_obj,
- target_i915_obj->cache_level);
+ target_i915_obj->cache_level,
+ target_i915_obj->gfdt);
}
/* Validate that the target is in a valid r/w GPU domain */
@@ -433,7 +434,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
/* Ensure ppgtt mapping exists if needed */
if (dev_priv->mm.aliasing_ppgtt && !obj->has_aliasing_ppgtt_mapping) {
i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
- obj, obj->cache_level);
+ obj, obj->cache_level, obj->gfdt);
obj->has_aliasing_ppgtt_mapping = 1;
}
@@ -450,7 +451,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
if (entry->flags & EXEC_OBJECT_NEEDS_GTT &&
!obj->has_global_gtt_mapping)
- i915_gem_gtt_bind_object(obj, obj->cache_level);
+ i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
return 0;
}
@@ -778,6 +779,8 @@ i915_gem_execbuffer_move_to_active(struct list_head *objects,
i915_gem_object_move_to_active(obj, ring);
if (obj->base.write_domain) {
obj->dirty = 1;
+ if (obj->gfdt)
+ obj->gfdt_dirty = 1;
obj->last_write_seqno = intel_ring_get_seqno(ring);
if (obj->pin_count) /* check for potential scanout */
intel_mark_fb_busy(obj);
@@ -1011,7 +1014,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
* hsw should have this fixed, but let's be paranoid and do it
* unconditionally for now. */
if (flags & I915_DISPATCH_SECURE && !batch_obj->has_global_gtt_mapping)
- i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level);
+ i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level,
+ batch_obj->gfdt);
ret = i915_gem_execbuffer_move_to_gpu(ring, &eb->objects);
if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 926a1e2..3c7e48b 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -42,15 +42,20 @@ typedef uint32_t gtt_pte_t;
#define HSW_PTE_UNCACHED (0)
#define GEN6_PTE_CACHE_LLC (2 << 1)
#define GEN6_PTE_CACHE_LLC_MLC (3 << 1)
+#define GEN6_PTE_GFDT (1 << 3)
#define GEN6_PTE_ADDR_ENCODE(addr) GEN6_GTT_ADDR_ENCODE(addr)
static inline gtt_pte_t gen6_pte_encode(struct drm_device *dev,
dma_addr_t addr,
- enum i915_cache_level level)
+ enum i915_cache_level level,
+ bool gfdt)
{
gtt_pte_t pte = GEN6_PTE_VALID;
pte |= GEN6_PTE_ADDR_ENCODE(addr);
+ if (gfdt && HAS_GFDT(dev))
+ pte |= GEN6_PTE_GFDT;
+
switch (level) {
case I915_CACHE_LLC_MLC:
/* Haswell doesn't set L3 this way */
@@ -89,7 +94,7 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
scratch_pte = gen6_pte_encode(ppgtt->dev,
ppgtt->scratch_page_dma_addr,
- I915_CACHE_LLC);
+ I915_CACHE_LLC, false);
while (num_entries) {
last_pte = first_pte + num_entries;
@@ -112,7 +117,8 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
struct sg_table *pages,
unsigned first_entry,
- enum i915_cache_level cache_level)
+ enum i915_cache_level cache_level,
+ bool gfdt)
{
gtt_pte_t *pt_vaddr;
unsigned act_pd = first_entry / I915_PPGTT_PT_ENTRIES;
@@ -133,7 +139,7 @@ static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
for (j = first_pte; j < I915_PPGTT_PT_ENTRIES; j++) {
page_addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
pt_vaddr[j] = gen6_pte_encode(ppgtt->dev, page_addr,
- cache_level);
+ cache_level, gfdt);
/* grab the next page */
if (++m == segment_len) {
@@ -279,11 +285,12 @@ void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev)
void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
struct drm_i915_gem_object *obj,
- enum i915_cache_level cache_level)
+ enum i915_cache_level cache_level,
+ bool gfdt)
{
ppgtt->insert_entries(ppgtt, obj->pages,
obj->gtt_space->start >> PAGE_SHIFT,
- cache_level);
+ cache_level, gfdt);
}
void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
@@ -401,7 +408,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
i915_gem_clflush_object(obj);
- i915_gem_gtt_bind_object(obj, obj->cache_level);
+ i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
}
i915_gem_chipset_flush(dev);
@@ -429,7 +436,8 @@ int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
static void gen6_ggtt_insert_entries(struct drm_device *dev,
struct sg_table *st,
unsigned int first_entry,
- enum i915_cache_level level)
+ enum i915_cache_level level,
+ bool gfdt)
{
struct drm_i915_private *dev_priv = dev->dev_private;
struct scatterlist *sg = st->sgl;
@@ -443,7 +451,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
len = sg_dma_len(sg) >> PAGE_SHIFT;
for (m = 0; m < len; m++) {
addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
- iowrite32(gen6_pte_encode(dev, addr, level),
+ iowrite32(gen6_pte_encode(dev, addr, level, gfdt),
>t_entries[i]);
i++;
}
@@ -457,7 +465,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
*/
if (i != 0)
WARN_ON(readl(>t_entries[i-1])
- != gen6_pte_encode(dev, addr, level));
+ != gen6_pte_encode(dev, addr, level, gfdt));
/* This next bit makes the above posting read even more important. We
* want to flush the TLBs only after we're certain all the PTE updates
@@ -483,7 +491,7 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
num_entries = max_entries;
scratch_pte = gen6_pte_encode(dev, dev_priv->gtt.scratch_page_dma,
- I915_CACHE_LLC);
+ I915_CACHE_LLC, false);
for (i = 0; i < num_entries; i++)
iowrite32(scratch_pte, >t_base[i]);
readl(gtt_base);
@@ -493,7 +501,8 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
static void i915_ggtt_insert_entries(struct drm_device *dev,
struct sg_table *st,
unsigned int pg_start,
- enum i915_cache_level cache_level)
+ enum i915_cache_level cache_level,
+ bool gfdt)
{
unsigned int flags = (cache_level == I915_CACHE_NONE) ?
AGP_USER_MEMORY : AGP_USER_CACHED_MEMORY;
@@ -511,14 +520,15 @@ static void i915_ggtt_clear_range(struct drm_device *dev,
void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
- enum i915_cache_level cache_level)
+ enum i915_cache_level cache_level,
+ bool gfdt)
{
struct drm_device *dev = obj->base.dev;
struct drm_i915_private *dev_priv = dev->dev_private;
dev_priv->gtt.gtt_insert_entries(dev, obj->pages,
obj->gtt_space->start >> PAGE_SHIFT,
- cache_level);
+ cache_level, gfdt);
obj->has_global_gtt_mapping = 1;
}
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 4cf3ece..6675af2 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -242,6 +242,7 @@
#define MI_FLUSH_DW MI_INSTR(0x26, 1) /* for GEN6 */
#define MI_FLUSH_DW_STORE_INDEX (1<<21)
#define MI_INVALIDATE_TLB (1<<18)
+#define MI_SYNCHRONIZE_GFDT (1<<17)
#define MI_FLUSH_DW_OP_STOREDW (1<<14)
#define MI_INVALIDATE_BSD (1<<7)
#define MI_FLUSH_DW_USE_GTT (1<<2)
@@ -311,6 +312,7 @@
#define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24) /* gen7+ */
#define PIPE_CONTROL_CS_STALL (1<<20)
#define PIPE_CONTROL_TLB_INVALIDATE (1<<18)
+#define PIPE_CONTROL_SYNCHRONIZE_GFDT (1<<17)
#define PIPE_CONTROL_QW_WRITE (1<<14)
#define PIPE_CONTROL_DEPTH_STALL (1<<13)
#define PIPE_CONTROL_WRITE_FLUSH (1<<12)
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 3db4a68..ae2c7b5 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -252,14 +252,15 @@ TRACE_EVENT(i915_gem_ring_dispatch,
);
TRACE_EVENT(i915_gem_ring_flush,
- TP_PROTO(struct intel_ring_buffer *ring, u32 invalidate, u32 flush),
- TP_ARGS(ring, invalidate, flush),
+ TP_PROTO(struct intel_ring_buffer *ring, u32 invalidate, u32 flush, u32 internal),
+ TP_ARGS(ring, invalidate, flush, internal),
TP_STRUCT__entry(
__field(u32, dev)
__field(u32, ring)
__field(u32, invalidate)
__field(u32, flush)
+ __field(u32, internal)
),
TP_fast_assign(
@@ -267,11 +268,12 @@ TRACE_EVENT(i915_gem_ring_flush,
__entry->ring = ring->id;
__entry->invalidate = invalidate;
__entry->flush = flush;
+ __entry->internal = internal;
),
- TP_printk("dev=%u, ring=%x, invalidate=%04x, flush=%04x",
+ TP_printk("dev=%u, ring=%x, invalidate=%04x, flush=%04x, internal=%08x",
__entry->dev, __entry->ring,
- __entry->invalidate, __entry->flush)
+ __entry->invalidate, __entry->flush, __entry->internal)
);
DECLARE_EVENT_CLASS(i915_gem_request,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 1d5d613..9b1ae4d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -54,7 +54,8 @@ static inline int ring_space(struct intel_ring_buffer *ring)
static int
gen2_render_ring_flush(struct intel_ring_buffer *ring,
u32 invalidate_domains,
- u32 flush_domains)
+ u32 flush_domains,
+ u32 internal)
{
u32 cmd;
int ret;
@@ -80,7 +81,8 @@ gen2_render_ring_flush(struct intel_ring_buffer *ring,
static int
gen4_render_ring_flush(struct intel_ring_buffer *ring,
u32 invalidate_domains,
- u32 flush_domains)
+ u32 flush_domains,
+ u32 internal)
{
struct drm_device *dev = ring->dev;
u32 cmd;
@@ -210,7 +212,9 @@ intel_emit_post_sync_nonzero_flush(struct intel_ring_buffer *ring)
static int
gen6_render_ring_flush(struct intel_ring_buffer *ring,
- u32 invalidate_domains, u32 flush_domains)
+ u32 invalidate_domains,
+ u32 flush_domains,
+ u32 internal)
{
u32 flags = 0;
struct pipe_control *pc = ring->private;
@@ -248,6 +252,12 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
}
+ /* Flush GFDT out to memory */
+ if (internal & I915_FLUSH_GFDT) {
+ flags |= PIPE_CONTROL_QW_WRITE;
+ flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
+ }
+
ret = intel_ring_begin(ring, 4);
if (ret)
return ret;
@@ -282,7 +292,9 @@ gen7_render_ring_cs_stall_wa(struct intel_ring_buffer *ring)
static int
gen7_render_ring_flush(struct intel_ring_buffer *ring,
- u32 invalidate_domains, u32 flush_domains)
+ u32 invalidate_domains,
+ u32 flush_domains,
+ u32 internal)
{
u32 flags = 0;
struct pipe_control *pc = ring->private;
@@ -325,6 +337,12 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
* invalidate bit set. */
gen7_render_ring_cs_stall_wa(ring);
}
+ /* Flush GFDT out to memory */
+ if (internal & I915_FLUSH_GFDT) {
+ flags |= PIPE_CONTROL_QW_WRITE;
+ flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
+ flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
+ }
ret = intel_ring_begin(ring, 4);
if (ret)
@@ -912,8 +930,9 @@ void intel_ring_setup_status_page(struct intel_ring_buffer *ring)
static int
bsd_ring_flush(struct intel_ring_buffer *ring,
- u32 invalidate_domains,
- u32 flush_domains)
+ u32 invalidate_domains,
+ u32 flush_domains,
+ u32 internal)
{
int ret;
@@ -1547,7 +1566,9 @@ static void gen6_bsd_ring_write_tail(struct intel_ring_buffer *ring,
}
static int gen6_ring_flush(struct intel_ring_buffer *ring,
- u32 invalidate, u32 flush)
+ u32 invalidate,
+ u32 flush,
+ u32 internal)
{
uint32_t cmd;
int ret;
@@ -1557,6 +1578,12 @@ static int gen6_ring_flush(struct intel_ring_buffer *ring,
return ret;
cmd = MI_FLUSH_DW;
+
+ /* Flush GFDT out to memory */
+ if (internal & I915_FLUSH_GFDT)
+ cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
+ MI_SYNCHRONIZE_GFDT;
+
/*
* Bspec vol 1c.5 - video engine command streamer:
* "If ENABLED, all TLBs will be invalidated once the flush
@@ -1619,7 +1646,9 @@ gen6_ring_dispatch_execbuffer(struct intel_ring_buffer *ring,
/* Blitter support (SandyBridge+) */
static int blt_ring_flush(struct intel_ring_buffer *ring,
- u32 invalidate, u32 flush)
+ u32 invalidate,
+ u32 flush,
+ u32 internal)
{
uint32_t cmd;
int ret;
@@ -1629,6 +1658,12 @@ static int blt_ring_flush(struct intel_ring_buffer *ring,
return ret;
cmd = MI_FLUSH_DW;
+
+ /* Flush GFDT out to memory */
+ if (internal & I915_FLUSH_GFDT)
+ cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
+ MI_SYNCHRONIZE_GFDT;
+
/*
* Bspec vol 1c.3 - blitter engine command streamer:
* "If ENABLED, all TLBs will be invalidated once the flush
@@ -1889,11 +1924,31 @@ intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
if (!ring->gpu_caches_dirty)
return 0;
- ret = ring->flush(ring, 0, I915_GEM_GPU_DOMAINS);
+ ret = ring->flush(ring, 0, I915_GEM_GPU_DOMAINS, 0);
+ if (ret)
+ return ret;
+
+ trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS, 0);
+
+ ring->gpu_caches_dirty = false;
+ return 0;
+}
+
+int
+intel_ring_flush_internal(struct intel_ring_buffer *ring, u32 internal)
+{
+ uint32_t flush_domains;
+ int ret;
+
+ flush_domains = 0;
+ if (ring->gpu_caches_dirty)
+ flush_domains = I915_GEM_GPU_DOMAINS;
+
+ ret = ring->flush(ring, 0, flush_domains, internal);
if (ret)
return ret;
- trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
+ trace_i915_gem_ring_flush(ring, 0, flush_domains, internal);
ring->gpu_caches_dirty = false;
return 0;
@@ -1909,11 +1964,11 @@ intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring)
if (ring->gpu_caches_dirty)
flush_domains = I915_GEM_GPU_DOMAINS;
- ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+ ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, flush_domains, 0);
if (ret)
return ret;
- trace_i915_gem_ring_flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+ trace_i915_gem_ring_flush(ring, I915_GEM_GPU_DOMAINS, flush_domains, 0);
ring->gpu_caches_dirty = false;
return 0;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d66208c..b9aa76e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -80,7 +80,8 @@ struct intel_ring_buffer {
u32 value);
int __must_check (*flush)(struct intel_ring_buffer *ring,
u32 invalidate_domains,
- u32 flush_domains);
+ u32 flush_domains,
+ u32 internal);
int (*add_request)(struct intel_ring_buffer *ring);
/* Some chipsets are not quite as coherent as advertised and need
* an expensive kick to force a true read of the up-to-date seqno.
@@ -220,6 +221,8 @@ int __must_check intel_ring_idle(struct intel_ring_buffer *ring);
void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno);
int intel_ring_flush_all_caches(struct intel_ring_buffer *ring);
int intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring);
+int intel_ring_flush_internal(struct intel_ring_buffer *ring, u32 internal);
+#define I915_FLUSH_GFDT 0x1
int intel_init_render_ring_buffer(struct drm_device *dev);
int intel_init_bsd_ring_buffer(struct drm_device *dev);
--
1.7.10.4
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 10+ messages in thread