All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: GFDT support for SNB/IVB
@ 2013-03-01 18:32 ville.syrjala
  2013-03-03 16:28 ` Daniel Vetter
  2013-03-06 16:28 ` Chris Wilson
  0 siblings, 2 replies; 10+ messages in thread
From: ville.syrjala @ 2013-03-01 18:32 UTC (permalink / raw)
  To: intel-gfx

From: Ville Syrjälä <ville.syrjala@linux.intel.com>

Currently all scanout buffers must be uncached because the
display controller doesn't snoop the LLC. SNB introduced another
method to guarantee coherency for the display controller. It's
called the GFDT or graphics data type.

Pages that have the GFDT bit enabled in their PTEs get flushed
all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
issued with the "synchronize GFDT" bit set.

So rather than making all scanout buffers uncached, set the GFDT
bit in their PTEs, and modify the ring flush functions to enable
the "synchronize GFDT" bit.

On HSW the GFDT bit was removed from the PTE, and it's only present in
surface state, so we can't really set it from the kernel. Also the docs
state that the hardware isn't actually guaranteed to respect the GFDT
bit. So it looks like GFDT isn't all that useful on HSW.

So far I've tried this very quickly on an IVB machine, and
it seems to be working as advertised. No idea if it does any
good though.

TODO:
- make sure there are no missing flushes (CPU access doesn't
  respect GFDT after all).
- measure it and see if there's some real benefit
- maybe we can track whether "synchronize GFDT" needs to be
  issued, and skip it when possible. needs some numbers to
  determine if it's worthwile.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |  18 +++--
 drivers/gpu/drm/i915/i915_gem.c            | 109 ++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_gem_context.c    |   3 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  10 +--
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  38 ++++++----
 drivers/gpu/drm/i915/i915_reg.h            |   2 +
 drivers/gpu/drm/i915/intel_display.c       |   4 +-
 drivers/gpu/drm/i915/intel_overlay.c       |   2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c    |  25 +++++++
 9 files changed, 167 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e95337c..537b344 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -408,7 +408,8 @@ struct i915_gtt {
 	void (*gtt_insert_entries)(struct drm_device *dev,
 				   struct sg_table *st,
 				   unsigned int pg_start,
-				   enum i915_cache_level cache_level);
+				   enum i915_cache_level cache_level,
+				   bool gfdt);
 };
 #define gtt_total_entries(gtt) ((gtt).total >> PAGE_SHIFT)
 
@@ -429,7 +430,8 @@ struct i915_hw_ppgtt {
 	void (*insert_entries)(struct i915_hw_ppgtt *ppgtt,
 			       struct sg_table *st,
 			       unsigned int pg_start,
-			       enum i915_cache_level cache_level);
+			       enum i915_cache_level cache_level,
+			       bool gfdt);
 	void (*cleanup)(struct i915_hw_ppgtt *ppgtt);
 };
 
@@ -1169,6 +1171,7 @@ struct drm_i915_gem_object {
 	unsigned int fenced_gpu_access:1;
 
 	unsigned int cache_level:2;
+	unsigned int gfdt:1;
 
 	unsigned int has_aliasing_ppgtt_mapping:1;
 	unsigned int has_global_gtt_mapping:1;
@@ -1310,6 +1313,10 @@ struct drm_i915_file_private {
 #define HAS_LLC(dev)            (INTEL_INFO(dev)->has_llc)
 #define I915_NEED_GFX_HWS(dev)	(INTEL_INFO(dev)->need_gfx_hws)
 
+/* Only SNB and IVB have GFDT in PTEs */
+#define HAS_GFDT(dev)		(HAS_LLC(dev) && !IS_HASWELL(dev) && \
+				 (IS_GEN6(dev) || IS_GEN7(dev)))
+
 #define HAS_HW_CONTEXTS(dev)	(INTEL_INFO(dev)->gen >= 6)
 #define HAS_ALIASING_PPGTT(dev)	(INTEL_INFO(dev)->gen >=6 && !IS_VALLEYVIEW(dev))
 
@@ -1639,7 +1646,8 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
 int __must_check
 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write);
 int __must_check
-i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
+i915_gem_object_pin_to_display_plane(struct drm_device *dev,
+				     struct drm_i915_gem_object *obj,
 				     u32 alignment,
 				     struct intel_ring_buffer *pipelined);
 int i915_gem_attach_phys_object(struct drm_device *dev,
@@ -1681,14 +1689,14 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
 void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev);
 void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
 			    struct drm_i915_gem_object *obj,
-			    enum i915_cache_level cache_level);
+			    enum i915_cache_level cache_level, bool gfdt);
 void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
 			      struct drm_i915_gem_object *obj);
 
 void i915_gem_restore_gtt_mappings(struct drm_device *dev);
 int __must_check i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj);
 void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
-				enum i915_cache_level cache_level);
+			      enum i915_cache_level cache_level, bool gfdt);
 void i915_gem_gtt_unbind_object(struct drm_i915_gem_object *obj);
 void i915_gem_gtt_finish_object(struct drm_i915_gem_object *obj);
 void i915_gem_init_global_gtt(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 8413ffc..c635430 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3035,8 +3035,13 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj)
 	 * and flushes when moving into and out of the RENDER domain, correct
 	 * snooping behaviour occurs naturally as the result of our domain
 	 * tracking.
+	 *
+	 * Also flush all pinned objects with GFDT enabled. Such objects
+	 * may be used for scanout, and the CPU doesn't know anything
+	 * about GFDT.
 	 */
-	if (obj->cache_level != I915_CACHE_NONE)
+	if (obj->cache_level != I915_CACHE_NONE &&
+	    !(obj->gfdt && obj->pin_count))
 		return;
 
 	trace_i915_gem_object_clflush(obj);
@@ -3154,6 +3159,11 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret;
+	bool gfdt = obj->gfdt;
+
+	/* Clear GFDT when moving to uncached */
+	if (cache_level == I915_CACHE_NONE)
+		gfdt = false;
 
 	if (obj->cache_level == cache_level)
 		return 0;
@@ -3187,10 +3197,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 		}
 
 		if (obj->has_global_gtt_mapping)
-			i915_gem_gtt_bind_object(obj, cache_level);
+			i915_gem_gtt_bind_object(obj, cache_level, gfdt);
 		if (obj->has_aliasing_ppgtt_mapping)
 			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
-					       obj, cache_level);
+					       obj, cache_level, gfdt);
 
 		obj->gtt_space->color = cache_level;
 	}
@@ -3219,6 +3229,65 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 	}
 
 	obj->cache_level = cache_level;
+	obj->gfdt = gfdt;
+	i915_gem_verify_gtt(dev);
+	return 0;
+}
+
+static int i915_gem_object_set_gfdt(struct drm_i915_gem_object *obj,
+				    bool gfdt)
+{
+	struct drm_device *dev = obj->base.dev;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	int ret;
+
+	if (!HAS_GFDT(dev))
+		return -ENODEV;
+
+	if (obj->gfdt == gfdt)
+		return 0;
+
+	/* no point in setting GFDT on uncached object */
+	if (obj->cache_level == I915_CACHE_NONE)
+		return -EINVAL;
+
+	if (obj->gtt_space) {
+		ret = i915_gem_object_finish_gpu(obj);
+		if (ret)
+			return ret;
+
+		i915_gem_object_finish_gtt(obj);
+
+		if (obj->has_global_gtt_mapping)
+			i915_gem_gtt_bind_object(obj, obj->cache_level, gfdt);
+		if (obj->has_aliasing_ppgtt_mapping)
+			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
+					       obj, obj->cache_level, gfdt);
+	}
+
+	if (gfdt) {
+		u32 old_read_domains, old_write_domain;
+
+		/*
+		 * Since we're just now enabling GFDT there's most
+		 * likely dirty data in the LLC. It must be flushed
+		 * to memory the old fashined way.
+		 */
+		WARN_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
+		WARN_ON(obj->base.read_domains & ~I915_GEM_DOMAIN_CPU);
+
+		old_read_domains = obj->base.read_domains;
+		old_write_domain = obj->base.write_domain;
+
+		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+
+		trace_i915_gem_object_change_domain(obj,
+						    old_read_domains,
+						    old_write_domain);
+	}
+
+	obj->gfdt = gfdt;
 	i915_gem_verify_gtt(dev);
 	return 0;
 }
@@ -3291,7 +3360,8 @@ unlock:
  * any flushes to be pipelined (for pageflips).
  */
 int
-i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
+i915_gem_object_pin_to_display_plane(struct drm_device *dev,
+				     struct drm_i915_gem_object *obj,
 				     u32 alignment,
 				     struct intel_ring_buffer *pipelined)
 {
@@ -3304,18 +3374,23 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 			return ret;
 	}
 
-	/* The display engine is not coherent with the LLC cache on gen6.  As
-	 * a result, we make sure that the pinning that is about to occur is
-	 * done with uncached PTEs. This is lowest common denominator for all
-	 * chipsets.
-	 *
-	 * However for gen6+, we could do better by using the GFDT bit instead
-	 * of uncaching, which would allow us to flush all the LLC-cached data
-	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
+	/*
+	 * Try to set the GFDT bit instead of uncaching. This allow us to flush
+	 * all the LLC-cached data with that bit in the PTE to main memory with
+	 * just one PIPE_CONTROL.
 	 */
-	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
-	if (ret)
-		return ret;
+	ret = i915_gem_object_set_gfdt(obj, true);
+	if (ret) {
+		/*
+		 * The display engine is not coherent with the LLC cache on gen6.  As
+		 * a result, we make sure that the pinning that is about to occur is
+		 * done with uncached PTEs. This is lowest common denominator for all
+		 * chipsets.
+		 */
+		ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
+		if (ret)
+			return ret;
+	}
 
 	/* As the user may map the buffer once pinned in the display plane
 	 * (e.g. libkms for the bootup splash), we have to ensure that we
@@ -3499,11 +3574,11 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj,
 			return ret;
 
 		if (!dev_priv->mm.aliasing_ppgtt)
-			i915_gem_gtt_bind_object(obj, obj->cache_level);
+			i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 	}
 
 	if (!obj->has_global_gtt_mapping && map_and_fenceable)
-		i915_gem_gtt_bind_object(obj, obj->cache_level);
+		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 
 	obj->pin_count++;
 	obj->pin_mappable |= map_and_fenceable;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 21177d9..7244c0c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -382,7 +382,8 @@ static int do_switch(struct i915_hw_context *to)
 	}
 
 	if (!to->obj->has_global_gtt_mapping)
-		i915_gem_gtt_bind_object(to->obj, to->obj->cache_level);
+		i915_gem_gtt_bind_object(to->obj, to->obj->cache_level,
+					 to->obj->gfdt);
 
 	if (!to->is_initialized || is_default_context(to))
 		hw_flags |= MI_RESTORE_INHIBIT;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 2f2daeb..28a3c42 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -197,7 +197,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	    reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
 	    !target_i915_obj->has_global_gtt_mapping)) {
 		i915_gem_gtt_bind_object(target_i915_obj,
-					 target_i915_obj->cache_level);
+					 target_i915_obj->cache_level,
+					 target_i915_obj->gfdt);
 	}
 
 	/* Validate that the target is in a valid r/w GPU domain */
@@ -432,7 +433,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
 	/* Ensure ppgtt mapping exists if needed */
 	if (dev_priv->mm.aliasing_ppgtt && !obj->has_aliasing_ppgtt_mapping) {
 		i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
-				       obj, obj->cache_level);
+				       obj, obj->cache_level, obj->gfdt);
 
 		obj->has_aliasing_ppgtt_mapping = 1;
 	}
@@ -449,7 +450,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
 
 	if (entry->flags & EXEC_OBJECT_NEEDS_GTT &&
 	    !obj->has_global_gtt_mapping)
-		i915_gem_gtt_bind_object(obj, obj->cache_level);
+		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 
 	return 0;
 }
@@ -1011,7 +1012,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * hsw should have this fixed, but let's be paranoid and do it
 	 * unconditionally for now. */
 	if (flags & I915_DISPATCH_SECURE && !batch_obj->has_global_gtt_mapping)
-		i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level);
+		i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level,
+					 batch_obj->gfdt);
 
 	ret = i915_gem_execbuffer_move_to_gpu(ring, &eb->objects);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 926a1e2..3c7e48b 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -42,15 +42,20 @@ typedef uint32_t gtt_pte_t;
 #define HSW_PTE_UNCACHED		(0)
 #define GEN6_PTE_CACHE_LLC		(2 << 1)
 #define GEN6_PTE_CACHE_LLC_MLC		(3 << 1)
+#define GEN6_PTE_GFDT			(1 << 3)
 #define GEN6_PTE_ADDR_ENCODE(addr)	GEN6_GTT_ADDR_ENCODE(addr)
 
 static inline gtt_pte_t gen6_pte_encode(struct drm_device *dev,
 					dma_addr_t addr,
-					enum i915_cache_level level)
+					enum i915_cache_level level,
+					bool gfdt)
 {
 	gtt_pte_t pte = GEN6_PTE_VALID;
 	pte |= GEN6_PTE_ADDR_ENCODE(addr);
 
+	if (gfdt && HAS_GFDT(dev))
+		pte |= GEN6_PTE_GFDT;
+
 	switch (level) {
 	case I915_CACHE_LLC_MLC:
 		/* Haswell doesn't set L3 this way */
@@ -89,7 +94,7 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
 
 	scratch_pte = gen6_pte_encode(ppgtt->dev,
 				      ppgtt->scratch_page_dma_addr,
-				      I915_CACHE_LLC);
+				      I915_CACHE_LLC, false);
 
 	while (num_entries) {
 		last_pte = first_pte + num_entries;
@@ -112,7 +117,8 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
 static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
 				      struct sg_table *pages,
 				      unsigned first_entry,
-				      enum i915_cache_level cache_level)
+				      enum i915_cache_level cache_level,
+				      bool gfdt)
 {
 	gtt_pte_t *pt_vaddr;
 	unsigned act_pd = first_entry / I915_PPGTT_PT_ENTRIES;
@@ -133,7 +139,7 @@ static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
 		for (j = first_pte; j < I915_PPGTT_PT_ENTRIES; j++) {
 			page_addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
 			pt_vaddr[j] = gen6_pte_encode(ppgtt->dev, page_addr,
-						      cache_level);
+						      cache_level, gfdt);
 
 			/* grab the next page */
 			if (++m == segment_len) {
@@ -279,11 +285,12 @@ void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev)
 
 void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
 			    struct drm_i915_gem_object *obj,
-			    enum i915_cache_level cache_level)
+			    enum i915_cache_level cache_level,
+			    bool gfdt)
 {
 	ppgtt->insert_entries(ppgtt, obj->pages,
 			      obj->gtt_space->start >> PAGE_SHIFT,
-			      cache_level);
+			      cache_level, gfdt);
 }
 
 void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
@@ -401,7 +408,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
 		i915_gem_clflush_object(obj);
-		i915_gem_gtt_bind_object(obj, obj->cache_level);
+		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 	}
 
 	i915_gem_chipset_flush(dev);
@@ -429,7 +436,8 @@ int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
 static void gen6_ggtt_insert_entries(struct drm_device *dev,
 				     struct sg_table *st,
 				     unsigned int first_entry,
-				     enum i915_cache_level level)
+				     enum i915_cache_level level,
+				     bool gfdt)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct scatterlist *sg = st->sgl;
@@ -443,7 +451,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
 		len = sg_dma_len(sg) >> PAGE_SHIFT;
 		for (m = 0; m < len; m++) {
 			addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
-			iowrite32(gen6_pte_encode(dev, addr, level),
+			iowrite32(gen6_pte_encode(dev, addr, level, gfdt),
 				  &gtt_entries[i]);
 			i++;
 		}
@@ -457,7 +465,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
 	 */
 	if (i != 0)
 		WARN_ON(readl(&gtt_entries[i-1])
-			!= gen6_pte_encode(dev, addr, level));
+			!= gen6_pte_encode(dev, addr, level, gfdt));
 
 	/* This next bit makes the above posting read even more important. We
 	 * want to flush the TLBs only after we're certain all the PTE updates
@@ -483,7 +491,7 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
 		num_entries = max_entries;
 
 	scratch_pte = gen6_pte_encode(dev, dev_priv->gtt.scratch_page_dma,
-				      I915_CACHE_LLC);
+				      I915_CACHE_LLC, false);
 	for (i = 0; i < num_entries; i++)
 		iowrite32(scratch_pte, &gtt_base[i]);
 	readl(gtt_base);
@@ -493,7 +501,8 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
 static void i915_ggtt_insert_entries(struct drm_device *dev,
 				     struct sg_table *st,
 				     unsigned int pg_start,
-				     enum i915_cache_level cache_level)
+				     enum i915_cache_level cache_level,
+				     bool gfdt)
 {
 	unsigned int flags = (cache_level == I915_CACHE_NONE) ?
 		AGP_USER_MEMORY : AGP_USER_CACHED_MEMORY;
@@ -511,14 +520,15 @@ static void i915_ggtt_clear_range(struct drm_device *dev,
 
 
 void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
-			      enum i915_cache_level cache_level)
+			      enum i915_cache_level cache_level,
+			      bool gfdt)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	dev_priv->gtt.gtt_insert_entries(dev, obj->pages,
 					 obj->gtt_space->start >> PAGE_SHIFT,
-					 cache_level);
+					 cache_level, gfdt);
 
 	obj->has_global_gtt_mapping = 1;
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index cd226c2..6aa29d5 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -242,6 +242,7 @@
 #define MI_FLUSH_DW		MI_INSTR(0x26, 1) /* for GEN6 */
 #define   MI_FLUSH_DW_STORE_INDEX	(1<<21)
 #define   MI_INVALIDATE_TLB		(1<<18)
+#define   MI_SYNCHRONIZE_GFDT		(1<<17)
 #define   MI_FLUSH_DW_OP_STOREDW	(1<<14)
 #define   MI_INVALIDATE_BSD		(1<<7)
 #define   MI_FLUSH_DW_USE_GTT		(1<<2)
@@ -311,6 +312,7 @@
 #define   PIPE_CONTROL_GLOBAL_GTT_IVB			(1<<24) /* gen7+ */
 #define   PIPE_CONTROL_CS_STALL				(1<<20)
 #define   PIPE_CONTROL_TLB_INVALIDATE			(1<<18)
+#define   PIPE_CONTROL_SYNCHRONIZE_GFDT			(1<<17)
 #define   PIPE_CONTROL_QW_WRITE				(1<<14)
 #define   PIPE_CONTROL_DEPTH_STALL			(1<<13)
 #define   PIPE_CONTROL_WRITE_FLUSH			(1<<12)
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index d912306..fc99d55 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -1968,7 +1968,7 @@ intel_pin_and_fence_fb_obj(struct drm_device *dev,
 	}
 
 	dev_priv->mm.interruptible = false;
-	ret = i915_gem_object_pin_to_display_plane(obj, alignment, pipelined);
+	ret = i915_gem_object_pin_to_display_plane(dev, obj, alignment, pipelined);
 	if (ret)
 		goto err_interruptible;
 
@@ -6340,7 +6340,7 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
 			goto fail_locked;
 		}
 
-		ret = i915_gem_object_pin_to_display_plane(obj, 0, NULL);
+		ret = i915_gem_object_pin_to_display_plane(dev, obj, 0, NULL);
 		if (ret) {
 			DRM_ERROR("failed to move cursor bo into the GTT\n");
 			goto fail_locked;
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index 67a2501..b4110a2 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -695,7 +695,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	if (ret != 0)
 		return ret;
 
-	ret = i915_gem_object_pin_to_display_plane(new_bo, 0, NULL);
+	ret = i915_gem_object_pin_to_display_plane(dev, new_bo, 0, NULL);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 1d5d613..8406247 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -234,6 +234,12 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
 		 * when the render cache is indeed flushed.
 		 */
 		flags |= PIPE_CONTROL_CS_STALL;
+
+		/* Flush GFDT out to memory */
+		if (HAS_GFDT(ring->dev)) {
+			flags |= PIPE_CONTROL_QW_WRITE;
+			flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
+		}
 	}
 	if (invalidate_domains) {
 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
@@ -306,6 +312,13 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
 	if (flush_domains) {
 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+
+		/* Flush GFDT out to memory */
+		if (HAS_GFDT(ring->dev)) {
+			flags |= PIPE_CONTROL_QW_WRITE;
+			flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
+			flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
+		}
 	}
 	if (invalidate_domains) {
 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
@@ -1557,6 +1570,12 @@ static int gen6_ring_flush(struct intel_ring_buffer *ring,
 		return ret;
 
 	cmd = MI_FLUSH_DW;
+
+	/* Flush GFDT out to memory */
+	if (flush & I915_GEM_GPU_DOMAINS && HAS_GFDT(ring->dev))
+		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
+			MI_SYNCHRONIZE_GFDT;
+
 	/*
 	 * Bspec vol 1c.5 - video engine command streamer:
 	 * "If ENABLED, all TLBs will be invalidated once the flush
@@ -1629,6 +1648,12 @@ static int blt_ring_flush(struct intel_ring_buffer *ring,
 		return ret;
 
 	cmd = MI_FLUSH_DW;
+
+	/* Flush GFDT out to memory */
+	if (flush & I915_GEM_DOMAIN_RENDER && HAS_GFDT(ring->dev))
+		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
+			MI_SYNCHRONIZE_GFDT;
+
 	/*
 	 * Bspec vol 1c.3 - blitter engine command streamer:
 	 * "If ENABLED, all TLBs will be invalidated once the flush
-- 
1.7.12.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915: GFDT support for SNB/IVB
  2013-03-01 18:32 [PATCH] drm/i915: GFDT support for SNB/IVB ville.syrjala
@ 2013-03-03 16:28 ` Daniel Vetter
  2013-03-03 17:39   ` Chris Wilson
  2013-03-04 14:03   ` Ville Syrjälä
  2013-03-06 16:28 ` Chris Wilson
  1 sibling, 2 replies; 10+ messages in thread
From: Daniel Vetter @ 2013-03-03 16:28 UTC (permalink / raw)
  To: ville.syrjala; +Cc: intel-gfx

On Fri, Mar 01, 2013 at 08:32:57PM +0200, ville.syrjala@linux.intel.com wrote:
> From: Ville Syrjälä <ville.syrjala@linux.intel.com>
> 
> Currently all scanout buffers must be uncached because the
> display controller doesn't snoop the LLC. SNB introduced another
> method to guarantee coherency for the display controller. It's
> called the GFDT or graphics data type.
> 
> Pages that have the GFDT bit enabled in their PTEs get flushed
> all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
> issued with the "synchronize GFDT" bit set.
> 
> So rather than making all scanout buffers uncached, set the GFDT
> bit in their PTEs, and modify the ring flush functions to enable
> the "synchronize GFDT" bit.
> 
> On HSW the GFDT bit was removed from the PTE, and it's only present in
> surface state, so we can't really set it from the kernel. Also the docs
> state that the hardware isn't actually guaranteed to respect the GFDT
> bit. So it looks like GFDT isn't all that useful on HSW.
> 
> So far I've tried this very quickly on an IVB machine, and
> it seems to be working as advertised. No idea if it does any
> good though.
> 
> TODO:
> - make sure there are no missing flushes (CPU access doesn't
>   respect GFDT after all).
> - measure it and see if there's some real benefit
> - maybe we can track whether "synchronize GFDT" needs to be
>   issued, and skip it when possible. needs some numbers to
>   determine if it's worthwile.
> 
> Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>

Iirc when I've tried this out a while back it regressed a few benchmarks.
Chris&me suspected cache trahsing, but hard to tell without proper
instrumentation support. Chris played around with a few tricks to mark
other giant bos as uncacheable, but he couldn't find any improved
workloads.

In short I think this needs to come with decent amounts of numbers
attached, like the TODO says ;-)

The other thing was that I didn't manage to get things to work properly,
leaving some random cachline dirt on the screen. Looking at your code, you
add the gfdt flush to every ring_flush, whereas I've tried to be clever
and only flushed after batches rendering to the frontbuffer. So probably a
bug in my code, or a flush on a given ring doesn't flush out caches for
one of the other engines.

And finally a bikeshed: I'd vote for a slightly more generice unsigned
long cache_flags instead of bool gfdt.

Cheers, Daniel

> ---
>  drivers/gpu/drm/i915/i915_drv.h            |  18 +++--
>  drivers/gpu/drm/i915/i915_gem.c            | 109 ++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_gem_context.c    |   3 +-
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  10 +--
>  drivers/gpu/drm/i915/i915_gem_gtt.c        |  38 ++++++----
>  drivers/gpu/drm/i915/i915_reg.h            |   2 +
>  drivers/gpu/drm/i915/intel_display.c       |   4 +-
>  drivers/gpu/drm/i915/intel_overlay.c       |   2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |  25 +++++++
>  9 files changed, 167 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e95337c..537b344 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -408,7 +408,8 @@ struct i915_gtt {
>  	void (*gtt_insert_entries)(struct drm_device *dev,
>  				   struct sg_table *st,
>  				   unsigned int pg_start,
> -				   enum i915_cache_level cache_level);
> +				   enum i915_cache_level cache_level,
> +				   bool gfdt);
>  };
>  #define gtt_total_entries(gtt) ((gtt).total >> PAGE_SHIFT)
>  
> @@ -429,7 +430,8 @@ struct i915_hw_ppgtt {
>  	void (*insert_entries)(struct i915_hw_ppgtt *ppgtt,
>  			       struct sg_table *st,
>  			       unsigned int pg_start,
> -			       enum i915_cache_level cache_level);
> +			       enum i915_cache_level cache_level,
> +			       bool gfdt);
>  	void (*cleanup)(struct i915_hw_ppgtt *ppgtt);
>  };
>  
> @@ -1169,6 +1171,7 @@ struct drm_i915_gem_object {
>  	unsigned int fenced_gpu_access:1;
>  
>  	unsigned int cache_level:2;
> +	unsigned int gfdt:1;
>  
>  	unsigned int has_aliasing_ppgtt_mapping:1;
>  	unsigned int has_global_gtt_mapping:1;
> @@ -1310,6 +1313,10 @@ struct drm_i915_file_private {
>  #define HAS_LLC(dev)            (INTEL_INFO(dev)->has_llc)
>  #define I915_NEED_GFX_HWS(dev)	(INTEL_INFO(dev)->need_gfx_hws)
>  
> +/* Only SNB and IVB have GFDT in PTEs */
> +#define HAS_GFDT(dev)		(HAS_LLC(dev) && !IS_HASWELL(dev) && \
> +				 (IS_GEN6(dev) || IS_GEN7(dev)))
> +
>  #define HAS_HW_CONTEXTS(dev)	(INTEL_INFO(dev)->gen >= 6)
>  #define HAS_ALIASING_PPGTT(dev)	(INTEL_INFO(dev)->gen >=6 && !IS_VALLEYVIEW(dev))
>  
> @@ -1639,7 +1646,8 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
>  int __must_check
>  i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write);
>  int __must_check
> -i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
> +i915_gem_object_pin_to_display_plane(struct drm_device *dev,
> +				     struct drm_i915_gem_object *obj,
>  				     u32 alignment,
>  				     struct intel_ring_buffer *pipelined);
>  int i915_gem_attach_phys_object(struct drm_device *dev,
> @@ -1681,14 +1689,14 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
>  void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev);
>  void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
>  			    struct drm_i915_gem_object *obj,
> -			    enum i915_cache_level cache_level);
> +			    enum i915_cache_level cache_level, bool gfdt);
>  void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
>  			      struct drm_i915_gem_object *obj);
>  
>  void i915_gem_restore_gtt_mappings(struct drm_device *dev);
>  int __must_check i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj);
>  void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
> -				enum i915_cache_level cache_level);
> +			      enum i915_cache_level cache_level, bool gfdt);
>  void i915_gem_gtt_unbind_object(struct drm_i915_gem_object *obj);
>  void i915_gem_gtt_finish_object(struct drm_i915_gem_object *obj);
>  void i915_gem_init_global_gtt(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 8413ffc..c635430 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3035,8 +3035,13 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj)
>  	 * and flushes when moving into and out of the RENDER domain, correct
>  	 * snooping behaviour occurs naturally as the result of our domain
>  	 * tracking.
> +	 *
> +	 * Also flush all pinned objects with GFDT enabled. Such objects
> +	 * may be used for scanout, and the CPU doesn't know anything
> +	 * about GFDT.
>  	 */
> -	if (obj->cache_level != I915_CACHE_NONE)
> +	if (obj->cache_level != I915_CACHE_NONE &&
> +	    !(obj->gfdt && obj->pin_count))
>  		return;
>  
>  	trace_i915_gem_object_clflush(obj);
> @@ -3154,6 +3159,11 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>  	struct drm_device *dev = obj->base.dev;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
>  	int ret;
> +	bool gfdt = obj->gfdt;
> +
> +	/* Clear GFDT when moving to uncached */
> +	if (cache_level == I915_CACHE_NONE)
> +		gfdt = false;
>  
>  	if (obj->cache_level == cache_level)
>  		return 0;
> @@ -3187,10 +3197,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>  		}
>  
>  		if (obj->has_global_gtt_mapping)
> -			i915_gem_gtt_bind_object(obj, cache_level);
> +			i915_gem_gtt_bind_object(obj, cache_level, gfdt);
>  		if (obj->has_aliasing_ppgtt_mapping)
>  			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
> -					       obj, cache_level);
> +					       obj, cache_level, gfdt);
>  
>  		obj->gtt_space->color = cache_level;
>  	}
> @@ -3219,6 +3229,65 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>  	}
>  
>  	obj->cache_level = cache_level;
> +	obj->gfdt = gfdt;
> +	i915_gem_verify_gtt(dev);
> +	return 0;
> +}
> +
> +static int i915_gem_object_set_gfdt(struct drm_i915_gem_object *obj,
> +				    bool gfdt)
> +{
> +	struct drm_device *dev = obj->base.dev;
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +	int ret;
> +
> +	if (!HAS_GFDT(dev))
> +		return -ENODEV;
> +
> +	if (obj->gfdt == gfdt)
> +		return 0;
> +
> +	/* no point in setting GFDT on uncached object */
> +	if (obj->cache_level == I915_CACHE_NONE)
> +		return -EINVAL;
> +
> +	if (obj->gtt_space) {
> +		ret = i915_gem_object_finish_gpu(obj);
> +		if (ret)
> +			return ret;
> +
> +		i915_gem_object_finish_gtt(obj);
> +
> +		if (obj->has_global_gtt_mapping)
> +			i915_gem_gtt_bind_object(obj, obj->cache_level, gfdt);
> +		if (obj->has_aliasing_ppgtt_mapping)
> +			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
> +					       obj, obj->cache_level, gfdt);
> +	}
> +
> +	if (gfdt) {
> +		u32 old_read_domains, old_write_domain;
> +
> +		/*
> +		 * Since we're just now enabling GFDT there's most
> +		 * likely dirty data in the LLC. It must be flushed
> +		 * to memory the old fashined way.
> +		 */
> +		WARN_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
> +		WARN_ON(obj->base.read_domains & ~I915_GEM_DOMAIN_CPU);
> +
> +		old_read_domains = obj->base.read_domains;
> +		old_write_domain = obj->base.write_domain;
> +
> +		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +
> +		trace_i915_gem_object_change_domain(obj,
> +						    old_read_domains,
> +						    old_write_domain);
> +	}
> +
> +	obj->gfdt = gfdt;
>  	i915_gem_verify_gtt(dev);
>  	return 0;
>  }
> @@ -3291,7 +3360,8 @@ unlock:
>   * any flushes to be pipelined (for pageflips).
>   */
>  int
> -i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
> +i915_gem_object_pin_to_display_plane(struct drm_device *dev,
> +				     struct drm_i915_gem_object *obj,
>  				     u32 alignment,
>  				     struct intel_ring_buffer *pipelined)
>  {
> @@ -3304,18 +3374,23 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
>  			return ret;
>  	}
>  
> -	/* The display engine is not coherent with the LLC cache on gen6.  As
> -	 * a result, we make sure that the pinning that is about to occur is
> -	 * done with uncached PTEs. This is lowest common denominator for all
> -	 * chipsets.
> -	 *
> -	 * However for gen6+, we could do better by using the GFDT bit instead
> -	 * of uncaching, which would allow us to flush all the LLC-cached data
> -	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
> +	/*
> +	 * Try to set the GFDT bit instead of uncaching. This allow us to flush
> +	 * all the LLC-cached data with that bit in the PTE to main memory with
> +	 * just one PIPE_CONTROL.
>  	 */
> -	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
> -	if (ret)
> -		return ret;
> +	ret = i915_gem_object_set_gfdt(obj, true);
> +	if (ret) {
> +		/*
> +		 * The display engine is not coherent with the LLC cache on gen6.  As
> +		 * a result, we make sure that the pinning that is about to occur is
> +		 * done with uncached PTEs. This is lowest common denominator for all
> +		 * chipsets.
> +		 */
> +		ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
> +		if (ret)
> +			return ret;
> +	}
>  
>  	/* As the user may map the buffer once pinned in the display plane
>  	 * (e.g. libkms for the bootup splash), we have to ensure that we
> @@ -3499,11 +3574,11 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj,
>  			return ret;
>  
>  		if (!dev_priv->mm.aliasing_ppgtt)
> -			i915_gem_gtt_bind_object(obj, obj->cache_level);
> +			i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
>  	}
>  
>  	if (!obj->has_global_gtt_mapping && map_and_fenceable)
> -		i915_gem_gtt_bind_object(obj, obj->cache_level);
> +		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
>  
>  	obj->pin_count++;
>  	obj->pin_mappable |= map_and_fenceable;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 21177d9..7244c0c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -382,7 +382,8 @@ static int do_switch(struct i915_hw_context *to)
>  	}
>  
>  	if (!to->obj->has_global_gtt_mapping)
> -		i915_gem_gtt_bind_object(to->obj, to->obj->cache_level);
> +		i915_gem_gtt_bind_object(to->obj, to->obj->cache_level,
> +					 to->obj->gfdt);
>  
>  	if (!to->is_initialized || is_default_context(to))
>  		hw_flags |= MI_RESTORE_INHIBIT;
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 2f2daeb..28a3c42 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -197,7 +197,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  	    reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
>  	    !target_i915_obj->has_global_gtt_mapping)) {
>  		i915_gem_gtt_bind_object(target_i915_obj,
> -					 target_i915_obj->cache_level);
> +					 target_i915_obj->cache_level,
> +					 target_i915_obj->gfdt);
>  	}
>  
>  	/* Validate that the target is in a valid r/w GPU domain */
> @@ -432,7 +433,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
>  	/* Ensure ppgtt mapping exists if needed */
>  	if (dev_priv->mm.aliasing_ppgtt && !obj->has_aliasing_ppgtt_mapping) {
>  		i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
> -				       obj, obj->cache_level);
> +				       obj, obj->cache_level, obj->gfdt);
>  
>  		obj->has_aliasing_ppgtt_mapping = 1;
>  	}
> @@ -449,7 +450,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
>  
>  	if (entry->flags & EXEC_OBJECT_NEEDS_GTT &&
>  	    !obj->has_global_gtt_mapping)
> -		i915_gem_gtt_bind_object(obj, obj->cache_level);
> +		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
>  
>  	return 0;
>  }
> @@ -1011,7 +1012,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	 * hsw should have this fixed, but let's be paranoid and do it
>  	 * unconditionally for now. */
>  	if (flags & I915_DISPATCH_SECURE && !batch_obj->has_global_gtt_mapping)
> -		i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level);
> +		i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level,
> +					 batch_obj->gfdt);
>  
>  	ret = i915_gem_execbuffer_move_to_gpu(ring, &eb->objects);
>  	if (ret)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 926a1e2..3c7e48b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -42,15 +42,20 @@ typedef uint32_t gtt_pte_t;
>  #define HSW_PTE_UNCACHED		(0)
>  #define GEN6_PTE_CACHE_LLC		(2 << 1)
>  #define GEN6_PTE_CACHE_LLC_MLC		(3 << 1)
> +#define GEN6_PTE_GFDT			(1 << 3)
>  #define GEN6_PTE_ADDR_ENCODE(addr)	GEN6_GTT_ADDR_ENCODE(addr)
>  
>  static inline gtt_pte_t gen6_pte_encode(struct drm_device *dev,
>  					dma_addr_t addr,
> -					enum i915_cache_level level)
> +					enum i915_cache_level level,
> +					bool gfdt)
>  {
>  	gtt_pte_t pte = GEN6_PTE_VALID;
>  	pte |= GEN6_PTE_ADDR_ENCODE(addr);
>  
> +	if (gfdt && HAS_GFDT(dev))
> +		pte |= GEN6_PTE_GFDT;
> +
>  	switch (level) {
>  	case I915_CACHE_LLC_MLC:
>  		/* Haswell doesn't set L3 this way */
> @@ -89,7 +94,7 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
>  
>  	scratch_pte = gen6_pte_encode(ppgtt->dev,
>  				      ppgtt->scratch_page_dma_addr,
> -				      I915_CACHE_LLC);
> +				      I915_CACHE_LLC, false);
>  
>  	while (num_entries) {
>  		last_pte = first_pte + num_entries;
> @@ -112,7 +117,8 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
>  static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
>  				      struct sg_table *pages,
>  				      unsigned first_entry,
> -				      enum i915_cache_level cache_level)
> +				      enum i915_cache_level cache_level,
> +				      bool gfdt)
>  {
>  	gtt_pte_t *pt_vaddr;
>  	unsigned act_pd = first_entry / I915_PPGTT_PT_ENTRIES;
> @@ -133,7 +139,7 @@ static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
>  		for (j = first_pte; j < I915_PPGTT_PT_ENTRIES; j++) {
>  			page_addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
>  			pt_vaddr[j] = gen6_pte_encode(ppgtt->dev, page_addr,
> -						      cache_level);
> +						      cache_level, gfdt);
>  
>  			/* grab the next page */
>  			if (++m == segment_len) {
> @@ -279,11 +285,12 @@ void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev)
>  
>  void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
>  			    struct drm_i915_gem_object *obj,
> -			    enum i915_cache_level cache_level)
> +			    enum i915_cache_level cache_level,
> +			    bool gfdt)
>  {
>  	ppgtt->insert_entries(ppgtt, obj->pages,
>  			      obj->gtt_space->start >> PAGE_SHIFT,
> -			      cache_level);
> +			      cache_level, gfdt);
>  }
>  
>  void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
> @@ -401,7 +408,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
>  
>  	list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
>  		i915_gem_clflush_object(obj);
> -		i915_gem_gtt_bind_object(obj, obj->cache_level);
> +		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
>  	}
>  
>  	i915_gem_chipset_flush(dev);
> @@ -429,7 +436,8 @@ int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
>  static void gen6_ggtt_insert_entries(struct drm_device *dev,
>  				     struct sg_table *st,
>  				     unsigned int first_entry,
> -				     enum i915_cache_level level)
> +				     enum i915_cache_level level,
> +				     bool gfdt)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct scatterlist *sg = st->sgl;
> @@ -443,7 +451,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
>  		len = sg_dma_len(sg) >> PAGE_SHIFT;
>  		for (m = 0; m < len; m++) {
>  			addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
> -			iowrite32(gen6_pte_encode(dev, addr, level),
> +			iowrite32(gen6_pte_encode(dev, addr, level, gfdt),
>  				  &gtt_entries[i]);
>  			i++;
>  		}
> @@ -457,7 +465,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
>  	 */
>  	if (i != 0)
>  		WARN_ON(readl(&gtt_entries[i-1])
> -			!= gen6_pte_encode(dev, addr, level));
> +			!= gen6_pte_encode(dev, addr, level, gfdt));
>  
>  	/* This next bit makes the above posting read even more important. We
>  	 * want to flush the TLBs only after we're certain all the PTE updates
> @@ -483,7 +491,7 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
>  		num_entries = max_entries;
>  
>  	scratch_pte = gen6_pte_encode(dev, dev_priv->gtt.scratch_page_dma,
> -				      I915_CACHE_LLC);
> +				      I915_CACHE_LLC, false);
>  	for (i = 0; i < num_entries; i++)
>  		iowrite32(scratch_pte, &gtt_base[i]);
>  	readl(gtt_base);
> @@ -493,7 +501,8 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
>  static void i915_ggtt_insert_entries(struct drm_device *dev,
>  				     struct sg_table *st,
>  				     unsigned int pg_start,
> -				     enum i915_cache_level cache_level)
> +				     enum i915_cache_level cache_level,
> +				     bool gfdt)
>  {
>  	unsigned int flags = (cache_level == I915_CACHE_NONE) ?
>  		AGP_USER_MEMORY : AGP_USER_CACHED_MEMORY;
> @@ -511,14 +520,15 @@ static void i915_ggtt_clear_range(struct drm_device *dev,
>  
>  
>  void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
> -			      enum i915_cache_level cache_level)
> +			      enum i915_cache_level cache_level,
> +			      bool gfdt)
>  {
>  	struct drm_device *dev = obj->base.dev;
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  
>  	dev_priv->gtt.gtt_insert_entries(dev, obj->pages,
>  					 obj->gtt_space->start >> PAGE_SHIFT,
> -					 cache_level);
> +					 cache_level, gfdt);
>  
>  	obj->has_global_gtt_mapping = 1;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index cd226c2..6aa29d5 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -242,6 +242,7 @@
>  #define MI_FLUSH_DW		MI_INSTR(0x26, 1) /* for GEN6 */
>  #define   MI_FLUSH_DW_STORE_INDEX	(1<<21)
>  #define   MI_INVALIDATE_TLB		(1<<18)
> +#define   MI_SYNCHRONIZE_GFDT		(1<<17)
>  #define   MI_FLUSH_DW_OP_STOREDW	(1<<14)
>  #define   MI_INVALIDATE_BSD		(1<<7)
>  #define   MI_FLUSH_DW_USE_GTT		(1<<2)
> @@ -311,6 +312,7 @@
>  #define   PIPE_CONTROL_GLOBAL_GTT_IVB			(1<<24) /* gen7+ */
>  #define   PIPE_CONTROL_CS_STALL				(1<<20)
>  #define   PIPE_CONTROL_TLB_INVALIDATE			(1<<18)
> +#define   PIPE_CONTROL_SYNCHRONIZE_GFDT			(1<<17)
>  #define   PIPE_CONTROL_QW_WRITE				(1<<14)
>  #define   PIPE_CONTROL_DEPTH_STALL			(1<<13)
>  #define   PIPE_CONTROL_WRITE_FLUSH			(1<<12)
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index d912306..fc99d55 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -1968,7 +1968,7 @@ intel_pin_and_fence_fb_obj(struct drm_device *dev,
>  	}
>  
>  	dev_priv->mm.interruptible = false;
> -	ret = i915_gem_object_pin_to_display_plane(obj, alignment, pipelined);
> +	ret = i915_gem_object_pin_to_display_plane(dev, obj, alignment, pipelined);
>  	if (ret)
>  		goto err_interruptible;
>  
> @@ -6340,7 +6340,7 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
>  			goto fail_locked;
>  		}
>  
> -		ret = i915_gem_object_pin_to_display_plane(obj, 0, NULL);
> +		ret = i915_gem_object_pin_to_display_plane(dev, obj, 0, NULL);
>  		if (ret) {
>  			DRM_ERROR("failed to move cursor bo into the GTT\n");
>  			goto fail_locked;
> diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
> index 67a2501..b4110a2 100644
> --- a/drivers/gpu/drm/i915/intel_overlay.c
> +++ b/drivers/gpu/drm/i915/intel_overlay.c
> @@ -695,7 +695,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
>  	if (ret != 0)
>  		return ret;
>  
> -	ret = i915_gem_object_pin_to_display_plane(new_bo, 0, NULL);
> +	ret = i915_gem_object_pin_to_display_plane(dev, new_bo, 0, NULL);
>  	if (ret != 0)
>  		return ret;
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 1d5d613..8406247 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -234,6 +234,12 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
>  		 * when the render cache is indeed flushed.
>  		 */
>  		flags |= PIPE_CONTROL_CS_STALL;
> +
> +		/* Flush GFDT out to memory */
> +		if (HAS_GFDT(ring->dev)) {
> +			flags |= PIPE_CONTROL_QW_WRITE;
> +			flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
> +		}
>  	}
>  	if (invalidate_domains) {
>  		flags |= PIPE_CONTROL_TLB_INVALIDATE;
> @@ -306,6 +312,13 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
>  	if (flush_domains) {
>  		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
>  		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
> +
> +		/* Flush GFDT out to memory */
> +		if (HAS_GFDT(ring->dev)) {
> +			flags |= PIPE_CONTROL_QW_WRITE;
> +			flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
> +			flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
> +		}
>  	}
>  	if (invalidate_domains) {
>  		flags |= PIPE_CONTROL_TLB_INVALIDATE;
> @@ -1557,6 +1570,12 @@ static int gen6_ring_flush(struct intel_ring_buffer *ring,
>  		return ret;
>  
>  	cmd = MI_FLUSH_DW;
> +
> +	/* Flush GFDT out to memory */
> +	if (flush & I915_GEM_GPU_DOMAINS && HAS_GFDT(ring->dev))
> +		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
> +			MI_SYNCHRONIZE_GFDT;
> +
>  	/*
>  	 * Bspec vol 1c.5 - video engine command streamer:
>  	 * "If ENABLED, all TLBs will be invalidated once the flush
> @@ -1629,6 +1648,12 @@ static int blt_ring_flush(struct intel_ring_buffer *ring,
>  		return ret;
>  
>  	cmd = MI_FLUSH_DW;
> +
> +	/* Flush GFDT out to memory */
> +	if (flush & I915_GEM_DOMAIN_RENDER && HAS_GFDT(ring->dev))
> +		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
> +			MI_SYNCHRONIZE_GFDT;
> +
>  	/*
>  	 * Bspec vol 1c.3 - blitter engine command streamer:
>  	 * "If ENABLED, all TLBs will be invalidated once the flush
> -- 
> 1.7.12.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915: GFDT support for SNB/IVB
  2013-03-03 16:28 ` Daniel Vetter
@ 2013-03-03 17:39   ` Chris Wilson
  2013-03-04 13:51     ` Ville Syrjälä
  2013-03-04 14:03   ` Ville Syrjälä
  1 sibling, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2013-03-03 17:39 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Sun, Mar 03, 2013 at 05:28:52PM +0100, Daniel Vetter wrote:
> The other thing was that I didn't manage to get things to work properly,
> leaving some random cachline dirt on the screen. Looking at your code, you
> add the gfdt flush to every ring_flush, whereas I've tried to be clever
> and only flushed after batches rendering to the frontbuffer. So probably a
> bug in my code, or a flush on a given ring doesn't flush out caches for
> one of the other engines.

Right, we have a formalized interface for flushes to scanout rather than
always flushing GFDT.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915: GFDT support for SNB/IVB
  2013-03-03 17:39   ` Chris Wilson
@ 2013-03-04 13:51     ` Ville Syrjälä
  2013-03-04 14:01       ` Chris Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Ville Syrjälä @ 2013-03-04 13:51 UTC (permalink / raw)
  To: Chris Wilson, Daniel Vetter, intel-gfx

On Sun, Mar 03, 2013 at 05:39:09PM +0000, Chris Wilson wrote:
> On Sun, Mar 03, 2013 at 05:28:52PM +0100, Daniel Vetter wrote:
> > The other thing was that I didn't manage to get things to work properly,
> > leaving some random cachline dirt on the screen. Looking at your code, you
> > add the gfdt flush to every ring_flush, whereas I've tried to be clever
> > and only flushed after batches rendering to the frontbuffer. So probably a
> > bug in my code, or a flush on a given ring doesn't flush out caches for
> > one of the other engines.
> 
> Right, we have a formalized interface for flushes to scanout rather than
> always flushing GFDT.

We do? Where's that?

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915: GFDT support for SNB/IVB
  2013-03-04 13:51     ` Ville Syrjälä
@ 2013-03-04 14:01       ` Chris Wilson
  0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2013-03-04 14:01 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: intel-gfx

On Mon, Mar 04, 2013 at 03:51:11PM +0200, Ville Syrjälä wrote:
> On Sun, Mar 03, 2013 at 05:39:09PM +0000, Chris Wilson wrote:
> > On Sun, Mar 03, 2013 at 05:28:52PM +0100, Daniel Vetter wrote:
> > > The other thing was that I didn't manage to get things to work properly,
> > > leaving some random cachline dirt on the screen. Looking at your code, you
> > > add the gfdt flush to every ring_flush, whereas I've tried to be clever
> > > and only flushed after batches rendering to the frontbuffer. So probably a
> > > bug in my code, or a flush on a given ring doesn't flush out caches for
> > > one of the other engines.
> > 
> > Right, we have a formalized interface for flushes to scanout rather than
> > always flushing GFDT.
> 
> We do? Where's that?

A side-effect of the API for busy_ioctl() is that all caches for the bo
are flushed and any outstanding requests queued in order for the
bo to become unbusy in the near-future. This semantic was required for
GL query objects to eventually complete without futher interaction. It
also solved the problem of having to flush the scanout periodically on
gen4+ (and on gen3 if we enable the optimisation bit) and is so
enshrined into lore.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915: GFDT support for SNB/IVB
  2013-03-03 16:28 ` Daniel Vetter
  2013-03-03 17:39   ` Chris Wilson
@ 2013-03-04 14:03   ` Ville Syrjälä
  1 sibling, 0 replies; 10+ messages in thread
From: Ville Syrjälä @ 2013-03-04 14:03 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Sun, Mar 03, 2013 at 05:28:52PM +0100, Daniel Vetter wrote:
> On Fri, Mar 01, 2013 at 08:32:57PM +0200, ville.syrjala@linux.intel.com wrote:
> > From: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > 
> > Currently all scanout buffers must be uncached because the
> > display controller doesn't snoop the LLC. SNB introduced another
> > method to guarantee coherency for the display controller. It's
> > called the GFDT or graphics data type.
> > 
> > Pages that have the GFDT bit enabled in their PTEs get flushed
> > all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
> > issued with the "synchronize GFDT" bit set.
> > 
> > So rather than making all scanout buffers uncached, set the GFDT
> > bit in their PTEs, and modify the ring flush functions to enable
> > the "synchronize GFDT" bit.
> > 
> > On HSW the GFDT bit was removed from the PTE, and it's only present in
> > surface state, so we can't really set it from the kernel. Also the docs
> > state that the hardware isn't actually guaranteed to respect the GFDT
> > bit. So it looks like GFDT isn't all that useful on HSW.
> > 
> > So far I've tried this very quickly on an IVB machine, and
> > it seems to be working as advertised. No idea if it does any
> > good though.
> > 
> > TODO:
> > - make sure there are no missing flushes (CPU access doesn't
> >   respect GFDT after all).
> > - measure it and see if there's some real benefit
> > - maybe we can track whether "synchronize GFDT" needs to be
> >   issued, and skip it when possible. needs some numbers to
> >   determine if it's worthwile.
> > 
> > Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> 
> Iirc when I've tried this out a while back it regressed a few benchmarks.
> Chris&me suspected cache trahsing, but hard to tell without proper
> instrumentation support. Chris played around with a few tricks to mark
> other giant bos as uncacheable, but he couldn't find any improved
> workloads.

I see. I didn't realize this was tried already. Not that I really
planned to implement this in the first place. I was just studying the
code and figured I'd learn better by trying to change things a bit
;) But if there's interest I can of course try to improve it further.

> In short I think this needs to come with decent amounts of numbers
> attached, like the TODO says ;-)
> 
> The other thing was that I didn't manage to get things to work properly,
> leaving some random cachline dirt on the screen. Looking at your code, you
> add the gfdt flush to every ring_flush, whereas I've tried to be clever
> and only flushed after batches rendering to the frontbuffer. So probably a
> bug in my code, or a flush on a given ring doesn't flush out caches for
> one of the other engines.

I had a bug in my first attempt where I forgot the initial clflush. That
left some random crap on the screen until I rendered the next frame w/
GFDT already enabled. Other than that I didn't see any corruption except
when I intentionally left out the flushes. But as stated my code does
too many flushes probably.

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] drm/i915: GFDT support for SNB/IVB
  2013-03-01 18:32 [PATCH] drm/i915: GFDT support for SNB/IVB ville.syrjala
  2013-03-03 16:28 ` Daniel Vetter
@ 2013-03-06 16:28 ` Chris Wilson
  2013-04-04  9:01   ` Daniel Vetter
  1 sibling, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2013-03-06 16:28 UTC (permalink / raw)
  To: intel-gfx

From: Ville Syrjälä <ville.syrjala@linux.intel.com>

Currently all scanout buffers must be uncached because the
display controller doesn't snoop the LLC. SNB introduced another
method to guarantee coherency for the display controller. It's
called the GFDT or graphics data type.

Pages that have the GFDT bit enabled in their PTEs get flushed
all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
issued with the "synchronize GFDT" bit set.

So rather than making all scanout buffers uncached, set the GFDT
bit in their PTEs, and modify the ring flush functions to enable
the "synchronize GFDT" bit.

On HSW the GFDT bit was removed from the PTE, and it's only present in
surface state, so we can't really set it from the kernel. Also the docs
state that the hardware isn't actually guaranteed to respect the GFDT
bit. So it looks like GFDT isn't all that useful on HSW.

So far I've tried this very quickly on an IVB machine, and
it seems to be working as advertised. No idea if it does any
good though.

On an i5-2520m (laptop) running gnome-shell at 1366x768:
  padman 		140.78 -> 145.98 fps
  openarena 		183.72 -> 186.87 fps
  gtkperf ComboBoxEntry	20.27 -> 22.14s
  gtkperf pixbuf	 1.12 ->  1.47s
  x11perf -aa10text	13.40 -> 13.20 Mglyphs
which are well within the throttling noise.

v2 [ickle]: adapt to comply with existing userspace guarantees

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c            |    4 +
 drivers/gpu/drm/i915/i915_drv.h            |   16 +++-
 drivers/gpu/drm/i915/i915_gem.c            |  111 ++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_gem_context.c    |    5 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   12 ++-
 drivers/gpu/drm/i915/i915_gem_gtt.c        |   38 ++++++----
 drivers/gpu/drm/i915/i915_reg.h            |    2 +
 drivers/gpu/drm/i915/i915_trace.h          |   10 ++-
 drivers/gpu/drm/i915/intel_ringbuffer.c    |   79 +++++++++++++++++---
 drivers/gpu/drm/i915/intel_ringbuffer.h    |    5 +-
 10 files changed, 225 insertions(+), 57 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 66d06ac..ff935f1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -235,6 +235,7 @@ static const struct intel_device_info intel_sandybridge_d_info = {
 	.has_bsd_ring = 1,
 	.has_blt_ring = 1,
 	.has_llc = 1,
+	.has_gfdt = 1,
 	.has_force_wake = 1,
 };
 
@@ -245,6 +246,7 @@ static const struct intel_device_info intel_sandybridge_m_info = {
 	.has_bsd_ring = 1,
 	.has_blt_ring = 1,
 	.has_llc = 1,
+	.has_gfdt = 1,
 	.has_force_wake = 1,
 };
 
@@ -254,6 +256,7 @@ static const struct intel_device_info intel_ivybridge_d_info = {
 	.has_bsd_ring = 1,
 	.has_blt_ring = 1,
 	.has_llc = 1,
+	.has_gfdt = 1,
 	.has_force_wake = 1,
 };
 
@@ -264,6 +267,7 @@ static const struct intel_device_info intel_ivybridge_m_info = {
 	.has_bsd_ring = 1,
 	.has_blt_ring = 1,
 	.has_llc = 1,
+	.has_gfdt = 1,
 	.has_force_wake = 1,
 };
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 535bf29..9841dd7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -367,6 +367,7 @@ struct intel_device_info {
 	u8 has_bsd_ring:1;
 	u8 has_blt_ring:1;
 	u8 has_llc:1;
+	u8 has_gfdt:1;
 };
 
 enum i915_cache_level {
@@ -409,7 +410,8 @@ struct i915_gtt {
 	void (*gtt_insert_entries)(struct drm_device *dev,
 				   struct sg_table *st,
 				   unsigned int pg_start,
-				   enum i915_cache_level cache_level);
+				   enum i915_cache_level cache_level,
+				   bool gfdt);
 };
 #define gtt_total_entries(gtt) ((gtt).total >> PAGE_SHIFT)
 
@@ -430,7 +432,8 @@ struct i915_hw_ppgtt {
 	void (*insert_entries)(struct i915_hw_ppgtt *ppgtt,
 			       struct sg_table *st,
 			       unsigned int pg_start,
-			       enum i915_cache_level cache_level);
+			       enum i915_cache_level cache_level,
+			       bool gfdt);
 	void (*cleanup)(struct i915_hw_ppgtt *ppgtt);
 };
 
@@ -1170,6 +1173,8 @@ struct drm_i915_gem_object {
 	unsigned int fenced_gpu_access:1;
 
 	unsigned int cache_level:2;
+	unsigned int gfdt:1;
+	unsigned int gfdt_dirty:1;
 
 	unsigned int has_aliasing_ppgtt_mapping:1;
 	unsigned int has_global_gtt_mapping:1;
@@ -1328,6 +1333,9 @@ struct drm_i915_file_private {
 #define HAS_LLC(dev)            (INTEL_INFO(dev)->has_llc)
 #define I915_NEED_GFX_HWS(dev)	(INTEL_INFO(dev)->need_gfx_hws)
 
+/* Only SNB and IVB have GFDT in PTEs */
+#define HAS_GFDT(dev)            (INTEL_INFO(dev)->has_gfdt)
+
 #define HAS_HW_CONTEXTS(dev)	(INTEL_INFO(dev)->gen >= 6)
 #define HAS_ALIASING_PPGTT(dev)	(INTEL_INFO(dev)->gen >=6 && !IS_VALLEYVIEW(dev))
 
@@ -1702,14 +1710,14 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
 void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev);
 void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
 			    struct drm_i915_gem_object *obj,
-			    enum i915_cache_level cache_level);
+			    enum i915_cache_level cache_level, bool gfdt);
 void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
 			      struct drm_i915_gem_object *obj);
 
 void i915_gem_restore_gtt_mappings(struct drm_device *dev);
 int __must_check i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj);
 void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
-				enum i915_cache_level cache_level);
+			      enum i915_cache_level cache_level, bool gfdt);
 void i915_gem_gtt_unbind_object(struct drm_i915_gem_object *obj);
 void i915_gem_gtt_finish_object(struct drm_i915_gem_object *obj);
 void i915_gem_init_global_gtt(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 19fc21b..2b4bc88 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2282,6 +2282,25 @@ i915_gem_retire_work_handler(struct work_struct *work)
 	mutex_unlock(&dev->struct_mutex);
 }
 
+static int i915_gem_object_flush_gfdt(struct drm_i915_gem_object *obj,
+				      struct intel_ring_buffer *ring)
+{
+	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
+	int ret;
+
+	if (!obj->gfdt_dirty)
+		return 0;
+
+	if (ring == NULL)
+		ring = &dev_priv->ring[BCS];
+	ret = intel_ring_flush_internal(ring, I915_FLUSH_GFDT);
+	if (ret)
+		return ret;
+
+	obj->gfdt_dirty = false;
+	return 0;
+}
+
 /**
  * Ensures that an object will eventually get non-busy by flushing any required
  * write domains, emitting any outstanding lazy request and retiring and
@@ -2292,6 +2311,10 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
 {
 	int ret;
 
+	ret = i915_gem_object_flush_gfdt(obj, obj->ring);
+	if (ret)
+		return ret;
+
 	if (obj->active) {
 		ret = i915_gem_check_olr(obj->ring, obj->last_read_seqno);
 		if (ret)
@@ -3034,6 +3057,9 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj)
 	if (obj->stolen)
 		return;
 
+	if (obj->gfdt)
+		obj->gfdt_dirty = 1;
+
 	/* If the GPU is snooping the contents of the CPU cache,
 	 * we do not need to manually clear the CPU cache lines.  However,
 	 * the caches are only snooped when the render cache is
@@ -3161,7 +3187,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret;
 
-	if (obj->cache_level == cache_level)
+	if (obj->cache_level == cache_level && !obj->gfdt)
 		return 0;
 
 	if (obj->pin_count) {
@@ -3193,10 +3219,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 		}
 
 		if (obj->has_global_gtt_mapping)
-			i915_gem_gtt_bind_object(obj, cache_level);
+			i915_gem_gtt_bind_object(obj, cache_level, false);
 		if (obj->has_aliasing_ppgtt_mapping)
 			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
-					       obj, cache_level);
+					       obj, cache_level, false);
 
 		obj->gtt_space->color = cache_level;
 	}
@@ -3225,6 +3251,48 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 	}
 
 	obj->cache_level = cache_level;
+	obj->gfdt_dirty = obj->gfdt = false;
+	i915_gem_verify_gtt(dev);
+	return 0;
+}
+
+static int i915_gem_object_set_gfdt(struct drm_i915_gem_object *obj,
+				    bool gfdt)
+{
+	struct drm_device *dev = obj->base.dev;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	int ret;
+
+	if (!HAS_GFDT(dev))
+		return -ENODEV;
+
+	if (obj->gfdt == gfdt)
+		return 0;
+
+	/* no point in setting GFDT on uncached object */
+	if (obj->cache_level == I915_CACHE_NONE)
+		return -EINVAL;
+
+	if (obj->gtt_space) {
+		ret = i915_gem_object_finish_gpu(obj);
+		if (ret)
+			return ret;
+
+		i915_gem_object_finish_gtt(obj);
+
+		if (obj->has_global_gtt_mapping)
+			i915_gem_gtt_bind_object(obj, obj->cache_level, gfdt);
+		if (obj->has_aliasing_ppgtt_mapping)
+			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
+					       obj, obj->cache_level, gfdt);
+	}
+
+	/* Explicity perform the clflush to clear untagged dirty data */
+	if (obj->pages && !obj->stolen)
+		drm_clflush_sg(obj->pages);
+
+	obj->gfdt = gfdt;
+	obj->gfdt_dirty = false;
 	i915_gem_verify_gtt(dev);
 	return 0;
 }
@@ -3310,18 +3378,23 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 			return ret;
 	}
 
-	/* The display engine is not coherent with the LLC cache on gen6.  As
-	 * a result, we make sure that the pinning that is about to occur is
-	 * done with uncached PTEs. This is lowest common denominator for all
-	 * chipsets.
-	 *
-	 * However for gen6+, we could do better by using the GFDT bit instead
-	 * of uncaching, which would allow us to flush all the LLC-cached data
-	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
+	/*
+	 * Try to set the GFDT bit instead of uncaching. This allow us to flush
+	 * all the LLC-cached data with that bit in the PTE to main memory with
+	 * just one PIPE_CONTROL.
 	 */
-	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
-	if (ret)
-		return ret;
+	ret = i915_gem_object_set_gfdt(obj, true);
+	if (ret) {
+		/*
+		 * The display engine is not coherent with the LLC cache on gen6.  As
+		 * a result, we make sure that the pinning that is about to occur is
+		 * done with uncached PTEs. This is lowest common denominator for all
+		 * chipsets.
+		 */
+		ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
+		if (ret)
+			return ret;
+	}
 
 	/* As the user may map the buffer once pinned in the display plane
 	 * (e.g. libkms for the bootup splash), we have to ensure that we
@@ -3346,6 +3419,12 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 					    old_read_domains,
 					    old_write_domain);
 
+	ret = i915_gem_object_flush_gfdt(obj, pipelined);
+	if (ret) {
+		i915_gem_object_unpin(obj);
+		return ret;
+	}
+
 	return 0;
 }
 
@@ -3505,11 +3584,11 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj,
 			return ret;
 
 		if (!dev_priv->mm.aliasing_ppgtt)
-			i915_gem_gtt_bind_object(obj, obj->cache_level);
+			i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 	}
 
 	if (!obj->has_global_gtt_mapping && map_and_fenceable)
-		i915_gem_gtt_bind_object(obj, obj->cache_level);
+		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 
 	obj->pin_count++;
 	obj->pin_mappable |= map_and_fenceable;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 94d873a..b8301d3 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -309,7 +309,7 @@ mi_set_context(struct intel_ring_buffer *ring,
 	 * itlb_before_ctx_switch.
 	 */
 	if (IS_GEN6(ring->dev) && ring->itlb_before_ctx_switch) {
-		ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, 0);
+		ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, 0, 0);
 		if (ret)
 			return ret;
 	}
@@ -371,7 +371,8 @@ static int do_switch(struct i915_hw_context *to)
 	}
 
 	if (!to->obj->has_global_gtt_mapping)
-		i915_gem_gtt_bind_object(to->obj, to->obj->cache_level);
+		i915_gem_gtt_bind_object(to->obj, to->obj->cache_level,
+					 to->obj->gfdt);
 
 	if (!to->is_initialized || is_default_context(to))
 		hw_flags |= MI_RESTORE_INHIBIT;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6ae62ea..4ad0323 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -197,7 +197,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	    reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
 	    !target_i915_obj->has_global_gtt_mapping)) {
 		i915_gem_gtt_bind_object(target_i915_obj,
-					 target_i915_obj->cache_level);
+					 target_i915_obj->cache_level,
+					 target_i915_obj->gfdt);
 	}
 
 	/* Validate that the target is in a valid r/w GPU domain */
@@ -433,7 +434,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
 	/* Ensure ppgtt mapping exists if needed */
 	if (dev_priv->mm.aliasing_ppgtt && !obj->has_aliasing_ppgtt_mapping) {
 		i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
-				       obj, obj->cache_level);
+				       obj, obj->cache_level, obj->gfdt);
 
 		obj->has_aliasing_ppgtt_mapping = 1;
 	}
@@ -450,7 +451,7 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
 
 	if (entry->flags & EXEC_OBJECT_NEEDS_GTT &&
 	    !obj->has_global_gtt_mapping)
-		i915_gem_gtt_bind_object(obj, obj->cache_level);
+		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 
 	return 0;
 }
@@ -778,6 +779,8 @@ i915_gem_execbuffer_move_to_active(struct list_head *objects,
 		i915_gem_object_move_to_active(obj, ring);
 		if (obj->base.write_domain) {
 			obj->dirty = 1;
+			if (obj->gfdt)
+				obj->gfdt_dirty = 1;
 			obj->last_write_seqno = intel_ring_get_seqno(ring);
 			if (obj->pin_count) /* check for potential scanout */
 				intel_mark_fb_busy(obj);
@@ -1011,7 +1014,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * hsw should have this fixed, but let's be paranoid and do it
 	 * unconditionally for now. */
 	if (flags & I915_DISPATCH_SECURE && !batch_obj->has_global_gtt_mapping)
-		i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level);
+		i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level,
+					 batch_obj->gfdt);
 
 	ret = i915_gem_execbuffer_move_to_gpu(ring, &eb->objects);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 926a1e2..3c7e48b 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -42,15 +42,20 @@ typedef uint32_t gtt_pte_t;
 #define HSW_PTE_UNCACHED		(0)
 #define GEN6_PTE_CACHE_LLC		(2 << 1)
 #define GEN6_PTE_CACHE_LLC_MLC		(3 << 1)
+#define GEN6_PTE_GFDT			(1 << 3)
 #define GEN6_PTE_ADDR_ENCODE(addr)	GEN6_GTT_ADDR_ENCODE(addr)
 
 static inline gtt_pte_t gen6_pte_encode(struct drm_device *dev,
 					dma_addr_t addr,
-					enum i915_cache_level level)
+					enum i915_cache_level level,
+					bool gfdt)
 {
 	gtt_pte_t pte = GEN6_PTE_VALID;
 	pte |= GEN6_PTE_ADDR_ENCODE(addr);
 
+	if (gfdt && HAS_GFDT(dev))
+		pte |= GEN6_PTE_GFDT;
+
 	switch (level) {
 	case I915_CACHE_LLC_MLC:
 		/* Haswell doesn't set L3 this way */
@@ -89,7 +94,7 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
 
 	scratch_pte = gen6_pte_encode(ppgtt->dev,
 				      ppgtt->scratch_page_dma_addr,
-				      I915_CACHE_LLC);
+				      I915_CACHE_LLC, false);
 
 	while (num_entries) {
 		last_pte = first_pte + num_entries;
@@ -112,7 +117,8 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt *ppgtt,
 static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
 				      struct sg_table *pages,
 				      unsigned first_entry,
-				      enum i915_cache_level cache_level)
+				      enum i915_cache_level cache_level,
+				      bool gfdt)
 {
 	gtt_pte_t *pt_vaddr;
 	unsigned act_pd = first_entry / I915_PPGTT_PT_ENTRIES;
@@ -133,7 +139,7 @@ static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
 		for (j = first_pte; j < I915_PPGTT_PT_ENTRIES; j++) {
 			page_addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
 			pt_vaddr[j] = gen6_pte_encode(ppgtt->dev, page_addr,
-						      cache_level);
+						      cache_level, gfdt);
 
 			/* grab the next page */
 			if (++m == segment_len) {
@@ -279,11 +285,12 @@ void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev)
 
 void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
 			    struct drm_i915_gem_object *obj,
-			    enum i915_cache_level cache_level)
+			    enum i915_cache_level cache_level,
+			    bool gfdt)
 {
 	ppgtt->insert_entries(ppgtt, obj->pages,
 			      obj->gtt_space->start >> PAGE_SHIFT,
-			      cache_level);
+			      cache_level, gfdt);
 }
 
 void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
@@ -401,7 +408,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
 		i915_gem_clflush_object(obj);
-		i915_gem_gtt_bind_object(obj, obj->cache_level);
+		i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
 	}
 
 	i915_gem_chipset_flush(dev);
@@ -429,7 +436,8 @@ int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
 static void gen6_ggtt_insert_entries(struct drm_device *dev,
 				     struct sg_table *st,
 				     unsigned int first_entry,
-				     enum i915_cache_level level)
+				     enum i915_cache_level level,
+				     bool gfdt)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct scatterlist *sg = st->sgl;
@@ -443,7 +451,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
 		len = sg_dma_len(sg) >> PAGE_SHIFT;
 		for (m = 0; m < len; m++) {
 			addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
-			iowrite32(gen6_pte_encode(dev, addr, level),
+			iowrite32(gen6_pte_encode(dev, addr, level, gfdt),
 				  &gtt_entries[i]);
 			i++;
 		}
@@ -457,7 +465,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
 	 */
 	if (i != 0)
 		WARN_ON(readl(&gtt_entries[i-1])
-			!= gen6_pte_encode(dev, addr, level));
+			!= gen6_pte_encode(dev, addr, level, gfdt));
 
 	/* This next bit makes the above posting read even more important. We
 	 * want to flush the TLBs only after we're certain all the PTE updates
@@ -483,7 +491,7 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
 		num_entries = max_entries;
 
 	scratch_pte = gen6_pte_encode(dev, dev_priv->gtt.scratch_page_dma,
-				      I915_CACHE_LLC);
+				      I915_CACHE_LLC, false);
 	for (i = 0; i < num_entries; i++)
 		iowrite32(scratch_pte, &gtt_base[i]);
 	readl(gtt_base);
@@ -493,7 +501,8 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
 static void i915_ggtt_insert_entries(struct drm_device *dev,
 				     struct sg_table *st,
 				     unsigned int pg_start,
-				     enum i915_cache_level cache_level)
+				     enum i915_cache_level cache_level,
+				     bool gfdt)
 {
 	unsigned int flags = (cache_level == I915_CACHE_NONE) ?
 		AGP_USER_MEMORY : AGP_USER_CACHED_MEMORY;
@@ -511,14 +520,15 @@ static void i915_ggtt_clear_range(struct drm_device *dev,
 
 
 void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
-			      enum i915_cache_level cache_level)
+			      enum i915_cache_level cache_level,
+			      bool gfdt)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	dev_priv->gtt.gtt_insert_entries(dev, obj->pages,
 					 obj->gtt_space->start >> PAGE_SHIFT,
-					 cache_level);
+					 cache_level, gfdt);
 
 	obj->has_global_gtt_mapping = 1;
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 4cf3ece..6675af2 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -242,6 +242,7 @@
 #define MI_FLUSH_DW		MI_INSTR(0x26, 1) /* for GEN6 */
 #define   MI_FLUSH_DW_STORE_INDEX	(1<<21)
 #define   MI_INVALIDATE_TLB		(1<<18)
+#define   MI_SYNCHRONIZE_GFDT		(1<<17)
 #define   MI_FLUSH_DW_OP_STOREDW	(1<<14)
 #define   MI_INVALIDATE_BSD		(1<<7)
 #define   MI_FLUSH_DW_USE_GTT		(1<<2)
@@ -311,6 +312,7 @@
 #define   PIPE_CONTROL_GLOBAL_GTT_IVB			(1<<24) /* gen7+ */
 #define   PIPE_CONTROL_CS_STALL				(1<<20)
 #define   PIPE_CONTROL_TLB_INVALIDATE			(1<<18)
+#define   PIPE_CONTROL_SYNCHRONIZE_GFDT			(1<<17)
 #define   PIPE_CONTROL_QW_WRITE				(1<<14)
 #define   PIPE_CONTROL_DEPTH_STALL			(1<<13)
 #define   PIPE_CONTROL_WRITE_FLUSH			(1<<12)
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 3db4a68..ae2c7b5 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -252,14 +252,15 @@ TRACE_EVENT(i915_gem_ring_dispatch,
 );
 
 TRACE_EVENT(i915_gem_ring_flush,
-	    TP_PROTO(struct intel_ring_buffer *ring, u32 invalidate, u32 flush),
-	    TP_ARGS(ring, invalidate, flush),
+	    TP_PROTO(struct intel_ring_buffer *ring, u32 invalidate, u32 flush, u32 internal),
+	    TP_ARGS(ring, invalidate, flush, internal),
 
 	    TP_STRUCT__entry(
 			     __field(u32, dev)
 			     __field(u32, ring)
 			     __field(u32, invalidate)
 			     __field(u32, flush)
+			     __field(u32, internal)
 			     ),
 
 	    TP_fast_assign(
@@ -267,11 +268,12 @@ TRACE_EVENT(i915_gem_ring_flush,
 			   __entry->ring = ring->id;
 			   __entry->invalidate = invalidate;
 			   __entry->flush = flush;
+			   __entry->internal = internal;
 			   ),
 
-	    TP_printk("dev=%u, ring=%x, invalidate=%04x, flush=%04x",
+	    TP_printk("dev=%u, ring=%x, invalidate=%04x, flush=%04x, internal=%08x",
 		      __entry->dev, __entry->ring,
-		      __entry->invalidate, __entry->flush)
+		      __entry->invalidate, __entry->flush, __entry->internal)
 );
 
 DECLARE_EVENT_CLASS(i915_gem_request,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 1d5d613..9b1ae4d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -54,7 +54,8 @@ static inline int ring_space(struct intel_ring_buffer *ring)
 static int
 gen2_render_ring_flush(struct intel_ring_buffer *ring,
 		       u32	invalidate_domains,
-		       u32	flush_domains)
+		       u32	flush_domains,
+		       u32	internal)
 {
 	u32 cmd;
 	int ret;
@@ -80,7 +81,8 @@ gen2_render_ring_flush(struct intel_ring_buffer *ring,
 static int
 gen4_render_ring_flush(struct intel_ring_buffer *ring,
 		       u32	invalidate_domains,
-		       u32	flush_domains)
+		       u32	flush_domains,
+		       u32	internal)
 {
 	struct drm_device *dev = ring->dev;
 	u32 cmd;
@@ -210,7 +212,9 @@ intel_emit_post_sync_nonzero_flush(struct intel_ring_buffer *ring)
 
 static int
 gen6_render_ring_flush(struct intel_ring_buffer *ring,
-                         u32 invalidate_domains, u32 flush_domains)
+		       u32 invalidate_domains,
+		       u32 flush_domains,
+		       u32 internal)
 {
 	u32 flags = 0;
 	struct pipe_control *pc = ring->private;
@@ -248,6 +252,12 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 	}
 
+	/* Flush GFDT out to memory */
+	if (internal & I915_FLUSH_GFDT) {
+		flags |= PIPE_CONTROL_QW_WRITE;
+		flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
+	}
+
 	ret = intel_ring_begin(ring, 4);
 	if (ret)
 		return ret;
@@ -282,7 +292,9 @@ gen7_render_ring_cs_stall_wa(struct intel_ring_buffer *ring)
 
 static int
 gen7_render_ring_flush(struct intel_ring_buffer *ring,
-		       u32 invalidate_domains, u32 flush_domains)
+		       u32 invalidate_domains,
+		       u32 flush_domains,
+		       u32 internal)
 {
 	u32 flags = 0;
 	struct pipe_control *pc = ring->private;
@@ -325,6 +337,12 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
 		 * invalidate bit set. */
 		gen7_render_ring_cs_stall_wa(ring);
 	}
+	/* Flush GFDT out to memory */
+	if (internal & I915_FLUSH_GFDT) {
+		flags |= PIPE_CONTROL_QW_WRITE;
+		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
+		flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
+	}
 
 	ret = intel_ring_begin(ring, 4);
 	if (ret)
@@ -912,8 +930,9 @@ void intel_ring_setup_status_page(struct intel_ring_buffer *ring)
 
 static int
 bsd_ring_flush(struct intel_ring_buffer *ring,
-	       u32     invalidate_domains,
-	       u32     flush_domains)
+	       u32	invalidate_domains,
+	       u32	flush_domains,
+	       u32	internal)
 {
 	int ret;
 
@@ -1547,7 +1566,9 @@ static void gen6_bsd_ring_write_tail(struct intel_ring_buffer *ring,
 }
 
 static int gen6_ring_flush(struct intel_ring_buffer *ring,
-			   u32 invalidate, u32 flush)
+			   u32 invalidate,
+			   u32 flush,
+			   u32 internal)
 {
 	uint32_t cmd;
 	int ret;
@@ -1557,6 +1578,12 @@ static int gen6_ring_flush(struct intel_ring_buffer *ring,
 		return ret;
 
 	cmd = MI_FLUSH_DW;
+
+	/* Flush GFDT out to memory */
+	if (internal & I915_FLUSH_GFDT)
+		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
+			MI_SYNCHRONIZE_GFDT;
+
 	/*
 	 * Bspec vol 1c.5 - video engine command streamer:
 	 * "If ENABLED, all TLBs will be invalidated once the flush
@@ -1619,7 +1646,9 @@ gen6_ring_dispatch_execbuffer(struct intel_ring_buffer *ring,
 /* Blitter support (SandyBridge+) */
 
 static int blt_ring_flush(struct intel_ring_buffer *ring,
-			  u32 invalidate, u32 flush)
+			  u32 invalidate,
+			  u32 flush,
+			  u32 internal)
 {
 	uint32_t cmd;
 	int ret;
@@ -1629,6 +1658,12 @@ static int blt_ring_flush(struct intel_ring_buffer *ring,
 		return ret;
 
 	cmd = MI_FLUSH_DW;
+
+	/* Flush GFDT out to memory */
+	if (internal & I915_FLUSH_GFDT)
+		cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
+			MI_SYNCHRONIZE_GFDT;
+
 	/*
 	 * Bspec vol 1c.3 - blitter engine command streamer:
 	 * "If ENABLED, all TLBs will be invalidated once the flush
@@ -1889,11 +1924,31 @@ intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
 	if (!ring->gpu_caches_dirty)
 		return 0;
 
-	ret = ring->flush(ring, 0, I915_GEM_GPU_DOMAINS);
+	ret = ring->flush(ring, 0, I915_GEM_GPU_DOMAINS, 0);
+	if (ret)
+		return ret;
+
+	trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS, 0);
+
+	ring->gpu_caches_dirty = false;
+	return 0;
+}
+
+int
+intel_ring_flush_internal(struct intel_ring_buffer *ring, u32 internal)
+{
+	uint32_t flush_domains;
+	int ret;
+
+	flush_domains = 0;
+	if (ring->gpu_caches_dirty)
+		flush_domains = I915_GEM_GPU_DOMAINS;
+
+	ret = ring->flush(ring, 0, flush_domains, internal);
 	if (ret)
 		return ret;
 
-	trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
+	trace_i915_gem_ring_flush(ring, 0, flush_domains, internal);
 
 	ring->gpu_caches_dirty = false;
 	return 0;
@@ -1909,11 +1964,11 @@ intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring)
 	if (ring->gpu_caches_dirty)
 		flush_domains = I915_GEM_GPU_DOMAINS;
 
-	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+	ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, flush_domains, 0);
 	if (ret)
 		return ret;
 
-	trace_i915_gem_ring_flush(ring, I915_GEM_GPU_DOMAINS, flush_domains);
+	trace_i915_gem_ring_flush(ring, I915_GEM_GPU_DOMAINS, flush_domains, 0);
 
 	ring->gpu_caches_dirty = false;
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d66208c..b9aa76e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -80,7 +80,8 @@ struct  intel_ring_buffer {
 				      u32 value);
 	int __must_check (*flush)(struct intel_ring_buffer *ring,
 				  u32	invalidate_domains,
-				  u32	flush_domains);
+				  u32	flush_domains,
+				  u32	internal);
 	int		(*add_request)(struct intel_ring_buffer *ring);
 	/* Some chipsets are not quite as coherent as advertised and need
 	 * an expensive kick to force a true read of the up-to-date seqno.
@@ -220,6 +221,8 @@ int __must_check intel_ring_idle(struct intel_ring_buffer *ring);
 void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno);
 int intel_ring_flush_all_caches(struct intel_ring_buffer *ring);
 int intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring);
+int intel_ring_flush_internal(struct intel_ring_buffer *ring, u32 internal);
+#define I915_FLUSH_GFDT 0x1
 
 int intel_init_render_ring_buffer(struct drm_device *dev);
 int intel_init_bsd_ring_buffer(struct drm_device *dev);
-- 
1.7.10.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915: GFDT support for SNB/IVB
  2013-03-06 16:28 ` Chris Wilson
@ 2013-04-04  9:01   ` Daniel Vetter
  2013-04-04 11:27     ` Ville Syrjälä
  0 siblings, 1 reply; 10+ messages in thread
From: Daniel Vetter @ 2013-04-04  9:01 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Mar 06, 2013 at 04:28:09PM +0000, Chris Wilson wrote:
> From: Ville Syrjälä <ville.syrjala@linux.intel.com>
> 
> Currently all scanout buffers must be uncached because the
> display controller doesn't snoop the LLC. SNB introduced another
> method to guarantee coherency for the display controller. It's
> called the GFDT or graphics data type.
> 
> Pages that have the GFDT bit enabled in their PTEs get flushed
> all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
> issued with the "synchronize GFDT" bit set.
> 
> So rather than making all scanout buffers uncached, set the GFDT
> bit in their PTEs, and modify the ring flush functions to enable
> the "synchronize GFDT" bit.
> 
> On HSW the GFDT bit was removed from the PTE, and it's only present in
> surface state, so we can't really set it from the kernel. Also the docs
> state that the hardware isn't actually guaranteed to respect the GFDT
> bit. So it looks like GFDT isn't all that useful on HSW.
> 
> So far I've tried this very quickly on an IVB machine, and
> it seems to be working as advertised. No idea if it does any
> good though.
> 
> On an i5-2520m (laptop) running gnome-shell at 1366x768:
>   padman 		140.78 -> 145.98 fps
>   openarena 		183.72 -> 186.87 fps
>   gtkperf ComboBoxEntry	20.27 -> 22.14s
>   gtkperf pixbuf	 1.12 ->  1.47s
>   x11perf -aa10text	13.40 -> 13.20 Mglyphs
> which are well within the throttling noise.
> 
> v2 [ickle]: adapt to comply with existing userspace guarantees
> 
> Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Oops, this one here somehow fell a bit through the cracks.

Two bikesheds and one real issue:
- s/bool gfdt/unsigned caching_flags/ for the gtt pte punchers. I expect
  more fun to come here ;-)
- ring->flush has a pile of arguments, I guess we should coalesce
  invalidate/flush and the new internal into a simple flags array. I don't
  expect that we'll every switch invalidate/flush back to domain arrays
  from the current "it's just a bool, really" usage.

Also, the above should be done in separate prep patches imo.

The real deal is flushing cpu access, since that probably does not set the
gfdt flag. So cpu reads are fine and don't require any flushing, but cpu
writes must be clflushed I think. In a way gfdt works as if the gpu is
doing write-through caching (safe that we have to manually initiate the
write-through with the gfdt flush). But from the pov of cpu access it's
the same, and hence requires the same amount of clflushing.

Hence I'm leaning towards adding a new I915_CACHING_WT cache_level. Ofc
for gfdt we still need to keep track of the gfdt_dirty state, but all the
cpu side flushing business should be much clearer.

Stumbled over this while checking whether sw_finish_ioctl would still do
the right thing, and noticed that the clflush is a noop when gfdt is
treated like the current CACHE_LLC.

Cheers, Daniel

PS: Ben Widawsky noticed that on ivb we set a stupid ppgtt pte caching
override, which results in pte cachelines being tagged with gfdt. Hsw has
an equivalent mode to allow the gpu to write through. I have no idea
what's the point of this and we never write ptes from the gpu. So can you
please throw the relevant patch on top to disable gfdt for ppgtt ptes in
ECOCHK?
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915: GFDT support for SNB/IVB
  2013-04-04  9:01   ` Daniel Vetter
@ 2013-04-04 11:27     ` Ville Syrjälä
  2013-04-04 12:17       ` Chris Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Ville Syrjälä @ 2013-04-04 11:27 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Thu, Apr 04, 2013 at 11:01:02AM +0200, Daniel Vetter wrote:
> On Wed, Mar 06, 2013 at 04:28:09PM +0000, Chris Wilson wrote:
> > From: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > 
> > Currently all scanout buffers must be uncached because the
> > display controller doesn't snoop the LLC. SNB introduced another
> > method to guarantee coherency for the display controller. It's
> > called the GFDT or graphics data type.
> > 
> > Pages that have the GFDT bit enabled in their PTEs get flushed
> > all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
> > issued with the "synchronize GFDT" bit set.
> > 
> > So rather than making all scanout buffers uncached, set the GFDT
> > bit in their PTEs, and modify the ring flush functions to enable
> > the "synchronize GFDT" bit.
> > 
> > On HSW the GFDT bit was removed from the PTE, and it's only present in
> > surface state, so we can't really set it from the kernel. Also the docs
> > state that the hardware isn't actually guaranteed to respect the GFDT
> > bit. So it looks like GFDT isn't all that useful on HSW.
> > 
> > So far I've tried this very quickly on an IVB machine, and
> > it seems to be working as advertised. No idea if it does any
> > good though.
> > 
> > On an i5-2520m (laptop) running gnome-shell at 1366x768:
> >   padman 		140.78 -> 145.98 fps
> >   openarena 		183.72 -> 186.87 fps
> >   gtkperf ComboBoxEntry	20.27 -> 22.14s
> >   gtkperf pixbuf	 1.12 ->  1.47s
> >   x11perf -aa10text	13.40 -> 13.20 Mglyphs
> > which are well within the throttling noise.
> > 
> > v2 [ickle]: adapt to comply with existing userspace guarantees
> > 
> > Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Oops, this one here somehow fell a bit through the cracks.
> 
> Two bikesheds and one real issue:
> - s/bool gfdt/unsigned caching_flags/ for the gtt pte punchers. I expect
>   more fun to come here ;-)
> - ring->flush has a pile of arguments, I guess we should coalesce
>   invalidate/flush and the new internal into a simple flags array. I don't
>   expect that we'll every switch invalidate/flush back to domain arrays
>   from the current "it's just a bool, really" usage.
> 
> Also, the above should be done in separate prep patches imo.
> 
> The real deal is flushing cpu access, since that probably does not set the
> gfdt flag. So cpu reads are fine and don't require any flushing, but cpu
> writes must be clflushed I think. In a way gfdt works as if the gpu is
> doing write-through caching (safe that we have to manually initiate the
> write-through with the gfdt flush). But from the pov of cpu access it's
> the same, and hence requires the same amount of clflushing.
> 
> Hence I'm leaning towards adding a new I915_CACHING_WT cache_level. Ofc
> for gfdt we still need to keep track of the gfdt_dirty state, but all the
> cpu side flushing business should be much clearer.
> 
> Stumbled over this while checking whether sw_finish_ioctl would still do
> the right thing, and noticed that the clflush is a noop when gfdt is
> treated like the current CACHE_LLC.

My original patch had a change to clflushing where pinned objects w/ gfdt
were also flushed. I didn't really read v2 well enough to figure out why
that got dropped.

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] drm/i915: GFDT support for SNB/IVB
  2013-04-04 11:27     ` Ville Syrjälä
@ 2013-04-04 12:17       ` Chris Wilson
  0 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2013-04-04 12:17 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: intel-gfx

On Thu, Apr 04, 2013 at 02:27:12PM +0300, Ville Syrjälä wrote:
> On Thu, Apr 04, 2013 at 11:01:02AM +0200, Daniel Vetter wrote:
> > On Wed, Mar 06, 2013 at 04:28:09PM +0000, Chris Wilson wrote:
> > > From: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > > 
> > > Currently all scanout buffers must be uncached because the
> > > display controller doesn't snoop the LLC. SNB introduced another
> > > method to guarantee coherency for the display controller. It's
> > > called the GFDT or graphics data type.
> > > 
> > > Pages that have the GFDT bit enabled in their PTEs get flushed
> > > all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
> > > issued with the "synchronize GFDT" bit set.
> > > 
> > > So rather than making all scanout buffers uncached, set the GFDT
> > > bit in their PTEs, and modify the ring flush functions to enable
> > > the "synchronize GFDT" bit.
> > > 
> > > On HSW the GFDT bit was removed from the PTE, and it's only present in
> > > surface state, so we can't really set it from the kernel. Also the docs
> > > state that the hardware isn't actually guaranteed to respect the GFDT
> > > bit. So it looks like GFDT isn't all that useful on HSW.
> > > 
> > > So far I've tried this very quickly on an IVB machine, and
> > > it seems to be working as advertised. No idea if it does any
> > > good though.
> > > 
> > > On an i5-2520m (laptop) running gnome-shell at 1366x768:
> > >   padman 		140.78 -> 145.98 fps
> > >   openarena 		183.72 -> 186.87 fps
> > >   gtkperf ComboBoxEntry	20.27 -> 22.14s
> > >   gtkperf pixbuf	 1.12 ->  1.47s
> > >   x11perf -aa10text	13.40 -> 13.20 Mglyphs
> > > which are well within the throttling noise.
> > > 
> > > v2 [ickle]: adapt to comply with existing userspace guarantees
> > > 
> > > Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > Oops, this one here somehow fell a bit through the cracks.
> > 
> > Two bikesheds and one real issue:
> > - s/bool gfdt/unsigned caching_flags/ for the gtt pte punchers. I expect
> >   more fun to come here ;-)
> > - ring->flush has a pile of arguments, I guess we should coalesce
> >   invalidate/flush and the new internal into a simple flags array. I don't
> >   expect that we'll every switch invalidate/flush back to domain arrays
> >   from the current "it's just a bool, really" usage.
> > 
> > Also, the above should be done in separate prep patches imo.
> > 
> > The real deal is flushing cpu access, since that probably does not set the
> > gfdt flag. So cpu reads are fine and don't require any flushing, but cpu
> > writes must be clflushed I think. In a way gfdt works as if the gpu is
> > doing write-through caching (safe that we have to manually initiate the
> > write-through with the gfdt flush). But from the pov of cpu access it's
> > the same, and hence requires the same amount of clflushing.
> > 
> > Hence I'm leaning towards adding a new I915_CACHING_WT cache_level. Ofc
> > for gfdt we still need to keep track of the gfdt_dirty state, but all the
> > cpu side flushing business should be much clearer.
> > 
> > Stumbled over this while checking whether sw_finish_ioctl would still do
> > the right thing, and noticed that the clflush is a noop when gfdt is
> > treated like the current CACHE_LLC.
> 
> My original patch had a change to clflushing where pinned objects w/ gfdt
> were also flushed. I didn't really read v2 well enough to figure out why
> that got dropped.

Because it was the wrong approach. The API where the scanout is to be
made coherent with CPU mmaps is sw_finish. The only user of that is early
pre-gen5 DDX. So imo the original patch was abusing a generic flag and
applying the flushes at the wrong points, and I simply didn't care about
badly designed API such as sw_finish.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2013-04-04 12:17 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-03-01 18:32 [PATCH] drm/i915: GFDT support for SNB/IVB ville.syrjala
2013-03-03 16:28 ` Daniel Vetter
2013-03-03 17:39   ` Chris Wilson
2013-03-04 13:51     ` Ville Syrjälä
2013-03-04 14:01       ` Chris Wilson
2013-03-04 14:03   ` Ville Syrjälä
2013-03-06 16:28 ` Chris Wilson
2013-04-04  9:01   ` Daniel Vetter
2013-04-04 11:27     ` Ville Syrjälä
2013-04-04 12:17       ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.