Re: [PATCH 8/8] drm/i915: Refactor execbuffer relocation writing

From: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH 8/8] drm/i915: Refactor execbuffer relocation writing
Date: Thu, 09 Jun 2016 16:31:25 +0300	[thread overview]
Message-ID: <1465479085.9670.40.camel@linux.intel.com> (raw)
In-Reply-To: <1465471779-20765-9-git-send-email-chris@chris-wilson.co.uk>

On to, 2016-06-09 at 12:29 +0100, Chris Wilson wrote:
> With in the introduction of the reloc page cache, we are just one step
> away from refactoring the relocation write functions into one. Not only
> does it tidy the code (slightly), but it greatly simplifies the control
> logic much to gcc's satisfaction.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 289 +++++++++++++++--------------
>  1 file changed, 150 insertions(+), 139 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 318c71b663f4..bda8ec8b02e6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -34,6 +34,8 @@
>  #include 
>  #include 
>  
> +#define DBG_USE_CPU_RELOC 0 /* force relocations to use the CPU write path */

Better connect to the new debug Kconfig menu you introduced?

> +
>  #define  __EXEC_OBJECT_HAS_PIN (1U<<31)
>  #define  __EXEC_OBJECT_HAS_FENCE (1U<<30)
>  #define  __EXEC_OBJECT_NEEDS_MAP (1U<<29)
> @@ -53,6 +55,7 @@ struct i915_execbuffer_params {
>  };
>  
>  struct eb_vmas {
> +	struct drm_i915_private *i915;
>  	struct list_head vmas;
>  	int and;
>  	union {
> @@ -62,7 +65,8 @@ struct eb_vmas {
>  };
>  
>  static struct eb_vmas *
> -eb_create(struct drm_i915_gem_execbuffer2 *args)
> +eb_create(struct drm_i915_private *i915,
> +	  struct drm_i915_gem_execbuffer2 *args)
>  {
>  	struct eb_vmas *eb = NULL;
>  
> @@ -89,6 +93,7 @@ eb_create(struct drm_i915_gem_execbuffer2 *args)
>  	} else
>  		eb->and = -args->buffer_count;
>  
> +	eb->i915 = i915;
>  	INIT_LIST_HEAD(&eb->vmas);
>  	return eb;
>  }
> @@ -272,7 +277,8 @@ static void eb_destroy(struct eb_vmas *eb)
>  
>  static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
>  {
> -	return (HAS_LLC(obj->base.dev) ||
> +	return (DBG_USE_CPU_RELOC ||
> +		HAS_LLC(obj->base.dev) ||
>  		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
>  		obj->cache_level != I915_CACHE_NONE);
>  }
> @@ -296,32 +302,58 @@ static inline uint64_t gen8_noncanonical_addr(uint64_t address)
>  }
>  
>  static inline uint64_t
> -relocation_target(struct drm_i915_gem_relocation_entry *reloc,
> +relocation_target(const struct drm_i915_gem_relocation_entry *reloc,

These const changes could be a bunch instead of sprinkled around.
Unless we want to smuggle them in through the resistance.

>  		  uint64_t target_offset)
>  {
>  	return gen8_canonical_addr((int)reloc->delta + target_offset);
>  }
>  
>  struct reloc_cache {
> -	void *vaddr;
> +	struct drm_i915_private *i915;
> +	unsigned long vaddr;
>  	unsigned page;
> -	enum { KMAP, IOMAP } type;
> +	struct drm_mm_node node;
> +	bool use_64bit_reloc;
>  };
>  
> -static void reloc_cache_init(struct reloc_cache *cache)
> +static void reloc_cache_init(struct reloc_cache *cache,
> +			     struct drm_i915_private *i915)
>  {
>  	cache->page = -1;
> -	cache->vaddr = NULL;
> +	cache->vaddr = 0;
> +	cache->i915 = i915;
> +	cache->use_64bit_reloc = INTEL_GEN(cache->i915) >= 8;
> +}
> +
> +static inline void *unmask_page(unsigned long p)
> +{
> +	return (void *)(uintptr_t)(p & PAGE_MASK);
>  }
>  
> +static inline unsigned unmask_flags(unsigned long p)
> +{
> +	return p & ~PAGE_MASK;
> +}
> +
> +#define KMAP 0x4
> +
>  static void reloc_cache_fini(struct reloc_cache *cache)
>  {
> -	if (cache->vaddr == NULL)
> +	void *vaddr;
> +
> +	if (cache->vaddr == 0)
>  		return;
>  
> -	switch (cache->type) {
> -	case KMAP: kunmap_atomic(cache->vaddr); break;
> -	case IOMAP: io_mapping_unmap_atomic(cache->vaddr); break;
> +	vaddr = unmask_page(cache->vaddr);
> +	if (cache->vaddr & KMAP) {
> +		if (cache->vaddr & CLFLUSH_AFTER)
> +			mb();
> +
> +		kunmap_atomic(vaddr);
> +		i915_gem_object_unpin_pages((struct drm_i915_gem_object *)cache->node.mm);

Rather long line.

> +	} else {
> +		io_mapping_unmap_atomic(vaddr);
> +		i915_vma_unpin((struct i915_vma *)cache->node.mm);
>  	}
>  }
>  
> @@ -329,148 +361,138 @@ static void *reloc_kmap(struct drm_i915_gem_object *obj,
>  			struct reloc_cache *cache,
>  			int page)
>  {
> -	if (cache->page == page)
> -		return cache->vaddr;
> +	void *vaddr;
>  
> -	if (cache->vaddr)
> -		kunmap_atomic(cache->vaddr);
> +	if (cache->vaddr) {
> +		kunmap_atomic(unmask_page(cache->vaddr));
> +	} else {
> +		unsigned flushes;
> +		int ret;
> +
> +		ret = i915_gem_obj_prepare_shmem_write(obj, &flushes);

Yeah, needs_clflush is so bad name earlier in the series, that even you
don't use it. "need_clflushes" or anything is better.

> +		if (ret)
> +			return ERR_PTR(ret);
>  
> +		cache->vaddr = flushes | KMAP;
> +		cache->node.mm = (void *)obj;
> +		if (flushes)
> +			mb();
> +	}
> +
> +	vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
> +	cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr;
>  	cache->page = page;
> -	cache->vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
> -	cache->type = KMAP;
>  
> -	return cache->vaddr;
> +	return vaddr;
>  }
>  
> -static int
> -relocate_entry_cpu(struct drm_i915_gem_object *obj,
> -		   struct drm_i915_gem_relocation_entry *reloc,
> -		   struct reloc_cache *cache,
> -		   uint64_t target_offset)
> +static void *reloc_iomap(struct drm_i915_gem_object *obj,
> +			 struct reloc_cache *cache,
> +			 int page)
>  {
> -	struct drm_device *dev = obj->base.dev;
> -	uint32_t page_offset = offset_in_page(reloc->offset);
> -	uint64_t delta = relocation_target(reloc, target_offset);
> -	char *vaddr;
> -	int ret;
> +	void *vaddr;
>  
> -	ret = i915_gem_object_set_to_cpu_domain(obj, true);
> -	if (ret)
> -		return ret;
> +	if (cache->vaddr) {
> +		io_mapping_unmap_atomic(unmask_page(cache->vaddr));
> +	} else {
> +		struct i915_vma *vma;
> +		int ret;
>  
> -	vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
> -	*(uint32_t *)(vaddr + page_offset) = lower_32_bits(delta);
> +		if (use_cpu_reloc(obj))
> +			return NULL;
>  
> -	if (INTEL_GEN(dev) >= 8) {
> -		page_offset += sizeof(uint32_t);
> -		if (page_offset == PAGE_SIZE) {
> -			vaddr = reloc_kmap(obj, cache, cache->page + 1);
> -			page_offset = 0;
> -		}
> -		*(uint32_t *)(vaddr + page_offset) = upper_32_bits(delta);
> -	}
> +		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
> +					       PIN_MAPPABLE | PIN_NONBLOCK);
> +		if (IS_ERR(vma))
> +			return NULL;
>  
> -	return 0;
> -}

Ugh, did you simultaneously move and rename functions. This is rather
hard to follow from this point on...

Regards, Joonas

> +		ret = i915_gem_object_set_to_gtt_domain(obj, true);
> +		if (ret)
> +			return ERR_PTR(ret);
>  
> -static void *reloc_iomap(struct drm_i915_private *i915,
> -			 struct reloc_cache *cache,
> -			 uint64_t offset)
> -{
> -	if (cache->page == offset >> PAGE_SHIFT)
> -		return cache->vaddr;
> +		ret = i915_gem_object_put_fence(obj);
> +		if (ret)
> +			return ERR_PTR(ret);
>  
> -	if (cache->vaddr)
> -		io_mapping_unmap_atomic(cache->vaddr);
> +		cache->node.start = vma->node.start;
> +		cache->node.mm = (void *)vma;
> +	}
>  
> -	cache->page = offset >> PAGE_SHIFT;
> -	cache->vaddr =
> -		io_mapping_map_atomic_wc(i915->ggtt.mappable,
> -					 offset & PAGE_MASK);
> -	cache->type = IOMAP;
> +	vaddr = io_mapping_map_atomic_wc(cache->i915->ggtt.mappable,
> +					 cache->node.start + (page << PAGE_SHIFT));
> +	cache->page = page;
> +	cache->vaddr = (unsigned long)vaddr;
>  
> -	return cache->vaddr;
> +	return vaddr;
>  }
>  
> -static int
> -relocate_entry_gtt(struct drm_i915_gem_object *obj,
> -		   struct drm_i915_gem_relocation_entry *reloc,
> -		   struct reloc_cache *cache,
> -		   uint64_t target_offset)
> +static void *reloc_vaddr(struct drm_i915_gem_object *obj,
> +			 struct reloc_cache *cache,
> +			 int page)
>  {
> -	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
> -	struct i915_vma *vma;
> -	uint64_t delta = relocation_target(reloc, target_offset);
> -	uint64_t offset;
> -	void __iomem *reloc_page;
> -	int ret;
> -
> -	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, PIN_MAPPABLE);
> -	if (IS_ERR(vma))
> -		return PTR_ERR(vma);
> -
> -	ret = i915_gem_object_set_to_gtt_domain(obj, true);
> -	if (ret)
> -		goto unpin;
> -
> -	ret = i915_gem_object_put_fence(obj);
> -	if (ret)
> -		goto unpin;
> -
> -	/* Map the page containing the relocation we're going to perform.  */
> -	offset = vma->node.start;
> -	offset += reloc->offset;
> -	reloc_page = reloc_iomap(dev_priv, cache, offset);
> -	iowrite32(lower_32_bits(delta), reloc_page + offset_in_page(offset));
> +	void *vaddr;
>  
> -	if (INTEL_GEN(dev_priv) >= 8) {
> -		offset += sizeof(uint32_t);
> -		if (offset_in_page(offset) == 0)
> -			reloc_page = reloc_iomap(dev_priv, cache, offset);
> -		iowrite32(upper_32_bits(delta),
> -			  reloc_page + offset_in_page(offset));
> +	if (cache->page == page) {
> +		vaddr = unmask_page(cache->vaddr);
> +	} else {
> +		vaddr = NULL;
> +		if ((cache->vaddr & KMAP) == 0)
> +			vaddr = reloc_iomap(obj, cache, page);
> +		if (vaddr == NULL)
> +			vaddr = reloc_kmap(obj, cache, page);
>  	}
>  
> -unpin:
> -	__i915_vma_unpin(vma);
> -	return ret;
> +	return vaddr;
>  }
>  
>  static void
> -clflush_write32(void *addr, uint32_t value)
> +clflush_write32(uint32_t *addr, uint32_t value, unsigned flushes)
>  {
> -	/* This is not a fast path, so KISS. */
> -	drm_clflush_virt_range(addr, sizeof(uint32_t));
> -	*(uint32_t *)addr = value;
> -	drm_clflush_virt_range(addr, sizeof(uint32_t));
> +	if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
> +		if (flushes & CLFLUSH_BEFORE) {
> +			clflushopt(addr);
> +			mb();
> +		}
> +
> +		*addr = value;
> +
> +		/* Writes to the same cacheline are serialised by the CPU
> +		 * (including clflush). On the write path, we only require
> +		 * that it hits memory in an orderly fashion and place
> +		 * mb barriers at the start and end of the relocation phase
> +		 * to ensure ordering of clflush wrt to the system.
> +		 */
> +		if (flushes & CLFLUSH_AFTER)
> +			clflushopt(addr);
> +	} else
> +		*addr = value;
>  }
>  
>  static int
> -relocate_entry_clflush(struct drm_i915_gem_object *obj,
> -		       struct drm_i915_gem_relocation_entry *reloc,
> -		       struct reloc_cache *cache,
> -		       uint64_t target_offset)
> +relocate_entry(struct drm_i915_gem_object *obj,
> +	       const struct drm_i915_gem_relocation_entry *reloc,
> +	       struct reloc_cache *cache,
> +	       uint64_t target_offset)
>  {
> -	struct drm_device *dev = obj->base.dev;
> -	uint32_t page_offset = offset_in_page(reloc->offset);
> -	uint64_t delta = relocation_target(reloc, target_offset);
> -	char *vaddr;
> -	int ret;
> +	uint64_t offset = reloc->offset;
> +	bool wide = cache->use_64bit_reloc;
> +	void *vaddr;
>  
> -	ret = i915_gem_object_set_to_gtt_domain(obj, true);
> -	if (ret)
> -		return ret;
> +	target_offset = relocation_target(reloc, target_offset);
> +repeat:
> +	vaddr = reloc_vaddr(obj, cache, offset >> PAGE_SHIFT);
> +	if (IS_ERR(vaddr))
> +		return PTR_ERR(vaddr);
>  
> -	vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
> -	clflush_write32(vaddr + page_offset, lower_32_bits(delta));
> +	clflush_write32(vaddr + offset_in_page(offset),
> +			lower_32_bits(target_offset),
> +			cache->vaddr);
>  
> -	if (INTEL_GEN(dev) >= 8) {
> -		page_offset += sizeof(uint32_t);
> -		if (page_offset == PAGE_SIZE) {
> -			vaddr = reloc_kmap(obj, cache, cache->page + 1);
> -			page_offset = 0;
> -		}
> -		clflush_write32(vaddr + page_offset, upper_32_bits(delta));
> +	if (wide) {
> +		offset += sizeof(uint32_t);
> +		target_offset >>= 32;
> +		wide = false;
> +		goto repeat;
>  	}
>  
>  	return 0;
> @@ -557,7 +579,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  
>  	/* Check that the relocation address is valid... */
>  	if (unlikely(reloc->offset >
> -		obj->base.size - (INTEL_INFO(dev)->gen >= 8 ? 8 : 4))) {
> +		     obj->base.size - (cache->use_64bit_reloc ? 8 : 4))) {
>  		DRM_DEBUG("Relocation beyond object bounds: "
>  			  "obj %p target %d offset %d size %d.\n",
>  			  obj, reloc->target_handle,
> @@ -577,23 +599,12 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  	if (pagefault_disabled() && !object_is_idle(obj))
>  		return -EFAULT;
>  
> -	if (use_cpu_reloc(obj))
> -		ret = relocate_entry_cpu(obj, reloc, cache, target_offset);
> -	else if (obj->map_and_fenceable)
> -		ret = relocate_entry_gtt(obj, reloc, cache, target_offset);
> -	else if (static_cpu_has(X86_FEATURE_CLFLUSH))
> -		ret = relocate_entry_clflush(obj, reloc, cache, target_offset);
> -	else {
> -		WARN_ONCE(1, "Impossible case in relocation handling\n");
> -		ret = -ENODEV;
> -	}
> -
> +	ret = relocate_entry(obj, reloc, cache, target_offset);
>  	if (ret)
>  		return ret;
>  
>  	/* and update the user's relocation entry */
>  	reloc->presumed_offset = target_offset;
> -
>  	return 0;
>  }
>  
> @@ -609,7 +620,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
>  	int remain, ret = 0;
>  
>  	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
> -	reloc_cache_init(&cache);
> +	reloc_cache_init(&cache, eb->i915);
>  
>  	remain = entry->relocation_count;
>  	while (remain) {
> @@ -658,7 +669,7 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
>  	struct reloc_cache cache;
>  	int i, ret = 0;
>  
> -	reloc_cache_init(&cache);
> +	reloc_cache_init(&cache, eb->i915);
>  	for (i = 0; i < entry->relocation_count; i++) {
>  		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache);
>  		if (ret)
> @@ -1073,8 +1084,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>  	if (flush_chipset)
>  		i915_gem_chipset_flush(req->engine->i915);
>  
> -	if (flush_domains & I915_GEM_DOMAIN_GTT)
> -		wmb();
> +	/* Make sure (untracked) CPU relocs/parsing are flushed */
> +	wmb();
>  
>  	/* Unconditionally invalidate gpu caches and TLBs. */
>  	return req->engine->emit_flush(req, I915_GEM_GPU_DOMAINS, 0);
> @@ -1606,7 +1617,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  
>  	memset(¶ms_master, 0x00, sizeof(params_master));
>  
> -	eb = eb_create(args);
> +	eb = eb_create(dev_priv, args);
>  	if (eb == NULL) {
>  		i915_gem_context_put(ctx);
>  		mutex_unlock(&dev->struct_mutex);
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx