Re: [PATCH 2/5] drm/i915: Fallback to using CPU relocations for large batch buffers

From: Daniel Vetter <daniel@ffwll.ch>
To: Chris Wilson <chris@chris-wilson.co.uk>
Cc: intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/5] drm/i915: Fallback to using CPU relocations for large batch buffers
Date: Tue, 27 Jan 2015 16:09:37 +0100	[thread overview]
Message-ID: <20150127150937.GD4764@phenom.ffwll.local> (raw)
In-Reply-To: <1421234459-21424-2-git-send-email-chris@chris-wilson.co.uk>

On Wed, Jan 14, 2015 at 11:20:56AM +0000, Chris Wilson wrote:
> If the batch buffer is too large to fit into the aperture and we need a
> GTT mapping for relocations, we currently fail. This only applies to a
> subset of machines for a subset of environments, quite undesirable. We
> can simply check after failing to insert the batch into the GTT as to
> whether we only need a mappable binding for relocation and, if so, we can
> revert to using a non-mappable binding and an alternate relocation
> method. However, using relocate_entry_cpu() is excruciatingly slow for
> large buffers on non-LLC as the entire buffer requires clflushing before
> and after the relocation handling. Alternatively, we can implement a
> third relocation method that only clflushes around the relocation entry.
> This is still slower than updating through the GTT, so we prefer using
> the GTT where possible, but is orders of magnitude faster as we
> typically do not have to then clflush the entire buffer.
> 
> An alternative idea of using a temporary WC mapping of the backing store
> is promising (it should be faster than using the GTT itself), but
> requires fairly extensive arch/x86 support - along the lines of
> kmap_atomic_prof_pfn() (which is not universally implemented even for
> x86).
> 
> Testcase: igt/gem_exec_big #pnv,byt
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=88392
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 84 +++++++++++++++++++++++++-----
>  1 file changed, 72 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index e3ef17783765..06bdf7670584 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -251,7 +251,6 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
>  {
>  	return (HAS_LLC(obj->base.dev) ||
>  		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
> -		!obj->map_and_fenceable ||
>  		obj->cache_level != I915_CACHE_NONE);
>  }
>  
> @@ -337,6 +336,51 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
>  	return 0;
>  }
>  
> +static void
> +clflush_write32(void *addr, uint32_t value)
> +{
> +	/* This is not a fast path, so KISS. */
> +	drm_clflush_virt_range(addr, sizeof(uint32_t));
> +	*(uint32_t *)addr = value;
> +	drm_clflush_virt_range(addr, sizeof(uint32_t));
> +}
> +
> +static int
> +relocate_entry_clflush(struct drm_i915_gem_object *obj,
> +		       struct drm_i915_gem_relocation_entry *reloc,
> +		       uint64_t target_offset)
> +{
> +	struct drm_device *dev = obj->base.dev;
> +	uint32_t page_offset = offset_in_page(reloc->offset);
> +	uint64_t delta = (int)reloc->delta + target_offset;
> +	char *vaddr;
> +	int ret;
> +
> +	ret = i915_gem_object_set_to_gtt_domain(obj, true);
> +	if (ret)
> +		return ret;
> +
> +	vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> +				reloc->offset >> PAGE_SHIFT));
> +	clflush_write32(vaddr + page_offset, lower_32_bits(delta));
> +
> +	if (INTEL_INFO(dev)->gen >= 8) {
> +		page_offset = offset_in_page(page_offset + sizeof(uint32_t));
> +
> +		if (page_offset == 0) {
> +			kunmap_atomic(vaddr);
> +			vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> +			    (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT));
> +		}
> +
> +		clflush_write32(vaddr + page_offset, upper_32_bits(delta));
> +	}
> +
> +	kunmap_atomic(vaddr);
> +
> +	return 0;
> +}
> +
>  static int
>  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  				   struct eb_vmas *eb,
> @@ -426,9 +470,12 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  
>  	if (use_cpu_reloc(obj))
>  		ret = relocate_entry_cpu(obj, reloc, target_offset);
> -	else
> +	else if (obj->map_and_fenceable)
>  		ret = relocate_entry_gtt(obj, reloc, target_offset);
> -
> +	else if (cpu_has_clflush)
> +		ret = relocate_entry_clflush(obj, reloc, target_offset);
> +	else
> +		ret = -ENODEV;

I think a DRM_ERROR here would be good since this should never happen. And
why don't we have an errno for -EKERNELBUG ...

>  	if (ret)
>  		return ret;
>  
> @@ -525,6 +572,12 @@ i915_gem_execbuffer_relocate(struct eb_vmas *eb)
>  	return ret;
>  }
>  
> +static bool only_mappable_for_reloc(unsigned int flags)
> +{
> +	return (flags & (EXEC_OBJECT_NEEDS_FENCE | __EXEC_OBJECT_NEEDS_MAP)) ==
> +		__EXEC_OBJECT_NEEDS_MAP;
> +}

I'm slightly freaked out by us encoding this here without making it
explicit in the flags. But I couldn't come up with a better idea?

> +
>  static int
>  i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
>  				struct intel_engine_cs *ring,
> @@ -536,14 +589,21 @@ i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
>  	int ret;
>  
>  	flags = 0;
> -	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
> -		flags |= PIN_GLOBAL | PIN_MAPPABLE;
> -	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
> -		flags |= PIN_GLOBAL;
> -	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
> -		flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
> +	if (!drm_mm_node_allocated(&vma->node)) {
> +		if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
> +			flags |= PIN_GLOBAL | PIN_MAPPABLE;
> +		if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
> +			flags |= PIN_GLOBAL;
> +		if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
> +			flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
> +	}

Hm, aren't we always calling reserve un buffers we know we've just
unbound? Keeping the flags computation would at least be a good selfcheck
about the consistency of our placing decisions, so I'd like to keep it.

>  
>  	ret = i915_gem_object_pin(obj, vma->vm, entry->alignment, flags);
> +	if ((ret == -ENOSPC  || ret == -E2BIG) &&
> +	    only_mappable_for_reloc(entry->flags))
> +		ret = i915_gem_object_pin(obj, vma->vm,
> +					  entry->alignment,
> +					  flags & ~(PIN_GLOBAL | PIN_MAPPABLE));
>  	if (ret)
>  		return ret;
>  
> @@ -605,13 +665,13 @@ eb_vma_misplaced(struct i915_vma *vma)
>  	    vma->node.start & (entry->alignment - 1))
>  		return true;
>  
> -	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP && !obj->map_and_fenceable)
> -		return true;
> -
>  	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
>  	    vma->node.start < BATCH_OFFSET_BIAS)
>  		return true;
>  
> +	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP && !obj->map_and_fenceable)
> +		return !only_mappable_for_reloc(entry->flags);

Hm, this seems to contradict your commit message a bit since it'll result
in a behavioural change of what we try to push into mappable for relocs.

Shouldn't we instead just clear the NEEDS_MAP flag in reserve_vma when the
mappable pin fails and we fall back?

Besides the nitpicks on integration lgtm.
-Daniel

> +
>  	return false;
>  }
>  
> -- 
> 2.1.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx