All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Ville Syrjälä" <ville.syrjala@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>
Cc: intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH v2 1/6] drm/i915: Eliminate vmap overhead for cmd parser
Date: Fri, 20 Nov 2015 16:41:53 +0200	[thread overview]
Message-ID: <20151120144153.GL4437@intel.com> (raw)
In-Reply-To: <1448016961-25331-2-git-send-email-chris@chris-wilson.co.uk>

On Fri, Nov 20, 2015 at 10:55:56AM +0000, Chris Wilson wrote:
> With a little complexity to handle cmds straddling page boundaries, we
> can completely avoiding having to vmap the batch and the shadow batch
> objects whilst running the command parser.
> 
> On ivb i7-3720MQ:
> 
> x11perf -dot before 54.3M, after 53.2M (max 203M)
> glxgears before 7110 fps, after 7300 fps (max 7860 fps)
> 
> Before:
> Time to blt 16384 bytes x      1:	 12.400µs, 1.2GiB/s
> Time to blt 16384 bytes x   4096:	  3.055µs, 5.0GiB/s
> 
> After:
> Time to blt 16384 bytes x      1:	  8.600µs, 1.8GiB/s
> Time to blt 16384 bytes x   4096:	  2.456µs, 6.2GiB/s
> 
> Removing the vmap is mostly a win, except we lose in a few cases where
> the batch size is greater than a page due to the extra complexity (loss
> of a simple cache efficient large copy, and boundary handling).
> 
> v2: Reorder so that we do check oacontrol remaining set at end-of-batch
> v3: Stage the copy through a temporary page so that we only copy into
> dst commands that have been verified.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_cmd_parser.c | 305 +++++++++++++++++----------------
>  1 file changed, 153 insertions(+), 152 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index 814d894ed925..db3a04ea036a 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -860,100 +860,6 @@ find_reg(const struct drm_i915_reg_descriptor *table,
>  	return NULL;
>  }
>  
> -static u32 *vmap_batch(struct drm_i915_gem_object *obj,
> -		       unsigned start, unsigned len)
> -{
> -	int i;
> -	void *addr = NULL;
> -	struct sg_page_iter sg_iter;
> -	int first_page = start >> PAGE_SHIFT;
> -	int last_page = (len + start + 4095) >> PAGE_SHIFT;
> -	int npages = last_page - first_page;
> -	struct page **pages;
> -
> -	pages = drm_malloc_ab(npages, sizeof(*pages));
> -	if (pages == NULL) {
> -		DRM_DEBUG_DRIVER("Failed to get space for pages\n");
> -		goto finish;
> -	}
> -
> -	i = 0;
> -	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
> -		pages[i++] = sg_page_iter_page(&sg_iter);
> -		if (i == npages)
> -			break;
> -	}
> -
> -	addr = vmap(pages, i, 0, PAGE_KERNEL);
> -	if (addr == NULL) {
> -		DRM_DEBUG_DRIVER("Failed to vmap pages\n");
> -		goto finish;
> -	}
> -
> -finish:
> -	if (pages)
> -		drm_free_large(pages);
> -	return (u32*)addr;
> -}
> -
> -/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> -static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> -		       struct drm_i915_gem_object *src_obj,
> -		       u32 batch_start_offset,
> -		       u32 batch_len)
> -{
> -	int needs_clflush = 0;
> -	void *src_base, *src;
> -	void *dst = NULL;
> -	int ret;
> -
> -	if (batch_len > dest_obj->base.size ||
> -	    batch_len + batch_start_offset > src_obj->base.size)
> -		return ERR_PTR(-E2BIG);
> -
> -	if (WARN_ON(dest_obj->pages_pin_count == 0))
> -		return ERR_PTR(-ENODEV);
> -
> -	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> -		return ERR_PTR(ret);
> -	}
> -
> -	src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
> -	if (!src_base) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> -		ret = -ENOMEM;
> -		goto unpin_src;
> -	}
> -
> -	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> -		goto unmap_src;
> -	}
> -
> -	dst = vmap_batch(dest_obj, 0, batch_len);
> -	if (!dst) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> -		ret = -ENOMEM;
> -		goto unmap_src;
> -	}
> -
> -	src = src_base + offset_in_page(batch_start_offset);
> -	if (needs_clflush)
> -		drm_clflush_virt_range(src, batch_len);
> -
> -	memcpy(dst, src, batch_len);
> -
> -unmap_src:
> -	vunmap(src_base);
> -unpin_src:
> -	i915_gem_object_unpin_pages(src_obj);
> -
> -	return ret ? ERR_PTR(ret) : dst;
> -}
> -
>  /**
>   * i915_needs_cmd_parser() - should a given ring use software command parsing?
>   * @ring: the ring in question
> @@ -1097,6 +1003,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
>  }
>  
>  #define LENGTH_BIAS 2
> +#define MAX_PARTIAL 256
>  
>  /**
>   * i915_parse_cmds() - parse a submitted batch buffer for privilege violations
> @@ -1120,86 +1027,180 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
>  		    u32 batch_len,
>  		    bool is_master)
>  {
> -	u32 *cmd, *batch_base, *batch_end;
> +	const struct drm_i915_cmd_descriptor *desc;
> +	unsigned dst_iter, src_iter;
> +	int needs_clflush = 0;
> +	struct get_page rewind;
> +	void *src, *dst, *tmp;
> +	u32 partial, length;
> +	unsigned in, out;
>  	struct drm_i915_cmd_descriptor default_desc = { 0 };
>  	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
>  	int ret = 0;
>  
> -	batch_base = copy_batch(shadow_batch_obj, batch_obj,
> -				batch_start_offset, batch_len);
> -	if (IS_ERR(batch_base)) {
> -		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
> -		return PTR_ERR(batch_base);
> +	if (batch_len > shadow_batch_obj->base.size ||

AFAIK that can't actaully happen since we allocate the shadow batch
based on batch_len. But I see it was already like this in the old code.

> +	    batch_len + batch_start_offset > batch_obj->base.size)

This worries me more. Shouldn't we also have this check somewhere for
the non-cmd_parser cases? Nor can I see any overflow check for
'batch_len+batch_start_offset'.

> +		return -E2BIG;
> +
> +	if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
> +		return -ENODEV;
> +
> +	ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> +		return ret;
> +	}
> +
> +	ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> +		goto unpin;
>  	}
>  
> +	tmp = kmalloc(PAGE_SIZE + MAX_PARTIAL, GFP_KERNEL);

GFP_TEMPORARY perhaps?

> +	if (tmp == NULL)
> +		return -ENOMEM;
> +
>  	/*
>  	 * We use the batch length as size because the shadow object is as
>  	 * large or larger and copy_batch() will write MI_NOPs to the extra

copy_batch() is gone.

>  	 * space. Parsing should be faster in some cases this way.
>  	 */
> -	batch_end = batch_base + (batch_len / sizeof(*batch_end));
> +	ret = -EINVAL;
> +	rewind = batch_obj->get_page;

Shouldn't the get_page work just fine without this rewind trick? As in if
some other user wants one of the previous pages it restarts iterating
from 0?

> +
> +	dst_iter = 0;
> +	dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
> +	out = 0;
> +
> +	in = offset_in_page(batch_start_offset);
> +	partial = 0;
> +	for (src_iter = batch_start_offset >> PAGE_SHIFT;
> +	     src_iter < batch_obj->base.size >> PAGE_SHIFT;
> +	     src_iter++) {
> +		u32 *cmd, *page_end, *batch_end;
> +		u32 this;
> +
> +		this = batch_len;
> +		if (this > PAGE_SIZE - in)
> +			this = PAGE_SIZE - in;
> +
> +		src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
> +		if (needs_clflush)
> +			drm_clflush_virt_range(src + in, this);
> +
> +		if (this == PAGE_SIZE && partial == 0)
> +			copy_page(tmp, src);
> +		else
> +			memcpy(tmp+partial, src+in, this);
> +
> +		cmd = tmp;
> +		page_end = tmp + this + partial;
> +		batch_end = tmp + batch_len + partial;
> +		partial = 0;
> +
> +		do {
> +			if (*cmd == MI_BATCH_BUFFER_END) {
> +				if (oacontrol_set) {
> +					DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> +					goto unmap;
> +				}
>  
> -	cmd = batch_base;
> -	while (cmd < batch_end) {
> -		const struct drm_i915_cmd_descriptor *desc;
> -		u32 length;
> +				cmd++; /* copy this cmd to dst */
> +				batch_len = this; /* no more src */
> +				ret = 0;
> +				break;
> +			}
>  
> -		if (*cmd == MI_BATCH_BUFFER_END)
> -			break;
> +			desc = find_cmd(ring, *cmd, &default_desc);
> +			if (!desc) {
> +				DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> +						 *cmd);
> +				goto unmap;
> +			}
>  
> -		desc = find_cmd(ring, *cmd, &default_desc);
> -		if (!desc) {
> -			DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> -					 *cmd);
> -			ret = -EINVAL;
> -			break;
> -		}

It's quite hard to see what's changed due to the reindent. Any chance
you could do the reindent in a prep patch just help my poor brain a bit?

> +			/*
> +			 * If the batch buffer contains a chained batch, return an
> +			 * error that tells the caller to abort and dispatch the
> +			 * workload as a non-secure batch.
> +			 */
> +			if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> +				ret = -EACCES;
> +				goto unmap;
> +			}
>  
> -		/*
> -		 * If the batch buffer contains a chained batch, return an
> -		 * error that tells the caller to abort and dispatch the
> -		 * workload as a non-secure batch.
> -		 */
> -		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> -			ret = -EACCES;
> -			break;
> -		}
> +			if (desc->flags & CMD_DESC_FIXED)
> +				length = desc->length.fixed;
> +			else
> +				length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
>  
> -		if (desc->flags & CMD_DESC_FIXED)
> -			length = desc->length.fixed;
> -		else
> -			length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
> -
> -		if ((batch_end - cmd) < length) {
> -			DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> -					 *cmd,
> -					 length,
> -					 batch_end - cmd);
> -			ret = -EINVAL;
> -			break;
> -		}
> +			if (unlikely(cmd + length > page_end)) {
> +				if (unlikely(cmd + length > batch_end)) {
> +					DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> +							 *cmd, length, batch_end - cmd);
> +					goto unmap;
> +				}
>  
> -		if (!check_cmd(ring, desc, cmd, length, is_master,
> -			       &oacontrol_set)) {
> -			ret = -EINVAL;
> -			break;
> -		}
> +				if (WARN_ON(length > MAX_PARTIAL)) {
> +					ret = -ENODEV;
> +					goto unmap;
> +				}
>  
> -		cmd += length;
> -	}
> +				partial = 4*(page_end - cmd);
> +				break;
> +			}
>  
> -	if (oacontrol_set) {
> -		DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> -		ret = -EINVAL;
> -	}
> +			if (!check_cmd(ring, desc, cmd, length, is_master,
> +				       &oacontrol_set))
> +				goto unmap;
> +
> +			cmd += length;
> +		} while (cmd < page_end);
> +
> +		/* Copy the validated page to the GPU batch */
> +		{
> +			/* exclude partial cmd, we copy it next time */
> +			unsigned dst_length = (void *)cmd - tmp;
> +			in = 0;
> +			do {
> +				int len;
> +
> +				if (out == PAGE_SIZE) {
> +					kunmap_atomic(dst);
> +					dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
> +					out = 0;
> +				}
>  
> -	if (cmd >= batch_end) {
> -		DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
> -		ret = -EINVAL;
> -	}
> +				len = dst_length;
> +				if (len + out > PAGE_SIZE)
> +					len = PAGE_SIZE - out;
> +				if (len == PAGE_SIZE)
> +					copy_page(dst, tmp + in);
> +				else
> +					memcpy(dst + out, tmp + in, len);
> +				in += len;
> +				out += len;
> +				dst_length -= len;
> +			} while (dst_length);
> +		}
> +
> +		batch_len -= this;
> +		if (batch_len == 0)
> +			break;
>  
> -	vunmap(batch_base);
> +		if (partial)
> +			memmove(tmp, cmd, partial);
>  
> +		kunmap_atomic(src);
> +		in = 0;
> +	}
> +unmap:
> +	kunmap_atomic(src);
> +	kunmap_atomic(dst);
> +	batch_obj->get_page = rewind;
> +	kfree(tmp);
> +unpin:
> +	i915_gem_object_unpin_pages(batch_obj);
>  	return ret;
>  }
>  
> -- 
> 2.6.2

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

  reply	other threads:[~2015-11-20 14:41 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-11-20 10:55 cmdparser overhead reduction Chris Wilson
2015-11-20 10:55 ` [PATCH v2 1/6] drm/i915: Eliminate vmap overhead for cmd parser Chris Wilson
2015-11-20 14:41   ` Ville Syrjälä [this message]
2015-11-20 14:52     ` Chris Wilson
2015-11-20 15:31     ` [PATCH v3] " Chris Wilson
2015-11-25 19:51       ` Ville Syrjälä
2015-11-25 20:13         ` Chris Wilson
2015-11-25 21:15           ` Ville Syrjälä
2015-11-20 10:55 ` [PATCH v2 2/6] drm/i915: Cache last cmd descriptor when parsing Chris Wilson
2015-11-20 15:08   ` Ville Syrjälä
2015-11-20 15:44     ` Chris Wilson
2015-12-01 17:30   ` Ville Syrjälä
2015-11-20 10:55 ` [PATCH v2 3/6] drm/i915: Use WC copies on !llc platforms for the command parser Chris Wilson
2015-11-20 15:05   ` Ville Syrjälä
2015-11-20 15:22     ` Chris Wilson
2015-12-01 17:32       ` Ville Syrjälä
2015-11-20 10:55 ` [PATCH v2 4/6] drm/i915: Reduce arithmetic operations during cmd parser lookup Chris Wilson
2015-11-20 15:02   ` Ville Syrjälä
2015-11-20 10:56 ` [PATCH v2 5/6] drm/i915: Reduce pointer indirection " Chris Wilson
2015-11-20 15:27   ` Ville Syrjälä
2015-11-20 15:34     ` Chris Wilson
2015-11-20 15:47       ` Ville Syrjälä
2015-11-23  8:09         ` Jani Nikula
2015-12-01 17:39     ` Ville Syrjälä
2015-11-20 10:56 ` [PATCH v2 6/6] drm/i915: Improve hash function for the command parser Chris Wilson
2015-11-20 15:13   ` Ville Syrjälä

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20151120144153.GL4437@intel.com \
    --to=ville.syrjala@linux.intel.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.