From: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/8] drm/i915: Cache kmap between relocations
Date: Thu, 09 Jun 2016 15:25:47 +0300 [thread overview]
Message-ID: <1465475147.9670.7.camel@linux.intel.com> (raw)
In-Reply-To: <1465471779-20765-3-git-send-email-chris@chris-wilson.co.uk>
On to, 2016-06-09 at 12:29 +0100, Chris Wilson wrote:
> When doing relocations, we have to obtain a mapping to the page
> containing the target address. This is either a kmap or iomap depending
> on GPU and its cache coherency. Neighbouring relocation entries are
> typically within the same page and so we can cache our kmapping between
> them and avoid those pesky TLB flushes.
>
> Note that there is some sleight-of-hand in how the slow relocate works
> as the reloc_entry_cache implies pagefaults disabled (as we are inside a
> kmap_atomic section). However, the slow relocate code is meant to be the
> fallback from the atomic fast path failing. Fortunately it works as we
> already have performed the copy_from_user for the relocation array (no
> more pagefaults there) and the kmap_atomic cache is enabled after we
> have waited upon an active buffer (so no more sleeping in atomic).
> Magic!
>
You could also mention that you mangle the relocation <-> page logic,
so this is not purely about caching. Or maybe even split it.
Below comments, those addressed;
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 160 +++++++++++++++++++----------
> 1 file changed, 106 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index a29c4b6fea28..318c71b663f4 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -302,9 +302,50 @@ relocation_target(struct drm_i915_gem_relocation_entry *reloc,
> return gen8_canonical_addr((int)reloc->delta + target_offset);
> }
>
> +struct reloc_cache {
> + void *vaddr;
> + unsigned page;
> + enum { KMAP, IOMAP } type;
> +};
> +
> +static void reloc_cache_init(struct reloc_cache *cache)
> +{
> + cache->page = -1;
> + cache->vaddr = NULL;
> +}
> +
> +static void reloc_cache_fini(struct reloc_cache *cache)
> +{
> + if (cache->vaddr == NULL)
> + return;
> +
> + switch (cache->type) {
> + case KMAP: kunmap_atomic(cache->vaddr); break;
> + case IOMAP: io_mapping_unmap_atomic(cache->vaddr); break;
> + }
> +}
> +
> +static void *reloc_kmap(struct drm_i915_gem_object *obj,
> + struct reloc_cache *cache,
> + int page)
> +{
> + if (cache->page == page)
> + return cache->vaddr;
> +
> + if (cache->vaddr)
> + kunmap_atomic(cache->vaddr);
Maybe add some GEM_BUG_ON(cache->type != KMAP) here before running
kunmap_atomic? Because that assumption is made.
> +
> + cache->page = page;
> + cache->vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
> + cache->type = KMAP;
> +
> + return cache->vaddr;
> +}
> +
> static int
> relocate_entry_cpu(struct drm_i915_gem_object *obj,
> struct drm_i915_gem_relocation_entry *reloc,
> + struct reloc_cache *cache,
> uint64_t target_offset)
> {
> struct drm_device *dev = obj->base.dev;
> @@ -317,34 +358,47 @@ relocate_entry_cpu(struct drm_i915_gem_object *obj,
> if (ret)
> return ret;
>
> - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
> - reloc->offset >> PAGE_SHIFT));
> + vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
> *(uint32_t *)(vaddr + page_offset) = lower_32_bits(delta);
>
> - if (INTEL_INFO(dev)->gen >= 8) {
> - page_offset = offset_in_page(page_offset + sizeof(uint32_t));
> -
> - if (page_offset == 0) {
> - kunmap_atomic(vaddr);
> - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
> - (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT));
> + if (INTEL_GEN(dev) >= 8) {
> + page_offset += sizeof(uint32_t);
> + if (page_offset == PAGE_SIZE) {
> + vaddr = reloc_kmap(obj, cache, cache->page + 1);
> + page_offset = 0;
> }
> -
> *(uint32_t *)(vaddr + page_offset) = upper_32_bits(delta);
> }
>
> - kunmap_atomic(vaddr);
> -
> return 0;
> }
>
> +static void *reloc_iomap(struct drm_i915_private *i915,
> + struct reloc_cache *cache,
> + uint64_t offset)
> +{
> + if (cache->page == offset >> PAGE_SHIFT)
> + return cache->vaddr;
> +
> + if (cache->vaddr)
> + io_mapping_unmap_atomic(cache->vaddr);
> +
> + cache->page = offset >> PAGE_SHIFT;
> + cache->vaddr =
> + io_mapping_map_atomic_wc(i915->ggtt.mappable,
> + offset & PAGE_MASK);
> + cache->type = IOMAP;
> +
> + return cache->vaddr;
> +}
> +
> static int
> relocate_entry_gtt(struct drm_i915_gem_object *obj,
> struct drm_i915_gem_relocation_entry *reloc,
> + struct reloc_cache *cache,
> uint64_t target_offset)
> {
> struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
> - struct i915_ggtt *ggtt = &dev_priv->ggtt;
> struct i915_vma *vma;
> uint64_t delta = relocation_target(reloc, target_offset);
> uint64_t offset;
> @@ -366,28 +420,19 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
> /* Map the page containing the relocation we're going to perform. */
> offset = vma->node.start;
> offset += reloc->offset;
> - reloc_page = io_mapping_map_atomic_wc(ggtt->mappable,
> - offset & PAGE_MASK);
> + reloc_page = reloc_iomap(dev_priv, cache, offset);
> iowrite32(lower_32_bits(delta), reloc_page + offset_in_page(offset));
>
> if (INTEL_GEN(dev_priv) >= 8) {
> offset += sizeof(uint32_t);
> -
> - if (offset_in_page(offset) == 0) {
> - io_mapping_unmap_atomic(reloc_page);
> - reloc_page =
> - io_mapping_map_atomic_wc(ggtt->mappable,
> - offset);
> - }
> -
> + if (offset_in_page(offset) == 0)
> + reloc_page = reloc_iomap(dev_priv, cache, offset);
> iowrite32(upper_32_bits(delta),
> reloc_page + offset_in_page(offset));
> }
>
> - io_mapping_unmap_atomic(reloc_page);
> -
> unpin:
> - i915_vma_unpin(vma);
> + __i915_vma_unpin(vma);
> return ret;
> }
>
> @@ -403,6 +448,7 @@ clflush_write32(void *addr, uint32_t value)
> static int
> relocate_entry_clflush(struct drm_i915_gem_object *obj,
> struct drm_i915_gem_relocation_entry *reloc,
> + struct reloc_cache *cache,
For consistency with rest of the patch, I would put the cache as last
argument, as it could easily be removed again in future, so it's the
least important.
> uint64_t target_offset)
> {
> struct drm_device *dev = obj->base.dev;
> @@ -415,24 +461,18 @@ relocate_entry_clflush(struct drm_i915_gem_object *obj,
> if (ret)
> return ret;
>
> - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
> - reloc->offset >> PAGE_SHIFT));
> + vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
> clflush_write32(vaddr + page_offset, lower_32_bits(delta));
>
> - if (INTEL_INFO(dev)->gen >= 8) {
> - page_offset = offset_in_page(page_offset + sizeof(uint32_t));
> -
> - if (page_offset == 0) {
> - kunmap_atomic(vaddr);
> - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
> - (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT));
> + if (INTEL_GEN(dev) >= 8) {
> + page_offset += sizeof(uint32_t);
> + if (page_offset == PAGE_SIZE) {
> + vaddr = reloc_kmap(obj, cache, cache->page + 1);
> + page_offset = 0;
> }
> -
> clflush_write32(vaddr + page_offset, upper_32_bits(delta));
> }
>
> - kunmap_atomic(vaddr);
> -
> return 0;
> }
>
> @@ -453,7 +493,8 @@ static bool object_is_idle(struct drm_i915_gem_object *obj)
> static int
> i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> struct eb_vmas *eb,
> - struct drm_i915_gem_relocation_entry *reloc)
> + struct drm_i915_gem_relocation_entry *reloc,
> + struct reloc_cache *cache)
> {
> struct drm_device *dev = obj->base.dev;
> struct drm_gem_object *target_obj;
> @@ -537,11 +578,11 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> return -EFAULT;
>
> if (use_cpu_reloc(obj))
> - ret = relocate_entry_cpu(obj, reloc, target_offset);
> + ret = relocate_entry_cpu(obj, reloc, cache, target_offset);
> else if (obj->map_and_fenceable)
> - ret = relocate_entry_gtt(obj, reloc, target_offset);
> + ret = relocate_entry_gtt(obj, reloc, cache, target_offset);
> else if (static_cpu_has(X86_FEATURE_CLFLUSH))
> - ret = relocate_entry_clflush(obj, reloc, target_offset);
> + ret = relocate_entry_clflush(obj, reloc, cache, target_offset);
> else {
> WARN_ONCE(1, "Impossible case in relocation handling\n");
> ret = -ENODEV;
> @@ -564,9 +605,11 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
> struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
> struct drm_i915_gem_relocation_entry __user *user_relocs;
> struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> - int remain, ret;
> + struct reloc_cache cache;
> + int remain, ret = 0;
>
> user_relocs = u64_to_user_ptr(entry->relocs_ptr);
> + reloc_cache_init(&cache);
>
> remain = entry->relocation_count;
> while (remain) {
> @@ -576,19 +619,23 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
> count = ARRAY_SIZE(stack_reloc);
> remain -= count;
>
> - if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0])))
> - return -EFAULT;
> + if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]))) {
> + ret = -EFAULT;
> + goto out;
> + }
>
> do {
> u64 offset = r->presumed_offset;
>
> - ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r);
> + ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, &cache);
> if (ret)
> - return ret;
> + goto out;
>
> if (r->presumed_offset != offset &&
> - __put_user(r->presumed_offset, &user_relocs->presumed_offset)) {
> - return -EFAULT;
> + __put_user(r->presumed_offset,
> + &user_relocs->presumed_offset)) {
> + ret = -EFAULT;
> + goto out;
> }
>
> user_relocs++;
> @@ -596,7 +643,9 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
> } while (--count);
> }
>
> - return 0;
> +out:
> + reloc_cache_fini(&cache);
<NL> here to keep consistent between your next change.
Regards, Joonas
> + return ret;
> #undef N_RELOC
> }
>
> @@ -606,15 +655,18 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
> struct drm_i915_gem_relocation_entry *relocs)
> {
> const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> - int i, ret;
> + struct reloc_cache cache;
> + int i, ret = 0;
>
> + reloc_cache_init(&cache);
> for (i = 0; i < entry->relocation_count; i++) {
> - ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i]);
> + ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache);
> if (ret)
> - return ret;
> + break;
> }
> + reloc_cache_fini(&cache);
>
> - return 0;
> + return ret;
> }
>
> static int
--
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
next prev parent reply other threads:[~2016-06-09 12:25 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-06-09 11:29 Start tidying up execbuf relocations Chris Wilson
2016-06-09 11:29 ` [PATCH 1/8] drm/i915: Print the batchbuffer offset next to BBADDR in error state Chris Wilson
2016-06-09 12:29 ` Joonas Lahtinen
2016-06-09 11:29 ` [PATCH 2/8] drm/i915: Cache kmap between relocations Chris Wilson
2016-06-09 12:25 ` Joonas Lahtinen [this message]
2016-06-09 11:29 ` [PATCH 3/8] drm/i915: Extract i915_gem_obj_prepare_shmem_write() Chris Wilson
2016-06-09 11:39 ` Chris Wilson
2016-06-09 12:47 ` Joonas Lahtinen
2016-06-09 11:29 ` [PATCH 4/8] drm/i915: Before accessing an object via the cpu, flush GTT writes Chris Wilson
2016-06-09 12:50 ` Joonas Lahtinen
2016-06-09 11:29 ` [PATCH 5/8] drm/i915: Wait for writes through the GTT to land before reading back Chris Wilson
2016-06-09 11:36 ` Chris Wilson
2016-06-09 12:54 ` Joonas Lahtinen
2016-06-09 11:29 ` [PATCH 6/8] drm/i915: Pin the pages first in shmem prepare read/write Chris Wilson
2016-06-09 13:06 ` Joonas Lahtinen
2016-06-09 13:35 ` Chris Wilson
2016-06-09 13:51 ` Joonas Lahtinen
2016-06-09 14:13 ` Chris Wilson
2016-06-09 11:29 ` [PATCH 7/8] drm/i915: Tidy up flush cpu/gtt write domains Chris Wilson
2016-06-09 13:12 ` Joonas Lahtinen
2016-06-09 11:29 ` [PATCH 8/8] drm/i915: Refactor execbuffer relocation writing Chris Wilson
2016-06-09 13:31 ` Joonas Lahtinen
2016-06-09 14:22 ` Chris Wilson
2016-06-09 11:40 ` ✗ Ro.CI.BAT: failure for series starting with [1/8] drm/i915: Print the batchbuffer offset next to BBADDR in error state Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1465475147.9670.7.camel@linux.intel.com \
--to=joonas.lahtinen@linux.intel.com \
--cc=chris@chris-wilson.co.uk \
--cc=intel-gfx@lists.freedesktop.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.