From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/4] drm/i915: Emit pipelined fence changes
Date: Wed, 5 Jul 2017 11:19:43 +0100 [thread overview]
Message-ID: <99d30dd1-b518-0f9b-a412-dbf365ad690c@linux.intel.com> (raw)
In-Reply-To: <20170703101412.30820-3-chris@chris-wilson.co.uk>
On 03/07/2017 11:14, Chris Wilson wrote:
> Many years ago, long before requests, we tried doing this. We never
> quite got it right, but now with requests we have the tracking to do the
> job properly!
Add a few words on the benefits in certain use cases/benchmarks.
>
> One of the stall points for gen2/gen3 is the use of fence registers for
> GPU operations. There are only a few available, and currently if we
> exhaust the available fence register we must stall the GPU between
> batches. By pipelining the fence register writes, we can avoid the stall
> and continuously emit the batches. The challenge is remembering to wait
> for those pipelined LRI before accessing the fence with the CPU, and
> that is what our request tracking makes easy.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 1 +
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 15 ++-
> drivers/gpu/drm/i915/i915_gem_fence_reg.c | 193 +++++++++++++++++++++++++----
> drivers/gpu/drm/i915/i915_gem_fence_reg.h | 1 +
> drivers/gpu/drm/i915/i915_vma.c | 1 +
> drivers/gpu/drm/i915/i915_vma.h | 4 +
> 6 files changed, 186 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 939a299260e9..5318e321ce50 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4880,6 +4880,7 @@ i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
> fence->i915 = dev_priv;
> fence->id = i;
> list_add_tail(&fence->link, &dev_priv->mm.fence_list);
> + init_request_active(&fence->pipelined, NULL);
> }
> i915_gem_restore_fences(dev_priv);
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 22a9f5358322..19347ad0e7ac 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -365,11 +365,12 @@ eb_pin_vma(struct i915_execbuffer *eb,
> return;
>
> if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
> - if (unlikely(i915_vma_pin_fence(vma))) {
> + if (unlikely(i915_vma_reserve_fence(vma))) {
> i915_vma_unpin(vma);
> return;
> }
>
> + entry->flags &= ~EXEC_OBJECT_ASYNC;
> if (vma->fence)
> entry->flags |= __EXEC_OBJECT_HAS_FENCE;
> }
> @@ -564,12 +565,13 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
> GEM_BUG_ON(eb_vma_misplaced(entry, vma));
>
> if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
> - err = i915_vma_pin_fence(vma);
> + err = i915_vma_reserve_fence(vma);
> if (unlikely(err)) {
> i915_vma_unpin(vma);
> return err;
> }
>
> + entry->flags &= ~EXEC_OBJECT_ASYNC;
A comment here would be good.
> if (vma->fence)
> entry->flags |= __EXEC_OBJECT_HAS_FENCE;
> }
> @@ -1848,6 +1850,12 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
> if (unlikely(obj->cache_dirty && !obj->cache_coherent))
> i915_gem_clflush_object(obj, 0);
>
> + if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
> + err = i915_vma_emit_pipelined_fence(vma, eb->request);
> + if (err)
> + return err;
> + }
> +
> err = i915_gem_request_await_object
> (eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
> if (err)
> @@ -1932,9 +1940,6 @@ void i915_vma_move_to_active(struct i915_vma *vma,
> obj->base.read_domains = 0;
> }
> obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
> -
> - if (flags & EXEC_OBJECT_NEEDS_FENCE)
> - i915_gem_active_set(&vma->last_fence, req);
> }
>
> static int i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> index 55ac7bc14fce..d0cd051c19fd 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> @@ -55,10 +55,9 @@
> * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
> */
>
> -#define pipelined 0
> -
> -static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
> - struct i915_vma *vma)
> +static int i965_write_fence_reg(struct drm_i915_fence_reg *fence,
> + struct i915_vma *vma,
> + struct drm_i915_gem_request *pipelined)
> {
> i915_reg_t fence_reg_lo, fence_reg_hi;
> int fence_pitch_shift;
> @@ -110,11 +109,30 @@ static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
> I915_WRITE(fence_reg_hi, upper_32_bits(val));
> I915_WRITE(fence_reg_lo, lower_32_bits(val));
> POSTING_READ(fence_reg_lo);
> + } else {
> + u32 *cs;
> +
> + cs = intel_ring_begin(pipelined, 8);
Do we need to adjust the ring space reservation amount?
> + if (IS_ERR(cs))
> + return PTR_ERR(cs);
> +
> + *cs++ = MI_LOAD_REGISTER_IMM(3);
> + *cs++ = i915_mmio_reg_offset(fence_reg_lo);
> + *cs++ = 0;
> + *cs++ = i915_mmio_reg_offset(fence_reg_hi);
> + *cs++ = upper_32_bits(val);
> + *cs++ = i915_mmio_reg_offset(fence_reg_lo);
> + *cs++ = lower_32_bits(val);
> + *cs++ = MI_NOOP;
> + intel_ring_advance(pipelined, cs);
> }
> +
> + return 0;
> }
>
> -static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
> - struct i915_vma *vma)
> +static int i915_write_fence_reg(struct drm_i915_fence_reg *fence,
> + struct i915_vma *vma,
> + struct drm_i915_gem_request *pipelined)
> {
> u32 val;
>
> @@ -150,11 +168,26 @@ static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
>
> I915_WRITE(reg, val);
> POSTING_READ(reg);
> + } else {
> + u32 *cs;
> +
> + cs = intel_ring_begin(pipelined, 4);
> + if (IS_ERR(cs))
> + return PTR_ERR(cs);
> +
> + *cs++ = MI_LOAD_REGISTER_IMM(1);
> + *cs++ = i915_mmio_reg_offset(FENCE_REG(fence->id));
> + *cs++ = val;
> + *cs++ = MI_NOOP;
> + intel_ring_advance(pipelined, cs);
> }
> +
> + return 0;
> }
>
> -static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
> - struct i915_vma *vma)
> +static int i830_write_fence_reg(struct drm_i915_fence_reg *fence,
> + struct i915_vma *vma,
> + struct drm_i915_gem_request *pipelined)
> {
> u32 val;
>
> @@ -182,29 +215,49 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
>
> I915_WRITE(reg, val);
> POSTING_READ(reg);
> + } else {
> + u32 *cs;
> +
> + cs = intel_ring_begin(pipelined, 4);
> + if (IS_ERR(cs))
> + return PTR_ERR(cs);
> +
> + *cs++ = MI_LOAD_REGISTER_IMM(1);
> + *cs++ = i915_mmio_reg_offset(FENCE_REG(fence->id));
> + *cs++ = val;
> + *cs++ = MI_NOOP;
> + intel_ring_advance(pipelined, cs);
> }
> +
> + return 0;
> }
>
> -static void fence_write(struct drm_i915_fence_reg *fence,
> - struct i915_vma *vma)
> +static int fence_write(struct drm_i915_fence_reg *fence,
> + struct i915_vma *vma,
> + struct drm_i915_gem_request *rq)
> {
> + int err;
> +
> /* Previous access through the fence register is marshalled by
> * the mb() inside the fault handlers (i915_gem_release_mmaps)
> * and explicitly managed for internal users.
> */
>
> if (IS_GEN2(fence->i915))
> - i830_write_fence_reg(fence, vma);
> + err = i830_write_fence_reg(fence, vma, rq);
> else if (IS_GEN3(fence->i915))
> - i915_write_fence_reg(fence, vma);
> + err = i915_write_fence_reg(fence, vma, rq);
> else
> - i965_write_fence_reg(fence, vma);
> + err = i965_write_fence_reg(fence, vma, rq);
> + if (err)
> + return err;
>
> /* Access through the fenced region afterwards is
> * ordered by the posting reads whilst writing the registers.
> */
>
> fence->dirty = false;
> + return 0;
> }
>
> static int fence_update(struct drm_i915_fence_reg *fence,
> @@ -212,17 +265,15 @@ static int fence_update(struct drm_i915_fence_reg *fence,
> {
> int ret;
>
> + ret = i915_gem_active_retire(&fence->pipelined,
> + &fence->i915->drm.struct_mutex) > + if (ret)
> + return ret;
> +
> if (vma) {
> if (!i915_vma_is_map_and_fenceable(vma))
> return -EINVAL;
>
> - if (WARN(!i915_gem_object_get_stride(vma->obj) ||
> - !i915_gem_object_get_tiling(vma->obj),
> - "bogus fence setup with stride: 0x%x, tiling mode: %i\n",
> - i915_gem_object_get_stride(vma->obj),
> - i915_gem_object_get_tiling(vma->obj)))
> - return -EINVAL;
> -
> ret = i915_gem_active_retire(&vma->last_fence,
> &vma->obj->base.dev->struct_mutex);
> if (ret)
> @@ -253,7 +304,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
> * to the runtime resume, see i915_gem_restore_fences().
> */
> if (intel_runtime_pm_get_if_in_use(fence->i915)) {
> - fence_write(fence, vma);
> + fence_write(fence, vma, NULL);
> intel_runtime_pm_put(fence->i915);
> }
>
> @@ -287,6 +338,8 @@ int i915_vma_put_fence(struct i915_vma *vma)
> if (!fence)
> return 0;
>
> + GEM_BUG_ON(fence->vma != vma);
> +
> if (fence->pin_count)
> return -EBUSY;
>
> @@ -344,11 +397,16 @@ i915_vma_pin_fence(struct i915_vma *vma)
> assert_rpm_wakelock_held(vma->vm->i915);
>
> /* Just update our place in the LRU if our fence is getting reused. */
> - if (vma->fence) {
> - fence = vma->fence;
> + fence = vma->fence;
> + if (fence) {
> GEM_BUG_ON(fence->vma != vma);
> fence->pin_count++;
> if (!fence->dirty) {
> + err = i915_gem_active_retire(&fence->pipelined,
> + &fence->i915->drm.struct_mutex);
> + if (err)
> + goto err_unpin;
Comment explaining the need for this block. There is one CPU wait on the
fence_update path already. At least I couldn't easily figure it out. And
why also in the !dirty case specifically?
> +
> list_move_tail(&fence->link,
> &fence->i915->mm.fence_list);
> return 0;
> @@ -372,6 +430,93 @@ i915_vma_pin_fence(struct i915_vma *vma)
> return err;
> }
>
> +int i915_vma_reserve_fence(struct i915_vma *vma)
> +{
> + struct drm_i915_fence_reg *fence;
> +
> + lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
> + GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
> + GEM_BUG_ON(!i915_vma_is_pinned(vma));
> +
> + fence = vma->fence;
> + if (!fence) {
> + if (!i915_gem_object_is_tiled(vma->obj))
> + return 0;
> +
> + if (!i915_vma_is_map_and_fenceable(vma))
> + return -EINVAL;
> +
> + fence = fence_find(vma->vm->i915);
> + if (IS_ERR(fence))
> + return PTR_ERR(fence);
If all fences are pinned via i915_gem_fault which I thought can happen
with the previous patch then here we error out instead of waiting?
> +
> + vma->fence = fence;
> +
> + if (fence->vma) {
> + i915_gem_release_mmap(fence->vma->obj);
> + fence->vma->fence = NULL;
> + }
> + fence->vma = vma;
> + fence->dirty = true;
> + }
> + fence->pin_count++;
> + list_move_tail(&fence->link, &fence->i915->mm.fence_list);
> +
> + GEM_BUG_ON(!i915_gem_object_is_tiled(vma->obj));
> + GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
> + GEM_BUG_ON(vma->node.size != vma->fence_size);
> + GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_alignment));
> +
> + return 0;
> +}
> +
> +int i915_vma_emit_pipelined_fence(struct i915_vma *vma,
> + struct drm_i915_gem_request *rq)
> +{
> + struct drm_i915_fence_reg *fence = vma->fence;
> + struct drm_i915_gem_request *prev;
> + int err;
> +
> + lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
> + GEM_BUG_ON(fence && !fence->pin_count);
> +
> + if (!fence)
> + goto out;
> +
> + prev = i915_gem_active_raw(&fence->pipelined,
> + &fence->i915->drm.struct_mutex);
> + if (prev) {
> + err = i915_gem_request_await_dma_fence(rq, &prev->fence);
> + if (err)
> + return err;
> + }
> +
> + if (!fence->dirty)
> + goto out;
Hm so unless "dirty" no actual fence management will happen. What is the
meaning of dirty? Has it changed? Does it needs a write up in the header
file?
Looks like in any case I will need to apply this to follow the flow.
Regards,
Tvrtko
> +
> + GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
> +
> + if (fence->vma) {
> + prev = i915_gem_active_raw(&fence->vma->last_fence,
> + &fence->i915->drm.struct_mutex);
> + if (prev) {
> + err = i915_gem_request_await_dma_fence(rq,
> + &prev->fence);
> + if (err)
> + return err;
> + }
> + }
> +
> + err = fence_write(fence, vma, rq);
> + if (err)
> + return err;
> +
> + i915_gem_active_set(&fence->pipelined, rq);
> +out:
> + i915_gem_active_set(&vma->last_fence, rq);
> + return 0;
> +}
> +
> /**
> * i915_gem_revoke_fences - revoke fence state
> * @dev_priv: i915 device private
> @@ -429,7 +574,7 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
> vma = NULL;
> }
>
> - fence_write(reg, vma);
> + fence_write(reg, vma, NULL);
> reg->vma = vma;
> }
> }
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> index 99a31ded4dfd..ce45972fc5c6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> @@ -47,6 +47,7 @@ struct drm_i915_fence_reg {
> * command (such as BLT on gen2/3), as a "fence".
> */
> bool dirty;
> + struct i915_gem_active pipelined;
> };
>
> #endif
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index efbfee8eac99..0c489090d4ab 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -728,6 +728,7 @@ int i915_vma_unbind(struct i915_vma *vma)
> __i915_vma_iounmap(vma);
> vma->flags &= ~I915_VMA_CAN_FENCE;
> }
> + GEM_BUG_ON(vma->fence);
>
> if (likely(!vma->vm->closed)) {
> trace_i915_vma_unbind(vma);
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 19f58af4f1bf..f0dc6eaebeab 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -367,5 +367,9 @@ i915_vma_unpin_fence(struct i915_vma *vma)
> __i915_vma_unpin_fence(vma);
> }
>
> +int __must_check i915_vma_reserve_fence(struct i915_vma *vma);
> +int i915_vma_emit_pipelined_fence(struct i915_vma *vma,
> + struct drm_i915_gem_request *rq);
> +
> #endif
>
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
next prev parent reply other threads:[~2017-07-05 10:19 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-07-03 10:14 [PATCH 1/4] drm/i915: Pin fence for iomap Chris Wilson
2017-07-03 10:14 ` [PATCH 2/4] drm/i915: Consolidate get_fence with pin_fence Chris Wilson
2017-07-04 14:47 ` Tvrtko Ursulin
2017-07-03 10:14 ` [PATCH 3/4] drm/i915: Emit pipelined fence changes Chris Wilson
2017-07-05 10:19 ` Tvrtko Ursulin [this message]
2017-07-06 11:35 ` Chris Wilson
2017-07-03 10:14 ` [PATCH 4/4] drm/i915: Make fence->pin_count atomic to allow unlocked unpinning Chris Wilson
2017-07-03 11:09 ` ✓ Fi.CI.BAT: success for series starting with [1/4] drm/i915: Pin fence for iomap Patchwork
2017-07-04 9:53 ` [PATCH 1/4] " Tvrtko Ursulin
2017-07-04 10:04 ` Chris Wilson
2017-07-04 11:19 ` Tvrtko Ursulin
2017-07-04 11:25 ` Chris Wilson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=99d30dd1-b518-0f9b-a412-dbf365ad690c@linux.intel.com \
--to=tvrtko.ursulin@linux.intel.com \
--cc=chris@chris-wilson.co.uk \
--cc=intel-gfx@lists.freedesktop.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).