Re: [RFC 5/9] drm/i915: Add per context timelines to fence object

From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: John.C.Harrison@Intel.com, Intel-GFX@Lists.FreeDesktop.Org
Subject: Re: [RFC 5/9] drm/i915: Add per context timelines to fence object
Date: Thu, 23 Jul 2015 14:50:37 +0100	[thread overview]
Message-ID: <55B0F12D.7090204@linux.intel.com> (raw)
In-Reply-To: <1437143483-6234-6-git-send-email-John.C.Harrison@Intel.com>

Hi,

On 07/17/2015 03:31 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> The fence object used inside the request structure requires a sequence number.
> Although this is not used by the i915 driver itself, it could potentially be
> used by non-i915 code if the fence is passed outside of the driver. This is the
> intention as it allows external kernel drivers and user applications to wait on
> batch buffer completion asynchronously via the dma-buff fence API.
>
> To ensure that such external users are not confused by strange things happening
> with the seqno, this patch adds in a per context timeline that can provide a
> guaranteed in-order seqno value for the fence. This is safe because the
> scheduler will not re-order batch buffers within a context - they are considered
> to be mutually dependent.
>
> [new patch in series]
>
> For: VIZ-5190
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h         | 25 ++++++++----
>   drivers/gpu/drm/i915/i915_gem.c         | 69 ++++++++++++++++++++++++++++++---
>   drivers/gpu/drm/i915/i915_gem_context.c | 15 ++++++-
>   drivers/gpu/drm/i915/intel_lrc.c        |  8 ++++
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
>   5 files changed, 103 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 0c7df46..88a4746 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -840,6 +840,15 @@ struct i915_ctx_hang_stats {
>   	bool banned;
>   };
>
> +struct i915_fence_timeline {
> +	unsigned    fence_context;
> +	uint32_t    context;

Unused field?

> +	uint32_t    next;

fence.h defines seqnos as 'unsigned', which matches this in practice, 
but maybe it would be nicer to use the same type name.

> +
> +	struct intel_context *ctx;
> +	struct intel_engine_cs *ring;
> +};
> +
>   /* This must match up with the value previously used for execbuf2.rsvd1. */
>   #define DEFAULT_CONTEXT_HANDLE 0
>
> @@ -885,6 +894,7 @@ struct intel_context {
>   		struct drm_i915_gem_object *state;
>   		struct intel_ringbuffer *ringbuf;
>   		int pin_count;
> +		struct i915_fence_timeline fence_timeline;
>   	} engine[I915_NUM_RINGS];
>
>   	struct list_head link;
> @@ -2153,13 +2163,10 @@ void i915_gem_track_fb(struct drm_i915_gem_object *old,
>   struct drm_i915_gem_request {
>   	/**
>   	 * Underlying object for implementing the signal/wait stuff.
> -	 * NB: Never call fence_later() or return this fence object to user
> -	 * land! Due to lazy allocation, scheduler re-ordering, pre-emption,
> -	 * etc., there is no guarantee at all about the validity or
> -	 * sequentiality of the fence's seqno! It is also unsafe to let
> -	 * anything outside of the i915 driver get hold of the fence object
> -	 * as the clean up when decrementing the reference count requires
> -	 * holding the driver mutex lock.
> +	 * NB: Never return this fence object to user land! It is unsafe to
> +	 * let anything outside of the i915 driver get hold of the fence
> +	 * object as the clean up when decrementing the reference count
> +	 * requires holding the driver mutex lock.
>   	 */
>   	struct fence fence;
>
> @@ -2239,6 +2246,10 @@ int i915_gem_request_alloc(struct intel_engine_cs *ring,
>   			   struct drm_i915_gem_request **req_out);
>   void i915_gem_request_cancel(struct drm_i915_gem_request *req);
>
> +int i915_create_fence_timeline(struct drm_device *dev,
> +			       struct intel_context *ctx,
> +			       struct intel_engine_cs *ring);
> +
>   static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req)
>   {
>   	return fence_is_signaled(&req->fence);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 3970250..af79716 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2671,6 +2671,25 @@ static bool i915_gem_request_is_completed(struct fence *req_fence)
>   	return i915_seqno_passed(seqno, req->seqno);
>   }
>
> +static void i915_fence_timeline_value_str(struct fence *fence, char *str, int size)
> +{
> +	struct drm_i915_gem_request *req;
> +
> +	req = container_of(fence, typeof(*req), fence);
> +
> +	/* Last signalled timeline value ??? */
> +	snprintf(str, size, "? [%d]"/*, tl->value*/, req->ring->get_seqno(req->ring, true));
> +}

If timelines are per context now maybe we should update 
i915_gem_request_get_timeline_name to be per context instead of per 
engine as well? Like this we have a name space overlap / seqno 
collisions from userspace point of view.

> +static void i915_fence_value_str(struct fence *fence, char *str, int size)
> +{
> +	struct drm_i915_gem_request *req;
> +
> +	req = container_of(fence, typeof(*req), fence);
> +
> +	snprintf(str, size, "%d [%d]", req->fence.seqno, req->seqno);
> +}
> +
>   static const struct fence_ops i915_gem_request_fops = {
>   	.get_driver_name	= i915_gem_request_get_driver_name,
>   	.get_timeline_name	= i915_gem_request_get_timeline_name,
> @@ -2678,8 +2697,48 @@ static const struct fence_ops i915_gem_request_fops = {
>   	.signaled		= i915_gem_request_is_completed,
>   	.wait			= fence_default_wait,
>   	.release		= i915_gem_request_free,
> +	.fence_value_str	= i915_fence_value_str,
> +	.timeline_value_str	= i915_fence_timeline_value_str,
>   };
>
> +int i915_create_fence_timeline(struct drm_device *dev,
> +			       struct intel_context *ctx,
> +			       struct intel_engine_cs *ring)
> +{
> +	struct i915_fence_timeline *timeline;
> +
> +	timeline = &ctx->engine[ring->id].fence_timeline;
> +
> +	if (timeline->ring)
> +		return 0;
> +
> +	timeline->fence_context = fence_context_alloc(1);
> +
> +	/*
> +	 * Start the timeline from seqno 0 as this is a special value
> +	 * that is reserved for invalid sync points.
> +	 */
> +	timeline->next       = 1;
> +	timeline->ctx        = ctx;
> +	timeline->ring       = ring;
> +
> +	return 0;
> +}
> +
> +static uint32_t i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
> +{
> +	uint32_t seqno;
> +
> +	seqno = timeline->next;
> +
> +	/* Reserve zero for invalid */
> +	if (++timeline->next == 0 ) {
> +		timeline->next = 1;
> +	}
> +
> +	return seqno;
> +}
> +
>   int i915_gem_request_alloc(struct intel_engine_cs *ring,
>   			   struct intel_context *ctx,
>   			   struct drm_i915_gem_request **req_out)
> @@ -2715,7 +2774,9 @@ int i915_gem_request_alloc(struct intel_engine_cs *ring,
>   		goto err;
>   	}
>
> -	fence_init(&req->fence, &i915_gem_request_fops, &ring->fence_lock, ring->fence_context, req->seqno);
> +	fence_init(&req->fence, &i915_gem_request_fops, &ring->fence_lock,
> +		   ctx->engine[ring->id].fence_timeline.fence_context,
> +		   i915_fence_timeline_get_next_seqno(&ctx->engine[ring->id].fence_timeline));

I suppose for debugging it could be useful to add this new seqno in 
i915_gem_request_info to have visibility at both sides. To map userspace 
seqnos to driver state.

>   	/*
>   	 * Reserve space in the ring buffer for all the commands required to
> @@ -5065,7 +5126,7 @@ i915_gem_init_hw(struct drm_device *dev)
>   {
>   	struct drm_i915_private *dev_priv = dev->dev_private;
>   	struct intel_engine_cs *ring;
> -	int ret, i, j, fence_base;
> +	int ret, i, j;
>
>   	if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt())
>   		return -EIO;
> @@ -5117,16 +5178,12 @@ i915_gem_init_hw(struct drm_device *dev)
>   			goto out;
>   	}
>
> -	fence_base = fence_context_alloc(I915_NUM_RINGS);
> -
>   	/* Now it is safe to go back round and do everything else: */
>   	for_each_ring(ring, dev_priv, i) {
>   		struct drm_i915_gem_request *req;
>
>   		WARN_ON(!ring->default_context);
>
> -		ring->fence_context = fence_base + i;
> -
>   		ret = i915_gem_request_alloc(ring, ring->default_context, &req);
>   		if (ret) {
>   			i915_gem_cleanup_ringbuffer(dev);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index b77a8f7..7eb8694 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -242,7 +242,7 @@ i915_gem_create_context(struct drm_device *dev,
>   {
>   	const bool is_global_default_ctx = file_priv == NULL;
>   	struct intel_context *ctx;
> -	int ret = 0;
> +	int i, ret = 0;
>
>   	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
>
> @@ -250,6 +250,19 @@ i915_gem_create_context(struct drm_device *dev,
>   	if (IS_ERR(ctx))
>   		return ctx;
>
> +	if (!i915.enable_execlists) {
> +		struct intel_engine_cs *ring;
> +
> +		/* Create a per context timeline for fences */
> +		for_each_ring(ring, to_i915(dev), i) {
> +			ret = i915_create_fence_timeline(dev, ctx, ring);
> +			if (ret) {
> +				DRM_ERROR("Fence timeline creation failed for legacy %s: %p\n", ring->name, ctx);
> +				goto err_destroy;
> +			}
> +		}
> +	}
> +
>   	if (is_global_default_ctx && ctx->legacy_hw_ctx.rcs_state) {
>   		/* We may need to do things with the shrinker which
>   		 * require us to immediately switch back to the default
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index ee4aecd..8f255de 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2376,6 +2376,14 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>   		goto error;
>   	}
>
> +	/* Create a per context timeline for fences */
> +	ret = i915_create_fence_timeline(dev, ctx, ring);
> +	if (ret) {
> +		DRM_ERROR("Fence timeline creation failed for ring %s, ctx %p\n",
> +			  ring->name, ctx);
> +		goto error;
> +	}
> +

We must be 100% sure userspace cannot provoke context creation failure 
by accident or deliberately. Otherwise we would leak fence contexts 
until overflow which would be bad.

Perhaps matching fence_context_release for existing fence_context_alloc 
should be added?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx