On Thu, Sep 18, 2014 at 11:11:57PM -0400, Jerome Glisse wrote:
> On Thu, Sep 18, 2014 at 08:42:16PM -0400, Jerome Glisse wrote:
> > On Thu, Sep 18, 2014 at 05:30:12PM +0200, Christian König wrote:
> > > From: Christian König <christian.koenig@amd.com>
> > > 
> > > This patch adds a new 64bit ID as a result to each command submission.
> > 
> > NAK for design reasons.
> > 
> > First and foremost we DO NOT WANT TO ALLOCATE ANYTHING for submission id.
> > I know that hooking up to fence make life easy but this is not how it
> > should be done.
> > 
> > What you want to expose is the per ring 32bit seq value and report to user
> > space the value and the ring id. To handle the wrap around you add a uniq
> > 32bit wrap counter which is incremented every 1 << 30 or some big number
> > seq value number (but must be smaller than 1 << 31). Then you use arithmetic
> > to determine if a cs seq number is done or not (ie the seq wrap around
> > counter diff should not be bigger then 2 or 3) and the seq number should
> > obviously be after again using arithmetic diff like jiffies code.
> > 
> > All this being hidden in the kernel all the user space have to know is :
> > 32 bit seq value per ring
> > 32 bit ring uniq id
> > 32 wrap counter per ring
> > 
> > With such scheme you never allocate anything or worry about any fence.
> > Way simpler and less code.
> > 
> > Cheers,
> > Jérôme
> 
> Because code is clearer than words attached is a patch of what i am thinking
> about. The arithmetic should be right as well as the coherency and proper
> memory barrier. Thought it is completely untested and will require couple
> fixes for properly checking userspace arguments and other aspect (see FIXME
> in patches).
> 
> There is no allocation, it is a pretty small patch, and it provide a fast
> ligthweight solution. While the new ioctl argument and return value can be
> improved (like reporting more information to userspace like for instance
> warning userspace when it is querying very old fences). I think the overall
> design is sound.
> 
> Let me know.

Ok actually after a beer (given that planet express manual for Bender, which
also apply to me, clearly stipulate to expect malfunction if not properly
inebriate) it became clear that by reversing the problem i could make it a
lot simpler. So attached is an even simpler patch that handle gracefully user
space querying for very old fence (ie that could have wrap around).

Code explains it all.

> 
> Cheers,
> Jérôme
> 
> > 
> > 
> > > 
> > > Signed-off-by: Christian König <christian.koenig@amd.com>
> > > ---
> > >  drivers/gpu/drm/radeon/Makefile     |   2 +-
> > >  drivers/gpu/drm/radeon/radeon.h     |  13 +-
> > >  drivers/gpu/drm/radeon/radeon_cs.c  |  13 ++
> > >  drivers/gpu/drm/radeon/radeon_kms.c |  41 +++----
> > >  drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++
> > >  include/uapi/drm/radeon_drm.h       |   1 +
> > >  6 files changed, 277 insertions(+), 22 deletions(-)
> > >  create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c
> > > 
> > > diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile
> > > index 8cc9f68..21ee928 100644
> > > --- a/drivers/gpu/drm/radeon/Makefile
> > > +++ b/drivers/gpu/drm/radeon/Makefile
> > > @@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \
> > >  	rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \
> > >  	trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \
> > >  	ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \
> > > -	radeon_sync.o
> > > +	radeon_sync.o radeon_seq.o
> > >  
> > >  # add async DMA block
> > >  radeon-y += \
> > > diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> > > index b3b4e96..dbfd346 100644
> > > --- a/drivers/gpu/drm/radeon/radeon.h
> > > +++ b/drivers/gpu/drm/radeon/radeon.h
> > > @@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a,
> > >  }
> > >  
> > >  /*
> > > + * Userspace command submission identifier generation
> > > + */
> > > +struct radeon_seq;
> > > +
> > > +uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence);
> > > +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id);
> > > +void radeon_seq_destroy(struct radeon_seq **seq);
> > > +
> > > +/*
> > >   * Tiling registers
> > >   */
> > >  struct radeon_surface_reg {
> > > @@ -946,7 +955,9 @@ struct radeon_vm_manager {
> > >   * file private structure
> > >   */
> > >  struct radeon_fpriv {
> > > -	struct radeon_vm		vm;
> > > +	struct radeon_vm	vm;
> > > +	struct mutex		seq_lock;
> > > +	struct radeon_seq	*seq;
> > >  };
> > >  
> > >  /*
> > > diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
> > > index b8f20a5..0c0f0b3 100644
> > > --- a/drivers/gpu/drm/radeon/radeon_cs.c
> > > +++ b/drivers/gpu/drm/radeon/radeon_cs.c
> > > @@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo
> > >  	unsigned i;
> > >  
> > >  	if (!error) {
> > > +		if (parser->chunk_flags &&
> > > +		    parser->chunk_flags->length_dw > 4) {
> > > +			struct radeon_fpriv *fpriv = parser->filp->driver_priv;
> > > +			uint32_t __user *to = parser->chunk_flags->user_ptr;
> > > +			uint64_t id;
> > > +
> > > +			mutex_lock(&fpriv->seq_lock);
> > > +			id = radeon_seq_push(&fpriv->seq, parser->ib.fence);
> > > +			mutex_unlock(&fpriv->seq_lock);
> > > +
> > > +			copy_to_user(&to[3], &id, sizeof(uint64_t));
> > > +		}
> > > +
> > >  		/* Sort the buffer list from the smallest to largest buffer,
> > >  		 * which affects the order of buffers in the LRU list.
> > >  		 * This assures that the smallest buffers are added first
> > > diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
> > > index 85ee6f7..38880af 100644
> > > --- a/drivers/gpu/drm/radeon/radeon_kms.c
> > > +++ b/drivers/gpu/drm/radeon/radeon_kms.c
> > > @@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev)
> > >   */
> > >  int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
> > >  {
> > > +	struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
> > >  	struct radeon_device *rdev = dev->dev_private;
> > >  	int r;
> > >  
> > > -	file_priv->driver_priv = NULL;
> > > +	if (unlikely(!fpriv))
> > > +		return -ENOMEM;
> > > +
> > > +	file_priv->driver_priv = fpriv;
> > >  
> > >  	r = pm_runtime_get_sync(dev->dev);
> > >  	if (r < 0)
> > > -		return r;
> > > +		goto error;
> > >  
> > >  	/* new gpu have virtual address space support */
> > >  	if (rdev->family >= CHIP_CAYMAN) {
> > > -		struct radeon_fpriv *fpriv;
> > >  		struct radeon_vm *vm;
> > >  		int r;
> > >  
> > > -		fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
> > > -		if (unlikely(!fpriv)) {
> > > -			return -ENOMEM;
> > > -		}
> > > -
> > >  		vm = &fpriv->vm;
> > >  		r = radeon_vm_init(rdev, vm);
> > > -		if (r) {
> > > -			kfree(fpriv);
> > > -			return r;
> > > -		}
> > > +		if (r)
> > > +			goto error;
> > >  
> > >  		if (rdev->accel_working) {
> > >  			r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false);
> > >  			if (r) {
> > >  				radeon_vm_fini(rdev, vm);
> > > -				kfree(fpriv);
> > > -				return r;
> > > +				goto error;
> > >  			}
> > >  
> > >  			/* map the ib pool buffer read only into
> > > @@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
> > >  						  RADEON_VM_PAGE_SNOOPED);
> > >  			if (r) {
> > >  				radeon_vm_fini(rdev, vm);
> > > -				kfree(fpriv);
> > > -				return r;
> > > +				goto error;
> > >  			}
> > >  		}
> > > -		file_priv->driver_priv = fpriv;
> > >  	}
> > >  
> > > +	mutex_init(&fpriv->seq_lock);
> > > +
> > >  	pm_runtime_mark_last_busy(dev->dev);
> > >  	pm_runtime_put_autosuspend(dev->dev);
> > >  	return 0;
> > > +
> > > +error:
> > > +	kfree(fpriv);
> > > +	return r;
> > >  }
> > >  
> > >  /**
> > > @@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
> > >  void radeon_driver_postclose_kms(struct drm_device *dev,
> > >  				 struct drm_file *file_priv)
> > >  {
> > > +	struct radeon_fpriv *fpriv = file_priv->driver_priv;
> > >  	struct radeon_device *rdev = dev->dev_private;
> > >  
> > > +	radeon_seq_destroy(&fpriv->seq);
> > > +
> > >  	/* new gpu have virtual address space support */
> > >  	if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) {
> > > -		struct radeon_fpriv *fpriv = file_priv->driver_priv;
> > >  		struct radeon_vm *vm = &fpriv->vm;
> > >  		int r;
> > >  
> > > @@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev,
> > >  		}
> > >  
> > >  		radeon_vm_fini(rdev, vm);
> > > -		kfree(fpriv);
> > > -		file_priv->driver_priv = NULL;
> > >  	}
> > > +	kfree(fpriv);
> > > +	file_priv->driver_priv = NULL;
> > >  }
> > >  
> > >  /**
> > > diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c
> > > new file mode 100644
> > > index 0000000..d8857f1
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/radeon/radeon_seq.c
> > > @@ -0,0 +1,229 @@
> > > +/*
> > > + * Copyright 2014 Advanced Micro Devices, Inc.
> > > + * All Rights Reserved.
> > > + *
> > > + * Permission is hereby granted, free of charge, to any person obtaining a
> > > + * copy of this software and associated documentation files (the
> > > + * "Software"), to deal in the Software without restriction, including
> > > + * without limitation the rights to use, copy, modify, merge, publish,
> > > + * distribute, sub license, and/or sell copies of the Software, and to
> > > + * permit persons to whom the Software is furnished to do so, subject to
> > > + * the following conditions:
> > > + *
> > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> > > + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
> > > + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
> > > + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> > > + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> > > + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> > > + *
> > > + * The above copyright notice and this permission notice (including the
> > > + * next paragraph) shall be included in all copies or substantial portions
> > > + * of the Software.
> > > + *
> > > + */
> > > +/*
> > > + * Authors:
> > > + *    Christian König <christian.koenig@amd.com>
> > > + */
> > > +
> > > +#include <drm/drmP.h>
> > > +#include "radeon.h"
> > > +
> > > +/*
> > > + * ID sequences
> > > + * This code generates a 64bit identifier for a command submission.
> > > + * It works by adding the fence of the command submission to a automatically
> > > + * resizing ring buffer.
> > > + */
> > > +
> > > +struct radeon_seq {
> > > +	uint64_t		start;
> > > +	uint64_t		end;
> > > +	uint64_t		mask;
> > > +	struct radeon_seq	*replacement;
> > > +};
> > > +
> > > +/**
> > > + * radeon_seq_create - create a new sequence object
> > > + *
> > > + * @start: start value for this sequence
> > > + * @size: size of the ring buffer, must be power of two
> > > + *
> > > + * Allocate and initialize a new ring buffer and header.
> > > + * Returns NULL if allocation fails, new object otherwise.
> > > + */
> > > +static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size)
> > > +{
> > > +	unsigned bytes = sizeof(struct radeon_seq) +
> > > +		size * sizeof(struct radeon_fence *);
> > > +
> > > +	struct radeon_seq *seq;
> > > +
> > > +	seq = kmalloc(bytes, GFP_KERNEL);
> > > +	if (!seq)
> > > +		return NULL;
> > > +
> > > +	seq->start = start;
> > > +	seq->end = start;
> > > +	seq->mask = size - 1;
> > > +	seq->replacement = NULL;
> > > +
> > > +	return seq;
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_ring - get pointer to ring buffer
> > > + *
> > > + * @seq: sequence object
> > > + *
> > > + * Calculate the address of the ring buffer.
> > > + */
> > > +static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq)
> > > +{
> > > +	return (struct radeon_fence **)&seq[1];
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_try_free - try to free fences from the ring buffer
> > > + *
> > > + * @seq: sequence object
> > > + *
> > > + * Try to free fences from the start of the ring buffer.
> > > + */
> > > +static void radeon_seq_try_free(struct radeon_seq *seq)
> > > +{
> > > +	struct radeon_fence **ring = radeon_seq_ring(seq);
> > > +
> > > +	while (seq->start != seq->end) {
> > > +		unsigned idx = seq->start & seq->mask;
> > > +		struct radeon_fence *fence = ring[idx];
> > > +
> > > +		if (!radeon_fence_signaled(fence))
> > > +			break;
> > > +
> > > +		radeon_fence_unref(&fence);
> > > +		++seq->start;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_add - add new fence to the end of the ring buffer
> > > + *
> > > + * @seq: sequence object
> > > + * @f: the fence object
> > > + *
> > > + * Add the fence and return the generated ID.
> > > + */
> > > +static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f)
> > > +{
> > > +	struct radeon_fence **ring = radeon_seq_ring(seq);
> > > +
> > > +	ring[seq->end & seq->mask] = radeon_fence_ref(f);
> > > +	return seq->end++;
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_push - check for room and add the fence
> > > + *
> > > + * @seq: sequence object
> > > + * @fence: the fence object
> > > + *
> > > + * Check for room on the ring buffer, if there isn't enough
> > > + * reallocate the sequence object and add the fence.
> > > + * Returns the generated ID.
> > > + */
> > > +uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence)
> > > +{
> > > +	unsigned size_for_new_seq = 4;
> > > +	uint64_t start_for_new_seq = 1;
> > > +
> > > +	if (*seq) {
> > > +		/* try to release old replacements */
> > > +		while ((*seq)->replacement) {
> > > +			radeon_seq_try_free(*seq);
> > > +			if ((*seq)->start == (*seq)->end) {
> > > +				struct radeon_seq *repl = (*seq)->replacement;
> > > +
> > > +				kfree(*seq);
> > > +				*seq = repl;
> > > +			} else {
> > > +				/* move on to the current container */
> > > +				seq = &(*seq)->replacement;
> > > +			}
> > > +		}
> > > +
> > > +		/* check if we have enough room for one more fence */
> > > +		radeon_seq_try_free(*seq);
> > > +		if (((*seq)->end - (*seq)->start) <= (*seq)->mask)
> > > +			return radeon_seq_add(*seq, fence);
> > > +
> > > +		/* not enough room, let's allocate a replacement */
> > > +		size_for_new_seq = ((*seq)->mask + 1) * 2;
> > > +		start_for_new_seq = (*seq)->end + 1;
> > > +		seq = &(*seq)->replacement;
> > > +	}
> > > +
> > > +	*seq = radeon_seq_create(start_for_new_seq, size_for_new_seq);
> > > +	if (!*seq) {
> > > +		/* not enough memory for a new sequence object, but failing
> > > +		   here isn't a good idea either cause the commands are already
> > > +		   submitted to the hardware. So just block on the fence. */
> > > +		int r = radeon_fence_wait(fence, false);
> > > +		if (r)
> > > +			DRM_ERROR("Error waiting for fence (%d)\n", r);
> > > +		return 0;
> > > +	}
> > > +	return radeon_seq_add(*seq, fence);
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_query - lockup fence by it's ID
> > > + *
> > > + * @seq: sequence object
> > > + * @id: the generated ID
> > > + *
> > > + * Lockup the associated fence by it's ID.
> > > + * Returns fence object or NULL if it couldn't be found.
> > > + */
> > > +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id)
> > > +{
> > > +	struct radeon_fence **ring;
> > > +
> > > +	while (seq && id > seq->end)
> > > +		seq = seq->replacement;
> > > +
> > > +	if (!seq || id < seq->start)
> > > +		return NULL;
> > > +
> > > +	ring = radeon_seq_ring(seq);
> > > +	return ring[id & seq->mask];
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_destroy - destroy the sequence object
> > > + *
> > > + * @seq_ptr: pointer to sequence object
> > > + *
> > > + * Destroy the sequence objects and release all fence references taken.
> > > + */
> > > +void radeon_seq_destroy(struct radeon_seq **seq_ptr)
> > > +{
> > > +	struct radeon_seq *seq = *seq_ptr;
> > > +	while (seq) {
> > > +		struct radeon_seq *repl = seq->replacement;
> > > +		unsigned start = seq->start & seq->mask;
> > > +		unsigned end = seq->end & seq->mask;
> > > +		struct radeon_fence **ring;
> > > +		unsigned i;
> > > +
> > > +		ring = radeon_seq_ring(seq);
> > > +		for (i = start; i < end; ++i)
> > > +			radeon_fence_unref(&ring[i]);
> > > +
> > > +		kfree(seq);
> > > +		seq = repl;
> > > +	}
> > > +	*seq_ptr = NULL;
> > > +}
> > > diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h
> > > index 50d0fb4..6b2b2e7 100644
> > > --- a/include/uapi/drm/radeon_drm.h
> > > +++ b/include/uapi/drm/radeon_drm.h
> > > @@ -959,6 +959,7 @@ struct drm_radeon_gem_va {
> > >  #define RADEON_CS_RING_VCE          4
> > >  /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */
> > >  /* 0 = normal, + = higher priority, - = lower priority */
> > > +/* The fourth and fives dword are a 64bit fence ID generated for this CS */
> > >  
> > >  struct drm_radeon_cs_chunk {
> > >  	uint32_t		chunk_id;
> > > -- 
> > > 1.9.1
> > > 
> > > _______________________________________________
> > > dri-devel mailing list
> > > dri-devel@lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/dri-devel

> From b95fe503dad89875da9db2d11d05f93eca41232d Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
> Date: Thu, 18 Sep 2014 22:51:21 -0400
> Subject: [PATCH] drm/radeon: cs sequence id and cs completion query.
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> This report back to userspace ring id and sequence number that can
> be use by userspace to query for the completetion of the cs on the
> hardware. This also add a new ioctl to perform such query.
> 
> There is an extra value supplied to userspace the wrap counter that
> is use to detect wrap around and to gracefuly handle the case where
> userspace might be querying about a very, very, very old cs executed
> long time ago in a far far away universe.
> 
> This patch is aimed to introduce the necessary ground work for user
> space explicit synchronization. By allowing userspace to query about
> cs completetion on hardware, user space can perform operation and
> synchronization on buffer by itself without having the cs ioctl to
> implicitly wait for older cs completetion before scheduling new cs.
> This part is however left to a follow-up patch.
> 
> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> ---
>  drivers/gpu/drm/radeon/radeon.h       |  2 ++
>  drivers/gpu/drm/radeon/radeon_cs.c    | 60 +++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/radeon/radeon_fence.c |  4 +++
>  drivers/gpu/drm/radeon/radeon_kms.c   |  1 +
>  include/uapi/drm/radeon_drm.h         |  9 ++++++
>  5 files changed, 76 insertions(+)
> 
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 5f05b4c..1f1dd1f 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -118,6 +118,7 @@ extern int radeon_bapm;
>  #define RADEON_DEBUGFS_MAX_COMPONENTS		32
>  #define RADEONFB_CONN_LIMIT			4
>  #define RADEON_BIOS_NUM_SCRATCH			8
> +#define RADEON_SEQ_WRAP_VALUE			(1 << 30)
>  
>  /* fence seq are set to this number when signaled */
>  #define RADEON_FENCE_SIGNALED_SEQ		0LL
> @@ -355,6 +356,7 @@ struct radeon_fence_driver {
>  	/* sync_seq is protected by ring emission lock */
>  	uint64_t			sync_seq[RADEON_NUM_RINGS];
>  	atomic64_t			last_seq;
> +	int32_t				wrap_seq;
>  	bool				initialized;
>  };
>  
> diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
> index 83f382e..be4ae25 100644
> --- a/drivers/gpu/drm/radeon/radeon_cs.c
> +++ b/drivers/gpu/drm/radeon/radeon_cs.c
> @@ -404,6 +404,17 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo
>  		ttm_eu_fence_buffer_objects(&parser->ticket,
>  					    &parser->validated,
>  					    parser->ib.fence);
> +		if (parser->chunk_flags && parser->chunk_flags->length_dw > 4) {
> +			uint32_t __user *to = parser->chunk_flags->user_ptr;
> +			uint32_t tmp;
> +
> +			tmp = lower_32_bits(parser->ib.fence->seq);
> +			copy_to_user(&to[3], &tmp, sizeof(uint32_t));
> +			tmp = parser->ib.fence->ring;
> +			copy_to_user(&to[4], &tmp, sizeof(uint32_t));
> +			tmp = rdev->fence_drv[tmp]->wrap_seq;
> +			copy_to_user(&to[5], &tmp, sizeof(uint32_t));
> +		}
>  	} else if (backoff) {
>  		ttm_eu_backoff_reservation(&parser->ticket,
>  					   &parser->validated);
> @@ -823,3 +834,52 @@ int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p,
>  		*cs_reloc = p->relocs_ptr[(idx / 4)];
>  	return 0;
>  }
> +
> +int radeon_cs_done_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> +{
> +	struct radeon_device *rdev = dev->dev_private;
> +	struct drm_radeon_cs_done *args = data;
> +	unsigned i = args->ring;
> +	int32_t last_seq, sync_seq, wrap_seq;
> +
> +	/* FIXME check args->ring value is ok. */
> +
> +	/*
> +	 * The memory barrier is match with the one in radeon_fence_emit() and
> +	 * it insure us that we get the right matching wrap_seq and sync_seq.
> +	 *
> +	 * Note that no need to protect the fence_drv.sync_seq here as barrier
> +	 * insure us we will get the coherency we need.
> +	 */
> +	wrap_seq = ACCESS_ONCE(rdev->fence_drv[i].wrap_seq);
> +	smp_rmb();
> +	sync_seq = lower_32_bits(ACCESS_ONCE(rdev->fence_drv[i].sync_seq[i]));
> +
> +	/*
> +	 * So if current wrap_seq and one we are queried with are differ by
> +	 * more than one this means that we are queried about a very old fence
> +	 * seq value and we can assume it is long done.
> +	 *
> +	 * Well this is not entirely true, for it to be true we would need to
> +	 * stall when we increment the wrap counter if cs in previous wrap were
> +	 * not completed but this is highly unlikely. So live with the trill of
> +	 * it going wrong !
> +	 */
> +	if (abs((unsigned)wrap_seq - (unsigned)args->wrap) > 1)
> +		return 1;
> +	/* Now check if currently reported fence seq is done or not. */
> +	/* FIXME call fence func to update last_seq just in case. */
> +	last_seq = lower_32_bits(atomic64_read(&rdev->fence_drv[i].last_seq));
> +	if ((last_seq - arg->seq) >= 0)
> +		return 1;
> +	/*
> +	 * Last failsafe to handle the horrible case were userspace holded on
> +	 * a wrap and seq value for so long without querying that it we wrapped
> +	 * around. This is here to avoid userspace waiting for a fence that was
> +	 * emited a long time ago but the current sync_seq[ring] value migth be
> +	 * stuck and thus we might go bigger than this very old seq value.
> +	 */
> +	if (((sync_seq - args->seq) < 0) && args->wrap == wrap_seq)
> +		return 1;
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index 9137870..a6adcf6 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -119,6 +119,10 @@ int radeon_fence_emit(struct radeon_device *rdev,
>  	kref_init(&((*fence)->kref));
>  	(*fence)->rdev = rdev;
>  	(*fence)->seq = ++rdev->fence_drv[ring].sync_seq[ring];
> +	/* Barrier is important for radeon_fence_cs_done_ioctl. */
> +	smp_wmb();
> +	if (rdev->fence_drv[ring].sync_seq[ring] == RADEON_SEQ_WRAP_VALUE)
> +		rdev->fence_drv[ring].wrap_seq++;
>  	(*fence)->ring = ring;
>  	radeon_fence_ring_emit(rdev, ring, *fence);
>  	trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
> diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
> index eb7164d..c9cfcf5 100644
> --- a/drivers/gpu/drm/radeon/radeon_kms.c
> +++ b/drivers/gpu/drm/radeon/radeon_kms.c
> @@ -885,5 +885,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = {
>  	DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(RADEON_CS_DONE, radeon_cs_done_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
>  };
>  int radeon_max_kms_ioctl = ARRAY_SIZE(radeon_ioctls_kms);
> diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h
> index fea6099..a0c215d 100644
> --- a/include/uapi/drm/radeon_drm.h
> +++ b/include/uapi/drm/radeon_drm.h
> @@ -554,6 +554,7 @@ typedef struct {
>  #define DRM_IOCTL_RADEON_GEM_BUSY	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy)
>  #define DRM_IOCTL_RADEON_GEM_VA		DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va)
>  #define DRM_IOCTL_RADEON_GEM_OP		DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op)
> +#define DRM_IOCTL_RADEON_CS_DONE	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_CS_DONE, struct drm_radeon_cs_done)
>  
>  typedef struct drm_radeon_init {
>  	enum {
> @@ -936,6 +937,7 @@ struct drm_radeon_gem_va {
>  #define RADEON_CS_RING_VCE          4
>  /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */
>  /* 0 = normal, + = higher priority, - = lower priority */
> +/* The fifth, sixth, seventh  dword are a 32bit fence ID, ring id and wrap id of this CS */
>  
>  struct drm_radeon_cs_chunk {
>  	uint32_t		chunk_id;
> @@ -1038,4 +1040,11 @@ struct drm_radeon_info {
>  
>  #define CIK_TILE_MODE_DEPTH_STENCIL_1D		5
>  
> +struct drm_radeon_cs_done {
> +	int32_t			seq;
> +	int32_t			ring;
> +	int32_t			wrap;
> +	int32_t			pad;
> +};
> +
>  #endif
> -- 
> 1.9.3
>