All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
@ 2011-08-07 20:39 Marek Olšák
  2011-08-07 20:39 ` [PATCH 2/2] drm/radeon/kms: add a new gem_wait ioctl with read/write flags Marek Olšák
                   ` (3 more replies)
  0 siblings, 4 replies; 31+ messages in thread
From: Marek Olšák @ 2011-08-07 20:39 UTC (permalink / raw)
  To: dri-devel

Sometimes we want to know whether a buffer is busy and wait for it (bo_wait).
However, sometimes it would be more useful to be able to query whether
a buffer is busy and being either read or written, and wait until it's stopped
being either read or written. The point of this is to be able to avoid
unnecessary waiting, e.g. if a GPU has written something to a buffer and is now
reading that buffer, and a CPU wants to map that buffer for read, it needs to
only wait for the last write. If there were no write, there wouldn't be any
waiting needed.

This, or course, requires user space drivers to send read/write flags
with each relocation (like we have read/write domains in radeon, so we can
actually use those for something useful now).

Now how this patch works:

The read/write flags should passed to ttm_validate_buffer. TTM maintains
separate sync objects of the last read and write for each buffer, in addition
to the sync object of the last use of a buffer. ttm_bo_wait then operates
with one the sync objects.

Signed-off-by: Marek Olšák <maraeo@gmail.com>
---
 drivers/gpu/drm/nouveau/nouveau_bo.c    |    3 +-
 drivers/gpu/drm/nouveau/nouveau_gem.c   |    5 +-
 drivers/gpu/drm/radeon/radeon_cs.c      |    1 +
 drivers/gpu/drm/radeon/radeon_object.h  |    2 +-
 drivers/gpu/drm/ttm/ttm_bo.c            |   97 ++++++++++++++++++++++--------
 drivers/gpu/drm/ttm/ttm_bo_util.c       |   26 +++++++--
 drivers/gpu/drm/ttm/ttm_bo_vm.c         |    2 +-
 drivers/gpu/drm/ttm/ttm_execbuf_util.c  |   17 +++++-
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c |    1 +
 include/drm/ttm/ttm_bo_api.h            |   16 +++++-
 include/drm/ttm/ttm_execbuf_util.h      |    6 ++
 11 files changed, 137 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 890d50e..e87e24b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1104,7 +1104,8 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma)
 	if (vma->node) {
 		if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) {
 			spin_lock(&nvbo->bo.bdev->fence_lock);
-			ttm_bo_wait(&nvbo->bo, false, false, false);
+			ttm_bo_wait(&nvbo->bo, false, false, false,
+				    TTM_USAGE_READWRITE);
 			spin_unlock(&nvbo->bo.bdev->fence_lock);
 			nouveau_vm_unmap(vma);
 		}
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 5f0bc57..322bf62 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -589,7 +589,8 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev,
 		}
 
 		spin_lock(&nvbo->bo.bdev->fence_lock);
-		ret = ttm_bo_wait(&nvbo->bo, false, false, false);
+		ret = ttm_bo_wait(&nvbo->bo, false, false, false,
+				  TTM_USAGE_READWRITE);
 		spin_unlock(&nvbo->bo.bdev->fence_lock);
 		if (ret) {
 			NV_ERROR(dev, "reloc wait_idle failed: %d\n", ret);
@@ -825,7 +826,7 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
 	nvbo = nouveau_gem_object(gem);
 
 	spin_lock(&nvbo->bo.bdev->fence_lock);
-	ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
+	ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait, TTM_USAGE_READWRITE);
 	spin_unlock(&nvbo->bo.bdev->fence_lock);
 	drm_gem_object_unreference_unlocked(gem);
 	return ret;
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index fae00c0..14e8531 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -80,6 +80,7 @@ int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
 			p->relocs[i].lobj.wdomain = r->write_domain;
 			p->relocs[i].lobj.rdomain = r->read_domains;
 			p->relocs[i].lobj.tv.bo = &p->relocs[i].robj->tbo;
+			p->relocs[i].lobj.tv.usage = TTM_USAGE_READWRITE;
 			p->relocs[i].handle = r->handle;
 			p->relocs[i].flags = r->flags;
 			radeon_bo_list_add_object(&p->relocs[i].lobj,
diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h
index ede6c13..e9dc8b2 100644
--- a/drivers/gpu/drm/radeon/radeon_object.h
+++ b/drivers/gpu/drm/radeon/radeon_object.h
@@ -130,7 +130,7 @@ static inline int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type,
 	if (mem_type)
 		*mem_type = bo->tbo.mem.mem_type;
 	if (bo->tbo.sync_obj)
-		r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
+		r = ttm_bo_wait(&bo->tbo, true, true, no_wait, TTM_USAGE_READWRITE);
 	spin_unlock(&bo->tbo.bdev->fence_lock);
 	ttm_bo_unreserve(&bo->tbo);
 	return r;
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 56619f6..bb8c86d 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -494,7 +494,7 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 	int ret;
 
 	spin_lock(&bdev->fence_lock);
-	(void) ttm_bo_wait(bo, false, false, true);
+	(void) ttm_bo_wait(bo, false, false, true, TTM_USAGE_READWRITE);
 	if (!bo->sync_obj) {
 
 		spin_lock(&glob->lru_lock);
@@ -562,7 +562,8 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
 
 retry:
 	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
+	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu,
+			  TTM_USAGE_READWRITE);
 	spin_unlock(&bdev->fence_lock);
 
 	if (unlikely(ret != 0))
@@ -721,7 +722,8 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible,
 	int ret = 0;
 
 	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
+	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu,
+			  TTM_USAGE_READWRITE);
 	spin_unlock(&bdev->fence_lock);
 
 	if (unlikely(ret != 0)) {
@@ -1068,7 +1070,8 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
 	 * instead of doing it here.
 	 */
 	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
+	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu,
+			  TTM_USAGE_READWRITE);
 	spin_unlock(&bdev->fence_lock);
 	if (ret)
 		return ret;
@@ -1688,34 +1691,83 @@ out_unlock:
 	return ret;
 }
 
+static void ttm_bo_unref_sync_obj_locked(struct ttm_buffer_object *bo,
+					 void *sync_obj,
+					 void **extra_sync_obj)
+{
+	struct ttm_bo_device *bdev = bo->bdev;
+	struct ttm_bo_driver *driver = bdev->driver;
+	void *tmp_obj = NULL, *tmp_obj_read = NULL, *tmp_obj_write = NULL;
+
+	/* We must unref the sync obj wherever it's ref'd.
+	 * Note that if we unref bo->sync_obj, we can unref both the read
+	 * and write sync objs too, because they can't be newer than
+	 * bo->sync_obj, so they are no longer relevant. */
+	if (sync_obj == bo->sync_obj ||
+	    sync_obj == bo->sync_obj_read) {
+		tmp_obj_read = bo->sync_obj_read;
+		bo->sync_obj_read = NULL;
+	}
+	if (sync_obj == bo->sync_obj ||
+	    sync_obj == bo->sync_obj_write) {
+		tmp_obj_write = bo->sync_obj_write;
+		bo->sync_obj_write = NULL;
+	}
+	if (sync_obj == bo->sync_obj) {
+		tmp_obj = bo->sync_obj;
+		bo->sync_obj = NULL;
+	}
+
+	clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
+	spin_unlock(&bdev->fence_lock);
+	if (tmp_obj)
+		driver->sync_obj_unref(&tmp_obj);
+	if (tmp_obj_read)
+		driver->sync_obj_unref(&tmp_obj_read);
+	if (tmp_obj_write)
+		driver->sync_obj_unref(&tmp_obj_write);
+	if (extra_sync_obj)
+		driver->sync_obj_unref(extra_sync_obj);
+	spin_lock(&bdev->fence_lock);
+}
+
 int ttm_bo_wait(struct ttm_buffer_object *bo,
-		bool lazy, bool interruptible, bool no_wait)
+		bool lazy, bool interruptible, bool no_wait,
+		enum ttm_buffer_usage usage)
 {
 	struct ttm_bo_driver *driver = bo->bdev->driver;
 	struct ttm_bo_device *bdev = bo->bdev;
 	void *sync_obj;
 	void *sync_obj_arg;
 	int ret = 0;
+	void **bo_sync_obj;
 
-	if (likely(bo->sync_obj == NULL))
+	switch (usage) {
+	case TTM_USAGE_READ:
+		bo_sync_obj = &bo->sync_obj_read;
+		break;
+	case TTM_USAGE_WRITE:
+		bo_sync_obj = &bo->sync_obj_write;
+		break;
+	case TTM_USAGE_READWRITE:
+	default:
+		bo_sync_obj = &bo->sync_obj;
+	}
+
+	if (likely(*bo_sync_obj == NULL))
 		return 0;
 
-	while (bo->sync_obj) {
+	while (*bo_sync_obj) {
 
-		if (driver->sync_obj_signaled(bo->sync_obj, bo->sync_obj_arg)) {
-			void *tmp_obj = bo->sync_obj;
-			bo->sync_obj = NULL;
-			clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
-			spin_unlock(&bdev->fence_lock);
-			driver->sync_obj_unref(&tmp_obj);
-			spin_lock(&bdev->fence_lock);
+		if (driver->sync_obj_signaled(*bo_sync_obj, bo->sync_obj_arg)) {
+			ttm_bo_unref_sync_obj_locked(bo, *bo_sync_obj, NULL);
 			continue;
 		}
 
 		if (no_wait)
 			return -EBUSY;
 
-		sync_obj = driver->sync_obj_ref(bo->sync_obj);
+		sync_obj = driver->sync_obj_ref(*bo_sync_obj);
 		sync_obj_arg = bo->sync_obj_arg;
 		spin_unlock(&bdev->fence_lock);
 		ret = driver->sync_obj_wait(sync_obj, sync_obj_arg,
@@ -1726,16 +1778,9 @@ int ttm_bo_wait(struct ttm_buffer_object *bo,
 			return ret;
 		}
 		spin_lock(&bdev->fence_lock);
-		if (likely(bo->sync_obj == sync_obj &&
+		if (likely(*bo_sync_obj == sync_obj &&
 			   bo->sync_obj_arg == sync_obj_arg)) {
-			void *tmp_obj = bo->sync_obj;
-			bo->sync_obj = NULL;
-			clear_bit(TTM_BO_PRIV_FLAG_MOVING,
-				  &bo->priv_flags);
-			spin_unlock(&bdev->fence_lock);
-			driver->sync_obj_unref(&sync_obj);
-			driver->sync_obj_unref(&tmp_obj);
-			spin_lock(&bdev->fence_lock);
+			ttm_bo_unref_sync_obj_locked(bo, *bo_sync_obj, &sync_obj);
 		} else {
 			spin_unlock(&bdev->fence_lock);
 			driver->sync_obj_unref(&sync_obj);
@@ -1759,7 +1804,7 @@ int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait)
 	if (unlikely(ret != 0))
 		return ret;
 	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, true, no_wait);
+	ret = ttm_bo_wait(bo, false, true, no_wait, TTM_USAGE_READWRITE);
 	spin_unlock(&bdev->fence_lock);
 	if (likely(ret == 0))
 		atomic_inc(&bo->cpu_writers);
@@ -1833,7 +1878,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 	 */
 
 	spin_lock(&bo->bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, false, false);
+	ret = ttm_bo_wait(bo, false, false, false, TTM_USAGE_READWRITE);
 	spin_unlock(&bo->bdev->fence_lock);
 
 	if (unlikely(ret != 0))
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index 77dbf40..0399573 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -436,6 +436,8 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo,
 	atomic_set(&fbo->cpu_writers, 0);
 
 	fbo->sync_obj = driver->sync_obj_ref(bo->sync_obj);
+	fbo->sync_obj_read = driver->sync_obj_ref(bo->sync_obj_read);
+	fbo->sync_obj_write = driver->sync_obj_ref(bo->sync_obj_write);
 	kref_init(&fbo->list_kref);
 	kref_init(&fbo->kref);
 	fbo->destroy = &ttm_transfered_destroy;
@@ -618,20 +620,30 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
 	struct ttm_mem_reg *old_mem = &bo->mem;
 	int ret;
 	struct ttm_buffer_object *ghost_obj;
-	void *tmp_obj = NULL;
+	void *tmp_obj = NULL, *tmp_obj_read = NULL, *tmp_obj_write = NULL;
 
 	spin_lock(&bdev->fence_lock);
-	if (bo->sync_obj) {
+	if (bo->sync_obj)
 		tmp_obj = bo->sync_obj;
-		bo->sync_obj = NULL;
-	}
+	if (bo->sync_obj_read)
+		tmp_obj_read = bo->sync_obj_read;
+	if (bo->sync_obj_write)
+		tmp_obj_write = bo->sync_obj_write;
+
 	bo->sync_obj = driver->sync_obj_ref(sync_obj);
+	bo->sync_obj_read = driver->sync_obj_ref(sync_obj);
+	bo->sync_obj_write = driver->sync_obj_ref(sync_obj);
 	bo->sync_obj_arg = sync_obj_arg;
 	if (evict) {
-		ret = ttm_bo_wait(bo, false, false, false);
+		ret = ttm_bo_wait(bo, false, false, false,
+				  TTM_USAGE_READWRITE);
 		spin_unlock(&bdev->fence_lock);
 		if (tmp_obj)
 			driver->sync_obj_unref(&tmp_obj);
+		if (tmp_obj_read)
+			driver->sync_obj_unref(&tmp_obj_read);
+		if (tmp_obj_write)
+			driver->sync_obj_unref(&tmp_obj_write);
 		if (ret)
 			return ret;
 
@@ -655,6 +667,10 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
 		spin_unlock(&bdev->fence_lock);
 		if (tmp_obj)
 			driver->sync_obj_unref(&tmp_obj);
+		if (tmp_obj_read)
+			driver->sync_obj_unref(&tmp_obj_read);
+		if (tmp_obj_write)
+			driver->sync_obj_unref(&tmp_obj_write);
 
 		ret = ttm_buffer_object_transfer(bo, &ghost_obj);
 		if (ret)
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 221b924..ff1e26f 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -122,7 +122,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	spin_lock(&bdev->fence_lock);
 	if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
-		ret = ttm_bo_wait(bo, false, true, false);
+		ret = ttm_bo_wait(bo, false, true, false, TTM_USAGE_READWRITE);
 		spin_unlock(&bdev->fence_lock);
 		if (unlikely(ret != 0)) {
 			retval = (ret != -ERESTARTSYS) ?
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index 3832fe1..0438296 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -223,6 +223,14 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
 		bo = entry->bo;
 		entry->old_sync_obj = bo->sync_obj;
 		bo->sync_obj = driver->sync_obj_ref(sync_obj);
+		if (entry->usage & TTM_USAGE_READ) {
+			entry->old_sync_obj_read = bo->sync_obj_read;
+			bo->sync_obj_read = driver->sync_obj_ref(sync_obj);
+		}
+		if (entry->usage & TTM_USAGE_WRITE) {
+			entry->old_sync_obj_write = bo->sync_obj_write;
+			bo->sync_obj_write = driver->sync_obj_ref(sync_obj);
+		}
 		bo->sync_obj_arg = entry->new_sync_obj_arg;
 		ttm_bo_unreserve_locked(bo);
 		entry->reserved = false;
@@ -231,8 +239,15 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
 	spin_unlock(&bdev->fence_lock);
 
 	list_for_each_entry(entry, list, head) {
-		if (entry->old_sync_obj)
+		if (entry->old_sync_obj) {
 			driver->sync_obj_unref(&entry->old_sync_obj);
+		}
+		if (entry->old_sync_obj_read) {
+			driver->sync_obj_unref(&entry->old_sync_obj_read);
+		}
+		if (entry->old_sync_obj_write) {
+			driver->sync_obj_unref(&entry->old_sync_obj_write);
+		}
 	}
 }
 EXPORT_SYMBOL(ttm_eu_fence_buffer_objects);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 41b95ed..8ca3ddb 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -224,6 +224,7 @@ static int vmw_translate_guest_ptr(struct vmw_private *dev_priv,
 	if (unlikely(cur_validate_node == sw_context->cur_val_buf)) {
 		val_buf = &sw_context->val_bufs[cur_validate_node];
 		val_buf->bo = ttm_bo_reference(bo);
+		val_buf->usage = TTM_USAGE_READWRITE;
 		val_buf->new_sync_obj_arg = (void *) dev_priv;
 		list_add_tail(&val_buf->head, &sw_context->validate_nodes);
 		++sw_context->cur_val_buf;
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 42e3469..da957bf 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -44,6 +44,11 @@ struct ttm_bo_device;
 
 struct drm_mm_node;
 
+enum ttm_buffer_usage {
+    TTM_USAGE_READ = 1,
+    TTM_USAGE_WRITE = 2,
+    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
+};
 
 /**
  * struct ttm_placement
@@ -174,7 +179,10 @@ struct ttm_tt;
  * the bo_device::lru_lock.
  * @reserved: Deadlock-free lock used for synchronization state transitions.
  * @sync_obj_arg: Opaque argument to synchronization object function.
- * @sync_obj: Pointer to a synchronization object.
+ * @sync_obj: Pointer to a synchronization object of a last read or write,
+ * whichever is later.
+ * @sync_obj_read: Pointer to a synchronization object of a last read.
+ * @sync_obj_write: Pointer to a synchronization object of a last write.
  * @priv_flags: Flags describing buffer object internal state.
  * @vm_rb: Rb node for the vm rb tree.
  * @vm_node: Address space manager node.
@@ -258,6 +266,8 @@ struct ttm_buffer_object {
 
 	void *sync_obj_arg;
 	void *sync_obj;
+	void *sync_obj_read;
+	void *sync_obj_write;
 	unsigned long priv_flags;
 
 	/**
@@ -325,6 +335,7 @@ ttm_bo_reference(struct ttm_buffer_object *bo)
  * @bo:  The buffer object.
  * @interruptible:  Use interruptible wait.
  * @no_wait:  Return immediately if buffer is busy.
+ * @usage:  Whether to wait for the last read and/or the last write.
  *
  * This function must be called with the bo::mutex held, and makes
  * sure any previous rendering to the buffer is completed.
@@ -334,7 +345,8 @@ ttm_bo_reference(struct ttm_buffer_object *bo)
  * Returns -ERESTARTSYS if interrupted by a signal.
  */
 extern int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy,
-		       bool interruptible, bool no_wait);
+		       bool interruptible, bool no_wait,
+		       enum ttm_buffer_usage usage);
 /**
  * ttm_bo_validate
  *
diff --git a/include/drm/ttm/ttm_execbuf_util.h b/include/drm/ttm/ttm_execbuf_util.h
index 26cc7f9..375f299 100644
--- a/include/drm/ttm/ttm_execbuf_util.h
+++ b/include/drm/ttm/ttm_execbuf_util.h
@@ -41,20 +41,26 @@
  * @bo:             refcounted buffer object pointer.
  * @new_sync_obj_arg: New sync_obj_arg for @bo, to be used once
  * adding a new sync object.
+ * @usage           Indicates how @bo is used by the device.
  * @reserved:       Indicates whether @bo has been reserved for validation.
  * @removed:        Indicates whether @bo has been removed from lru lists.
  * @put_count:      Number of outstanding references on bo::list_kref.
  * @old_sync_obj:   Pointer to a sync object about to be unreferenced
+ * @old_sync_obj_read: Pointer to a read sync object about to be unreferenced.
+ * @old_sync_obj_write: Pointer to a write sync object about to be unreferenced.
  */
 
 struct ttm_validate_buffer {
 	struct list_head head;
 	struct ttm_buffer_object *bo;
 	void *new_sync_obj_arg;
+	enum ttm_buffer_usage usage;
 	bool reserved;
 	bool removed;
 	int put_count;
 	void *old_sync_obj;
+	void *old_sync_obj_read;
+	void *old_sync_obj_write;
 };
 
 /**
-- 
1.7.4.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/2] drm/radeon/kms: add a new gem_wait ioctl with read/write flags
  2011-08-07 20:39 [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write Marek Olšák
@ 2011-08-07 20:39 ` Marek Olšák
  2011-08-12 17:22   ` Jerome Glisse
  2011-08-12 17:21 ` [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write Jerome Glisse
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 31+ messages in thread
From: Marek Olšák @ 2011-08-07 20:39 UTC (permalink / raw)
  To: dri-devel

The new DRM_RADEON_GEM_WAIT ioctl combines GEM_WAIT_IDLE and GEM_BUSY (there
is a NO_WAIT flag to get the latter) with USAGE_READ and USAGE_WRITE flags
to take advantage of the new ttm_bo_wait changes.

Also bump the DRM version.

Signed-off-by: Marek Olšák <maraeo@gmail.com>
---
 drivers/gpu/drm/radeon/radeon.h        |    2 +
 drivers/gpu/drm/radeon/radeon_cs.c     |    5 +++-
 drivers/gpu/drm/radeon/radeon_drv.c    |    3 +-
 drivers/gpu/drm/radeon/radeon_gem.c    |   36 +++++++++++++++++++++++++++++--
 drivers/gpu/drm/radeon/radeon_kms.c    |    1 +
 drivers/gpu/drm/radeon/radeon_object.h |    4 +-
 include/drm/radeon_drm.h               |   11 +++++++++
 7 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 32807ba..0040d28 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -1143,6 +1143,8 @@ int radeon_gem_set_tiling_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *filp);
 int radeon_gem_get_tiling_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *filp);
+int radeon_gem_wait_ioctl(struct drm_device *dev, void *data,
+			  struct drm_file *filp);
 
 /* VRAM scratch page for HDP bug */
 struct r700_vram_scratch {
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 14e8531..f0b9066 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -80,7 +80,10 @@ int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
 			p->relocs[i].lobj.wdomain = r->write_domain;
 			p->relocs[i].lobj.rdomain = r->read_domains;
 			p->relocs[i].lobj.tv.bo = &p->relocs[i].robj->tbo;
-			p->relocs[i].lobj.tv.usage = TTM_USAGE_READWRITE;
+			if (r->read_domains)
+				p->relocs[i].lobj.tv.usage |= TTM_USAGE_READ;
+			if (r->write_domain)
+				p->relocs[i].lobj.tv.usage |= TTM_USAGE_WRITE;
 			p->relocs[i].handle = r->handle;
 			p->relocs[i].flags = r->flags;
 			radeon_bo_list_add_object(&p->relocs[i].lobj,
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index e71d2ed..bd187e0 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -52,9 +52,10 @@
  *   2.9.0 - r600 tiling (s3tc,rgtc) working, SET_PREDICATION packet 3 on r600 + eg, backend query
  *   2.10.0 - fusion 2D tiling
  *   2.11.0 - backend map, initial compute support for the CS checker
+ *   2.12.0 - DRM_RADEON_GEM_WAIT ioctl
  */
 #define KMS_DRIVER_MAJOR	2
-#define KMS_DRIVER_MINOR	11
+#define KMS_DRIVER_MINOR	12
 #define KMS_DRIVER_PATCHLEVEL	0
 int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags);
 int radeon_driver_unload_kms(struct drm_device *dev);
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index aa1ca2d..2edc2a4 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -122,7 +122,7 @@ int radeon_gem_set_domain(struct drm_gem_object *gobj,
 	}
 	if (domain == RADEON_GEM_DOMAIN_CPU) {
 		/* Asking for cpu access wait for object idle */
-		r = radeon_bo_wait(robj, NULL, false);
+		r = radeon_bo_wait(robj, NULL, false, TTM_USAGE_READWRITE);
 		if (r) {
 			printk(KERN_ERR "Failed to wait for object !\n");
 			return r;
@@ -273,7 +273,7 @@ int radeon_gem_busy_ioctl(struct drm_device *dev, void *data,
 		return -ENOENT;
 	}
 	robj = gem_to_radeon_bo(gobj);
-	r = radeon_bo_wait(robj, &cur_placement, true);
+	r = radeon_bo_wait(robj, &cur_placement, true, TTM_USAGE_READWRITE);
 	switch (cur_placement) {
 	case TTM_PL_VRAM:
 		args->domain = RADEON_GEM_DOMAIN_VRAM;
@@ -303,7 +303,7 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data,
 		return -ENOENT;
 	}
 	robj = gem_to_radeon_bo(gobj);
-	r = radeon_bo_wait(robj, NULL, false);
+	r = radeon_bo_wait(robj, NULL, false, TTM_USAGE_READWRITE);
 	/* callback hw specific functions if any */
 	if (robj->rdev->asic->ioctl_wait_idle)
 		robj->rdev->asic->ioctl_wait_idle(robj->rdev, robj);
@@ -311,6 +311,36 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data,
 	return r;
 }
 
+int radeon_gem_wait_ioctl(struct drm_device *dev, void *data,
+			  struct drm_file *filp)
+{
+	struct drm_radeon_gem_wait *args = data;
+	struct drm_gem_object *gobj;
+	struct radeon_bo *robj;
+	bool no_wait = (args->flags & RADEON_GEM_NO_WAIT) != 0;
+	enum ttm_buffer_usage usage = 0;
+	int r;
+
+	if (args->flags & RADEON_GEM_USAGE_READ)
+		usage |= TTM_USAGE_READ;
+	if (args->flags & RADEON_GEM_USAGE_WRITE)
+		usage |= TTM_USAGE_WRITE;
+	if (!usage)
+		usage = TTM_USAGE_READWRITE;
+
+	gobj = drm_gem_object_lookup(dev, filp, args->handle);
+	if (gobj == NULL) {
+		return -ENOENT;
+	}
+	robj = gem_to_radeon_bo(gobj);
+	r = radeon_bo_wait(robj, NULL, no_wait, usage);
+	/* callback hw specific functions if any */
+	if (!no_wait && robj->rdev->asic->ioctl_wait_idle)
+		robj->rdev->asic->ioctl_wait_idle(robj->rdev, robj);
+	drm_gem_object_unreference_unlocked(gobj);
+	return r;
+}
+
 int radeon_gem_set_tiling_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *filp)
 {
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
index be2c122..a749c26 100644
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@@ -451,5 +451,6 @@ struct drm_ioctl_desc radeon_ioctls_kms[] = {
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_SET_TILING, radeon_gem_set_tiling_ioctl, DRM_AUTH|DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_GET_TILING, radeon_gem_get_tiling_ioctl, DRM_AUTH|DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(RADEON_GEM_WAIT, radeon_gem_wait_ioctl, DRM_AUTH|DRM_UNLOCKED),
 };
 int radeon_max_kms_ioctl = DRM_ARRAY_SIZE(radeon_ioctls_kms);
diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h
index e9dc8b2..a057a8e 100644
--- a/drivers/gpu/drm/radeon/radeon_object.h
+++ b/drivers/gpu/drm/radeon/radeon_object.h
@@ -119,7 +119,7 @@ static inline u64 radeon_bo_mmap_offset(struct radeon_bo *bo)
 }
 
 static inline int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type,
-					bool no_wait)
+				 bool no_wait, enum ttm_buffer_usage usage)
 {
 	int r;
 
@@ -130,7 +130,7 @@ static inline int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type,
 	if (mem_type)
 		*mem_type = bo->tbo.mem.mem_type;
 	if (bo->tbo.sync_obj)
-		r = ttm_bo_wait(&bo->tbo, true, true, no_wait, TTM_USAGE_READWRITE);
+		r = ttm_bo_wait(&bo->tbo, true, true, no_wait, usage);
 	spin_unlock(&bo->tbo.bdev->fence_lock);
 	ttm_bo_unreserve(&bo->tbo);
 	return r;
diff --git a/include/drm/radeon_drm.h b/include/drm/radeon_drm.h
index b65be60..939b854 100644
--- a/include/drm/radeon_drm.h
+++ b/include/drm/radeon_drm.h
@@ -509,6 +509,7 @@ typedef struct {
 #define DRM_RADEON_GEM_SET_TILING	0x28
 #define DRM_RADEON_GEM_GET_TILING	0x29
 #define DRM_RADEON_GEM_BUSY		0x2a
+#define DRM_RADEON_GEM_WAIT		0x2b
 
 #define DRM_IOCTL_RADEON_CP_INIT    DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CP_INIT, drm_radeon_init_t)
 #define DRM_IOCTL_RADEON_CP_START   DRM_IO(  DRM_COMMAND_BASE + DRM_RADEON_CP_START)
@@ -550,6 +551,7 @@ typedef struct {
 #define DRM_IOCTL_RADEON_GEM_SET_TILING	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_SET_TILING, struct drm_radeon_gem_set_tiling)
 #define DRM_IOCTL_RADEON_GEM_GET_TILING	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_GET_TILING, struct drm_radeon_gem_get_tiling)
 #define DRM_IOCTL_RADEON_GEM_BUSY	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy)
+#define DRM_IOCTL_RADEON_GEM_WAIT	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_WAIT, struct drm_radeon_gem_wait)
 
 typedef struct drm_radeon_init {
 	enum {
@@ -846,6 +848,15 @@ struct drm_radeon_gem_busy {
 	uint32_t        domain;
 };
 
+#define RADEON_GEM_NO_WAIT	0x1
+#define RADEON_GEM_USAGE_READ	0x2
+#define RADEON_GEM_USAGE_WRITE	0x4
+
+struct drm_radeon_gem_wait {
+	uint32_t	handle;
+	uint32_t        flags;  /* one of RADEON_GEM_* */
+};
+
 struct drm_radeon_gem_pread {
 	/** Handle for the object being read. */
 	uint32_t handle;
-- 
1.7.4.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-08-07 20:39 [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write Marek Olšák
  2011-08-07 20:39 ` [PATCH 2/2] drm/radeon/kms: add a new gem_wait ioctl with read/write flags Marek Olšák
@ 2011-08-12 17:21 ` Jerome Glisse
  2011-08-13 20:32   ` Marek Olšák
  2011-10-04 11:48 ` Thomas Hellstrom
  2011-10-07  8:58 ` Thomas Hellstrom
  3 siblings, 1 reply; 31+ messages in thread
From: Jerome Glisse @ 2011-08-12 17:21 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

On Sun, Aug 7, 2011 at 4:39 PM, Marek Olšák <maraeo@gmail.com> wrote:
> Sometimes we want to know whether a buffer is busy and wait for it (bo_wait).
> However, sometimes it would be more useful to be able to query whether
> a buffer is busy and being either read or written, and wait until it's stopped
> being either read or written. The point of this is to be able to avoid
> unnecessary waiting, e.g. if a GPU has written something to a buffer and is now
> reading that buffer, and a CPU wants to map that buffer for read, it needs to
> only wait for the last write. If there were no write, there wouldn't be any
> waiting needed.
>
> This, or course, requires user space drivers to send read/write flags
> with each relocation (like we have read/write domains in radeon, so we can
> actually use those for something useful now).
>
> Now how this patch works:
>
> The read/write flags should passed to ttm_validate_buffer. TTM maintains
> separate sync objects of the last read and write for each buffer, in addition
> to the sync object of the last use of a buffer. ttm_bo_wait then operates
> with one the sync objects.

Just minor comment for extra safety see below, otherwise:
Reviewed-by: Jerome Glisse <jglisse@redhat.com>

> Signed-off-by: Marek Olšák <maraeo@gmail.com>
> ---
>  drivers/gpu/drm/nouveau/nouveau_bo.c    |    3 +-
>  drivers/gpu/drm/nouveau/nouveau_gem.c   |    5 +-
>  drivers/gpu/drm/radeon/radeon_cs.c      |    1 +
>  drivers/gpu/drm/radeon/radeon_object.h  |    2 +-
>  drivers/gpu/drm/ttm/ttm_bo.c            |   97 ++++++++++++++++++++++--------
>  drivers/gpu/drm/ttm/ttm_bo_util.c       |   26 +++++++--
>  drivers/gpu/drm/ttm/ttm_bo_vm.c         |    2 +-
>  drivers/gpu/drm/ttm/ttm_execbuf_util.c  |   17 +++++-
>  drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c |    1 +
>  include/drm/ttm/ttm_bo_api.h            |   16 +++++-
>  include/drm/ttm/ttm_execbuf_util.h      |    6 ++
>  11 files changed, 137 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index 890d50e..e87e24b 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -1104,7 +1104,8 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma)
>        if (vma->node) {
>                if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) {
>                        spin_lock(&nvbo->bo.bdev->fence_lock);
> -                       ttm_bo_wait(&nvbo->bo, false, false, false);
> +                       ttm_bo_wait(&nvbo->bo, false, false, false,
> +                                   TTM_USAGE_READWRITE);
>                        spin_unlock(&nvbo->bo.bdev->fence_lock);
>                        nouveau_vm_unmap(vma);
>                }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index 5f0bc57..322bf62 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -589,7 +589,8 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev,
>                }
>
>                spin_lock(&nvbo->bo.bdev->fence_lock);
> -               ret = ttm_bo_wait(&nvbo->bo, false, false, false);
> +               ret = ttm_bo_wait(&nvbo->bo, false, false, false,
> +                                 TTM_USAGE_READWRITE);
>                spin_unlock(&nvbo->bo.bdev->fence_lock);
>                if (ret) {
>                        NV_ERROR(dev, "reloc wait_idle failed: %d\n", ret);
> @@ -825,7 +826,7 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
>        nvbo = nouveau_gem_object(gem);
>
>        spin_lock(&nvbo->bo.bdev->fence_lock);
> -       ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
> +       ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait, TTM_USAGE_READWRITE);
>        spin_unlock(&nvbo->bo.bdev->fence_lock);
>        drm_gem_object_unreference_unlocked(gem);
>        return ret;
> diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
> index fae00c0..14e8531 100644
> --- a/drivers/gpu/drm/radeon/radeon_cs.c
> +++ b/drivers/gpu/drm/radeon/radeon_cs.c
> @@ -80,6 +80,7 @@ int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
>                        p->relocs[i].lobj.wdomain = r->write_domain;
>                        p->relocs[i].lobj.rdomain = r->read_domains;
>                        p->relocs[i].lobj.tv.bo = &p->relocs[i].robj->tbo;
> +                       p->relocs[i].lobj.tv.usage = TTM_USAGE_READWRITE;
>                        p->relocs[i].handle = r->handle;
>                        p->relocs[i].flags = r->flags;
>                        radeon_bo_list_add_object(&p->relocs[i].lobj,
> diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h
> index ede6c13..e9dc8b2 100644
> --- a/drivers/gpu/drm/radeon/radeon_object.h
> +++ b/drivers/gpu/drm/radeon/radeon_object.h
> @@ -130,7 +130,7 @@ static inline int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type,
>        if (mem_type)
>                *mem_type = bo->tbo.mem.mem_type;
>        if (bo->tbo.sync_obj)
> -               r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
> +               r = ttm_bo_wait(&bo->tbo, true, true, no_wait, TTM_USAGE_READWRITE);
>        spin_unlock(&bo->tbo.bdev->fence_lock);
>        ttm_bo_unreserve(&bo->tbo);
>        return r;
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 56619f6..bb8c86d 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -494,7 +494,7 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
>        int ret;
>
>        spin_lock(&bdev->fence_lock);
> -       (void) ttm_bo_wait(bo, false, false, true);
> +       (void) ttm_bo_wait(bo, false, false, true, TTM_USAGE_READWRITE);
>        if (!bo->sync_obj) {
>
>                spin_lock(&glob->lru_lock);
> @@ -562,7 +562,8 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
>
>  retry:
>        spin_lock(&bdev->fence_lock);
> -       ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> +       ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu,
> +                         TTM_USAGE_READWRITE);
>        spin_unlock(&bdev->fence_lock);
>
>        if (unlikely(ret != 0))
> @@ -721,7 +722,8 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible,
>        int ret = 0;
>
>        spin_lock(&bdev->fence_lock);
> -       ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> +       ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu,
> +                         TTM_USAGE_READWRITE);
>        spin_unlock(&bdev->fence_lock);
>
>        if (unlikely(ret != 0)) {
> @@ -1068,7 +1070,8 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
>         * instead of doing it here.
>         */
>        spin_lock(&bdev->fence_lock);
> -       ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> +       ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu,
> +                         TTM_USAGE_READWRITE);
>        spin_unlock(&bdev->fence_lock);
>        if (ret)
>                return ret;
> @@ -1688,34 +1691,83 @@ out_unlock:
>        return ret;
>  }
>
> +static void ttm_bo_unref_sync_obj_locked(struct ttm_buffer_object *bo,
> +                                        void *sync_obj,
> +                                        void **extra_sync_obj)
> +{
> +       struct ttm_bo_device *bdev = bo->bdev;
> +       struct ttm_bo_driver *driver = bdev->driver;
> +       void *tmp_obj = NULL, *tmp_obj_read = NULL, *tmp_obj_write = NULL;
> +
> +       /* We must unref the sync obj wherever it's ref'd.
> +        * Note that if we unref bo->sync_obj, we can unref both the read
> +        * and write sync objs too, because they can't be newer than
> +        * bo->sync_obj, so they are no longer relevant. */
> +       if (sync_obj == bo->sync_obj ||
> +           sync_obj == bo->sync_obj_read) {
> +               tmp_obj_read = bo->sync_obj_read;
> +               bo->sync_obj_read = NULL;
> +       }
> +       if (sync_obj == bo->sync_obj ||
> +           sync_obj == bo->sync_obj_write) {
> +               tmp_obj_write = bo->sync_obj_write;
> +               bo->sync_obj_write = NULL;
> +       }
> +       if (sync_obj == bo->sync_obj) {
> +               tmp_obj = bo->sync_obj;
> +               bo->sync_obj = NULL;
> +       }
> +
> +       clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
> +       spin_unlock(&bdev->fence_lock);
> +       if (tmp_obj)
> +               driver->sync_obj_unref(&tmp_obj);
> +       if (tmp_obj_read)
> +               driver->sync_obj_unref(&tmp_obj_read);
> +       if (tmp_obj_write)
> +               driver->sync_obj_unref(&tmp_obj_write);
> +       if (extra_sync_obj)
> +               driver->sync_obj_unref(extra_sync_obj);
> +       spin_lock(&bdev->fence_lock);
> +}
> +
>  int ttm_bo_wait(struct ttm_buffer_object *bo,
> -               bool lazy, bool interruptible, bool no_wait)
> +               bool lazy, bool interruptible, bool no_wait,
> +               enum ttm_buffer_usage usage)
>  {
>        struct ttm_bo_driver *driver = bo->bdev->driver;
>        struct ttm_bo_device *bdev = bo->bdev;
>        void *sync_obj;
>        void *sync_obj_arg;
>        int ret = 0;
> +       void **bo_sync_obj;
>
> -       if (likely(bo->sync_obj == NULL))
> +       switch (usage) {
> +       case TTM_USAGE_READ:
> +               bo_sync_obj = &bo->sync_obj_read;
> +               break;
> +       case TTM_USAGE_WRITE:
> +               bo_sync_obj = &bo->sync_obj_write;
> +               break;
> +       case TTM_USAGE_READWRITE:
> +       default:
> +               bo_sync_obj = &bo->sync_obj;
> +       }
> +
> +       if (likely(*bo_sync_obj == NULL))
>                return 0;
>
> -       while (bo->sync_obj) {
> +       while (*bo_sync_obj) {
>
> -               if (driver->sync_obj_signaled(bo->sync_obj, bo->sync_obj_arg)) {
> -                       void *tmp_obj = bo->sync_obj;
> -                       bo->sync_obj = NULL;
> -                       clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
> -                       spin_unlock(&bdev->fence_lock);
> -                       driver->sync_obj_unref(&tmp_obj);
> -                       spin_lock(&bdev->fence_lock);
> +               if (driver->sync_obj_signaled(*bo_sync_obj, bo->sync_obj_arg)) {
> +                       ttm_bo_unref_sync_obj_locked(bo, *bo_sync_obj, NULL);
>                        continue;
>                }
>
>                if (no_wait)
>                        return -EBUSY;
>
> -               sync_obj = driver->sync_obj_ref(bo->sync_obj);
> +               sync_obj = driver->sync_obj_ref(*bo_sync_obj);
>                sync_obj_arg = bo->sync_obj_arg;
>                spin_unlock(&bdev->fence_lock);
>                ret = driver->sync_obj_wait(sync_obj, sync_obj_arg,
> @@ -1726,16 +1778,9 @@ int ttm_bo_wait(struct ttm_buffer_object *bo,
>                        return ret;
>                }
>                spin_lock(&bdev->fence_lock);
> -               if (likely(bo->sync_obj == sync_obj &&
> +               if (likely(*bo_sync_obj == sync_obj &&
>                           bo->sync_obj_arg == sync_obj_arg)) {
> -                       void *tmp_obj = bo->sync_obj;
> -                       bo->sync_obj = NULL;
> -                       clear_bit(TTM_BO_PRIV_FLAG_MOVING,
> -                                 &bo->priv_flags);
> -                       spin_unlock(&bdev->fence_lock);
> -                       driver->sync_obj_unref(&sync_obj);
> -                       driver->sync_obj_unref(&tmp_obj);
> -                       spin_lock(&bdev->fence_lock);
> +                       ttm_bo_unref_sync_obj_locked(bo, *bo_sync_obj, &sync_obj);
>                } else {
>                        spin_unlock(&bdev->fence_lock);
>                        driver->sync_obj_unref(&sync_obj);
> @@ -1759,7 +1804,7 @@ int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait)
>        if (unlikely(ret != 0))
>                return ret;
>        spin_lock(&bdev->fence_lock);
> -       ret = ttm_bo_wait(bo, false, true, no_wait);
> +       ret = ttm_bo_wait(bo, false, true, no_wait, TTM_USAGE_READWRITE);
>        spin_unlock(&bdev->fence_lock);
>        if (likely(ret == 0))
>                atomic_inc(&bo->cpu_writers);
> @@ -1833,7 +1878,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
>         */
>
>        spin_lock(&bo->bdev->fence_lock);
> -       ret = ttm_bo_wait(bo, false, false, false);
> +       ret = ttm_bo_wait(bo, false, false, false, TTM_USAGE_READWRITE);
>        spin_unlock(&bo->bdev->fence_lock);
>
>        if (unlikely(ret != 0))
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
> index 77dbf40..0399573 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> @@ -436,6 +436,8 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo,
>        atomic_set(&fbo->cpu_writers, 0);
>
>        fbo->sync_obj = driver->sync_obj_ref(bo->sync_obj);
> +       fbo->sync_obj_read = driver->sync_obj_ref(bo->sync_obj_read);
> +       fbo->sync_obj_write = driver->sync_obj_ref(bo->sync_obj_write);
>        kref_init(&fbo->list_kref);
>        kref_init(&fbo->kref);
>        fbo->destroy = &ttm_transfered_destroy;
> @@ -618,20 +620,30 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
>        struct ttm_mem_reg *old_mem = &bo->mem;
>        int ret;
>        struct ttm_buffer_object *ghost_obj;
> -       void *tmp_obj = NULL;
> +       void *tmp_obj = NULL, *tmp_obj_read = NULL, *tmp_obj_write = NULL;
>
>        spin_lock(&bdev->fence_lock);
> -       if (bo->sync_obj) {
> +       if (bo->sync_obj)
>                tmp_obj = bo->sync_obj;
> -               bo->sync_obj = NULL;
> -       }
> +       if (bo->sync_obj_read)
> +               tmp_obj_read = bo->sync_obj_read;
> +       if (bo->sync_obj_write)
> +               tmp_obj_write = bo->sync_obj_write;
> +
>        bo->sync_obj = driver->sync_obj_ref(sync_obj);
> +       bo->sync_obj_read = driver->sync_obj_ref(sync_obj);
> +       bo->sync_obj_write = driver->sync_obj_ref(sync_obj);
>        bo->sync_obj_arg = sync_obj_arg;
>        if (evict) {
> -               ret = ttm_bo_wait(bo, false, false, false);
> +               ret = ttm_bo_wait(bo, false, false, false,
> +                                 TTM_USAGE_READWRITE);
>                spin_unlock(&bdev->fence_lock);
>                if (tmp_obj)
>                        driver->sync_obj_unref(&tmp_obj);
> +               if (tmp_obj_read)
> +                       driver->sync_obj_unref(&tmp_obj_read);
> +               if (tmp_obj_write)
> +                       driver->sync_obj_unref(&tmp_obj_write);
>                if (ret)
>                        return ret;
>
> @@ -655,6 +667,10 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
>                spin_unlock(&bdev->fence_lock);
>                if (tmp_obj)
>                        driver->sync_obj_unref(&tmp_obj);
> +               if (tmp_obj_read)
> +                       driver->sync_obj_unref(&tmp_obj_read);
> +               if (tmp_obj_write)
> +                       driver->sync_obj_unref(&tmp_obj_write);
>
>                ret = ttm_buffer_object_transfer(bo, &ghost_obj);
>                if (ret)
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index 221b924..ff1e26f 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -122,7 +122,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>
>        spin_lock(&bdev->fence_lock);
>        if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
> -               ret = ttm_bo_wait(bo, false, true, false);
> +               ret = ttm_bo_wait(bo, false, true, false, TTM_USAGE_READWRITE);
>                spin_unlock(&bdev->fence_lock);
>                if (unlikely(ret != 0)) {
>                        retval = (ret != -ERESTARTSYS) ?
> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> index 3832fe1..0438296 100644
> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> @@ -223,6 +223,14 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
>                bo = entry->bo;
>                entry->old_sync_obj = bo->sync_obj;

Here we should set entry->old_sync_obj_read &
entry->old_sync_obj_write to NULL so
we don't get bite by uninitialized memory (if caller fail or forget to do so)


>                bo->sync_obj = driver->sync_obj_ref(sync_obj);
> +               if (entry->usage & TTM_USAGE_READ) {
> +                       entry->old_sync_obj_read = bo->sync_obj_read;
> +                       bo->sync_obj_read = driver->sync_obj_ref(sync_obj);
> +               }
> +               if (entry->usage & TTM_USAGE_WRITE) {
> +                       entry->old_sync_obj_write = bo->sync_obj_write;
> +                       bo->sync_obj_write = driver->sync_obj_ref(sync_obj);
> +               }
>                bo->sync_obj_arg = entry->new_sync_obj_arg;
>                ttm_bo_unreserve_locked(bo);
>                entry->reserved = false;
> @@ -231,8 +239,15 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
>        spin_unlock(&bdev->fence_lock);
>
>        list_for_each_entry(entry, list, head) {
> -               if (entry->old_sync_obj)
> +               if (entry->old_sync_obj) {
>                        driver->sync_obj_unref(&entry->old_sync_obj);
> +               }
> +               if (entry->old_sync_obj_read) {
> +                       driver->sync_obj_unref(&entry->old_sync_obj_read);
> +               }
> +               if (entry->old_sync_obj_write) {
> +                       driver->sync_obj_unref(&entry->old_sync_obj_write);
> +               }
>        }
>  }
>  EXPORT_SYMBOL(ttm_eu_fence_buffer_objects);
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> index 41b95ed..8ca3ddb 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> @@ -224,6 +224,7 @@ static int vmw_translate_guest_ptr(struct vmw_private *dev_priv,
>        if (unlikely(cur_validate_node == sw_context->cur_val_buf)) {
>                val_buf = &sw_context->val_bufs[cur_validate_node];
>                val_buf->bo = ttm_bo_reference(bo);
> +               val_buf->usage = TTM_USAGE_READWRITE;
>                val_buf->new_sync_obj_arg = (void *) dev_priv;
>                list_add_tail(&val_buf->head, &sw_context->validate_nodes);
>                ++sw_context->cur_val_buf;
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> index 42e3469..da957bf 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -44,6 +44,11 @@ struct ttm_bo_device;
>
>  struct drm_mm_node;
>
> +enum ttm_buffer_usage {
> +    TTM_USAGE_READ = 1,
> +    TTM_USAGE_WRITE = 2,
> +    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
> +};
>
>  /**
>  * struct ttm_placement
> @@ -174,7 +179,10 @@ struct ttm_tt;
>  * the bo_device::lru_lock.
>  * @reserved: Deadlock-free lock used for synchronization state transitions.
>  * @sync_obj_arg: Opaque argument to synchronization object function.
> - * @sync_obj: Pointer to a synchronization object.
> + * @sync_obj: Pointer to a synchronization object of a last read or write,
> + * whichever is later.
> + * @sync_obj_read: Pointer to a synchronization object of a last read.
> + * @sync_obj_write: Pointer to a synchronization object of a last write.
>  * @priv_flags: Flags describing buffer object internal state.
>  * @vm_rb: Rb node for the vm rb tree.
>  * @vm_node: Address space manager node.
> @@ -258,6 +266,8 @@ struct ttm_buffer_object {
>
>        void *sync_obj_arg;
>        void *sync_obj;
> +       void *sync_obj_read;
> +       void *sync_obj_write;
>        unsigned long priv_flags;
>
>        /**
> @@ -325,6 +335,7 @@ ttm_bo_reference(struct ttm_buffer_object *bo)
>  * @bo:  The buffer object.
>  * @interruptible:  Use interruptible wait.
>  * @no_wait:  Return immediately if buffer is busy.
> + * @usage:  Whether to wait for the last read and/or the last write.
>  *
>  * This function must be called with the bo::mutex held, and makes
>  * sure any previous rendering to the buffer is completed.
> @@ -334,7 +345,8 @@ ttm_bo_reference(struct ttm_buffer_object *bo)
>  * Returns -ERESTARTSYS if interrupted by a signal.
>  */
>  extern int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy,
> -                      bool interruptible, bool no_wait);
> +                      bool interruptible, bool no_wait,
> +                      enum ttm_buffer_usage usage);
>  /**
>  * ttm_bo_validate
>  *
> diff --git a/include/drm/ttm/ttm_execbuf_util.h b/include/drm/ttm/ttm_execbuf_util.h
> index 26cc7f9..375f299 100644
> --- a/include/drm/ttm/ttm_execbuf_util.h
> +++ b/include/drm/ttm/ttm_execbuf_util.h
> @@ -41,20 +41,26 @@
>  * @bo:             refcounted buffer object pointer.
>  * @new_sync_obj_arg: New sync_obj_arg for @bo, to be used once
>  * adding a new sync object.
> + * @usage           Indicates how @bo is used by the device.
>  * @reserved:       Indicates whether @bo has been reserved for validation.
>  * @removed:        Indicates whether @bo has been removed from lru lists.
>  * @put_count:      Number of outstanding references on bo::list_kref.
>  * @old_sync_obj:   Pointer to a sync object about to be unreferenced
> + * @old_sync_obj_read: Pointer to a read sync object about to be unreferenced.
> + * @old_sync_obj_write: Pointer to a write sync object about to be unreferenced.
>  */
>
>  struct ttm_validate_buffer {
>        struct list_head head;
>        struct ttm_buffer_object *bo;
>        void *new_sync_obj_arg;
> +       enum ttm_buffer_usage usage;
>        bool reserved;
>        bool removed;
>        int put_count;
>        void *old_sync_obj;
> +       void *old_sync_obj_read;
> +       void *old_sync_obj_write;
>  };
>
>  /**
> --
> 1.7.4.1
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] drm/radeon/kms: add a new gem_wait ioctl with read/write flags
  2011-08-07 20:39 ` [PATCH 2/2] drm/radeon/kms: add a new gem_wait ioctl with read/write flags Marek Olšák
@ 2011-08-12 17:22   ` Jerome Glisse
  0 siblings, 0 replies; 31+ messages in thread
From: Jerome Glisse @ 2011-08-12 17:22 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

On Sun, Aug 7, 2011 at 4:39 PM, Marek Olšák <maraeo@gmail.com> wrote:
> The new DRM_RADEON_GEM_WAIT ioctl combines GEM_WAIT_IDLE and GEM_BUSY (there
> is a NO_WAIT flag to get the latter) with USAGE_READ and USAGE_WRITE flags
> to take advantage of the new ttm_bo_wait changes.
>
> Also bump the DRM version.
>
> Signed-off-by: Marek Olšák <maraeo@gmail.com>
Reviewed-by: Jerome Glisse <jglisse@redhat.com>
> ---
>  drivers/gpu/drm/radeon/radeon.h        |    2 +
>  drivers/gpu/drm/radeon/radeon_cs.c     |    5 +++-
>  drivers/gpu/drm/radeon/radeon_drv.c    |    3 +-
>  drivers/gpu/drm/radeon/radeon_gem.c    |   36 +++++++++++++++++++++++++++++--
>  drivers/gpu/drm/radeon/radeon_kms.c    |    1 +
>  drivers/gpu/drm/radeon/radeon_object.h |    4 +-
>  include/drm/radeon_drm.h               |   11 +++++++++
>  7 files changed, 55 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 32807ba..0040d28 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -1143,6 +1143,8 @@ int radeon_gem_set_tiling_ioctl(struct drm_device *dev, void *data,
>                                struct drm_file *filp);
>  int radeon_gem_get_tiling_ioctl(struct drm_device *dev, void *data,
>                                struct drm_file *filp);
> +int radeon_gem_wait_ioctl(struct drm_device *dev, void *data,
> +                         struct drm_file *filp);
>
>  /* VRAM scratch page for HDP bug */
>  struct r700_vram_scratch {
> diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
> index 14e8531..f0b9066 100644
> --- a/drivers/gpu/drm/radeon/radeon_cs.c
> +++ b/drivers/gpu/drm/radeon/radeon_cs.c
> @@ -80,7 +80,10 @@ int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
>                        p->relocs[i].lobj.wdomain = r->write_domain;
>                        p->relocs[i].lobj.rdomain = r->read_domains;
>                        p->relocs[i].lobj.tv.bo = &p->relocs[i].robj->tbo;
> -                       p->relocs[i].lobj.tv.usage = TTM_USAGE_READWRITE;
> +                       if (r->read_domains)
> +                               p->relocs[i].lobj.tv.usage |= TTM_USAGE_READ;
> +                       if (r->write_domain)
> +                               p->relocs[i].lobj.tv.usage |= TTM_USAGE_WRITE;
>                        p->relocs[i].handle = r->handle;
>                        p->relocs[i].flags = r->flags;
>                        radeon_bo_list_add_object(&p->relocs[i].lobj,
> diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
> index e71d2ed..bd187e0 100644
> --- a/drivers/gpu/drm/radeon/radeon_drv.c
> +++ b/drivers/gpu/drm/radeon/radeon_drv.c
> @@ -52,9 +52,10 @@
>  *   2.9.0 - r600 tiling (s3tc,rgtc) working, SET_PREDICATION packet 3 on r600 + eg, backend query
>  *   2.10.0 - fusion 2D tiling
>  *   2.11.0 - backend map, initial compute support for the CS checker
> + *   2.12.0 - DRM_RADEON_GEM_WAIT ioctl
>  */
>  #define KMS_DRIVER_MAJOR       2
> -#define KMS_DRIVER_MINOR       11
> +#define KMS_DRIVER_MINOR       12
>  #define KMS_DRIVER_PATCHLEVEL  0
>  int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags);
>  int radeon_driver_unload_kms(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
> index aa1ca2d..2edc2a4 100644
> --- a/drivers/gpu/drm/radeon/radeon_gem.c
> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> @@ -122,7 +122,7 @@ int radeon_gem_set_domain(struct drm_gem_object *gobj,
>        }
>        if (domain == RADEON_GEM_DOMAIN_CPU) {
>                /* Asking for cpu access wait for object idle */
> -               r = radeon_bo_wait(robj, NULL, false);
> +               r = radeon_bo_wait(robj, NULL, false, TTM_USAGE_READWRITE);
>                if (r) {
>                        printk(KERN_ERR "Failed to wait for object !\n");
>                        return r;
> @@ -273,7 +273,7 @@ int radeon_gem_busy_ioctl(struct drm_device *dev, void *data,
>                return -ENOENT;
>        }
>        robj = gem_to_radeon_bo(gobj);
> -       r = radeon_bo_wait(robj, &cur_placement, true);
> +       r = radeon_bo_wait(robj, &cur_placement, true, TTM_USAGE_READWRITE);
>        switch (cur_placement) {
>        case TTM_PL_VRAM:
>                args->domain = RADEON_GEM_DOMAIN_VRAM;
> @@ -303,7 +303,7 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data,
>                return -ENOENT;
>        }
>        robj = gem_to_radeon_bo(gobj);
> -       r = radeon_bo_wait(robj, NULL, false);
> +       r = radeon_bo_wait(robj, NULL, false, TTM_USAGE_READWRITE);
>        /* callback hw specific functions if any */
>        if (robj->rdev->asic->ioctl_wait_idle)
>                robj->rdev->asic->ioctl_wait_idle(robj->rdev, robj);
> @@ -311,6 +311,36 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data,
>        return r;
>  }
>
> +int radeon_gem_wait_ioctl(struct drm_device *dev, void *data,
> +                         struct drm_file *filp)
> +{
> +       struct drm_radeon_gem_wait *args = data;
> +       struct drm_gem_object *gobj;
> +       struct radeon_bo *robj;
> +       bool no_wait = (args->flags & RADEON_GEM_NO_WAIT) != 0;
> +       enum ttm_buffer_usage usage = 0;
> +       int r;
> +
> +       if (args->flags & RADEON_GEM_USAGE_READ)
> +               usage |= TTM_USAGE_READ;
> +       if (args->flags & RADEON_GEM_USAGE_WRITE)
> +               usage |= TTM_USAGE_WRITE;
> +       if (!usage)
> +               usage = TTM_USAGE_READWRITE;
> +
> +       gobj = drm_gem_object_lookup(dev, filp, args->handle);
> +       if (gobj == NULL) {
> +               return -ENOENT;
> +       }
> +       robj = gem_to_radeon_bo(gobj);
> +       r = radeon_bo_wait(robj, NULL, no_wait, usage);
> +       /* callback hw specific functions if any */
> +       if (!no_wait && robj->rdev->asic->ioctl_wait_idle)
> +               robj->rdev->asic->ioctl_wait_idle(robj->rdev, robj);
> +       drm_gem_object_unreference_unlocked(gobj);
> +       return r;
> +}
> +
>  int radeon_gem_set_tiling_ioctl(struct drm_device *dev, void *data,
>                                struct drm_file *filp)
>  {
> diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
> index be2c122..a749c26 100644
> --- a/drivers/gpu/drm/radeon/radeon_kms.c
> +++ b/drivers/gpu/drm/radeon/radeon_kms.c
> @@ -451,5 +451,6 @@ struct drm_ioctl_desc radeon_ioctls_kms[] = {
>        DRM_IOCTL_DEF_DRV(RADEON_GEM_SET_TILING, radeon_gem_set_tiling_ioctl, DRM_AUTH|DRM_UNLOCKED),
>        DRM_IOCTL_DEF_DRV(RADEON_GEM_GET_TILING, radeon_gem_get_tiling_ioctl, DRM_AUTH|DRM_UNLOCKED),
>        DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED),
> +       DRM_IOCTL_DEF_DRV(RADEON_GEM_WAIT, radeon_gem_wait_ioctl, DRM_AUTH|DRM_UNLOCKED),
>  };
>  int radeon_max_kms_ioctl = DRM_ARRAY_SIZE(radeon_ioctls_kms);
> diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h
> index e9dc8b2..a057a8e 100644
> --- a/drivers/gpu/drm/radeon/radeon_object.h
> +++ b/drivers/gpu/drm/radeon/radeon_object.h
> @@ -119,7 +119,7 @@ static inline u64 radeon_bo_mmap_offset(struct radeon_bo *bo)
>  }
>
>  static inline int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type,
> -                                       bool no_wait)
> +                                bool no_wait, enum ttm_buffer_usage usage)
>  {
>        int r;
>
> @@ -130,7 +130,7 @@ static inline int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type,
>        if (mem_type)
>                *mem_type = bo->tbo.mem.mem_type;
>        if (bo->tbo.sync_obj)
> -               r = ttm_bo_wait(&bo->tbo, true, true, no_wait, TTM_USAGE_READWRITE);
> +               r = ttm_bo_wait(&bo->tbo, true, true, no_wait, usage);
>        spin_unlock(&bo->tbo.bdev->fence_lock);
>        ttm_bo_unreserve(&bo->tbo);
>        return r;
> diff --git a/include/drm/radeon_drm.h b/include/drm/radeon_drm.h
> index b65be60..939b854 100644
> --- a/include/drm/radeon_drm.h
> +++ b/include/drm/radeon_drm.h
> @@ -509,6 +509,7 @@ typedef struct {
>  #define DRM_RADEON_GEM_SET_TILING      0x28
>  #define DRM_RADEON_GEM_GET_TILING      0x29
>  #define DRM_RADEON_GEM_BUSY            0x2a
> +#define DRM_RADEON_GEM_WAIT            0x2b
>
>  #define DRM_IOCTL_RADEON_CP_INIT    DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CP_INIT, drm_radeon_init_t)
>  #define DRM_IOCTL_RADEON_CP_START   DRM_IO(  DRM_COMMAND_BASE + DRM_RADEON_CP_START)
> @@ -550,6 +551,7 @@ typedef struct {
>  #define DRM_IOCTL_RADEON_GEM_SET_TILING        DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_SET_TILING, struct drm_radeon_gem_set_tiling)
>  #define DRM_IOCTL_RADEON_GEM_GET_TILING        DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_GET_TILING, struct drm_radeon_gem_get_tiling)
>  #define DRM_IOCTL_RADEON_GEM_BUSY      DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy)
> +#define DRM_IOCTL_RADEON_GEM_WAIT      DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_WAIT, struct drm_radeon_gem_wait)
>
>  typedef struct drm_radeon_init {
>        enum {
> @@ -846,6 +848,15 @@ struct drm_radeon_gem_busy {
>        uint32_t        domain;
>  };
>
> +#define RADEON_GEM_NO_WAIT     0x1
> +#define RADEON_GEM_USAGE_READ  0x2
> +#define RADEON_GEM_USAGE_WRITE 0x4
> +
> +struct drm_radeon_gem_wait {
> +       uint32_t        handle;
> +       uint32_t        flags;  /* one of RADEON_GEM_* */
> +};
> +
>  struct drm_radeon_gem_pread {
>        /** Handle for the object being read. */
>        uint32_t handle;
> --
> 1.7.4.1
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-08-12 17:21 ` [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write Jerome Glisse
@ 2011-08-13 20:32   ` Marek Olšák
  0 siblings, 0 replies; 31+ messages in thread
From: Marek Olšák @ 2011-08-13 20:32 UTC (permalink / raw)
  To: Jerome Glisse, dri-devel

[-- Attachment #1: Type: text/plain, Size: 768 bytes --]

On Fri, Aug 12, 2011 at 7:21 PM, Jerome Glisse <j.glisse@gmail.com> wrote:
>> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> index 3832fe1..0438296 100644
>> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> @@ -223,6 +223,14 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
>>                bo = entry->bo;
>>                entry->old_sync_obj = bo->sync_obj;
>
> Here we should set entry->old_sync_obj_read &
> entry->old_sync_obj_write to NULL so
> we don't get bite by uninitialized memory (if caller fail or forget to do so)

OK, thanks. It's fixed in the attached patch. There are no other changes.

Marek

[-- Attachment #2: 0001-drm-ttm-add-a-way-to-bo_wait-for-either-the-last-rea.patch --]
[-- Type: text/x-diff, Size: 18480 bytes --]

From 1abe40ba3cffb8fd4d593cdbe060c04dbdc0687c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <maraeo@gmail.com>
Date: Sun, 7 Aug 2011 05:36:50 +0200
Subject: [PATCH] drm/ttm: add a way to bo_wait for either the last read or last write (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sometimes we want to know whether a buffer is busy and wait for it (bo_wait).
However, sometimes it would be more useful to be able to query whether
a buffer is busy and being either read or written, and wait until it's stopped
being either read or written. The point of this is to be able to avoid
unnecessary waiting, e.g. if a GPU has written something to a buffer and is now
reading that buffer, and a CPU wants to map that buffer for read, it needs to
only wait for the last write. If there were no write, there wouldn't be any
waiting needed.

This, or course, requires user space drivers to send read/write flags
with each relocation (like we have read/write domains in radeon, so we can
actually use those for something useful now).

Now how this patch works:

The read/write flags should passed to ttm_validate_buffer. TTM maintains
separate sync objects of the last read and write for each buffer, in addition
to the sync object of the last use of a buffer. ttm_bo_wait then operates
with one the sync objects.

Signed-off-by: Marek Olšák <maraeo@gmail.com>
Reviewed-by: Jerome Glisse <jglisse@redhat.com>
---
 drivers/gpu/drm/nouveau/nouveau_bo.c    |    3 +-
 drivers/gpu/drm/nouveau/nouveau_gem.c   |    5 +-
 drivers/gpu/drm/radeon/radeon_cs.c      |    1 +
 drivers/gpu/drm/radeon/radeon_object.h  |    2 +-
 drivers/gpu/drm/ttm/ttm_bo.c            |   97 ++++++++++++++++++++++--------
 drivers/gpu/drm/ttm/ttm_bo_util.c       |   26 +++++++--
 drivers/gpu/drm/ttm/ttm_bo_vm.c         |    2 +-
 drivers/gpu/drm/ttm/ttm_execbuf_util.c  |   19 ++++++-
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c |    1 +
 include/drm/ttm/ttm_bo_api.h            |   16 +++++-
 include/drm/ttm/ttm_execbuf_util.h      |    6 ++
 11 files changed, 139 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 890d50e..e87e24b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1104,7 +1104,8 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma)
 	if (vma->node) {
 		if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) {
 			spin_lock(&nvbo->bo.bdev->fence_lock);
-			ttm_bo_wait(&nvbo->bo, false, false, false);
+			ttm_bo_wait(&nvbo->bo, false, false, false,
+				    TTM_USAGE_READWRITE);
 			spin_unlock(&nvbo->bo.bdev->fence_lock);
 			nouveau_vm_unmap(vma);
 		}
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 5f0bc57..322bf62 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -589,7 +589,8 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev,
 		}
 
 		spin_lock(&nvbo->bo.bdev->fence_lock);
-		ret = ttm_bo_wait(&nvbo->bo, false, false, false);
+		ret = ttm_bo_wait(&nvbo->bo, false, false, false,
+				  TTM_USAGE_READWRITE);
 		spin_unlock(&nvbo->bo.bdev->fence_lock);
 		if (ret) {
 			NV_ERROR(dev, "reloc wait_idle failed: %d\n", ret);
@@ -825,7 +826,7 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
 	nvbo = nouveau_gem_object(gem);
 
 	spin_lock(&nvbo->bo.bdev->fence_lock);
-	ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
+	ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait, TTM_USAGE_READWRITE);
 	spin_unlock(&nvbo->bo.bdev->fence_lock);
 	drm_gem_object_unreference_unlocked(gem);
 	return ret;
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index fae00c0..14e8531 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -80,6 +80,7 @@ int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
 			p->relocs[i].lobj.wdomain = r->write_domain;
 			p->relocs[i].lobj.rdomain = r->read_domains;
 			p->relocs[i].lobj.tv.bo = &p->relocs[i].robj->tbo;
+			p->relocs[i].lobj.tv.usage = TTM_USAGE_READWRITE;
 			p->relocs[i].handle = r->handle;
 			p->relocs[i].flags = r->flags;
 			radeon_bo_list_add_object(&p->relocs[i].lobj,
diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h
index ede6c13..e9dc8b2 100644
--- a/drivers/gpu/drm/radeon/radeon_object.h
+++ b/drivers/gpu/drm/radeon/radeon_object.h
@@ -130,7 +130,7 @@ static inline int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type,
 	if (mem_type)
 		*mem_type = bo->tbo.mem.mem_type;
 	if (bo->tbo.sync_obj)
-		r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
+		r = ttm_bo_wait(&bo->tbo, true, true, no_wait, TTM_USAGE_READWRITE);
 	spin_unlock(&bo->tbo.bdev->fence_lock);
 	ttm_bo_unreserve(&bo->tbo);
 	return r;
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 56619f6..bb8c86d 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -494,7 +494,7 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 	int ret;
 
 	spin_lock(&bdev->fence_lock);
-	(void) ttm_bo_wait(bo, false, false, true);
+	(void) ttm_bo_wait(bo, false, false, true, TTM_USAGE_READWRITE);
 	if (!bo->sync_obj) {
 
 		spin_lock(&glob->lru_lock);
@@ -562,7 +562,8 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
 
 retry:
 	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
+	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu,
+			  TTM_USAGE_READWRITE);
 	spin_unlock(&bdev->fence_lock);
 
 	if (unlikely(ret != 0))
@@ -721,7 +722,8 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible,
 	int ret = 0;
 
 	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
+	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu,
+			  TTM_USAGE_READWRITE);
 	spin_unlock(&bdev->fence_lock);
 
 	if (unlikely(ret != 0)) {
@@ -1068,7 +1070,8 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
 	 * instead of doing it here.
 	 */
 	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
+	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu,
+			  TTM_USAGE_READWRITE);
 	spin_unlock(&bdev->fence_lock);
 	if (ret)
 		return ret;
@@ -1688,34 +1691,83 @@ out_unlock:
 	return ret;
 }
 
+static void ttm_bo_unref_sync_obj_locked(struct ttm_buffer_object *bo,
+					 void *sync_obj,
+					 void **extra_sync_obj)
+{
+	struct ttm_bo_device *bdev = bo->bdev;
+	struct ttm_bo_driver *driver = bdev->driver;
+	void *tmp_obj = NULL, *tmp_obj_read = NULL, *tmp_obj_write = NULL;
+
+	/* We must unref the sync obj wherever it's ref'd.
+	 * Note that if we unref bo->sync_obj, we can unref both the read
+	 * and write sync objs too, because they can't be newer than
+	 * bo->sync_obj, so they are no longer relevant. */
+	if (sync_obj == bo->sync_obj ||
+	    sync_obj == bo->sync_obj_read) {
+		tmp_obj_read = bo->sync_obj_read;
+		bo->sync_obj_read = NULL;
+	}
+	if (sync_obj == bo->sync_obj ||
+	    sync_obj == bo->sync_obj_write) {
+		tmp_obj_write = bo->sync_obj_write;
+		bo->sync_obj_write = NULL;
+	}
+	if (sync_obj == bo->sync_obj) {
+		tmp_obj = bo->sync_obj;
+		bo->sync_obj = NULL;
+	}
+
+	clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
+	spin_unlock(&bdev->fence_lock);
+	if (tmp_obj)
+		driver->sync_obj_unref(&tmp_obj);
+	if (tmp_obj_read)
+		driver->sync_obj_unref(&tmp_obj_read);
+	if (tmp_obj_write)
+		driver->sync_obj_unref(&tmp_obj_write);
+	if (extra_sync_obj)
+		driver->sync_obj_unref(extra_sync_obj);
+	spin_lock(&bdev->fence_lock);
+}
+
 int ttm_bo_wait(struct ttm_buffer_object *bo,
-		bool lazy, bool interruptible, bool no_wait)
+		bool lazy, bool interruptible, bool no_wait,
+		enum ttm_buffer_usage usage)
 {
 	struct ttm_bo_driver *driver = bo->bdev->driver;
 	struct ttm_bo_device *bdev = bo->bdev;
 	void *sync_obj;
 	void *sync_obj_arg;
 	int ret = 0;
+	void **bo_sync_obj;
 
-	if (likely(bo->sync_obj == NULL))
+	switch (usage) {
+	case TTM_USAGE_READ:
+		bo_sync_obj = &bo->sync_obj_read;
+		break;
+	case TTM_USAGE_WRITE:
+		bo_sync_obj = &bo->sync_obj_write;
+		break;
+	case TTM_USAGE_READWRITE:
+	default:
+		bo_sync_obj = &bo->sync_obj;
+	}
+
+	if (likely(*bo_sync_obj == NULL))
 		return 0;
 
-	while (bo->sync_obj) {
+	while (*bo_sync_obj) {
 
-		if (driver->sync_obj_signaled(bo->sync_obj, bo->sync_obj_arg)) {
-			void *tmp_obj = bo->sync_obj;
-			bo->sync_obj = NULL;
-			clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
-			spin_unlock(&bdev->fence_lock);
-			driver->sync_obj_unref(&tmp_obj);
-			spin_lock(&bdev->fence_lock);
+		if (driver->sync_obj_signaled(*bo_sync_obj, bo->sync_obj_arg)) {
+			ttm_bo_unref_sync_obj_locked(bo, *bo_sync_obj, NULL);
 			continue;
 		}
 
 		if (no_wait)
 			return -EBUSY;
 
-		sync_obj = driver->sync_obj_ref(bo->sync_obj);
+		sync_obj = driver->sync_obj_ref(*bo_sync_obj);
 		sync_obj_arg = bo->sync_obj_arg;
 		spin_unlock(&bdev->fence_lock);
 		ret = driver->sync_obj_wait(sync_obj, sync_obj_arg,
@@ -1726,16 +1778,9 @@ int ttm_bo_wait(struct ttm_buffer_object *bo,
 			return ret;
 		}
 		spin_lock(&bdev->fence_lock);
-		if (likely(bo->sync_obj == sync_obj &&
+		if (likely(*bo_sync_obj == sync_obj &&
 			   bo->sync_obj_arg == sync_obj_arg)) {
-			void *tmp_obj = bo->sync_obj;
-			bo->sync_obj = NULL;
-			clear_bit(TTM_BO_PRIV_FLAG_MOVING,
-				  &bo->priv_flags);
-			spin_unlock(&bdev->fence_lock);
-			driver->sync_obj_unref(&sync_obj);
-			driver->sync_obj_unref(&tmp_obj);
-			spin_lock(&bdev->fence_lock);
+			ttm_bo_unref_sync_obj_locked(bo, *bo_sync_obj, &sync_obj);
 		} else {
 			spin_unlock(&bdev->fence_lock);
 			driver->sync_obj_unref(&sync_obj);
@@ -1759,7 +1804,7 @@ int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait)
 	if (unlikely(ret != 0))
 		return ret;
 	spin_lock(&bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, true, no_wait);
+	ret = ttm_bo_wait(bo, false, true, no_wait, TTM_USAGE_READWRITE);
 	spin_unlock(&bdev->fence_lock);
 	if (likely(ret == 0))
 		atomic_inc(&bo->cpu_writers);
@@ -1833,7 +1878,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
 	 */
 
 	spin_lock(&bo->bdev->fence_lock);
-	ret = ttm_bo_wait(bo, false, false, false);
+	ret = ttm_bo_wait(bo, false, false, false, TTM_USAGE_READWRITE);
 	spin_unlock(&bo->bdev->fence_lock);
 
 	if (unlikely(ret != 0))
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index 77dbf40..0399573 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -436,6 +436,8 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo,
 	atomic_set(&fbo->cpu_writers, 0);
 
 	fbo->sync_obj = driver->sync_obj_ref(bo->sync_obj);
+	fbo->sync_obj_read = driver->sync_obj_ref(bo->sync_obj_read);
+	fbo->sync_obj_write = driver->sync_obj_ref(bo->sync_obj_write);
 	kref_init(&fbo->list_kref);
 	kref_init(&fbo->kref);
 	fbo->destroy = &ttm_transfered_destroy;
@@ -618,20 +620,30 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
 	struct ttm_mem_reg *old_mem = &bo->mem;
 	int ret;
 	struct ttm_buffer_object *ghost_obj;
-	void *tmp_obj = NULL;
+	void *tmp_obj = NULL, *tmp_obj_read = NULL, *tmp_obj_write = NULL;
 
 	spin_lock(&bdev->fence_lock);
-	if (bo->sync_obj) {
+	if (bo->sync_obj)
 		tmp_obj = bo->sync_obj;
-		bo->sync_obj = NULL;
-	}
+	if (bo->sync_obj_read)
+		tmp_obj_read = bo->sync_obj_read;
+	if (bo->sync_obj_write)
+		tmp_obj_write = bo->sync_obj_write;
+
 	bo->sync_obj = driver->sync_obj_ref(sync_obj);
+	bo->sync_obj_read = driver->sync_obj_ref(sync_obj);
+	bo->sync_obj_write = driver->sync_obj_ref(sync_obj);
 	bo->sync_obj_arg = sync_obj_arg;
 	if (evict) {
-		ret = ttm_bo_wait(bo, false, false, false);
+		ret = ttm_bo_wait(bo, false, false, false,
+				  TTM_USAGE_READWRITE);
 		spin_unlock(&bdev->fence_lock);
 		if (tmp_obj)
 			driver->sync_obj_unref(&tmp_obj);
+		if (tmp_obj_read)
+			driver->sync_obj_unref(&tmp_obj_read);
+		if (tmp_obj_write)
+			driver->sync_obj_unref(&tmp_obj_write);
 		if (ret)
 			return ret;
 
@@ -655,6 +667,10 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
 		spin_unlock(&bdev->fence_lock);
 		if (tmp_obj)
 			driver->sync_obj_unref(&tmp_obj);
+		if (tmp_obj_read)
+			driver->sync_obj_unref(&tmp_obj_read);
+		if (tmp_obj_write)
+			driver->sync_obj_unref(&tmp_obj_write);
 
 		ret = ttm_buffer_object_transfer(bo, &ghost_obj);
 		if (ret)
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 221b924..ff1e26f 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -122,7 +122,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	spin_lock(&bdev->fence_lock);
 	if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
-		ret = ttm_bo_wait(bo, false, true, false);
+		ret = ttm_bo_wait(bo, false, true, false, TTM_USAGE_READWRITE);
 		spin_unlock(&bdev->fence_lock);
 		if (unlikely(ret != 0)) {
 			retval = (ret != -ERESTARTSYS) ?
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index 3832fe1..36d111a 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -221,8 +221,18 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
 
 	list_for_each_entry(entry, list, head) {
 		bo = entry->bo;
+		entry->old_sync_obj_read = NULL;
+		entry->old_sync_obj_write = NULL;
 		entry->old_sync_obj = bo->sync_obj;
 		bo->sync_obj = driver->sync_obj_ref(sync_obj);
+		if (entry->usage & TTM_USAGE_READ) {
+			entry->old_sync_obj_read = bo->sync_obj_read;
+			bo->sync_obj_read = driver->sync_obj_ref(sync_obj);
+		}
+		if (entry->usage & TTM_USAGE_WRITE) {
+			entry->old_sync_obj_write = bo->sync_obj_write;
+			bo->sync_obj_write = driver->sync_obj_ref(sync_obj);
+		}
 		bo->sync_obj_arg = entry->new_sync_obj_arg;
 		ttm_bo_unreserve_locked(bo);
 		entry->reserved = false;
@@ -231,8 +241,15 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
 	spin_unlock(&bdev->fence_lock);
 
 	list_for_each_entry(entry, list, head) {
-		if (entry->old_sync_obj)
+		if (entry->old_sync_obj) {
 			driver->sync_obj_unref(&entry->old_sync_obj);
+		}
+		if (entry->old_sync_obj_read) {
+			driver->sync_obj_unref(&entry->old_sync_obj_read);
+		}
+		if (entry->old_sync_obj_write) {
+			driver->sync_obj_unref(&entry->old_sync_obj_write);
+		}
 	}
 }
 EXPORT_SYMBOL(ttm_eu_fence_buffer_objects);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 41b95ed..8ca3ddb 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -224,6 +224,7 @@ static int vmw_translate_guest_ptr(struct vmw_private *dev_priv,
 	if (unlikely(cur_validate_node == sw_context->cur_val_buf)) {
 		val_buf = &sw_context->val_bufs[cur_validate_node];
 		val_buf->bo = ttm_bo_reference(bo);
+		val_buf->usage = TTM_USAGE_READWRITE;
 		val_buf->new_sync_obj_arg = (void *) dev_priv;
 		list_add_tail(&val_buf->head, &sw_context->validate_nodes);
 		++sw_context->cur_val_buf;
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 42e3469..da957bf 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -44,6 +44,11 @@ struct ttm_bo_device;
 
 struct drm_mm_node;
 
+enum ttm_buffer_usage {
+    TTM_USAGE_READ = 1,
+    TTM_USAGE_WRITE = 2,
+    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
+};
 
 /**
  * struct ttm_placement
@@ -174,7 +179,10 @@ struct ttm_tt;
  * the bo_device::lru_lock.
  * @reserved: Deadlock-free lock used for synchronization state transitions.
  * @sync_obj_arg: Opaque argument to synchronization object function.
- * @sync_obj: Pointer to a synchronization object.
+ * @sync_obj: Pointer to a synchronization object of a last read or write,
+ * whichever is later.
+ * @sync_obj_read: Pointer to a synchronization object of a last read.
+ * @sync_obj_write: Pointer to a synchronization object of a last write.
  * @priv_flags: Flags describing buffer object internal state.
  * @vm_rb: Rb node for the vm rb tree.
  * @vm_node: Address space manager node.
@@ -258,6 +266,8 @@ struct ttm_buffer_object {
 
 	void *sync_obj_arg;
 	void *sync_obj;
+	void *sync_obj_read;
+	void *sync_obj_write;
 	unsigned long priv_flags;
 
 	/**
@@ -325,6 +335,7 @@ ttm_bo_reference(struct ttm_buffer_object *bo)
  * @bo:  The buffer object.
  * @interruptible:  Use interruptible wait.
  * @no_wait:  Return immediately if buffer is busy.
+ * @usage:  Whether to wait for the last read and/or the last write.
  *
  * This function must be called with the bo::mutex held, and makes
  * sure any previous rendering to the buffer is completed.
@@ -334,7 +345,8 @@ ttm_bo_reference(struct ttm_buffer_object *bo)
  * Returns -ERESTARTSYS if interrupted by a signal.
  */
 extern int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy,
-		       bool interruptible, bool no_wait);
+		       bool interruptible, bool no_wait,
+		       enum ttm_buffer_usage usage);
 /**
  * ttm_bo_validate
  *
diff --git a/include/drm/ttm/ttm_execbuf_util.h b/include/drm/ttm/ttm_execbuf_util.h
index 26cc7f9..375f299 100644
--- a/include/drm/ttm/ttm_execbuf_util.h
+++ b/include/drm/ttm/ttm_execbuf_util.h
@@ -41,20 +41,26 @@
  * @bo:             refcounted buffer object pointer.
  * @new_sync_obj_arg: New sync_obj_arg for @bo, to be used once
  * adding a new sync object.
+ * @usage           Indicates how @bo is used by the device.
  * @reserved:       Indicates whether @bo has been reserved for validation.
  * @removed:        Indicates whether @bo has been removed from lru lists.
  * @put_count:      Number of outstanding references on bo::list_kref.
  * @old_sync_obj:   Pointer to a sync object about to be unreferenced
+ * @old_sync_obj_read: Pointer to a read sync object about to be unreferenced.
+ * @old_sync_obj_write: Pointer to a write sync object about to be unreferenced.
  */
 
 struct ttm_validate_buffer {
 	struct list_head head;
 	struct ttm_buffer_object *bo;
 	void *new_sync_obj_arg;
+	enum ttm_buffer_usage usage;
 	bool reserved;
 	bool removed;
 	int put_count;
 	void *old_sync_obj;
+	void *old_sync_obj_read;
+	void *old_sync_obj_write;
 };
 
 /**
-- 
1.7.4.1


[-- Attachment #3: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-08-07 20:39 [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write Marek Olšák
  2011-08-07 20:39 ` [PATCH 2/2] drm/radeon/kms: add a new gem_wait ioctl with read/write flags Marek Olšák
  2011-08-12 17:21 ` [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write Jerome Glisse
@ 2011-10-04 11:48 ` Thomas Hellstrom
  2011-10-05  2:08   ` Marek Olšák
  2011-10-07  8:58 ` Thomas Hellstrom
  3 siblings, 1 reply; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-04 11:48 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

On 08/07/2011 10:39 PM, Marek Olšák wrote:
> Sometimes we want to know whether a buffer is busy and wait for it (bo_wait).
> However, sometimes it would be more useful to be able to query whether
> a buffer is busy and being either read or written, and wait until it's stopped
> being either read or written. The point of this is to be able to avoid
> unnecessary waiting, e.g. if a GPU has written something to a buffer and is now
> reading that buffer, and a CPU wants to map that buffer for read, it needs to
> only wait for the last write. If there were no write, there wouldn't be any
> waiting needed.
>
> This, or course, requires user space drivers to send read/write flags
> with each relocation (like we have read/write domains in radeon, so we can
> actually use those for something useful now).
>
> Now how this patch works:
>
> The read/write flags should passed to ttm_validate_buffer. TTM maintains
> separate sync objects of the last read and write for each buffer, in addition
> to the sync object of the last use of a buffer. ttm_bo_wait then operates
> with one the sync objects.
>
> Signed-off-by: Marek Olšák<maraeo@gmail.com>
>    

Bah, I totally missed this patch and thus never reviewed it :( Probably 
on vacation.

There are a couple of things I'd like to point out.

1) The bo subsystem may never assume that fence objects are ordered, so 
that when we unref
bo::sync_obj, we may never assume that previously attached fence objects 
are signaled and can be unref'd
Think for example fence objects submitted to different command streams. 
This is a bug and must be fixed.
We can detach fence objects from buffers in the driver validation code, 
because that code knows whether fences are implicitly ordered, or can 
order them either by inserting a barrier (semaphore in NV languange) or 
waiting for the fence to expire. (For example if the new validation is 
READ and the fence currently attached is WRITE, we might need to 
schedule a gpu cache flush before detaching the write fence).

2) Can't we say that a write_sync_obj is simply a sync_obj? What's the 
difference between those two? I think we should remove the 
write_sync_obj bo member.

3) Ideally we should have a linked list of read sync objects, with their 
own sync_obj_arg, but since there apparently aren't any consumers yet, 
we could wait with that.

/Thomas



_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-04 11:48 ` Thomas Hellstrom
@ 2011-10-05  2:08   ` Marek Olšák
  2011-10-05  5:54     ` Thomas Hellstrom
  0 siblings, 1 reply; 31+ messages in thread
From: Marek Olšák @ 2011-10-05  2:08 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Tue, Oct 4, 2011 at 1:48 PM, Thomas Hellstrom <thomas@shipmail.org> wrote:
> Bah, I totally missed this patch and thus never reviewed it :( Probably on
> vacation.
>
> There are a couple of things I'd like to point out.
>
> 1) The bo subsystem may never assume that fence objects are ordered, so that
> when we unref
> bo::sync_obj, we may never assume that previously attached fence objects are
> signaled and can be unref'd
> Think for example fence objects submitted to different command streams. This
> is a bug and must be fixed.

If what you say is true, then even the original sync_obj can't be
trusted. What if I overwrite sync_obj with a new one and the new one
is signalled sooner than the old one?


> We can detach fence objects from buffers in the driver validation code,
> because that code knows whether fences are implicitly ordered, or can order
> them either by inserting a barrier (semaphore in NV languange) or waiting

I am not sure I follow you here. ttm_bo_wait needs the fences...
unless we want to move the fences out of TTM into drivers.


> for the fence to expire. (For example if the new validation is READ and the
> fence currently attached is WRITE, we might need to schedule a gpu cache
> flush before detaching the write fence).

I am not sure what fences have to do with flushing. Write caches
should be flushed automatically when resources are unbound. When a
resource is used for write and read at the same time, it's not our
problem: the user is responsible for flushing (e.g. through memory and
texture barriers in OpenGL), not the driver.


>
> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's the
> difference between those two? I think we should remove the write_sync_obj bo
> member.

Okay, but I think we should remove sync_obj instead, and keep write
and read sync objs. In the case of READWRITE usage, read_sync_obj
would be equal to write_sync_obj.

Marek

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-05  2:08   ` Marek Olšák
@ 2011-10-05  5:54     ` Thomas Hellstrom
  2011-10-06 22:42       ` Marek Olšák
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-05  5:54 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

On 10/05/2011 04:08 AM, Marek Olšák wrote:
> On Tue, Oct 4, 2011 at 1:48 PM, Thomas Hellstrom<thomas@shipmail.org>  wrote:
>    
>> Bah, I totally missed this patch and thus never reviewed it :( Probably on
>> vacation.
>>
>> There are a couple of things I'd like to point out.
>>
>> 1) The bo subsystem may never assume that fence objects are ordered, so that
>> when we unref
>> bo::sync_obj, we may never assume that previously attached fence objects are
>> signaled and can be unref'd
>> Think for example fence objects submitted to different command streams. This
>> is a bug and must be fixed.
>>      
> If what you say is true, then even the original sync_obj can't be
> trusted. What if I overwrite sync_obj with a new one and the new one
> is signalled sooner than the old one?
>    

The driver validation code will in effect overwrite the old with a new 
one, because the driver validation code knows what sync objects are 
ordered. If, during validation of a buffer object, the driver validation 
code detects that the buffer is already fenced with a sync object that 
will signal out-of-order, the driver validation code needs to *wait* for 
that sync object to signal before proceeding, or insert a sync object 
barrier in the command stream.

The TTM bo code doesn't know how to order fences, and never assumes that 
they are ordered.

>
>    
>> We can detach fence objects from buffers in the driver validation code,
>> because that code knows whether fences are implicitly ordered, or can order
>> them either by inserting a barrier (semaphore in NV languange) or waiting
>>      
> I am not sure I follow you here. ttm_bo_wait needs the fences...
> unless we want to move the fences out of TTM into drivers.
>    

Please see the above explanation.

>
>    
>> for the fence to expire. (For example if the new validation is READ and the
>> fence currently attached is WRITE, we might need to schedule a gpu cache
>> flush before detaching the write fence).
>>      
> I am not sure what fences have to do with flushing. Write caches
> should be flushed automatically when resources are unbound. When a
> resource is used for write and read at the same time, it's not our
> problem: the user is responsible for flushing (e.g. through memory and
> texture barriers in OpenGL), not the driver.
>    

How flushing is done is up to the driver writer, (fences is an excellent 
tool to do it in an efficient way), but barriers like the write-read 
barrier example above may need to be inserted for various reasons. Let's 
say you use render-to-texture, unbind the texture from the fbo, and then 
want to texture from it. At some point you *need* to flush if you have a 
write cache, and that flush needs to happen when you remove the write 
fence from the buffer, in order to replace it with a read fence, since 
after that the information that the buffer has been written to is gone. 
IIRC nouveau uses barriers like this to order fences from different 
command streams, Unichrome used it to order fences from different 
hardware engines.

In any case, I'm not saying fences is the best way to flush but since 
the bo code assumes that signaling a sync object means "make the buffer 
contents available for CPU read / write", it's usually a good way to do 
it; there's even a sync_obj_flush() method that gets called when a 
potential flush needs to happen.

>
>    
>> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's the
>> difference between those two? I think we should remove the write_sync_obj bo
>> member.
>>      
> Okay, but I think we should remove sync_obj instead, and keep write
> and read sync objs. In the case of READWRITE usage, read_sync_obj
> would be equal to write_sync_obj.
>
>    

Sure, I'm fine with that.

One other thing, though, that makes me a little puzzled:

Let's assume you don't allow readers and writers at the same time, which 
is my perception of how read- and write fences should work; you either 
have a list of read fences or a single write fence (in the same way a 
read-write lock works).

Now, if you only allow a single read fence, like in this patch. That 
implies that you can only have either a single read fence or a single 
write fence at any one time. We'd only need a single fence pointer on 
the bo, and sync_obj_arg would tell us whether to signal the fence for 
read or for write (assuming that sync_obj_arg was set up to indicate 
read / write at validation time), then the patch really isn't necessary 
at all, as it only allows a single read fence?

Or is it that you want to allow read- and write fences co-existing? In 
that case, what's the use case?


Thanks,
Thomas



> Marek
>    



_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-05  5:54     ` Thomas Hellstrom
@ 2011-10-06 22:42       ` Marek Olšák
  2011-10-07  8:00         ` Thomas Hellstrom
  0 siblings, 1 reply; 31+ messages in thread
From: Marek Olšák @ 2011-10-06 22:42 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Wed, Oct 5, 2011 at 7:54 AM, Thomas Hellstrom <thomas@shipmail.org> wrote:
> In any case, I'm not saying fences is the best way to flush but since the bo
> code assumes that signaling a sync object means "make the buffer contents
> available for CPU read / write", it's usually a good way to do it; there's
> even a sync_obj_flush() method that gets called when a potential flush needs
> to happen.

I don't think we use it like that. To my knowledge, the purpose of the
sync obj (to Radeon Gallium drivers at least) is to be able to wait
for the last use of a buffer. Whether the contents can or cannot be
available to the CPU is totally irrelevant.

Currently (and it's a very important performance optimization),
buffers stay mapped and available for CPU read/write during their
first map_buffer call. Unmap_buffer is a no-op. The unmapping happens
on buffer destruction. We only call bo_wait when we want to wait for
the GPU until it's done with the buffer (we don't always want that,
sometimes we want to use the unsychronized flag). Otherwise the
contents of buffers are available at *any time*.

We could probably implement bo_wait privately in the kernel driver and
not use ttm_bo_wait. I preferred code sharing though.

Textures (especially the tiled ones) are never mapped directly and a
temporary staging resource is used instead, so we don't actually
pollute address space that much. (in case you would have such a
remark) We will use staging resources for buffers too, but it's really
the last resort to avoid waiting when direct access can't be used.


>>> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's the
>>> difference between those two? I think we should remove the write_sync_obj
>>> bo
>>> member.
>>>
>>
>> Okay, but I think we should remove sync_obj instead, and keep write
>> and read sync objs. In the case of READWRITE usage, read_sync_obj
>> would be equal to write_sync_obj.
>>
>>
>
> Sure, I'm fine with that.
>
> One other thing, though, that makes me a little puzzled:
>
> Let's assume you don't allow readers and writers at the same time, which is
> my perception of how read- and write fences should work; you either have a
> list of read fences or a single write fence (in the same way a read-write
> lock works).
>
> Now, if you only allow a single read fence, like in this patch. That implies
> that you can only have either a single read fence or a single write fence at
> any one time. We'd only need a single fence pointer on the bo, and
> sync_obj_arg would tell us whether to signal the fence for read or for write
> (assuming that sync_obj_arg was set up to indicate read / write at
> validation time), then the patch really isn't necessary at all, as it only
> allows a single read fence?
>
> Or is it that you want to allow read- and write fences co-existing? In that
> case, what's the use case?

There are lots of read-write use cases which don't need any barriers
or flushing. The obvious ones are color blending and depth-stencil
buffering. The OpenGL application is also allowed to use a subrange of
a buffer as a vertex buffer (read-only) and another disjoint subrange
of the same buffer for transform feedback (write-only), which kinda
makes me think about whether we should track subranges instead of
treating a whole buffer as "busy". It gets even more funky with
ARB_shader_image_load_store, which supports atomic read-modify-write
operations on textures, not to mention atomic memory operations in
compute shaders (wait, isn't that also exposed in GL as
GL_ARB_shader_atomic_counters?).

I was thinking whether the two sync objs should be "read" and
"readwrite", or "read" and "write". I chose the latter, because it's
more fine-grained and we have to keep at least two of them around
anyway.

So now that you know what we use sync objs for, what are your ideas on
re-implementing that patch in a way that is okay with you? Besides
removing the third sync objs of course.

Marek

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-06 22:42       ` Marek Olšák
@ 2011-10-07  8:00         ` Thomas Hellstrom
  2011-10-07 13:24           ` Alex Deucher
                             ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-07  8:00 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

On 10/07/2011 12:42 AM, Marek Olšák wrote:
> On Wed, Oct 5, 2011 at 7:54 AM, Thomas Hellstrom<thomas@shipmail.org>  wrote:
>    
>> In any case, I'm not saying fences is the best way to flush but since the bo
>> code assumes that signaling a sync object means "make the buffer contents
>> available for CPU read / write", it's usually a good way to do it; there's
>> even a sync_obj_flush() method that gets called when a potential flush needs
>> to happen.
>>      
> I don't think we use it like that. To my knowledge, the purpose of the
> sync obj (to Radeon Gallium drivers at least) is to be able to wait
> for the last use of a buffer. Whether the contents can or cannot be
> available to the CPU is totally irrelevant.
>
> Currently (and it's a very important performance optimization),
> buffers stay mapped and available for CPU read/write during their
> first map_buffer call. Unmap_buffer is a no-op. The unmapping happens
> on buffer destruction. We only call bo_wait when we want to wait for
> the GPU until it's done with the buffer (we don't always want that,
> sometimes we want to use the unsychronized flag). Otherwise the
> contents of buffers are available at *any time*.
>
> We could probably implement bo_wait privately in the kernel driver and
> not use ttm_bo_wait. I preferred code sharing though.
>
> Textures (especially the tiled ones) are never mapped directly and a
> temporary staging resource is used instead, so we don't actually
> pollute address space that much. (in case you would have such a
> remark) We will use staging resources for buffers too, but it's really
> the last resort to avoid waiting when direct access can't be used.
>
>
>    
>>>> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's the
>>>> difference between those two? I think we should remove the write_sync_obj
>>>> bo
>>>> member.
>>>>
>>>>          
>>> Okay, but I think we should remove sync_obj instead, and keep write
>>> and read sync objs. In the case of READWRITE usage, read_sync_obj
>>> would be equal to write_sync_obj.
>>>
>>>
>>>        
>> Sure, I'm fine with that.
>>
>> One other thing, though, that makes me a little puzzled:
>>
>> Let's assume you don't allow readers and writers at the same time, which is
>> my perception of how read- and write fences should work; you either have a
>> list of read fences or a single write fence (in the same way a read-write
>> lock works).
>>
>> Now, if you only allow a single read fence, like in this patch. That implies
>> that you can only have either a single read fence or a single write fence at
>> any one time. We'd only need a single fence pointer on the bo, and
>> sync_obj_arg would tell us whether to signal the fence for read or for write
>> (assuming that sync_obj_arg was set up to indicate read / write at
>> validation time), then the patch really isn't necessary at all, as it only
>> allows a single read fence?
>>
>> Or is it that you want to allow read- and write fences co-existing? In that
>> case, what's the use case?
>>      
> There are lots of read-write use cases which don't need any barriers
> or flushing. The obvious ones are color blending and depth-stencil
> buffering. The OpenGL application is also allowed to use a subrange of
> a buffer as a vertex buffer (read-only) and another disjoint subrange
> of the same buffer for transform feedback (write-only), which kinda
> makes me think about whether we should track subranges instead of
> treating a whole buffer as "busy". It gets even more funky with
> ARB_shader_image_load_store, which supports atomic read-modify-write
> operations on textures, not to mention atomic memory operations in
> compute shaders (wait, isn't that also exposed in GL as
> GL_ARB_shader_atomic_counters?).
>
> I was thinking whether the two sync objs should be "read" and
> "readwrite", or "read" and "write". I chose the latter, because it's
> more fine-grained and we have to keep at least two of them around
> anyway.
>
> So now that you know what we use sync objs for, what are your ideas on
> re-implementing that patch in a way that is okay with you? Besides
> removing the third sync objs of course.
>
> Marek
>    
OK. First I think we need to make a distinction: bo sync objects vs 
driver fences. The bo sync obj api is there to strictly provide 
functionality that the ttm bo subsystem is using, and that follows a 
simple set of rules:

1) the bo subsystem does never assume sync objects are ordered. That 
means the bo subsystem needs to wait on a sync object before removing it 
from a buffer. Any other assumption is buggy and must be fixed. BUT, if 
that assumption takes place in the driver unknowingly from the ttm bo 
subsystem (which is usually the case), it's OK.

2) When the sync object(s) attached to the bo are signaled the ttm bo 
subsystem is free to copy the bo contents and to unbind the bo.

3) The ttm bo system allows sync objects to be signaled in different 
ways opaque to the subsystem using sync_obj_arg. The driver is 
responsible for setting up that argument.

4) Driver fences may be used for or expose other functionality or 
adaptions to APIs as long as the sync obj api exported to the bo 
sybsystem follows the above rules.

This means the following w r t the patch.

A) it violates 1). This is a bug that must be fixed. Assumptions that if 
one sync object is singnaled, another sync object is also signaled must 
be done in the driver and not in the bo subsystem. Hence we need to 
explicitly wait for a fence to remove it from the bo.

B) the sync_obj_arg carries *per-sync-obj* information on how it should 
be signaled. If we need to attach multiple sync objects to a buffer 
object, we also need multiple sync_obj_args. This is a bug and needs to 
be fixed.

C) There is really only one reason that the ttm bo subsystem should care 
about multiple sync objects, and that is because the driver can't order 
them efficiently. A such example would be hardware with multiple pipes 
reading simultaneously from the same texture buffer. Currently we don't 
support this so only the *last* sync object needs to be know by the bo 
subsystem. Keeping track of multiple fences generates a lot of 
completely unnecessary code in the ttm_bo_util file, the ttm_bo_vm file, 
and will be a nightmare if / when we truly support pipelined moves.

As I understand it from your patches, you want to keep multiple fences 
around only to track rendering history. If we want to do that 
generically, i suggest doing it in the execbuf util code in the 
following way:

struct ttm_eu_rendering_history {
     void *last_read_sync_obj;
     void *last_read_sync_obj_arg;
     void *last_write_sync_obj;
     void *last_write_sync_obj_arg;
}

Embed this structure in the radeon_bo, and build a small api around it, 
including *optionally* passing it to the existing execbuf utilities, and 
you should be done. The bo_util code and bo_vm code doesn't care about 
the rendering history. Only that the bo is completely idle.

Note also that when an accelerated bo move is scheduled, the driver 
needs to update this struct

/Thomas




















_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-08-07 20:39 [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write Marek Olšák
                   ` (2 preceding siblings ...)
  2011-10-04 11:48 ` Thomas Hellstrom
@ 2011-10-07  8:58 ` Thomas Hellstrom
  2011-10-08 10:26   ` Ville Syrjälä
  3 siblings, 1 reply; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-07  8:58 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

Oh, and one more style comment below:

On 08/07/2011 10:39 PM, Marek Olšák wrote:
>
> +enum ttm_buffer_usage {
> +    TTM_USAGE_READ = 1,
> +    TTM_USAGE_WRITE = 2,
> +    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
> +};
>
>    

Please don't use enums for bit operations.

#define TTM_USAGE_FLAG_READ   (1 << 0)
#define TTM_USAGE_FLAG_WRITE  (1 << 1)

/Thomas



_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07  8:00         ` Thomas Hellstrom
@ 2011-10-07 13:24           ` Alex Deucher
  2011-10-07 14:05             ` Thomas Hellstrom
  2011-10-07 13:38           ` Jerome Glisse
  2011-10-07 22:03           ` Marek Olšák
  2 siblings, 1 reply; 31+ messages in thread
From: Alex Deucher @ 2011-10-07 13:24 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Fri, Oct 7, 2011 at 4:00 AM, Thomas Hellstrom <thomas@shipmail.org> wrote:
> On 10/07/2011 12:42 AM, Marek Olšák wrote:
>>
>> On Wed, Oct 5, 2011 at 7:54 AM, Thomas Hellstrom<thomas@shipmail.org>
>>  wrote:
>>
>>>
>>> In any case, I'm not saying fences is the best way to flush but since the
>>> bo
>>> code assumes that signaling a sync object means "make the buffer contents
>>> available for CPU read / write", it's usually a good way to do it;
>>> there's
>>> even a sync_obj_flush() method that gets called when a potential flush
>>> needs
>>> to happen.
>>>
>>
>> I don't think we use it like that. To my knowledge, the purpose of the
>> sync obj (to Radeon Gallium drivers at least) is to be able to wait
>> for the last use of a buffer. Whether the contents can or cannot be
>> available to the CPU is totally irrelevant.
>>
>> Currently (and it's a very important performance optimization),
>> buffers stay mapped and available for CPU read/write during their
>> first map_buffer call. Unmap_buffer is a no-op. The unmapping happens
>> on buffer destruction. We only call bo_wait when we want to wait for
>> the GPU until it's done with the buffer (we don't always want that,
>> sometimes we want to use the unsychronized flag). Otherwise the
>> contents of buffers are available at *any time*.
>>
>> We could probably implement bo_wait privately in the kernel driver and
>> not use ttm_bo_wait. I preferred code sharing though.
>>
>> Textures (especially the tiled ones) are never mapped directly and a
>> temporary staging resource is used instead, so we don't actually
>> pollute address space that much. (in case you would have such a
>> remark) We will use staging resources for buffers too, but it's really
>> the last resort to avoid waiting when direct access can't be used.
>>
>>
>>
>>>>>
>>>>> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's the
>>>>> difference between those two? I think we should remove the
>>>>> write_sync_obj
>>>>> bo
>>>>> member.
>>>>>
>>>>>
>>>>
>>>> Okay, but I think we should remove sync_obj instead, and keep write
>>>> and read sync objs. In the case of READWRITE usage, read_sync_obj
>>>> would be equal to write_sync_obj.
>>>>
>>>>
>>>>
>>>
>>> Sure, I'm fine with that.
>>>
>>> One other thing, though, that makes me a little puzzled:
>>>
>>> Let's assume you don't allow readers and writers at the same time, which
>>> is
>>> my perception of how read- and write fences should work; you either have
>>> a
>>> list of read fences or a single write fence (in the same way a read-write
>>> lock works).
>>>
>>> Now, if you only allow a single read fence, like in this patch. That
>>> implies
>>> that you can only have either a single read fence or a single write fence
>>> at
>>> any one time. We'd only need a single fence pointer on the bo, and
>>> sync_obj_arg would tell us whether to signal the fence for read or for
>>> write
>>> (assuming that sync_obj_arg was set up to indicate read / write at
>>> validation time), then the patch really isn't necessary at all, as it
>>> only
>>> allows a single read fence?
>>>
>>> Or is it that you want to allow read- and write fences co-existing? In
>>> that
>>> case, what's the use case?
>>>
>>
>> There are lots of read-write use cases which don't need any barriers
>> or flushing. The obvious ones are color blending and depth-stencil
>> buffering. The OpenGL application is also allowed to use a subrange of
>> a buffer as a vertex buffer (read-only) and another disjoint subrange
>> of the same buffer for transform feedback (write-only), which kinda
>> makes me think about whether we should track subranges instead of
>> treating a whole buffer as "busy". It gets even more funky with
>> ARB_shader_image_load_store, which supports atomic read-modify-write
>> operations on textures, not to mention atomic memory operations in
>> compute shaders (wait, isn't that also exposed in GL as
>> GL_ARB_shader_atomic_counters?).
>>
>> I was thinking whether the two sync objs should be "read" and
>> "readwrite", or "read" and "write". I chose the latter, because it's
>> more fine-grained and we have to keep at least two of them around
>> anyway.
>>
>> So now that you know what we use sync objs for, what are your ideas on
>> re-implementing that patch in a way that is okay with you? Besides
>> removing the third sync objs of course.
>>
>> Marek
>>
>
> OK. First I think we need to make a distinction: bo sync objects vs driver
> fences. The bo sync obj api is there to strictly provide functionality that
> the ttm bo subsystem is using, and that follows a simple set of rules:
>
> 1) the bo subsystem does never assume sync objects are ordered. That means
> the bo subsystem needs to wait on a sync object before removing it from a
> buffer. Any other assumption is buggy and must be fixed. BUT, if that
> assumption takes place in the driver unknowingly from the ttm bo subsystem
> (which is usually the case), it's OK.
>
> 2) When the sync object(s) attached to the bo are signaled the ttm bo
> subsystem is free to copy the bo contents and to unbind the bo.
>
> 3) The ttm bo system allows sync objects to be signaled in different ways
> opaque to the subsystem using sync_obj_arg. The driver is responsible for
> setting up that argument.
>
> 4) Driver fences may be used for or expose other functionality or adaptions
> to APIs as long as the sync obj api exported to the bo sybsystem follows the
> above rules.
>
> This means the following w r t the patch.
>
> A) it violates 1). This is a bug that must be fixed. Assumptions that if one
> sync object is singnaled, another sync object is also signaled must be done
> in the driver and not in the bo subsystem. Hence we need to explicitly wait
> for a fence to remove it from the bo.
>
> B) the sync_obj_arg carries *per-sync-obj* information on how it should be
> signaled. If we need to attach multiple sync objects to a buffer object, we
> also need multiple sync_obj_args. This is a bug and needs to be fixed.
>
> C) There is really only one reason that the ttm bo subsystem should care
> about multiple sync objects, and that is because the driver can't order them
> efficiently. A such example would be hardware with multiple pipes reading
> simultaneously from the same texture buffer. Currently we don't support this
> so only the *last* sync object needs to be know by the bo subsystem. Keeping
> track of multiple fences generates a lot of completely unnecessary code in
> the ttm_bo_util file, the ttm_bo_vm file, and will be a nightmare if / when
> we truly support pipelined moves.

Just an FYI, cayman GPUs have multiple rings and we will be working to
support them in the coming months.

Alex

>
> As I understand it from your patches, you want to keep multiple fences
> around only to track rendering history. If we want to do that generically, i
> suggest doing it in the execbuf util code in the following way:
>
> struct ttm_eu_rendering_history {
>    void *last_read_sync_obj;
>    void *last_read_sync_obj_arg;
>    void *last_write_sync_obj;
>    void *last_write_sync_obj_arg;
> }
>
> Embed this structure in the radeon_bo, and build a small api around it,
> including *optionally* passing it to the existing execbuf utilities, and you
> should be done. The bo_util code and bo_vm code doesn't care about the
> rendering history. Only that the bo is completely idle.
>
> Note also that when an accelerated bo move is scheduled, the driver needs to
> update this struct
>
> /Thomas
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07  8:00         ` Thomas Hellstrom
  2011-10-07 13:24           ` Alex Deucher
@ 2011-10-07 13:38           ` Jerome Glisse
  2011-10-07 13:42             ` Jerome Glisse
                               ` (2 more replies)
  2011-10-07 22:03           ` Marek Olšák
  2 siblings, 3 replies; 31+ messages in thread
From: Jerome Glisse @ 2011-10-07 13:38 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Fri, Oct 7, 2011 at 4:00 AM, Thomas Hellstrom <thomas@shipmail.org> wrote:
> On 10/07/2011 12:42 AM, Marek Olšák wrote:
>>
>> On Wed, Oct 5, 2011 at 7:54 AM, Thomas Hellstrom<thomas@shipmail.org>
>>  wrote:
>>
>>>
>>> In any case, I'm not saying fences is the best way to flush but since the
>>> bo
>>> code assumes that signaling a sync object means "make the buffer contents
>>> available for CPU read / write", it's usually a good way to do it;
>>> there's
>>> even a sync_obj_flush() method that gets called when a potential flush
>>> needs
>>> to happen.
>>>
>>
>> I don't think we use it like that. To my knowledge, the purpose of the
>> sync obj (to Radeon Gallium drivers at least) is to be able to wait
>> for the last use of a buffer. Whether the contents can or cannot be
>> available to the CPU is totally irrelevant.
>>
>> Currently (and it's a very important performance optimization),
>> buffers stay mapped and available for CPU read/write during their
>> first map_buffer call. Unmap_buffer is a no-op. The unmapping happens
>> on buffer destruction. We only call bo_wait when we want to wait for
>> the GPU until it's done with the buffer (we don't always want that,
>> sometimes we want to use the unsychronized flag). Otherwise the
>> contents of buffers are available at *any time*.
>>
>> We could probably implement bo_wait privately in the kernel driver and
>> not use ttm_bo_wait. I preferred code sharing though.
>>
>> Textures (especially the tiled ones) are never mapped directly and a
>> temporary staging resource is used instead, so we don't actually
>> pollute address space that much. (in case you would have such a
>> remark) We will use staging resources for buffers too, but it's really
>> the last resort to avoid waiting when direct access can't be used.
>>
>>
>>
>>>>>
>>>>> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's the
>>>>> difference between those two? I think we should remove the
>>>>> write_sync_obj
>>>>> bo
>>>>> member.
>>>>>
>>>>>
>>>>
>>>> Okay, but I think we should remove sync_obj instead, and keep write
>>>> and read sync objs. In the case of READWRITE usage, read_sync_obj
>>>> would be equal to write_sync_obj.
>>>>
>>>>
>>>>
>>>
>>> Sure, I'm fine with that.
>>>
>>> One other thing, though, that makes me a little puzzled:
>>>
>>> Let's assume you don't allow readers and writers at the same time, which
>>> is
>>> my perception of how read- and write fences should work; you either have
>>> a
>>> list of read fences or a single write fence (in the same way a read-write
>>> lock works).
>>>
>>> Now, if you only allow a single read fence, like in this patch. That
>>> implies
>>> that you can only have either a single read fence or a single write fence
>>> at
>>> any one time. We'd only need a single fence pointer on the bo, and
>>> sync_obj_arg would tell us whether to signal the fence for read or for
>>> write
>>> (assuming that sync_obj_arg was set up to indicate read / write at
>>> validation time), then the patch really isn't necessary at all, as it
>>> only
>>> allows a single read fence?
>>>
>>> Or is it that you want to allow read- and write fences co-existing? In
>>> that
>>> case, what's the use case?
>>>
>>
>> There are lots of read-write use cases which don't need any barriers
>> or flushing. The obvious ones are color blending and depth-stencil
>> buffering. The OpenGL application is also allowed to use a subrange of
>> a buffer as a vertex buffer (read-only) and another disjoint subrange
>> of the same buffer for transform feedback (write-only), which kinda
>> makes me think about whether we should track subranges instead of
>> treating a whole buffer as "busy". It gets even more funky with
>> ARB_shader_image_load_store, which supports atomic read-modify-write
>> operations on textures, not to mention atomic memory operations in
>> compute shaders (wait, isn't that also exposed in GL as
>> GL_ARB_shader_atomic_counters?).
>>
>> I was thinking whether the two sync objs should be "read" and
>> "readwrite", or "read" and "write". I chose the latter, because it's
>> more fine-grained and we have to keep at least two of them around
>> anyway.
>>
>> So now that you know what we use sync objs for, what are your ideas on
>> re-implementing that patch in a way that is okay with you? Besides
>> removing the third sync objs of course.
>>
>> Marek
>>
>
> OK. First I think we need to make a distinction: bo sync objects vs driver
> fences. The bo sync obj api is there to strictly provide functionality that
> the ttm bo subsystem is using, and that follows a simple set of rules:
>
> 1) the bo subsystem does never assume sync objects are ordered. That means
> the bo subsystem needs to wait on a sync object before removing it from a
> buffer. Any other assumption is buggy and must be fixed. BUT, if that
> assumption takes place in the driver unknowingly from the ttm bo subsystem
> (which is usually the case), it's OK.
>
> 2) When the sync object(s) attached to the bo are signaled the ttm bo
> subsystem is free to copy the bo contents and to unbind the bo.
>
> 3) The ttm bo system allows sync objects to be signaled in different ways
> opaque to the subsystem using sync_obj_arg. The driver is responsible for
> setting up that argument.
>
> 4) Driver fences may be used for or expose other functionality or adaptions
> to APIs as long as the sync obj api exported to the bo sybsystem follows the
> above rules.
>
> This means the following w r t the patch.
>
> A) it violates 1). This is a bug that must be fixed. Assumptions that if one
> sync object is singnaled, another sync object is also signaled must be done
> in the driver and not in the bo subsystem. Hence we need to explicitly wait
> for a fence to remove it from the bo.
>
> B) the sync_obj_arg carries *per-sync-obj* information on how it should be
> signaled. If we need to attach multiple sync objects to a buffer object, we
> also need multiple sync_obj_args. This is a bug and needs to be fixed.
>
> C) There is really only one reason that the ttm bo subsystem should care
> about multiple sync objects, and that is because the driver can't order them
> efficiently. A such example would be hardware with multiple pipes reading
> simultaneously from the same texture buffer. Currently we don't support this
> so only the *last* sync object needs to be know by the bo subsystem. Keeping
> track of multiple fences generates a lot of completely unnecessary code in
> the ttm_bo_util file, the ttm_bo_vm file, and will be a nightmare if / when
> we truly support pipelined moves.
>
> As I understand it from your patches, you want to keep multiple fences
> around only to track rendering history. If we want to do that generically, i
> suggest doing it in the execbuf util code in the following way:
>
> struct ttm_eu_rendering_history {
>    void *last_read_sync_obj;
>    void *last_read_sync_obj_arg;
>    void *last_write_sync_obj;
>    void *last_write_sync_obj_arg;
> }
>
> Embed this structure in the radeon_bo, and build a small api around it,
> including *optionally* passing it to the existing execbuf utilities, and you
> should be done. The bo_util code and bo_vm code doesn't care about the
> rendering history. Only that the bo is completely idle.
>
> Note also that when an accelerated bo move is scheduled, the driver needs to
> update this struct
>
> /Thomas

I should have look at the patch long ago ... anyway i think a better
approach would be to expose fence id as 64bits unsigned to each
userspace client. I was thinking of mapping a page readonly (same page
as the one gpu write back) at somespecific offset in drm file (bit
like sarea but readonly so no lock). Each time userspace submit a
command stream it would get the fence id associated with the command
stream. It would then be up to userspace to track btw read or write
and do appropriate things. The kernel code would be simple (biggest
issue is finding an offset we can use for that), it would be fast as
no round trip to kernel to know the last fence.

Each fence seq would be valid only for a specific ring (only future
gpu impacted here, maybe cayman).

So no change to ttm, just change to radeon to return fence seq and to
move to an unsigned 64. Issue would be when gpu write back is
disabled, then we would either need userspace to call somethings like
bo wait or to other ioctl to get the kernel to update the copy, copy
would be updated in the irq handler too so at least it get updated
anytime something enable irq.

Cheers,
Jerome
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07 13:38           ` Jerome Glisse
@ 2011-10-07 13:42             ` Jerome Glisse
  2011-10-07 21:07               ` Marek Olšák
  2011-10-07 14:09             ` Thomas Hellstrom
  2011-10-07 21:30             ` Marek Olšák
  2 siblings, 1 reply; 31+ messages in thread
From: Jerome Glisse @ 2011-10-07 13:42 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Fri, Oct 7, 2011 at 9:38 AM, Jerome Glisse <j.glisse@gmail.com> wrote:
> On Fri, Oct 7, 2011 at 4:00 AM, Thomas Hellstrom <thomas@shipmail.org> wrote:
>> On 10/07/2011 12:42 AM, Marek Olšák wrote:
>>>
>>> On Wed, Oct 5, 2011 at 7:54 AM, Thomas Hellstrom<thomas@shipmail.org>
>>>  wrote:
>>>
>>>>
>>>> In any case, I'm not saying fences is the best way to flush but since the
>>>> bo
>>>> code assumes that signaling a sync object means "make the buffer contents
>>>> available for CPU read / write", it's usually a good way to do it;
>>>> there's
>>>> even a sync_obj_flush() method that gets called when a potential flush
>>>> needs
>>>> to happen.
>>>>
>>>
>>> I don't think we use it like that. To my knowledge, the purpose of the
>>> sync obj (to Radeon Gallium drivers at least) is to be able to wait
>>> for the last use of a buffer. Whether the contents can or cannot be
>>> available to the CPU is totally irrelevant.
>>>
>>> Currently (and it's a very important performance optimization),
>>> buffers stay mapped and available for CPU read/write during their
>>> first map_buffer call. Unmap_buffer is a no-op. The unmapping happens
>>> on buffer destruction. We only call bo_wait when we want to wait for
>>> the GPU until it's done with the buffer (we don't always want that,
>>> sometimes we want to use the unsychronized flag). Otherwise the
>>> contents of buffers are available at *any time*.
>>>
>>> We could probably implement bo_wait privately in the kernel driver and
>>> not use ttm_bo_wait. I preferred code sharing though.
>>>
>>> Textures (especially the tiled ones) are never mapped directly and a
>>> temporary staging resource is used instead, so we don't actually
>>> pollute address space that much. (in case you would have such a
>>> remark) We will use staging resources for buffers too, but it's really
>>> the last resort to avoid waiting when direct access can't be used.
>>>
>>>
>>>
>>>>>>
>>>>>> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's the
>>>>>> difference between those two? I think we should remove the
>>>>>> write_sync_obj
>>>>>> bo
>>>>>> member.
>>>>>>
>>>>>>
>>>>>
>>>>> Okay, but I think we should remove sync_obj instead, and keep write
>>>>> and read sync objs. In the case of READWRITE usage, read_sync_obj
>>>>> would be equal to write_sync_obj.
>>>>>
>>>>>
>>>>>
>>>>
>>>> Sure, I'm fine with that.
>>>>
>>>> One other thing, though, that makes me a little puzzled:
>>>>
>>>> Let's assume you don't allow readers and writers at the same time, which
>>>> is
>>>> my perception of how read- and write fences should work; you either have
>>>> a
>>>> list of read fences or a single write fence (in the same way a read-write
>>>> lock works).
>>>>
>>>> Now, if you only allow a single read fence, like in this patch. That
>>>> implies
>>>> that you can only have either a single read fence or a single write fence
>>>> at
>>>> any one time. We'd only need a single fence pointer on the bo, and
>>>> sync_obj_arg would tell us whether to signal the fence for read or for
>>>> write
>>>> (assuming that sync_obj_arg was set up to indicate read / write at
>>>> validation time), then the patch really isn't necessary at all, as it
>>>> only
>>>> allows a single read fence?
>>>>
>>>> Or is it that you want to allow read- and write fences co-existing? In
>>>> that
>>>> case, what's the use case?
>>>>
>>>
>>> There are lots of read-write use cases which don't need any barriers
>>> or flushing. The obvious ones are color blending and depth-stencil
>>> buffering. The OpenGL application is also allowed to use a subrange of
>>> a buffer as a vertex buffer (read-only) and another disjoint subrange
>>> of the same buffer for transform feedback (write-only), which kinda
>>> makes me think about whether we should track subranges instead of
>>> treating a whole buffer as "busy". It gets even more funky with
>>> ARB_shader_image_load_store, which supports atomic read-modify-write
>>> operations on textures, not to mention atomic memory operations in
>>> compute shaders (wait, isn't that also exposed in GL as
>>> GL_ARB_shader_atomic_counters?).
>>>
>>> I was thinking whether the two sync objs should be "read" and
>>> "readwrite", or "read" and "write". I chose the latter, because it's
>>> more fine-grained and we have to keep at least two of them around
>>> anyway.
>>>
>>> So now that you know what we use sync objs for, what are your ideas on
>>> re-implementing that patch in a way that is okay with you? Besides
>>> removing the third sync objs of course.
>>>
>>> Marek
>>>
>>
>> OK. First I think we need to make a distinction: bo sync objects vs driver
>> fences. The bo sync obj api is there to strictly provide functionality that
>> the ttm bo subsystem is using, and that follows a simple set of rules:
>>
>> 1) the bo subsystem does never assume sync objects are ordered. That means
>> the bo subsystem needs to wait on a sync object before removing it from a
>> buffer. Any other assumption is buggy and must be fixed. BUT, if that
>> assumption takes place in the driver unknowingly from the ttm bo subsystem
>> (which is usually the case), it's OK.
>>
>> 2) When the sync object(s) attached to the bo are signaled the ttm bo
>> subsystem is free to copy the bo contents and to unbind the bo.
>>
>> 3) The ttm bo system allows sync objects to be signaled in different ways
>> opaque to the subsystem using sync_obj_arg. The driver is responsible for
>> setting up that argument.
>>
>> 4) Driver fences may be used for or expose other functionality or adaptions
>> to APIs as long as the sync obj api exported to the bo sybsystem follows the
>> above rules.
>>
>> This means the following w r t the patch.
>>
>> A) it violates 1). This is a bug that must be fixed. Assumptions that if one
>> sync object is singnaled, another sync object is also signaled must be done
>> in the driver and not in the bo subsystem. Hence we need to explicitly wait
>> for a fence to remove it from the bo.
>>
>> B) the sync_obj_arg carries *per-sync-obj* information on how it should be
>> signaled. If we need to attach multiple sync objects to a buffer object, we
>> also need multiple sync_obj_args. This is a bug and needs to be fixed.
>>
>> C) There is really only one reason that the ttm bo subsystem should care
>> about multiple sync objects, and that is because the driver can't order them
>> efficiently. A such example would be hardware with multiple pipes reading
>> simultaneously from the same texture buffer. Currently we don't support this
>> so only the *last* sync object needs to be know by the bo subsystem. Keeping
>> track of multiple fences generates a lot of completely unnecessary code in
>> the ttm_bo_util file, the ttm_bo_vm file, and will be a nightmare if / when
>> we truly support pipelined moves.
>>
>> As I understand it from your patches, you want to keep multiple fences
>> around only to track rendering history. If we want to do that generically, i
>> suggest doing it in the execbuf util code in the following way:
>>
>> struct ttm_eu_rendering_history {
>>    void *last_read_sync_obj;
>>    void *last_read_sync_obj_arg;
>>    void *last_write_sync_obj;
>>    void *last_write_sync_obj_arg;
>> }
>>
>> Embed this structure in the radeon_bo, and build a small api around it,
>> including *optionally* passing it to the existing execbuf utilities, and you
>> should be done. The bo_util code and bo_vm code doesn't care about the
>> rendering history. Only that the bo is completely idle.
>>
>> Note also that when an accelerated bo move is scheduled, the driver needs to
>> update this struct
>>
>> /Thomas
>
> I should have look at the patch long ago ... anyway i think a better
> approach would be to expose fence id as 64bits unsigned to each
> userspace client. I was thinking of mapping a page readonly (same page
> as the one gpu write back) at somespecific offset in drm file (bit
> like sarea but readonly so no lock). Each time userspace submit a
> command stream it would get the fence id associated with the command
> stream. It would then be up to userspace to track btw read or write
> and do appropriate things. The kernel code would be simple (biggest
> issue is finding an offset we can use for that), it would be fast as
> no round trip to kernel to know the last fence.
>
> Each fence seq would be valid only for a specific ring (only future
> gpu impacted here, maybe cayman).
>
> So no change to ttm, just change to radeon to return fence seq and to
> move to an unsigned 64. Issue would be when gpu write back is
> disabled, then we would either need userspace to call somethings like
> bo wait or to other ioctl to get the kernel to update the copy, copy
> would be updated in the irq handler too so at least it get updated
> anytime something enable irq.
>
> Cheers,
> Jerome
>

Forget to mention that we would need a new wait_fence ioctl to wait
for a specific fence seq on a specific ring

Cheers,
Jerome
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07 13:24           ` Alex Deucher
@ 2011-10-07 14:05             ` Thomas Hellstrom
  2011-10-07 14:14               ` Alex Deucher
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-07 14:05 UTC (permalink / raw)
  To: Alex Deucher; +Cc: dri-devel

On 10/07/2011 03:24 PM, Alex Deucher wrote:
> On Fri, Oct 7, 2011 at 4:00 AM, Thomas Hellstrom<thomas@shipmail.org>  wrote:
>    
>> On 10/07/2011 12:42 AM, Marek Olšák wrote:
>>      
>>> On Wed, Oct 5, 2011 at 7:54 AM, Thomas Hellstrom<thomas@shipmail.org>
>>>   wrote:
>>>
>>>        
>>>> In any case, I'm not saying fences is the best way to flush but since the
>>>> bo
>>>> code assumes that signaling a sync object means "make the buffer contents
>>>> available for CPU read / write", it's usually a good way to do it;
>>>> there's
>>>> even a sync_obj_flush() method that gets called when a potential flush
>>>> needs
>>>> to happen.
>>>>
>>>>          
>>> I don't think we use it like that. To my knowledge, the purpose of the
>>> sync obj (to Radeon Gallium drivers at least) is to be able to wait
>>> for the last use of a buffer. Whether the contents can or cannot be
>>> available to the CPU is totally irrelevant.
>>>
>>> Currently (and it's a very important performance optimization),
>>> buffers stay mapped and available for CPU read/write during their
>>> first map_buffer call. Unmap_buffer is a no-op. The unmapping happens
>>> on buffer destruction. We only call bo_wait when we want to wait for
>>> the GPU until it's done with the buffer (we don't always want that,
>>> sometimes we want to use the unsychronized flag). Otherwise the
>>> contents of buffers are available at *any time*.
>>>
>>> We could probably implement bo_wait privately in the kernel driver and
>>> not use ttm_bo_wait. I preferred code sharing though.
>>>
>>> Textures (especially the tiled ones) are never mapped directly and a
>>> temporary staging resource is used instead, so we don't actually
>>> pollute address space that much. (in case you would have such a
>>> remark) We will use staging resources for buffers too, but it's really
>>> the last resort to avoid waiting when direct access can't be used.
>>>
>>>
>>>
>>>        
>>>>>> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's the
>>>>>> difference between those two? I think we should remove the
>>>>>> write_sync_obj
>>>>>> bo
>>>>>> member.
>>>>>>
>>>>>>
>>>>>>              
>>>>> Okay, but I think we should remove sync_obj instead, and keep write
>>>>> and read sync objs. In the case of READWRITE usage, read_sync_obj
>>>>> would be equal to write_sync_obj.
>>>>>
>>>>>
>>>>>
>>>>>            
>>>> Sure, I'm fine with that.
>>>>
>>>> One other thing, though, that makes me a little puzzled:
>>>>
>>>> Let's assume you don't allow readers and writers at the same time, which
>>>> is
>>>> my perception of how read- and write fences should work; you either have
>>>> a
>>>> list of read fences or a single write fence (in the same way a read-write
>>>> lock works).
>>>>
>>>> Now, if you only allow a single read fence, like in this patch. That
>>>> implies
>>>> that you can only have either a single read fence or a single write fence
>>>> at
>>>> any one time. We'd only need a single fence pointer on the bo, and
>>>> sync_obj_arg would tell us whether to signal the fence for read or for
>>>> write
>>>> (assuming that sync_obj_arg was set up to indicate read / write at
>>>> validation time), then the patch really isn't necessary at all, as it
>>>> only
>>>> allows a single read fence?
>>>>
>>>> Or is it that you want to allow read- and write fences co-existing? In
>>>> that
>>>> case, what's the use case?
>>>>
>>>>          
>>> There are lots of read-write use cases which don't need any barriers
>>> or flushing. The obvious ones are color blending and depth-stencil
>>> buffering. The OpenGL application is also allowed to use a subrange of
>>> a buffer as a vertex buffer (read-only) and another disjoint subrange
>>> of the same buffer for transform feedback (write-only), which kinda
>>> makes me think about whether we should track subranges instead of
>>> treating a whole buffer as "busy". It gets even more funky with
>>> ARB_shader_image_load_store, which supports atomic read-modify-write
>>> operations on textures, not to mention atomic memory operations in
>>> compute shaders (wait, isn't that also exposed in GL as
>>> GL_ARB_shader_atomic_counters?).
>>>
>>> I was thinking whether the two sync objs should be "read" and
>>> "readwrite", or "read" and "write". I chose the latter, because it's
>>> more fine-grained and we have to keep at least two of them around
>>> anyway.
>>>
>>> So now that you know what we use sync objs for, what are your ideas on
>>> re-implementing that patch in a way that is okay with you? Besides
>>> removing the third sync objs of course.
>>>
>>> Marek
>>>
>>>        
>> OK. First I think we need to make a distinction: bo sync objects vs driver
>> fences. The bo sync obj api is there to strictly provide functionality that
>> the ttm bo subsystem is using, and that follows a simple set of rules:
>>
>> 1) the bo subsystem does never assume sync objects are ordered. That means
>> the bo subsystem needs to wait on a sync object before removing it from a
>> buffer. Any other assumption is buggy and must be fixed. BUT, if that
>> assumption takes place in the driver unknowingly from the ttm bo subsystem
>> (which is usually the case), it's OK.
>>
>> 2) When the sync object(s) attached to the bo are signaled the ttm bo
>> subsystem is free to copy the bo contents and to unbind the bo.
>>
>> 3) The ttm bo system allows sync objects to be signaled in different ways
>> opaque to the subsystem using sync_obj_arg. The driver is responsible for
>> setting up that argument.
>>
>> 4) Driver fences may be used for or expose other functionality or adaptions
>> to APIs as long as the sync obj api exported to the bo sybsystem follows the
>> above rules.
>>
>> This means the following w r t the patch.
>>
>> A) it violates 1). This is a bug that must be fixed. Assumptions that if one
>> sync object is singnaled, another sync object is also signaled must be done
>> in the driver and not in the bo subsystem. Hence we need to explicitly wait
>> for a fence to remove it from the bo.
>>
>> B) the sync_obj_arg carries *per-sync-obj* information on how it should be
>> signaled. If we need to attach multiple sync objects to a buffer object, we
>> also need multiple sync_obj_args. This is a bug and needs to be fixed.
>>
>> C) There is really only one reason that the ttm bo subsystem should care
>> about multiple sync objects, and that is because the driver can't order them
>> efficiently. A such example would be hardware with multiple pipes reading
>> simultaneously from the same texture buffer. Currently we don't support this
>> so only the *last* sync object needs to be know by the bo subsystem. Keeping
>> track of multiple fences generates a lot of completely unnecessary code in
>> the ttm_bo_util file, the ttm_bo_vm file, and will be a nightmare if / when
>> we truly support pipelined moves.
>>      
> Just an FYI, cayman GPUs have multiple rings and we will be working to
> support them in the coming months.
>
> Alex
>
>    

Right. Nvidia has had it for a long time, but I think they use barriers 
("Semaphores") to order
sync objects when needed, so that only the last sync object is attached 
to the buffer.

If you need to / want to support simultaneous ring access to a buffer 
without such barriers,
we should change the single buffer object fence pointer to a list of 
pointers to fence objects.

/Thomas




_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07 13:38           ` Jerome Glisse
  2011-10-07 13:42             ` Jerome Glisse
@ 2011-10-07 14:09             ` Thomas Hellstrom
  2011-10-07 21:30             ` Marek Olšák
  2 siblings, 0 replies; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-07 14:09 UTC (permalink / raw)
  To: Jerome Glisse; +Cc: dri-devel

On 10/07/2011 03:38 PM, Jerome Glisse wrote:
> On Fri, Oct 7, 2011 at 4:00 AM, Thomas Hellstrom<thomas@shipmail.org>  wrote:
>    
>> On 10/07/2011 12:42 AM, Marek Olšák wrote:
>>      
>>> On Wed, Oct 5, 2011 at 7:54 AM, Thomas Hellstrom<thomas@shipmail.org>
>>>   wrote:
>>>
>>>        
>>>> In any case, I'm not saying fences is the best way to flush but since the
>>>> bo
>>>> code assumes that signaling a sync object means "make the buffer contents
>>>> available for CPU read / write", it's usually a good way to do it;
>>>> there's
>>>> even a sync_obj_flush() method that gets called when a potential flush
>>>> needs
>>>> to happen.
>>>>
>>>>          
>>> I don't think we use it like that. To my knowledge, the purpose of the
>>> sync obj (to Radeon Gallium drivers at least) is to be able to wait
>>> for the last use of a buffer. Whether the contents can or cannot be
>>> available to the CPU is totally irrelevant.
>>>
>>> Currently (and it's a very important performance optimization),
>>> buffers stay mapped and available for CPU read/write during their
>>> first map_buffer call. Unmap_buffer is a no-op. The unmapping happens
>>> on buffer destruction. We only call bo_wait when we want to wait for
>>> the GPU until it's done with the buffer (we don't always want that,
>>> sometimes we want to use the unsychronized flag). Otherwise the
>>> contents of buffers are available at *any time*.
>>>
>>> We could probably implement bo_wait privately in the kernel driver and
>>> not use ttm_bo_wait. I preferred code sharing though.
>>>
>>> Textures (especially the tiled ones) are never mapped directly and a
>>> temporary staging resource is used instead, so we don't actually
>>> pollute address space that much. (in case you would have such a
>>> remark) We will use staging resources for buffers too, but it's really
>>> the last resort to avoid waiting when direct access can't be used.
>>>
>>>
>>>
>>>        
>>>>>> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's the
>>>>>> difference between those two? I think we should remove the
>>>>>> write_sync_obj
>>>>>> bo
>>>>>> member.
>>>>>>
>>>>>>
>>>>>>              
>>>>> Okay, but I think we should remove sync_obj instead, and keep write
>>>>> and read sync objs. In the case of READWRITE usage, read_sync_obj
>>>>> would be equal to write_sync_obj.
>>>>>
>>>>>
>>>>>
>>>>>            
>>>> Sure, I'm fine with that.
>>>>
>>>> One other thing, though, that makes me a little puzzled:
>>>>
>>>> Let's assume you don't allow readers and writers at the same time, which
>>>> is
>>>> my perception of how read- and write fences should work; you either have
>>>> a
>>>> list of read fences or a single write fence (in the same way a read-write
>>>> lock works).
>>>>
>>>> Now, if you only allow a single read fence, like in this patch. That
>>>> implies
>>>> that you can only have either a single read fence or a single write fence
>>>> at
>>>> any one time. We'd only need a single fence pointer on the bo, and
>>>> sync_obj_arg would tell us whether to signal the fence for read or for
>>>> write
>>>> (assuming that sync_obj_arg was set up to indicate read / write at
>>>> validation time), then the patch really isn't necessary at all, as it
>>>> only
>>>> allows a single read fence?
>>>>
>>>> Or is it that you want to allow read- and write fences co-existing? In
>>>> that
>>>> case, what's the use case?
>>>>
>>>>          
>>> There are lots of read-write use cases which don't need any barriers
>>> or flushing. The obvious ones are color blending and depth-stencil
>>> buffering. The OpenGL application is also allowed to use a subrange of
>>> a buffer as a vertex buffer (read-only) and another disjoint subrange
>>> of the same buffer for transform feedback (write-only), which kinda
>>> makes me think about whether we should track subranges instead of
>>> treating a whole buffer as "busy". It gets even more funky with
>>> ARB_shader_image_load_store, which supports atomic read-modify-write
>>> operations on textures, not to mention atomic memory operations in
>>> compute shaders (wait, isn't that also exposed in GL as
>>> GL_ARB_shader_atomic_counters?).
>>>
>>> I was thinking whether the two sync objs should be "read" and
>>> "readwrite", or "read" and "write". I chose the latter, because it's
>>> more fine-grained and we have to keep at least two of them around
>>> anyway.
>>>
>>> So now that you know what we use sync objs for, what are your ideas on
>>> re-implementing that patch in a way that is okay with you? Besides
>>> removing the third sync objs of course.
>>>
>>> Marek
>>>
>>>        
>> OK. First I think we need to make a distinction: bo sync objects vs driver
>> fences. The bo sync obj api is there to strictly provide functionality that
>> the ttm bo subsystem is using, and that follows a simple set of rules:
>>
>> 1) the bo subsystem does never assume sync objects are ordered. That means
>> the bo subsystem needs to wait on a sync object before removing it from a
>> buffer. Any other assumption is buggy and must be fixed. BUT, if that
>> assumption takes place in the driver unknowingly from the ttm bo subsystem
>> (which is usually the case), it's OK.
>>
>> 2) When the sync object(s) attached to the bo are signaled the ttm bo
>> subsystem is free to copy the bo contents and to unbind the bo.
>>
>> 3) The ttm bo system allows sync objects to be signaled in different ways
>> opaque to the subsystem using sync_obj_arg. The driver is responsible for
>> setting up that argument.
>>
>> 4) Driver fences may be used for or expose other functionality or adaptions
>> to APIs as long as the sync obj api exported to the bo sybsystem follows the
>> above rules.
>>
>> This means the following w r t the patch.
>>
>> A) it violates 1). This is a bug that must be fixed. Assumptions that if one
>> sync object is singnaled, another sync object is also signaled must be done
>> in the driver and not in the bo subsystem. Hence we need to explicitly wait
>> for a fence to remove it from the bo.
>>
>> B) the sync_obj_arg carries *per-sync-obj* information on how it should be
>> signaled. If we need to attach multiple sync objects to a buffer object, we
>> also need multiple sync_obj_args. This is a bug and needs to be fixed.
>>
>> C) There is really only one reason that the ttm bo subsystem should care
>> about multiple sync objects, and that is because the driver can't order them
>> efficiently. A such example would be hardware with multiple pipes reading
>> simultaneously from the same texture buffer. Currently we don't support this
>> so only the *last* sync object needs to be know by the bo subsystem. Keeping
>> track of multiple fences generates a lot of completely unnecessary code in
>> the ttm_bo_util file, the ttm_bo_vm file, and will be a nightmare if / when
>> we truly support pipelined moves.
>>
>> As I understand it from your patches, you want to keep multiple fences
>> around only to track rendering history. If we want to do that generically, i
>> suggest doing it in the execbuf util code in the following way:
>>
>> struct ttm_eu_rendering_history {
>>     void *last_read_sync_obj;
>>     void *last_read_sync_obj_arg;
>>     void *last_write_sync_obj;
>>     void *last_write_sync_obj_arg;
>> }
>>
>> Embed this structure in the radeon_bo, and build a small api around it,
>> including *optionally* passing it to the existing execbuf utilities, and you
>> should be done. The bo_util code and bo_vm code doesn't care about the
>> rendering history. Only that the bo is completely idle.
>>
>> Note also that when an accelerated bo move is scheduled, the driver needs to
>> update this struct
>>
>> /Thomas
>>      
> I should have look at the patch long ago ... anyway i think a better
> approach would be to expose fence id as 64bits unsigned to each
> userspace client. I was thinking of mapping a page readonly (same page
> as the one gpu write back) at somespecific offset in drm file (bit
> like sarea but readonly so no lock). Each time userspace submit a
> command stream it would get the fence id associated with the command
> stream. It would then be up to userspace to track btw read or write
> and do appropriate things. The kernel code would be simple (biggest
> issue is finding an offset we can use for that), it would be fast as
> no round trip to kernel to know the last fence.
>
> Each fence seq would be valid only for a specific ring (only future
> gpu impacted here, maybe cayman).
>
> So no change to ttm, just change to radeon to return fence seq and to
> move to an unsigned 64. Issue would be when gpu write back is
> disabled, then we would either need userspace to call somethings like
> bo wait or to other ioctl to get the kernel to update the copy, copy
> would be updated in the irq handler too so at least it get updated
> anytime something enable irq.
>
> Cheers,
> Jerome
>    

That also sounds like a feasible / good way.
Note that the driver determines the mapping offset of TTM address space. 
You can
just adjust that somewhat upwards and you'll have free space for an ro page.

/Thomas



_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07 14:05             ` Thomas Hellstrom
@ 2011-10-07 14:14               ` Alex Deucher
  0 siblings, 0 replies; 31+ messages in thread
From: Alex Deucher @ 2011-10-07 14:14 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Fri, Oct 7, 2011 at 10:05 AM, Thomas Hellstrom <thomas@shipmail.org> wrote:
> On 10/07/2011 03:24 PM, Alex Deucher wrote:
>>
>> On Fri, Oct 7, 2011 at 4:00 AM, Thomas Hellstrom<thomas@shipmail.org>
>>  wrote:
>>
>>>
>>> On 10/07/2011 12:42 AM, Marek Olšák wrote:
>>>
>>>>
>>>> On Wed, Oct 5, 2011 at 7:54 AM, Thomas Hellstrom<thomas@shipmail.org>
>>>>  wrote:
>>>>
>>>>
>>>>>
>>>>> In any case, I'm not saying fences is the best way to flush but since
>>>>> the
>>>>> bo
>>>>> code assumes that signaling a sync object means "make the buffer
>>>>> contents
>>>>> available for CPU read / write", it's usually a good way to do it;
>>>>> there's
>>>>> even a sync_obj_flush() method that gets called when a potential flush
>>>>> needs
>>>>> to happen.
>>>>>
>>>>>
>>>>
>>>> I don't think we use it like that. To my knowledge, the purpose of the
>>>> sync obj (to Radeon Gallium drivers at least) is to be able to wait
>>>> for the last use of a buffer. Whether the contents can or cannot be
>>>> available to the CPU is totally irrelevant.
>>>>
>>>> Currently (and it's a very important performance optimization),
>>>> buffers stay mapped and available for CPU read/write during their
>>>> first map_buffer call. Unmap_buffer is a no-op. The unmapping happens
>>>> on buffer destruction. We only call bo_wait when we want to wait for
>>>> the GPU until it's done with the buffer (we don't always want that,
>>>> sometimes we want to use the unsychronized flag). Otherwise the
>>>> contents of buffers are available at *any time*.
>>>>
>>>> We could probably implement bo_wait privately in the kernel driver and
>>>> not use ttm_bo_wait. I preferred code sharing though.
>>>>
>>>> Textures (especially the tiled ones) are never mapped directly and a
>>>> temporary staging resource is used instead, so we don't actually
>>>> pollute address space that much. (in case you would have such a
>>>> remark) We will use staging resources for buffers too, but it's really
>>>> the last resort to avoid waiting when direct access can't be used.
>>>>
>>>>
>>>>
>>>>
>>>>>>>
>>>>>>> 2) Can't we say that a write_sync_obj is simply a sync_obj? What's
>>>>>>> the
>>>>>>> difference between those two? I think we should remove the
>>>>>>> write_sync_obj
>>>>>>> bo
>>>>>>> member.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> Okay, but I think we should remove sync_obj instead, and keep write
>>>>>> and read sync objs. In the case of READWRITE usage, read_sync_obj
>>>>>> would be equal to write_sync_obj.
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>
>>>>> Sure, I'm fine with that.
>>>>>
>>>>> One other thing, though, that makes me a little puzzled:
>>>>>
>>>>> Let's assume you don't allow readers and writers at the same time,
>>>>> which
>>>>> is
>>>>> my perception of how read- and write fences should work; you either
>>>>> have
>>>>> a
>>>>> list of read fences or a single write fence (in the same way a
>>>>> read-write
>>>>> lock works).
>>>>>
>>>>> Now, if you only allow a single read fence, like in this patch. That
>>>>> implies
>>>>> that you can only have either a single read fence or a single write
>>>>> fence
>>>>> at
>>>>> any one time. We'd only need a single fence pointer on the bo, and
>>>>> sync_obj_arg would tell us whether to signal the fence for read or for
>>>>> write
>>>>> (assuming that sync_obj_arg was set up to indicate read / write at
>>>>> validation time), then the patch really isn't necessary at all, as it
>>>>> only
>>>>> allows a single read fence?
>>>>>
>>>>> Or is it that you want to allow read- and write fences co-existing? In
>>>>> that
>>>>> case, what's the use case?
>>>>>
>>>>>
>>>>
>>>> There are lots of read-write use cases which don't need any barriers
>>>> or flushing. The obvious ones are color blending and depth-stencil
>>>> buffering. The OpenGL application is also allowed to use a subrange of
>>>> a buffer as a vertex buffer (read-only) and another disjoint subrange
>>>> of the same buffer for transform feedback (write-only), which kinda
>>>> makes me think about whether we should track subranges instead of
>>>> treating a whole buffer as "busy". It gets even more funky with
>>>> ARB_shader_image_load_store, which supports atomic read-modify-write
>>>> operations on textures, not to mention atomic memory operations in
>>>> compute shaders (wait, isn't that also exposed in GL as
>>>> GL_ARB_shader_atomic_counters?).
>>>>
>>>> I was thinking whether the two sync objs should be "read" and
>>>> "readwrite", or "read" and "write". I chose the latter, because it's
>>>> more fine-grained and we have to keep at least two of them around
>>>> anyway.
>>>>
>>>> So now that you know what we use sync objs for, what are your ideas on
>>>> re-implementing that patch in a way that is okay with you? Besides
>>>> removing the third sync objs of course.
>>>>
>>>> Marek
>>>>
>>>>
>>>
>>> OK. First I think we need to make a distinction: bo sync objects vs
>>> driver
>>> fences. The bo sync obj api is there to strictly provide functionality
>>> that
>>> the ttm bo subsystem is using, and that follows a simple set of rules:
>>>
>>> 1) the bo subsystem does never assume sync objects are ordered. That
>>> means
>>> the bo subsystem needs to wait on a sync object before removing it from a
>>> buffer. Any other assumption is buggy and must be fixed. BUT, if that
>>> assumption takes place in the driver unknowingly from the ttm bo
>>> subsystem
>>> (which is usually the case), it's OK.
>>>
>>> 2) When the sync object(s) attached to the bo are signaled the ttm bo
>>> subsystem is free to copy the bo contents and to unbind the bo.
>>>
>>> 3) The ttm bo system allows sync objects to be signaled in different ways
>>> opaque to the subsystem using sync_obj_arg. The driver is responsible for
>>> setting up that argument.
>>>
>>> 4) Driver fences may be used for or expose other functionality or
>>> adaptions
>>> to APIs as long as the sync obj api exported to the bo sybsystem follows
>>> the
>>> above rules.
>>>
>>> This means the following w r t the patch.
>>>
>>> A) it violates 1). This is a bug that must be fixed. Assumptions that if
>>> one
>>> sync object is singnaled, another sync object is also signaled must be
>>> done
>>> in the driver and not in the bo subsystem. Hence we need to explicitly
>>> wait
>>> for a fence to remove it from the bo.
>>>
>>> B) the sync_obj_arg carries *per-sync-obj* information on how it should
>>> be
>>> signaled. If we need to attach multiple sync objects to a buffer object,
>>> we
>>> also need multiple sync_obj_args. This is a bug and needs to be fixed.
>>>
>>> C) There is really only one reason that the ttm bo subsystem should care
>>> about multiple sync objects, and that is because the driver can't order
>>> them
>>> efficiently. A such example would be hardware with multiple pipes reading
>>> simultaneously from the same texture buffer. Currently we don't support
>>> this
>>> so only the *last* sync object needs to be know by the bo subsystem.
>>> Keeping
>>> track of multiple fences generates a lot of completely unnecessary code
>>> in
>>> the ttm_bo_util file, the ttm_bo_vm file, and will be a nightmare if /
>>> when
>>> we truly support pipelined moves.
>>>
>>
>> Just an FYI, cayman GPUs have multiple rings and we will be working to
>> support them in the coming months.
>>
>> Alex
>>
>>
>
> Right. Nvidia has had it for a long time, but I think they use barriers
> ("Semaphores") to order
> sync objects when needed, so that only the last sync object is attached to
> the buffer.
>
> If you need to / want to support simultaneous ring access to a buffer
> without such barriers,
> we should change the single buffer object fence pointer to a list of
> pointers to fence objects.

We were planning to use semaphores as well.

Alex

>
> /Thomas
>
>
>
>
>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07 13:42             ` Jerome Glisse
@ 2011-10-07 21:07               ` Marek Olšák
  0 siblings, 0 replies; 31+ messages in thread
From: Marek Olšák @ 2011-10-07 21:07 UTC (permalink / raw)
  To: Jerome Glisse; +Cc: dri-devel

On Fri, Oct 7, 2011 at 3:42 PM, Jerome Glisse <j.glisse@gmail.com> wrote:
> Forget to mention that we would need a new wait_fence ioctl to wait
> for a specific fence seq on a specific ring

We need a wait_fence ioctl anyway and even with a timeout parameter.
It's a mandatory OpenGL feature. r600g uses a loop with sched_yield
right now, which is like the worst implementation of wait_fence we can
have. r300g uses bo_wait for timeout == infinite, and a loop with
usleep for timeout != infinite.

Marek

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07 13:38           ` Jerome Glisse
  2011-10-07 13:42             ` Jerome Glisse
  2011-10-07 14:09             ` Thomas Hellstrom
@ 2011-10-07 21:30             ` Marek Olšák
  2011-10-08  8:14               ` Thomas Hellstrom
  2 siblings, 1 reply; 31+ messages in thread
From: Marek Olšák @ 2011-10-07 21:30 UTC (permalink / raw)
  To: Jerome Glisse; +Cc: dri-devel

On Fri, Oct 7, 2011 at 3:38 PM, Jerome Glisse <j.glisse@gmail.com> wrote:
> I should have look at the patch long ago ... anyway i think a better
> approach would be to expose fence id as 64bits unsigned to each
> userspace client. I was thinking of mapping a page readonly (same page
> as the one gpu write back) at somespecific offset in drm file (bit
> like sarea but readonly so no lock). Each time userspace submit a
> command stream it would get the fence id associated with the command
> stream. It would then be up to userspace to track btw read or write
> and do appropriate things. The kernel code would be simple (biggest
> issue is finding an offset we can use for that), it would be fast as
> no round trip to kernel to know the last fence.
>
> Each fence seq would be valid only for a specific ring (only future
> gpu impacted here, maybe cayman).
>
> So no change to ttm, just change to radeon to return fence seq and to
> move to an unsigned 64. Issue would be when gpu write back is
> disabled, then we would either need userspace to call somethings like
> bo wait or to other ioctl to get the kernel to update the copy, copy
> would be updated in the irq handler too so at least it get updated
> anytime something enable irq.

I am having a hard time understanding what you are saying.

Anyway, I had some read and write usage tracking in the radeon winsys.
That worked well for driver-private resources, but it was a total fail
for the ones shared with the DDX. I needed this bo_wait optimization
to work with multiple processes. That's the whole point why I am doing
this in the kernel.

Marek

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07  8:00         ` Thomas Hellstrom
  2011-10-07 13:24           ` Alex Deucher
  2011-10-07 13:38           ` Jerome Glisse
@ 2011-10-07 22:03           ` Marek Olšák
  2011-10-24 14:48             ` Thomas Hellstrom
  2 siblings, 1 reply; 31+ messages in thread
From: Marek Olšák @ 2011-10-07 22:03 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Fri, Oct 7, 2011 at 10:00 AM, Thomas Hellstrom <thomas@shipmail.org> wrote:
> OK. First I think we need to make a distinction: bo sync objects vs driver
> fences. The bo sync obj api is there to strictly provide functionality that
> the ttm bo subsystem is using, and that follows a simple set of rules:
>
> 1) the bo subsystem does never assume sync objects are ordered. That means
> the bo subsystem needs to wait on a sync object before removing it from a
> buffer. Any other assumption is buggy and must be fixed. BUT, if that
> assumption takes place in the driver unknowingly from the ttm bo subsystem
> (which is usually the case), it's OK.
>
> 2) When the sync object(s) attached to the bo are signaled the ttm bo
> subsystem is free to copy the bo contents and to unbind the bo.
>
> 3) The ttm bo system allows sync objects to be signaled in different ways
> opaque to the subsystem using sync_obj_arg. The driver is responsible for
> setting up that argument.
>
> 4) Driver fences may be used for or expose other functionality or adaptions
> to APIs as long as the sync obj api exported to the bo sybsystem follows the
> above rules.
>
> This means the following w r t the patch.
>
> A) it violates 1). This is a bug that must be fixed. Assumptions that if one
> sync object is singnaled, another sync object is also signaled must be done
> in the driver and not in the bo subsystem. Hence we need to explicitly wait
> for a fence to remove it from the bo.
>
> B) the sync_obj_arg carries *per-sync-obj* information on how it should be
> signaled. If we need to attach multiple sync objects to a buffer object, we
> also need multiple sync_obj_args. This is a bug and needs to be fixed.
>
> C) There is really only one reason that the ttm bo subsystem should care
> about multiple sync objects, and that is because the driver can't order them
> efficiently. A such example would be hardware with multiple pipes reading
> simultaneously from the same texture buffer. Currently we don't support this
> so only the *last* sync object needs to be know by the bo subsystem. Keeping
> track of multiple fences generates a lot of completely unnecessary code in
> the ttm_bo_util file, the ttm_bo_vm file, and will be a nightmare if / when
> we truly support pipelined moves.
>
> As I understand it from your patches, you want to keep multiple fences
> around only to track rendering history. If we want to do that generically, i
> suggest doing it in the execbuf util code in the following way:
>
> struct ttm_eu_rendering_history {
>    void *last_read_sync_obj;
>    void *last_read_sync_obj_arg;
>    void *last_write_sync_obj;
>    void *last_write_sync_obj_arg;
> }
>
> Embed this structure in the radeon_bo, and build a small api around it,
> including *optionally* passing it to the existing execbuf utilities, and you
> should be done. The bo_util code and bo_vm code doesn't care about the
> rendering history. Only that the bo is completely idle.
>
> Note also that when an accelerated bo move is scheduled, the driver needs to
> update this struct

OK, sounds good. I'll fix what should be fixed and send a patch when
it's ready. I am now not so sure whether doing this generically is a
good idea. :)

Marek
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07 21:30             ` Marek Olšák
@ 2011-10-08  8:14               ` Thomas Hellstrom
  0 siblings, 0 replies; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-08  8:14 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

On 10/07/2011 11:30 PM, Marek Olšák wrote:
> On Fri, Oct 7, 2011 at 3:38 PM, Jerome Glisse<j.glisse@gmail.com>  wrote:
>    
>> I should have look at the patch long ago ... anyway i think a better
>> approach would be to expose fence id as 64bits unsigned to each
>> userspace client. I was thinking of mapping a page readonly (same page
>> as the one gpu write back) at somespecific offset in drm file (bit
>> like sarea but readonly so no lock). Each time userspace submit a
>> command stream it would get the fence id associated with the command
>> stream. It would then be up to userspace to track btw read or write
>> and do appropriate things. The kernel code would be simple (biggest
>> issue is finding an offset we can use for that), it would be fast as
>> no round trip to kernel to know the last fence.
>>
>> Each fence seq would be valid only for a specific ring (only future
>> gpu impacted here, maybe cayman).
>>
>> So no change to ttm, just change to radeon to return fence seq and to
>> move to an unsigned 64. Issue would be when gpu write back is
>> disabled, then we would either need userspace to call somethings like
>> bo wait or to other ioctl to get the kernel to update the copy, copy
>> would be updated in the irq handler too so at least it get updated
>> anytime something enable irq.
>>      
> I am having a hard time understanding what you are saying.
>
> Anyway, I had some read and write usage tracking in the radeon winsys.
> That worked well for driver-private resources, but it was a total fail
> for the ones shared with the DDX. I needed this bo_wait optimization
> to work with multiple processes. That's the whole point why I am doing
> this in the kernel.
>
> Marek
>    
At one XDS meeting in Cambridge an IMHO questionable decision was taken to
try to keep synchronization operations like this in user-space, 
communicating necessary
info among involved components. In this case you'd then need to send 
fence sequences
down the DRI2 protocol to the winsys.

However, if you at one point want to do user-space suballocation of 
kernel buffers,
that's something you need to do anyway, because the kernel is not aware 
that user-space fences
the suballocations separately.

/Thomas



_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07  8:58 ` Thomas Hellstrom
@ 2011-10-08 10:26   ` Ville Syrjälä
  2011-10-08 11:10     ` Thomas Hellstrom
  0 siblings, 1 reply; 31+ messages in thread
From: Ville Syrjälä @ 2011-10-08 10:26 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Fri, Oct 07, 2011 at 10:58:13AM +0200, Thomas Hellstrom wrote:
> Oh, and one more style comment below:
> 
> On 08/07/2011 10:39 PM, Marek Olšák wrote:
> >
> > +enum ttm_buffer_usage {
> > +    TTM_USAGE_READ = 1,
> > +    TTM_USAGE_WRITE = 2,
> > +    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
> > +};
> >
> >    
> 
> Please don't use enums for bit operations.

Now I'm curious. Why not?

-- 
Ville Syrjälä
syrjala@sci.fi
http://www.sci.fi/~syrjala/
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-08 10:26   ` Ville Syrjälä
@ 2011-10-08 11:10     ` Thomas Hellstrom
  2011-10-08 11:27       ` Ville Syrjälä
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-08 11:10 UTC (permalink / raw)
  To: Marek Olšák, dri-devel

On 10/08/2011 12:26 PM, Ville Syrjälä wrote:
> On Fri, Oct 07, 2011 at 10:58:13AM +0200, Thomas Hellstrom wrote:
>    
>> Oh, and one more style comment below:
>>
>> On 08/07/2011 10:39 PM, Marek Olšák wrote:
>>      
>>> +enum ttm_buffer_usage {
>>> +    TTM_USAGE_READ = 1,
>>> +    TTM_USAGE_WRITE = 2,
>>> +    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
>>> +};
>>>
>>>
>>>        
>> Please don't use enums for bit operations.
>>      
> Now I'm curious. Why not?
>
>    
Because it's inconsistent with how flags are defined in the rest of the 
TTM module.

/Thomas


_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-08 11:10     ` Thomas Hellstrom
@ 2011-10-08 11:27       ` Ville Syrjälä
  2011-10-08 11:32         ` Thomas Hellstrom
  0 siblings, 1 reply; 31+ messages in thread
From: Ville Syrjälä @ 2011-10-08 11:27 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Sat, Oct 08, 2011 at 01:10:13PM +0200, Thomas Hellstrom wrote:
> On 10/08/2011 12:26 PM, Ville Syrjälä wrote:
> > On Fri, Oct 07, 2011 at 10:58:13AM +0200, Thomas Hellstrom wrote:
> >    
> >> Oh, and one more style comment below:
> >>
> >> On 08/07/2011 10:39 PM, Marek Olšák wrote:
> >>      
> >>> +enum ttm_buffer_usage {
> >>> +    TTM_USAGE_READ = 1,
> >>> +    TTM_USAGE_WRITE = 2,
> >>> +    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
> >>> +};
> >>>
> >>>
> >>>        
> >> Please don't use enums for bit operations.
> >>      
> > Now I'm curious. Why not?
> >
> >    
> Because it's inconsistent with how flags are defined in the rest of the 
> TTM module.

Ah OK. I was wondering if there's some subtle technical issue involved.
I've recently gotten to the habit of using enums for pretty much all
constants. Just easier on the eye IMHO, and avoids cpp output from
looking like number soup.

-- 
Ville Syrjälä
syrjala@sci.fi
http://www.sci.fi/~syrjala/
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-08 11:27       ` Ville Syrjälä
@ 2011-10-08 11:32         ` Thomas Hellstrom
  2011-10-24 16:42           ` Marek Olšák
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-08 11:32 UTC (permalink / raw)
  To: Marek Olšák, dri-devel

On 10/08/2011 01:27 PM, Ville Syrjälä wrote:
> On Sat, Oct 08, 2011 at 01:10:13PM +0200, Thomas Hellstrom wrote:
>    
>> On 10/08/2011 12:26 PM, Ville Syrjälä wrote:
>>      
>>> On Fri, Oct 07, 2011 at 10:58:13AM +0200, Thomas Hellstrom wrote:
>>>
>>>        
>>>> Oh, and one more style comment below:
>>>>
>>>> On 08/07/2011 10:39 PM, Marek Olšák wrote:
>>>>
>>>>          
>>>>> +enum ttm_buffer_usage {
>>>>> +    TTM_USAGE_READ = 1,
>>>>> +    TTM_USAGE_WRITE = 2,
>>>>> +    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
>>>>> +};
>>>>>
>>>>>
>>>>>
>>>>>            
>>>> Please don't use enums for bit operations.
>>>>
>>>>          
>>> Now I'm curious. Why not?
>>>
>>>
>>>        
>> Because it's inconsistent with how flags are defined in the rest of the
>> TTM module.
>>      
> Ah OK. I was wondering if there's some subtle technical issue involved.
> I've recently gotten to the habit of using enums for pretty much all
> constants. Just easier on the eye IMHO, and avoids cpp output from
> looking like number soup.
>
>    

Yes, there are a number of advantages, including symbolic debugger output.
If we had flag enums that enumerated 1, 2, 4, 8 etc. I'd feel motivated 
to move
all TTM definitions over.

/Thomas



_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-07 22:03           ` Marek Olšák
@ 2011-10-24 14:48             ` Thomas Hellstrom
  2011-10-24 17:10               ` Marek Olšák
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-24 14:48 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

On 10/08/2011 12:03 AM, Marek Olšák wrote:
> On Fri, Oct 7, 2011 at 10:00 AM, Thomas Hellstrom<thomas@shipmail.org>  wrote:
>    
>> OK. First I think we need to make a distinction: bo sync objects vs driver
>> fences. The bo sync obj api is there to strictly provide functionality that
>> the ttm bo subsystem is using, and that follows a simple set of rules:
>>
>> 1) the bo subsystem does never assume sync objects are ordered. That means
>> the bo subsystem needs to wait on a sync object before removing it from a
>> buffer. Any other assumption is buggy and must be fixed. BUT, if that
>> assumption takes place in the driver unknowingly from the ttm bo subsystem
>> (which is usually the case), it's OK.
>>
>> 2) When the sync object(s) attached to the bo are signaled the ttm bo
>> subsystem is free to copy the bo contents and to unbind the bo.
>>
>> 3) The ttm bo system allows sync objects to be signaled in different ways
>> opaque to the subsystem using sync_obj_arg. The driver is responsible for
>> setting up that argument.
>>
>> 4) Driver fences may be used for or expose other functionality or adaptions
>> to APIs as long as the sync obj api exported to the bo sybsystem follows the
>> above rules.
>>
>> This means the following w r t the patch.
>>
>> A) it violates 1). This is a bug that must be fixed. Assumptions that if one
>> sync object is singnaled, another sync object is also signaled must be done
>> in the driver and not in the bo subsystem. Hence we need to explicitly wait
>> for a fence to remove it from the bo.
>>
>> B) the sync_obj_arg carries *per-sync-obj* information on how it should be
>> signaled. If we need to attach multiple sync objects to a buffer object, we
>> also need multiple sync_obj_args. This is a bug and needs to be fixed.
>>
>> C) There is really only one reason that the ttm bo subsystem should care
>> about multiple sync objects, and that is because the driver can't order them
>> efficiently. A such example would be hardware with multiple pipes reading
>> simultaneously from the same texture buffer. Currently we don't support this
>> so only the *last* sync object needs to be know by the bo subsystem. Keeping
>> track of multiple fences generates a lot of completely unnecessary code in
>> the ttm_bo_util file, the ttm_bo_vm file, and will be a nightmare if / when
>> we truly support pipelined moves.
>>
>> As I understand it from your patches, you want to keep multiple fences
>> around only to track rendering history. If we want to do that generically, i
>> suggest doing it in the execbuf util code in the following way:
>>
>> struct ttm_eu_rendering_history {
>>     void *last_read_sync_obj;
>>     void *last_read_sync_obj_arg;
>>     void *last_write_sync_obj;
>>     void *last_write_sync_obj_arg;
>> }
>>
>> Embed this structure in the radeon_bo, and build a small api around it,
>> including *optionally* passing it to the existing execbuf utilities, and you
>> should be done. The bo_util code and bo_vm code doesn't care about the
>> rendering history. Only that the bo is completely idle.
>>
>> Note also that when an accelerated bo move is scheduled, the driver needs to
>> update this struct
>>      
> OK, sounds good. I'll fix what should be fixed and send a patch when
> it's ready. I am now not so sure whether doing this generically is a
> good idea. :)
>
> Marek
>    

Marek,
Any progress on this. The merge window is about to open soon I guess and 
we need a fix by then.

/Thomas



_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-08 11:32         ` Thomas Hellstrom
@ 2011-10-24 16:42           ` Marek Olšák
  2011-10-24 16:47             ` Thomas Hellstrom
  0 siblings, 1 reply; 31+ messages in thread
From: Marek Olšák @ 2011-10-24 16:42 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

On Sat, Oct 8, 2011 at 1:32 PM, Thomas Hellstrom <thomas@shipmail.org> wrote:
> On 10/08/2011 01:27 PM, Ville Syrjälä wrote:
>>
>> On Sat, Oct 08, 2011 at 01:10:13PM +0200, Thomas Hellstrom wrote:
>>
>>>
>>> On 10/08/2011 12:26 PM, Ville Syrjälä wrote:
>>>
>>>>
>>>> On Fri, Oct 07, 2011 at 10:58:13AM +0200, Thomas Hellstrom wrote:
>>>>
>>>>
>>>>>
>>>>> Oh, and one more style comment below:
>>>>>
>>>>> On 08/07/2011 10:39 PM, Marek Olšák wrote:
>>>>>
>>>>>
>>>>>>
>>>>>> +enum ttm_buffer_usage {
>>>>>> +    TTM_USAGE_READ = 1,
>>>>>> +    TTM_USAGE_WRITE = 2,
>>>>>> +    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
>>>>>> +};
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>
>>>>> Please don't use enums for bit operations.
>>>>>
>>>>>
>>>>
>>>> Now I'm curious. Why not?
>>>>
>>>>
>>>>
>>>
>>> Because it's inconsistent with how flags are defined in the rest of the
>>> TTM module.
>>>
>>
>> Ah OK. I was wondering if there's some subtle technical issue involved.
>> I've recently gotten to the habit of using enums for pretty much all
>> constants. Just easier on the eye IMHO, and avoids cpp output from
>> looking like number soup.
>>
>>
>
> Yes, there are a number of advantages, including symbolic debugger output.
> If we had flag enums that enumerated 1, 2, 4, 8 etc. I'd feel motivated to
> move
> all TTM definitions over.

I don't think that how it is enumerated matters in any way. What I
like about enums, besides what has already been mentioned, is that it
adds a self-documentation in the code. Compare:

void ttm_set_bo_flags(unsigned flags);

And:

void ttm_set_bo_flags(enum ttm_bo_flags flags);

The latter is way easier to understand for somebody who doesn't know
the code and wants to implement his first patch. With the latter, it's
clear at first glance what are the valid values for "flags", because
you can just search for "enum ttm_bo_flags".

I will change the enum to defines for the sake of following your code
style convention, but it's an unreasonable convention to say the
least.

Marek
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-24 16:42           ` Marek Olšák
@ 2011-10-24 16:47             ` Thomas Hellstrom
  0 siblings, 0 replies; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-24 16:47 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

On 10/24/2011 06:42 PM, Marek Olšák wrote:
> On Sat, Oct 8, 2011 at 1:32 PM, Thomas Hellstrom<thomas@shipmail.org>  wrote:
>    
>> On 10/08/2011 01:27 PM, Ville Syrjälä wrote:
>>      
>>> On Sat, Oct 08, 2011 at 01:10:13PM +0200, Thomas Hellstrom wrote:
>>>
>>>        
>>>> On 10/08/2011 12:26 PM, Ville Syrjälä wrote:
>>>>
>>>>          
>>>>> On Fri, Oct 07, 2011 at 10:58:13AM +0200, Thomas Hellstrom wrote:
>>>>>
>>>>>
>>>>>            
>>>>>> Oh, and one more style comment below:
>>>>>>
>>>>>> On 08/07/2011 10:39 PM, Marek Olšák wrote:
>>>>>>
>>>>>>
>>>>>>              
>>>>>>> +enum ttm_buffer_usage {
>>>>>>> +    TTM_USAGE_READ = 1,
>>>>>>> +    TTM_USAGE_WRITE = 2,
>>>>>>> +    TTM_USAGE_READWRITE = TTM_USAGE_READ | TTM_USAGE_WRITE
>>>>>>> +};
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>                
>>>>>> Please don't use enums for bit operations.
>>>>>>
>>>>>>
>>>>>>              
>>>>> Now I'm curious. Why not?
>>>>>
>>>>>
>>>>>
>>>>>            
>>>> Because it's inconsistent with how flags are defined in the rest of the
>>>> TTM module.
>>>>
>>>>          
>>> Ah OK. I was wondering if there's some subtle technical issue involved.
>>> I've recently gotten to the habit of using enums for pretty much all
>>> constants. Just easier on the eye IMHO, and avoids cpp output from
>>> looking like number soup.
>>>
>>>
>>>        
>> Yes, there are a number of advantages, including symbolic debugger output.
>> If we had flag enums that enumerated 1, 2, 4, 8 etc. I'd feel motivated to
>> move
>> all TTM definitions over.
>>      
> I don't think that how it is enumerated matters in any way. What I
> like about enums, besides what has already been mentioned, is that it
> adds a self-documentation in the code. Compare:
>
> void ttm_set_bo_flags(unsigned flags);
>
> And:
>
> void ttm_set_bo_flags(enum ttm_bo_flags flags);
>
> The latter is way easier to understand for somebody who doesn't know
> the code and wants to implement his first patch. With the latter, it's
> clear at first glance what are the valid values for "flags", because
> you can just search for "enum ttm_bo_flags".
>
> I will change the enum to defines for the sake of following your code
> style convention, but it's an unreasonable convention to say the
> least.
>
> Marek
>    

I'm not going to argue against this, because you're probably right.

The important thing is that we get the fix in with or without enums.

Thanks,
Thomas



_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-24 14:48             ` Thomas Hellstrom
@ 2011-10-24 17:10               ` Marek Olšák
  2011-10-24 17:28                 ` Thomas Hellstrom
  0 siblings, 1 reply; 31+ messages in thread
From: Marek Olšák @ 2011-10-24 17:10 UTC (permalink / raw)
  To: Thomas Hellstrom; +Cc: dri-devel

Hi Thomas,

I have made no progress so far due to lack of time.

Would you mind if I fixed the most important things first, which are:
- sync objects are not ordered, (A)
- every sync object must have its corresponding sync_obj_arg, (B)
and if I fixed (C) some time later.

I planned on moving the two sync objects from ttm into radeon and not
using ttm_bo_wait from radeon (i.e. pretty much reimplementing what it
does), but it looks more complicated to me than I had originally
thought.

Marek

On Mon, Oct 24, 2011 at 4:48 PM, Thomas Hellstrom <thomas@shipmail.org> wrote:
>
> Marek,
> Any progress on this. The merge window is about to open soon I guess and we
> need a fix by then.
>
> /Thomas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-24 17:10               ` Marek Olšák
@ 2011-10-24 17:28                 ` Thomas Hellstrom
  2011-10-24 17:39                   ` Marek Olšák
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Hellstrom @ 2011-10-24 17:28 UTC (permalink / raw)
  To: Marek Olšák; +Cc: dri-devel

Marek,

The problem is that the patch adds a lot of complicated code where it's 
not needed, and I don't want to end up reverting that code and 
re-implementing the new Radeon gem ioctl by myself.

Having a list of two fence objects and waiting for either of them 
shouldn't be that complicated to implement, in particular when it's done 
in a driver-specific way and you have the benefit of assuming that they 
are ordered.

Since the new functionality is a performance improvement, If time is an 
issue, I suggest we back this change out and go for the next merge window.

/Thomas


On 10/24/2011 07:10 PM, Marek Olšák wrote:
> Hi Thomas,
>
> I have made no progress so far due to lack of time.
>
> Would you mind if I fixed the most important things first, which are:
> - sync objects are not ordered, (A)
> - every sync object must have its corresponding sync_obj_arg, (B)
> and if I fixed (C) some time later.
>
> I planned on moving the two sync objects from ttm into radeon and not
> using ttm_bo_wait from radeon (i.e. pretty much reimplementing what it
> does), but it looks more complicated to me than I had originally
> thought.
>
> Marek
>
> On Mon, Oct 24, 2011 at 4:48 PM, Thomas Hellstrom<thomas@shipmail.org>  wrote:
>    
>> Marek,
>> Any progress on this. The merge window is about to open soon I guess and we
>> need a fix by then.
>>
>> /Thomas
>>      



_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write
  2011-10-24 17:28                 ` Thomas Hellstrom
@ 2011-10-24 17:39                   ` Marek Olšák
  0 siblings, 0 replies; 31+ messages in thread
From: Marek Olšák @ 2011-10-24 17:39 UTC (permalink / raw)
  To: Thomas Hellstrom, Dave Airlie; +Cc: dri-devel

Alright then.

Dave, if you are reading this, feel free not to include the two
patches I sent you in the next pull request.

Marek

On Mon, Oct 24, 2011 at 7:28 PM, Thomas Hellstrom <thomas@shipmail.org> wrote:
> Marek,
>
> The problem is that the patch adds a lot of complicated code where it's not
> needed, and I don't want to end up reverting that code and re-implementing
> the new Radeon gem ioctl by myself.
>
> Having a list of two fence objects and waiting for either of them shouldn't
> be that complicated to implement, in particular when it's done in a
> driver-specific way and you have the benefit of assuming that they are
> ordered.
>
> Since the new functionality is a performance improvement, If time is an
> issue, I suggest we back this change out and go for the next merge window.
>
> /Thomas
>
>
> On 10/24/2011 07:10 PM, Marek Olšák wrote:
>>
>> Hi Thomas,
>>
>> I have made no progress so far due to lack of time.
>>
>> Would you mind if I fixed the most important things first, which are:
>> - sync objects are not ordered, (A)
>> - every sync object must have its corresponding sync_obj_arg, (B)
>> and if I fixed (C) some time later.
>>
>> I planned on moving the two sync objects from ttm into radeon and not
>> using ttm_bo_wait from radeon (i.e. pretty much reimplementing what it
>> does), but it looks more complicated to me than I had originally
>> thought.
>>
>> Marek
>>
>> On Mon, Oct 24, 2011 at 4:48 PM, Thomas Hellstrom<thomas@shipmail.org>
>>  wrote:
>>
>>>
>>> Marek,
>>> Any progress on this. The merge window is about to open soon I guess and
>>> we
>>> need a fix by then.
>>>
>>> /Thomas
>>>
>
>
>
>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2011-10-24 17:40 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-07 20:39 [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write Marek Olšák
2011-08-07 20:39 ` [PATCH 2/2] drm/radeon/kms: add a new gem_wait ioctl with read/write flags Marek Olšák
2011-08-12 17:22   ` Jerome Glisse
2011-08-12 17:21 ` [PATCH 1/2] drm/ttm: add a way to bo_wait for either the last read or last write Jerome Glisse
2011-08-13 20:32   ` Marek Olšák
2011-10-04 11:48 ` Thomas Hellstrom
2011-10-05  2:08   ` Marek Olšák
2011-10-05  5:54     ` Thomas Hellstrom
2011-10-06 22:42       ` Marek Olšák
2011-10-07  8:00         ` Thomas Hellstrom
2011-10-07 13:24           ` Alex Deucher
2011-10-07 14:05             ` Thomas Hellstrom
2011-10-07 14:14               ` Alex Deucher
2011-10-07 13:38           ` Jerome Glisse
2011-10-07 13:42             ` Jerome Glisse
2011-10-07 21:07               ` Marek Olšák
2011-10-07 14:09             ` Thomas Hellstrom
2011-10-07 21:30             ` Marek Olšák
2011-10-08  8:14               ` Thomas Hellstrom
2011-10-07 22:03           ` Marek Olšák
2011-10-24 14:48             ` Thomas Hellstrom
2011-10-24 17:10               ` Marek Olšák
2011-10-24 17:28                 ` Thomas Hellstrom
2011-10-24 17:39                   ` Marek Olšák
2011-10-07  8:58 ` Thomas Hellstrom
2011-10-08 10:26   ` Ville Syrjälä
2011-10-08 11:10     ` Thomas Hellstrom
2011-10-08 11:27       ` Ville Syrjälä
2011-10-08 11:32         ` Thomas Hellstrom
2011-10-24 16:42           ` Marek Olšák
2011-10-24 16:47             ` Thomas Hellstrom

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.