* [PATCH] drm/ttm: remove fence_lock
@ 2012-10-12 15:23 Maarten Lankhorst
2012-10-18 7:28 ` Thomas Hellstrom
0 siblings, 1 reply; 15+ messages in thread
From: Maarten Lankhorst @ 2012-10-12 15:23 UTC (permalink / raw)
To: dri-devel
With the nouveau calls fixed there were only 2 places left that used
fence_lock without a reservation, those are fixed in this patch:
ttm_bo_cleanup_refs_or_queue is fixed by simply doing things the other
way around.
ttm_bo_cleanup_refs is fixed by taking a reservation first, then a pointer
to the fence object and backs off from the reservation if waiting has to
be performed.
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
---
This patch should be applied only after all the previous patches I sent are applied,
since it depends on sync_obj_arg removal (kinda, probably fails to apply otherwise),
uses ttm_bo_is_reserved, and depends on nouveau wait behavior being fixed.
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 -
drivers/gpu/drm/nouveau/nouveau_gem.c | 15 +---
drivers/gpu/drm/radeon/radeon_object.c | 2 -
drivers/gpu/drm/ttm/ttm_bo.c | 124 ++++++++++++--------------------
drivers/gpu/drm/ttm/ttm_bo_util.c | 3 -
drivers/gpu/drm/ttm/ttm_bo_vm.c | 5 -
drivers/gpu/drm/ttm/ttm_execbuf_util.c | 2 -
drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 3 -
include/drm/ttm/ttm_bo_api.h | 12 +--
include/drm/ttm/ttm_bo_driver.h | 3 -
10 files changed, 52 insertions(+), 121 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index cee7448..9749c45 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1449,10 +1449,8 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence)
if (likely(fence))
nouveau_fence_ref(fence);
- spin_lock(&nvbo->bo.bdev->fence_lock);
old_fence = nvbo->bo.sync_obj;
nvbo->bo.sync_obj = fence;
- spin_unlock(&nvbo->bo.bdev->fence_lock);
nouveau_fence_unref(&old_fence);
}
@@ -1552,9 +1550,7 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma)
if (vma->node) {
if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) {
ttm_bo_reserve(&nvbo->bo, false, false, false, 0);
- spin_lock(&nvbo->bo.bdev->fence_lock);
ttm_bo_wait(&nvbo->bo, false, false, false);
- spin_unlock(&nvbo->bo.bdev->fence_lock);
ttm_bo_unreserve(&nvbo->bo);
nouveau_vm_unmap(vma);
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 6d8391d..eaa700a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -391,18 +391,11 @@ retry:
static int
validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo)
{
- struct nouveau_fence *fence = NULL;
+ struct nouveau_fence *fence = nvbo->bo.sync_obj;
int ret = 0;
- spin_lock(&nvbo->bo.bdev->fence_lock);
- if (nvbo->bo.sync_obj)
- fence = nouveau_fence_ref(nvbo->bo.sync_obj);
- spin_unlock(&nvbo->bo.bdev->fence_lock);
-
- if (fence) {
+ if (fence)
ret = nouveau_fence_sync(fence, chan);
- nouveau_fence_unref(&fence);
- }
return ret;
}
@@ -614,9 +607,7 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev,
data |= r->vor;
}
- spin_lock(&nvbo->bo.bdev->fence_lock);
ret = ttm_bo_wait(&nvbo->bo, false, false, false);
- spin_unlock(&nvbo->bo.bdev->fence_lock);
if (ret) {
NV_ERROR(drm, "reloc wait_idle failed: %d\n", ret);
break;
@@ -848,11 +839,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0);
if (!ret) {
- spin_lock(&nvbo->bo.bdev->fence_lock);
ret = ttm_bo_wait(&nvbo->bo, true, true, true);
if (!no_wait && ret)
fence = nouveau_fence_ref(nvbo->bo.sync_obj);
- spin_unlock(&nvbo->bo.bdev->fence_lock);
ttm_bo_unreserve(&nvbo->bo);
}
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index 808c444..df430e4 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -613,12 +613,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, bool no_wait)
r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0);
if (unlikely(r != 0))
return r;
- spin_lock(&bo->tbo.bdev->fence_lock);
if (mem_type)
*mem_type = bo->tbo.mem.mem_type;
if (bo->tbo.sync_obj)
r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
- spin_unlock(&bo->tbo.bdev->fence_lock);
ttm_bo_unreserve(&bo->tbo);
return r;
}
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index f6d7026..69fc29b 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -498,28 +498,23 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
{
struct ttm_bo_device *bdev = bo->bdev;
struct ttm_bo_global *glob = bo->glob;
- struct ttm_bo_driver *driver;
+ struct ttm_bo_driver *driver = bdev->driver;
void *sync_obj = NULL;
int put_count;
int ret;
- spin_lock(&bdev->fence_lock);
- (void) ttm_bo_wait(bo, false, false, true);
- if (!bo->sync_obj) {
-
- spin_lock(&glob->lru_lock);
-
- /**
- * Lock inversion between bo:reserve and bdev::fence_lock here,
- * but that's OK, since we're only trylocking.
- */
-
- ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
+ spin_lock(&glob->lru_lock);
+ ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
+ if (!ret) {
+ ret = ttm_bo_wait(bo, false, false, true);
- if (unlikely(ret == -EBUSY))
+ if (unlikely(ret == -EBUSY)) {
+ sync_obj = driver->sync_obj_ref(bo->sync_obj);
+ atomic_set(&bo->reserved, 0);
+ wake_up_all(&bo->event_queue);
goto queue;
+ }
- spin_unlock(&bdev->fence_lock);
put_count = ttm_bo_del_from_lru(bo);
spin_unlock(&glob->lru_lock);
@@ -528,18 +523,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
ttm_bo_list_ref_sub(bo, put_count, true);
return;
- } else {
- spin_lock(&glob->lru_lock);
}
queue:
- driver = bdev->driver;
- if (bo->sync_obj)
- sync_obj = driver->sync_obj_ref(bo->sync_obj);
-
kref_get(&bo->list_kref);
list_add_tail(&bo->ddestroy, &bdev->ddestroy);
spin_unlock(&glob->lru_lock);
- spin_unlock(&bdev->fence_lock);
if (sync_obj) {
driver->sync_obj_flush(sync_obj);
@@ -565,25 +553,15 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
bool no_wait_gpu)
{
struct ttm_bo_device *bdev = bo->bdev;
+ struct ttm_bo_driver *driver = bdev->driver;
struct ttm_bo_global *glob = bo->glob;
int put_count;
int ret = 0;
+ void *sync_obj;
retry:
- spin_lock(&bdev->fence_lock);
- ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
- spin_unlock(&bdev->fence_lock);
-
- if (unlikely(ret != 0))
- return ret;
-
spin_lock(&glob->lru_lock);
- if (unlikely(list_empty(&bo->ddestroy))) {
- spin_unlock(&glob->lru_lock);
- return 0;
- }
-
ret = ttm_bo_reserve_locked(bo, interruptible,
no_wait_reserve, false, 0);
@@ -592,18 +570,37 @@ retry:
return ret;
}
- /**
- * We can re-check for sync object without taking
- * the bo::lock since setting the sync object requires
- * also bo::reserved. A busy object at this point may
- * be caused by another thread recently starting an accelerated
- * eviction.
- */
+ if (unlikely(list_empty(&bo->ddestroy))) {
+ atomic_set(&bo->reserved, 0);
+ wake_up_all(&bo->event_queue);
+ spin_unlock(&glob->lru_lock);
+ return 0;
+ }
+ ret = ttm_bo_wait(bo, false, false, true);
+
+ if (ret) {
+ if (no_wait_gpu) {
+ atomic_set(&bo->reserved, 0);
+ wake_up_all(&bo->event_queue);
+ spin_unlock(&glob->lru_lock);
+ return ret;
+ }
- if (unlikely(bo->sync_obj)) {
+ /**
+ * Take a reference to the fence and unreserve, if the wait
+ * was succesful and no new sync_obj was attached,
+ * ttm_bo_wait in retry will return ret = 0, and end the loop.
+ */
+
+ sync_obj = driver->sync_obj_ref(&bo->sync_obj);
atomic_set(&bo->reserved, 0);
wake_up_all(&bo->event_queue);
spin_unlock(&glob->lru_lock);
+
+ ret = driver->sync_obj_wait(bo->sync_obj, false, interruptible);
+ driver->sync_obj_unref(&sync_obj);
+ if (ret)
+ return ret;
goto retry;
}
@@ -735,9 +732,7 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible,
struct ttm_placement placement;
int ret = 0;
- spin_lock(&bdev->fence_lock);
ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
- spin_unlock(&bdev->fence_lock);
if (unlikely(ret != 0)) {
if (ret != -ERESTARTSYS) {
@@ -1054,7 +1049,6 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
{
int ret = 0;
struct ttm_mem_reg mem;
- struct ttm_bo_device *bdev = bo->bdev;
BUG_ON(!ttm_bo_is_reserved(bo));
@@ -1063,9 +1057,7 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
* Have the driver move function wait for idle when necessary,
* instead of doing it here.
*/
- spin_lock(&bdev->fence_lock);
ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
- spin_unlock(&bdev->fence_lock);
if (ret)
return ret;
mem.num_pages = bo->num_pages;
@@ -1554,7 +1546,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
bdev->glob = glob;
bdev->need_dma32 = need_dma32;
bdev->val_seq = 0;
- spin_lock_init(&bdev->fence_lock);
mutex_lock(&glob->device_list_mutex);
list_add_tail(&bdev->device_list, &glob->device_list);
mutex_unlock(&glob->device_list_mutex);
@@ -1690,52 +1681,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo,
bool lazy, bool interruptible, bool no_wait)
{
struct ttm_bo_driver *driver = bo->bdev->driver;
- struct ttm_bo_device *bdev = bo->bdev;
- void *sync_obj;
int ret = 0;
+ WARN_ON_ONCE(!ttm_bo_is_reserved(bo));
+
if (likely(bo->sync_obj == NULL))
return 0;
- while (bo->sync_obj) {
-
+ if (bo->sync_obj) {
if (driver->sync_obj_signaled(bo->sync_obj)) {
void *tmp_obj = bo->sync_obj;
bo->sync_obj = NULL;
clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
- spin_unlock(&bdev->fence_lock);
driver->sync_obj_unref(&tmp_obj);
- spin_lock(&bdev->fence_lock);
- continue;
+ return 0;
}
if (no_wait)
return -EBUSY;
- sync_obj = driver->sync_obj_ref(bo->sync_obj);
- spin_unlock(&bdev->fence_lock);
- ret = driver->sync_obj_wait(sync_obj,
+ ret = driver->sync_obj_wait(bo->sync_obj,
lazy, interruptible);
if (unlikely(ret != 0)) {
- driver->sync_obj_unref(&sync_obj);
- spin_lock(&bdev->fence_lock);
return ret;
}
- spin_lock(&bdev->fence_lock);
- if (likely(bo->sync_obj == sync_obj)) {
- void *tmp_obj = bo->sync_obj;
- bo->sync_obj = NULL;
- clear_bit(TTM_BO_PRIV_FLAG_MOVING,
- &bo->priv_flags);
- spin_unlock(&bdev->fence_lock);
- driver->sync_obj_unref(&sync_obj);
- driver->sync_obj_unref(&tmp_obj);
- spin_lock(&bdev->fence_lock);
- } else {
- spin_unlock(&bdev->fence_lock);
- driver->sync_obj_unref(&sync_obj);
- spin_lock(&bdev->fence_lock);
- }
+ driver->sync_obj_unref(&bo->sync_obj);
+ clear_bit(TTM_BO_PRIV_FLAG_MOVING,
+ &bo->priv_flags);
}
return 0;
}
@@ -1799,9 +1771,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
* Wait for GPU, then move to system cached.
*/
- spin_lock(&bo->bdev->fence_lock);
ret = ttm_bo_wait(bo, false, false, false);
- spin_unlock(&bo->bdev->fence_lock);
if (unlikely(ret != 0))
goto out;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index d80d1e8..84d6e01 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -622,7 +622,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
struct ttm_buffer_object *ghost_obj;
void *tmp_obj = NULL;
- spin_lock(&bdev->fence_lock);
if (bo->sync_obj) {
tmp_obj = bo->sync_obj;
bo->sync_obj = NULL;
@@ -630,7 +629,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
bo->sync_obj = driver->sync_obj_ref(sync_obj);
if (evict) {
ret = ttm_bo_wait(bo, false, false, false);
- spin_unlock(&bdev->fence_lock);
if (tmp_obj)
driver->sync_obj_unref(&tmp_obj);
if (ret)
@@ -653,7 +651,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
*/
set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
- spin_unlock(&bdev->fence_lock);
if (tmp_obj)
driver->sync_obj_unref(&tmp_obj);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index a877813..55718c2 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -122,17 +122,14 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* move.
*/
- spin_lock(&bdev->fence_lock);
if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
ret = ttm_bo_wait(bo, false, true, false);
- spin_unlock(&bdev->fence_lock);
if (unlikely(ret != 0)) {
retval = (ret != -ERESTARTSYS) ?
VM_FAULT_SIGBUS : VM_FAULT_NOPAGE;
goto out_unlock;
}
- } else
- spin_unlock(&bdev->fence_lock);
+ }
ret = ttm_mem_io_lock(man, true);
if (unlikely(ret != 0)) {
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index 721886e..51b5e97 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -207,7 +207,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
driver = bdev->driver;
glob = bo->glob;
- spin_lock(&bdev->fence_lock);
spin_lock(&glob->lru_lock);
list_for_each_entry(entry, list, head) {
@@ -218,7 +217,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
entry->reserved = false;
}
spin_unlock(&glob->lru_lock);
- spin_unlock(&bdev->fence_lock);
list_for_each_entry(entry, list, head) {
if (entry->old_sync_obj)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index ed3c1e7..e038c9a 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -251,13 +251,10 @@ static void vmw_dummy_query_bo_prepare(struct vmw_private *dev_priv)
volatile SVGA3dQueryResult *result;
bool dummy;
int ret;
- struct ttm_bo_device *bdev = &dev_priv->bdev;
struct ttm_buffer_object *bo = dev_priv->dummy_query_bo;
ttm_bo_reserve(bo, false, false, false, 0);
- spin_lock(&bdev->fence_lock);
ret = ttm_bo_wait(bo, false, false, false);
- spin_unlock(&bdev->fence_lock);
if (unlikely(ret != 0))
(void) vmw_fallback_wait(dev_priv, false, true, 0, false,
10*HZ);
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 816d9b1..6c69224 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -222,6 +222,8 @@ struct ttm_buffer_object {
struct file *persistent_swap_storage;
struct ttm_tt *ttm;
bool evicted;
+ void *sync_obj;
+ unsigned long priv_flags;
/**
* Members protected by the bdev::lru_lock.
@@ -242,16 +244,6 @@ struct ttm_buffer_object {
atomic_t reserved;
/**
- * Members protected by struct buffer_object_device::fence_lock
- * In addition, setting sync_obj to anything else
- * than NULL requires bo::reserved to be held. This allows for
- * checking NULL while reserved but not holding the mentioned lock.
- */
-
- void *sync_obj;
- unsigned long priv_flags;
-
- /**
* Members protected by the bdev::vm_lock
*/
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 0c8c3b5..aac101b 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -515,8 +515,6 @@ struct ttm_bo_global {
*
* @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
* @man: An array of mem_type_managers.
- * @fence_lock: Protects the synchronizing members on *all* bos belonging
- * to this device.
* @addr_space_mm: Range manager for the device address space.
* lru_lock: Spinlock that protects the buffer+device lru lists and
* ddestroy lists.
@@ -539,7 +537,6 @@ struct ttm_bo_device {
struct ttm_bo_driver *driver;
rwlock_t vm_lock;
struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
- spinlock_t fence_lock;
/*
* Protected by the vm lock.
*/
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-12 15:23 [PATCH] drm/ttm: remove fence_lock Maarten Lankhorst
@ 2012-10-18 7:28 ` Thomas Hellstrom
2012-10-18 7:59 ` Thomas Hellstrom
0 siblings, 1 reply; 15+ messages in thread
From: Thomas Hellstrom @ 2012-10-18 7:28 UTC (permalink / raw)
To: Maarten Lankhorst; +Cc: dri-devel
Hi, Maarten,
As you know I have been having my doubts about this change.
To me it seems insane to be forced to read the fence pointer under a
reserved lock, simply because when you take the reserve lock, another
process may have it and there is a substantial chance that that process
will also be waiting for idle while holding the reserve lock.
In essence, to read the fence pointer, there is a large chance you will
be waiting for idle to be able to access it. That's why it's protected by
a separate spinlock in the first place.
So even if this change might not affect current driver much it's a change
to the TTM api that leads to an IMHO very poor design.
/Thomas
On 10/12/2012 05:23 PM, Maarten Lankhorst wrote:
> With the nouveau calls fixed there were only 2 places left that used
> fence_lock without a reservation, those are fixed in this patch:
>
> ttm_bo_cleanup_refs_or_queue is fixed by simply doing things the other
> way around.
>
> ttm_bo_cleanup_refs is fixed by taking a reservation first, then a pointer
> to the fence object and backs off from the reservation if waiting has to
> be performed.
>
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
> ---
> This patch should be applied only after all the previous patches I sent are applied,
> since it depends on sync_obj_arg removal (kinda, probably fails to apply otherwise),
> uses ttm_bo_is_reserved, and depends on nouveau wait behavior being fixed.
>
> drivers/gpu/drm/nouveau/nouveau_bo.c | 4 -
> drivers/gpu/drm/nouveau/nouveau_gem.c | 15 +---
> drivers/gpu/drm/radeon/radeon_object.c | 2 -
> drivers/gpu/drm/ttm/ttm_bo.c | 124 ++++++++++++--------------------
> drivers/gpu/drm/ttm/ttm_bo_util.c | 3 -
> drivers/gpu/drm/ttm/ttm_bo_vm.c | 5 -
> drivers/gpu/drm/ttm/ttm_execbuf_util.c | 2 -
> drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 3 -
> include/drm/ttm/ttm_bo_api.h | 12 +--
> include/drm/ttm/ttm_bo_driver.h | 3 -
> 10 files changed, 52 insertions(+), 121 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index cee7448..9749c45 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -1449,10 +1449,8 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence)
> if (likely(fence))
> nouveau_fence_ref(fence);
>
> - spin_lock(&nvbo->bo.bdev->fence_lock);
> old_fence = nvbo->bo.sync_obj;
> nvbo->bo.sync_obj = fence;
> - spin_unlock(&nvbo->bo.bdev->fence_lock);
>
> nouveau_fence_unref(&old_fence);
> }
> @@ -1552,9 +1550,7 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma)
> if (vma->node) {
> if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) {
> ttm_bo_reserve(&nvbo->bo, false, false, false, 0);
> - spin_lock(&nvbo->bo.bdev->fence_lock);
> ttm_bo_wait(&nvbo->bo, false, false, false);
> - spin_unlock(&nvbo->bo.bdev->fence_lock);
> ttm_bo_unreserve(&nvbo->bo);
> nouveau_vm_unmap(vma);
> }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index 6d8391d..eaa700a 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -391,18 +391,11 @@ retry:
> static int
> validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo)
> {
> - struct nouveau_fence *fence = NULL;
> + struct nouveau_fence *fence = nvbo->bo.sync_obj;
> int ret = 0;
>
> - spin_lock(&nvbo->bo.bdev->fence_lock);
> - if (nvbo->bo.sync_obj)
> - fence = nouveau_fence_ref(nvbo->bo.sync_obj);
> - spin_unlock(&nvbo->bo.bdev->fence_lock);
> -
> - if (fence) {
> + if (fence)
> ret = nouveau_fence_sync(fence, chan);
> - nouveau_fence_unref(&fence);
> - }
>
> return ret;
> }
> @@ -614,9 +607,7 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev,
> data |= r->vor;
> }
>
> - spin_lock(&nvbo->bo.bdev->fence_lock);
> ret = ttm_bo_wait(&nvbo->bo, false, false, false);
> - spin_unlock(&nvbo->bo.bdev->fence_lock);
> if (ret) {
> NV_ERROR(drm, "reloc wait_idle failed: %d\n", ret);
> break;
> @@ -848,11 +839,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
>
> ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0);
> if (!ret) {
> - spin_lock(&nvbo->bo.bdev->fence_lock);
> ret = ttm_bo_wait(&nvbo->bo, true, true, true);
> if (!no_wait && ret)
> fence = nouveau_fence_ref(nvbo->bo.sync_obj);
> - spin_unlock(&nvbo->bo.bdev->fence_lock);
>
> ttm_bo_unreserve(&nvbo->bo);
> }
> diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
> index 808c444..df430e4 100644
> --- a/drivers/gpu/drm/radeon/radeon_object.c
> +++ b/drivers/gpu/drm/radeon/radeon_object.c
> @@ -613,12 +613,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, bool no_wait)
> r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0);
> if (unlikely(r != 0))
> return r;
> - spin_lock(&bo->tbo.bdev->fence_lock);
> if (mem_type)
> *mem_type = bo->tbo.mem.mem_type;
> if (bo->tbo.sync_obj)
> r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
> - spin_unlock(&bo->tbo.bdev->fence_lock);
> ttm_bo_unreserve(&bo->tbo);
> return r;
> }
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index f6d7026..69fc29b 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -498,28 +498,23 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
> {
> struct ttm_bo_device *bdev = bo->bdev;
> struct ttm_bo_global *glob = bo->glob;
> - struct ttm_bo_driver *driver;
> + struct ttm_bo_driver *driver = bdev->driver;
> void *sync_obj = NULL;
> int put_count;
> int ret;
>
> - spin_lock(&bdev->fence_lock);
> - (void) ttm_bo_wait(bo, false, false, true);
> - if (!bo->sync_obj) {
> -
> - spin_lock(&glob->lru_lock);
> -
> - /**
> - * Lock inversion between bo:reserve and bdev::fence_lock here,
> - * but that's OK, since we're only trylocking.
> - */
> -
> - ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
> + spin_lock(&glob->lru_lock);
> + ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
> + if (!ret) {
> + ret = ttm_bo_wait(bo, false, false, true);
>
> - if (unlikely(ret == -EBUSY))
> + if (unlikely(ret == -EBUSY)) {
> + sync_obj = driver->sync_obj_ref(bo->sync_obj);
> + atomic_set(&bo->reserved, 0);
> + wake_up_all(&bo->event_queue);
> goto queue;
> + }
>
> - spin_unlock(&bdev->fence_lock);
> put_count = ttm_bo_del_from_lru(bo);
>
> spin_unlock(&glob->lru_lock);
> @@ -528,18 +523,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
> ttm_bo_list_ref_sub(bo, put_count, true);
>
> return;
> - } else {
> - spin_lock(&glob->lru_lock);
> }
> queue:
> - driver = bdev->driver;
> - if (bo->sync_obj)
> - sync_obj = driver->sync_obj_ref(bo->sync_obj);
> -
> kref_get(&bo->list_kref);
> list_add_tail(&bo->ddestroy, &bdev->ddestroy);
> spin_unlock(&glob->lru_lock);
> - spin_unlock(&bdev->fence_lock);
>
> if (sync_obj) {
> driver->sync_obj_flush(sync_obj);
> @@ -565,25 +553,15 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
> bool no_wait_gpu)
> {
> struct ttm_bo_device *bdev = bo->bdev;
> + struct ttm_bo_driver *driver = bdev->driver;
> struct ttm_bo_global *glob = bo->glob;
> int put_count;
> int ret = 0;
> + void *sync_obj;
>
> retry:
> - spin_lock(&bdev->fence_lock);
> - ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> - spin_unlock(&bdev->fence_lock);
> -
> - if (unlikely(ret != 0))
> - return ret;
> -
> spin_lock(&glob->lru_lock);
>
> - if (unlikely(list_empty(&bo->ddestroy))) {
> - spin_unlock(&glob->lru_lock);
> - return 0;
> - }
> -
> ret = ttm_bo_reserve_locked(bo, interruptible,
> no_wait_reserve, false, 0);
>
> @@ -592,18 +570,37 @@ retry:
> return ret;
> }
>
> - /**
> - * We can re-check for sync object without taking
> - * the bo::lock since setting the sync object requires
> - * also bo::reserved. A busy object at this point may
> - * be caused by another thread recently starting an accelerated
> - * eviction.
> - */
> + if (unlikely(list_empty(&bo->ddestroy))) {
> + atomic_set(&bo->reserved, 0);
> + wake_up_all(&bo->event_queue);
> + spin_unlock(&glob->lru_lock);
> + return 0;
> + }
> + ret = ttm_bo_wait(bo, false, false, true);
> +
> + if (ret) {
> + if (no_wait_gpu) {
> + atomic_set(&bo->reserved, 0);
> + wake_up_all(&bo->event_queue);
> + spin_unlock(&glob->lru_lock);
> + return ret;
> + }
>
> - if (unlikely(bo->sync_obj)) {
> + /**
> + * Take a reference to the fence and unreserve, if the wait
> + * was succesful and no new sync_obj was attached,
> + * ttm_bo_wait in retry will return ret = 0, and end the loop.
> + */
> +
> + sync_obj = driver->sync_obj_ref(&bo->sync_obj);
> atomic_set(&bo->reserved, 0);
> wake_up_all(&bo->event_queue);
> spin_unlock(&glob->lru_lock);
> +
> + ret = driver->sync_obj_wait(bo->sync_obj, false, interruptible);
> + driver->sync_obj_unref(&sync_obj);
> + if (ret)
> + return ret;
> goto retry;
> }
>
> @@ -735,9 +732,7 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible,
> struct ttm_placement placement;
> int ret = 0;
>
> - spin_lock(&bdev->fence_lock);
> ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> - spin_unlock(&bdev->fence_lock);
>
> if (unlikely(ret != 0)) {
> if (ret != -ERESTARTSYS) {
> @@ -1054,7 +1049,6 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
> {
> int ret = 0;
> struct ttm_mem_reg mem;
> - struct ttm_bo_device *bdev = bo->bdev;
>
> BUG_ON(!ttm_bo_is_reserved(bo));
>
> @@ -1063,9 +1057,7 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
> * Have the driver move function wait for idle when necessary,
> * instead of doing it here.
> */
> - spin_lock(&bdev->fence_lock);
> ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
> - spin_unlock(&bdev->fence_lock);
> if (ret)
> return ret;
> mem.num_pages = bo->num_pages;
> @@ -1554,7 +1546,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
> bdev->glob = glob;
> bdev->need_dma32 = need_dma32;
> bdev->val_seq = 0;
> - spin_lock_init(&bdev->fence_lock);
> mutex_lock(&glob->device_list_mutex);
> list_add_tail(&bdev->device_list, &glob->device_list);
> mutex_unlock(&glob->device_list_mutex);
> @@ -1690,52 +1681,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo,
> bool lazy, bool interruptible, bool no_wait)
> {
> struct ttm_bo_driver *driver = bo->bdev->driver;
> - struct ttm_bo_device *bdev = bo->bdev;
> - void *sync_obj;
> int ret = 0;
>
> + WARN_ON_ONCE(!ttm_bo_is_reserved(bo));
> +
> if (likely(bo->sync_obj == NULL))
> return 0;
>
> - while (bo->sync_obj) {
> -
> + if (bo->sync_obj) {
> if (driver->sync_obj_signaled(bo->sync_obj)) {
> void *tmp_obj = bo->sync_obj;
> bo->sync_obj = NULL;
> clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
> - spin_unlock(&bdev->fence_lock);
> driver->sync_obj_unref(&tmp_obj);
> - spin_lock(&bdev->fence_lock);
> - continue;
> + return 0;
> }
>
> if (no_wait)
> return -EBUSY;
>
> - sync_obj = driver->sync_obj_ref(bo->sync_obj);
> - spin_unlock(&bdev->fence_lock);
> - ret = driver->sync_obj_wait(sync_obj,
> + ret = driver->sync_obj_wait(bo->sync_obj,
> lazy, interruptible);
> if (unlikely(ret != 0)) {
> - driver->sync_obj_unref(&sync_obj);
> - spin_lock(&bdev->fence_lock);
> return ret;
> }
> - spin_lock(&bdev->fence_lock);
> - if (likely(bo->sync_obj == sync_obj)) {
> - void *tmp_obj = bo->sync_obj;
> - bo->sync_obj = NULL;
> - clear_bit(TTM_BO_PRIV_FLAG_MOVING,
> - &bo->priv_flags);
> - spin_unlock(&bdev->fence_lock);
> - driver->sync_obj_unref(&sync_obj);
> - driver->sync_obj_unref(&tmp_obj);
> - spin_lock(&bdev->fence_lock);
> - } else {
> - spin_unlock(&bdev->fence_lock);
> - driver->sync_obj_unref(&sync_obj);
> - spin_lock(&bdev->fence_lock);
> - }
> + driver->sync_obj_unref(&bo->sync_obj);
> + clear_bit(TTM_BO_PRIV_FLAG_MOVING,
> + &bo->priv_flags);
> }
> return 0;
> }
> @@ -1799,9 +1771,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink)
> * Wait for GPU, then move to system cached.
> */
>
> - spin_lock(&bo->bdev->fence_lock);
> ret = ttm_bo_wait(bo, false, false, false);
> - spin_unlock(&bo->bdev->fence_lock);
>
> if (unlikely(ret != 0))
> goto out;
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
> index d80d1e8..84d6e01 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> @@ -622,7 +622,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
> struct ttm_buffer_object *ghost_obj;
> void *tmp_obj = NULL;
>
> - spin_lock(&bdev->fence_lock);
> if (bo->sync_obj) {
> tmp_obj = bo->sync_obj;
> bo->sync_obj = NULL;
> @@ -630,7 +629,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
> bo->sync_obj = driver->sync_obj_ref(sync_obj);
> if (evict) {
> ret = ttm_bo_wait(bo, false, false, false);
> - spin_unlock(&bdev->fence_lock);
> if (tmp_obj)
> driver->sync_obj_unref(&tmp_obj);
> if (ret)
> @@ -653,7 +651,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
> */
>
> set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
> - spin_unlock(&bdev->fence_lock);
> if (tmp_obj)
> driver->sync_obj_unref(&tmp_obj);
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index a877813..55718c2 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -122,17 +122,14 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> * move.
> */
>
> - spin_lock(&bdev->fence_lock);
> if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
> ret = ttm_bo_wait(bo, false, true, false);
> - spin_unlock(&bdev->fence_lock);
> if (unlikely(ret != 0)) {
> retval = (ret != -ERESTARTSYS) ?
> VM_FAULT_SIGBUS : VM_FAULT_NOPAGE;
> goto out_unlock;
> }
> - } else
> - spin_unlock(&bdev->fence_lock);
> + }
>
> ret = ttm_mem_io_lock(man, true);
> if (unlikely(ret != 0)) {
> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> index 721886e..51b5e97 100644
> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
> @@ -207,7 +207,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
> driver = bdev->driver;
> glob = bo->glob;
>
> - spin_lock(&bdev->fence_lock);
> spin_lock(&glob->lru_lock);
>
> list_for_each_entry(entry, list, head) {
> @@ -218,7 +217,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj)
> entry->reserved = false;
> }
> spin_unlock(&glob->lru_lock);
> - spin_unlock(&bdev->fence_lock);
>
> list_for_each_entry(entry, list, head) {
> if (entry->old_sync_obj)
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> index ed3c1e7..e038c9a 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> @@ -251,13 +251,10 @@ static void vmw_dummy_query_bo_prepare(struct vmw_private *dev_priv)
> volatile SVGA3dQueryResult *result;
> bool dummy;
> int ret;
> - struct ttm_bo_device *bdev = &dev_priv->bdev;
> struct ttm_buffer_object *bo = dev_priv->dummy_query_bo;
>
> ttm_bo_reserve(bo, false, false, false, 0);
> - spin_lock(&bdev->fence_lock);
> ret = ttm_bo_wait(bo, false, false, false);
> - spin_unlock(&bdev->fence_lock);
> if (unlikely(ret != 0))
> (void) vmw_fallback_wait(dev_priv, false, true, 0, false,
> 10*HZ);
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> index 816d9b1..6c69224 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -222,6 +222,8 @@ struct ttm_buffer_object {
> struct file *persistent_swap_storage;
> struct ttm_tt *ttm;
> bool evicted;
> + void *sync_obj;
> + unsigned long priv_flags;
>
> /**
> * Members protected by the bdev::lru_lock.
> @@ -242,16 +244,6 @@ struct ttm_buffer_object {
> atomic_t reserved;
>
> /**
> - * Members protected by struct buffer_object_device::fence_lock
> - * In addition, setting sync_obj to anything else
> - * than NULL requires bo::reserved to be held. This allows for
> - * checking NULL while reserved but not holding the mentioned lock.
> - */
> -
> - void *sync_obj;
> - unsigned long priv_flags;
> -
> - /**
> * Members protected by the bdev::vm_lock
> */
>
> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
> index 0c8c3b5..aac101b 100644
> --- a/include/drm/ttm/ttm_bo_driver.h
> +++ b/include/drm/ttm/ttm_bo_driver.h
> @@ -515,8 +515,6 @@ struct ttm_bo_global {
> *
> * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
> * @man: An array of mem_type_managers.
> - * @fence_lock: Protects the synchronizing members on *all* bos belonging
> - * to this device.
> * @addr_space_mm: Range manager for the device address space.
> * lru_lock: Spinlock that protects the buffer+device lru lists and
> * ddestroy lists.
> @@ -539,7 +537,6 @@ struct ttm_bo_device {
> struct ttm_bo_driver *driver;
> rwlock_t vm_lock;
> struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
> - spinlock_t fence_lock;
> /*
> * Protected by the vm lock.
> */
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-18 7:28 ` Thomas Hellstrom
@ 2012-10-18 7:59 ` Thomas Hellstrom
2012-10-18 8:37 ` Maarten Lankhorst
0 siblings, 1 reply; 15+ messages in thread
From: Thomas Hellstrom @ 2012-10-18 7:59 UTC (permalink / raw)
To: Maarten Lankhorst; +Cc: dri-devel
On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
> Hi, Maarten,
>
> As you know I have been having my doubts about this change.
> To me it seems insane to be forced to read the fence pointer under a
> reserved lock, simply because when you take the reserve lock, another
> process may have it and there is a substantial chance that that process
> will also be waiting for idle while holding the reserve lock.
>
> In essence, to read the fence pointer, there is a large chance you will
> be waiting for idle to be able to access it. That's why it's protected by
> a separate spinlock in the first place.
>
> So even if this change might not affect current driver much it's a change
> to the TTM api that leads to an IMHO very poor design.
>
> /Thomas
>
One way we could perhaps improve on this, if you think this is
necessary, is to
build a bit on the initial rcu suggestion, still require reservation
when the fence pointer is modified,
but to also use rcu_assign_pointer() for the fence pointer.
Anyone that wants quick access to the fence pointer would then use
rcu_dereference_x(), but
if the fence is indeed signaled, that caller needs to avoid setting the
bo fence pointer to NULL.
A check for bo idle without taking any (bo) locks would then mean imply
reading the fence pointer
in this fashion, and if it's non-NULL check whether the fence is signaled.
/Thomas
>
> On 10/12/2012 05:23 PM, Maarten Lankhorst wrote:
>> With the nouveau calls fixed there were only 2 places left that used
>> fence_lock without a reservation, those are fixed in this patch:
>>
>> ttm_bo_cleanup_refs_or_queue is fixed by simply doing things the other
>> way around.
>>
>> ttm_bo_cleanup_refs is fixed by taking a reservation first, then a
>> pointer
>> to the fence object and backs off from the reservation if waiting has to
>> be performed.
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
>> ---
>> This patch should be applied only after all the previous patches I
>> sent are applied,
>> since it depends on sync_obj_arg removal (kinda, probably fails to
>> apply otherwise),
>> uses ttm_bo_is_reserved, and depends on nouveau wait behavior being
>> fixed.
>>
>> drivers/gpu/drm/nouveau/nouveau_bo.c | 4 -
>> drivers/gpu/drm/nouveau/nouveau_gem.c | 15 +---
>> drivers/gpu/drm/radeon/radeon_object.c | 2 -
>> drivers/gpu/drm/ttm/ttm_bo.c | 124
>> ++++++++++++--------------------
>> drivers/gpu/drm/ttm/ttm_bo_util.c | 3 -
>> drivers/gpu/drm/ttm/ttm_bo_vm.c | 5 -
>> drivers/gpu/drm/ttm/ttm_execbuf_util.c | 2 -
>> drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 3 -
>> include/drm/ttm/ttm_bo_api.h | 12 +--
>> include/drm/ttm/ttm_bo_driver.h | 3 -
>> 10 files changed, 52 insertions(+), 121 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
>> b/drivers/gpu/drm/nouveau/nouveau_bo.c
>> index cee7448..9749c45 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
>> @@ -1449,10 +1449,8 @@ nouveau_bo_fence(struct nouveau_bo *nvbo,
>> struct nouveau_fence *fence)
>> if (likely(fence))
>> nouveau_fence_ref(fence);
>> - spin_lock(&nvbo->bo.bdev->fence_lock);
>> old_fence = nvbo->bo.sync_obj;
>> nvbo->bo.sync_obj = fence;
>> - spin_unlock(&nvbo->bo.bdev->fence_lock);
>> nouveau_fence_unref(&old_fence);
>> }
>> @@ -1552,9 +1550,7 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo,
>> struct nouveau_vma *vma)
>> if (vma->node) {
>> if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) {
>> ttm_bo_reserve(&nvbo->bo, false, false, false, 0);
>> - spin_lock(&nvbo->bo.bdev->fence_lock);
>> ttm_bo_wait(&nvbo->bo, false, false, false);
>> - spin_unlock(&nvbo->bo.bdev->fence_lock);
>> ttm_bo_unreserve(&nvbo->bo);
>> nouveau_vm_unmap(vma);
>> }
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> index 6d8391d..eaa700a 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> @@ -391,18 +391,11 @@ retry:
>> static int
>> validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo)
>> {
>> - struct nouveau_fence *fence = NULL;
>> + struct nouveau_fence *fence = nvbo->bo.sync_obj;
>> int ret = 0;
>> - spin_lock(&nvbo->bo.bdev->fence_lock);
>> - if (nvbo->bo.sync_obj)
>> - fence = nouveau_fence_ref(nvbo->bo.sync_obj);
>> - spin_unlock(&nvbo->bo.bdev->fence_lock);
>> -
>> - if (fence) {
>> + if (fence)
>> ret = nouveau_fence_sync(fence, chan);
>> - nouveau_fence_unref(&fence);
>> - }
>> return ret;
>> }
>> @@ -614,9 +607,7 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device
>> *dev,
>> data |= r->vor;
>> }
>> - spin_lock(&nvbo->bo.bdev->fence_lock);
>> ret = ttm_bo_wait(&nvbo->bo, false, false, false);
>> - spin_unlock(&nvbo->bo.bdev->fence_lock);
>> if (ret) {
>> NV_ERROR(drm, "reloc wait_idle failed: %d\n", ret);
>> break;
>> @@ -848,11 +839,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device
>> *dev, void *data,
>> ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0);
>> if (!ret) {
>> - spin_lock(&nvbo->bo.bdev->fence_lock);
>> ret = ttm_bo_wait(&nvbo->bo, true, true, true);
>> if (!no_wait && ret)
>> fence = nouveau_fence_ref(nvbo->bo.sync_obj);
>> - spin_unlock(&nvbo->bo.bdev->fence_lock);
>> ttm_bo_unreserve(&nvbo->bo);
>> }
>> diff --git a/drivers/gpu/drm/radeon/radeon_object.c
>> b/drivers/gpu/drm/radeon/radeon_object.c
>> index 808c444..df430e4 100644
>> --- a/drivers/gpu/drm/radeon/radeon_object.c
>> +++ b/drivers/gpu/drm/radeon/radeon_object.c
>> @@ -613,12 +613,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32
>> *mem_type, bool no_wait)
>> r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0);
>> if (unlikely(r != 0))
>> return r;
>> - spin_lock(&bo->tbo.bdev->fence_lock);
>> if (mem_type)
>> *mem_type = bo->tbo.mem.mem_type;
>> if (bo->tbo.sync_obj)
>> r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
>> - spin_unlock(&bo->tbo.bdev->fence_lock);
>> ttm_bo_unreserve(&bo->tbo);
>> return r;
>> }
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> index f6d7026..69fc29b 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -498,28 +498,23 @@ static void ttm_bo_cleanup_refs_or_queue(struct
>> ttm_buffer_object *bo)
>> {
>> struct ttm_bo_device *bdev = bo->bdev;
>> struct ttm_bo_global *glob = bo->glob;
>> - struct ttm_bo_driver *driver;
>> + struct ttm_bo_driver *driver = bdev->driver;
>> void *sync_obj = NULL;
>> int put_count;
>> int ret;
>> - spin_lock(&bdev->fence_lock);
>> - (void) ttm_bo_wait(bo, false, false, true);
>> - if (!bo->sync_obj) {
>> -
>> - spin_lock(&glob->lru_lock);
>> -
>> - /**
>> - * Lock inversion between bo:reserve and bdev::fence_lock here,
>> - * but that's OK, since we're only trylocking.
>> - */
>> -
>> - ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
>> + spin_lock(&glob->lru_lock);
>> + ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
>> + if (!ret) {
>> + ret = ttm_bo_wait(bo, false, false, true);
>> - if (unlikely(ret == -EBUSY))
>> + if (unlikely(ret == -EBUSY)) {
>> + sync_obj = driver->sync_obj_ref(bo->sync_obj);
>> + atomic_set(&bo->reserved, 0);
>> + wake_up_all(&bo->event_queue);
>> goto queue;
>> + }
>> - spin_unlock(&bdev->fence_lock);
>> put_count = ttm_bo_del_from_lru(bo);
>> spin_unlock(&glob->lru_lock);
>> @@ -528,18 +523,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct
>> ttm_buffer_object *bo)
>> ttm_bo_list_ref_sub(bo, put_count, true);
>> return;
>> - } else {
>> - spin_lock(&glob->lru_lock);
>> }
>> queue:
>> - driver = bdev->driver;
>> - if (bo->sync_obj)
>> - sync_obj = driver->sync_obj_ref(bo->sync_obj);
>> -
>> kref_get(&bo->list_kref);
>> list_add_tail(&bo->ddestroy, &bdev->ddestroy);
>> spin_unlock(&glob->lru_lock);
>> - spin_unlock(&bdev->fence_lock);
>> if (sync_obj) {
>> driver->sync_obj_flush(sync_obj);
>> @@ -565,25 +553,15 @@ static int ttm_bo_cleanup_refs(struct
>> ttm_buffer_object *bo,
>> bool no_wait_gpu)
>> {
>> struct ttm_bo_device *bdev = bo->bdev;
>> + struct ttm_bo_driver *driver = bdev->driver;
>> struct ttm_bo_global *glob = bo->glob;
>> int put_count;
>> int ret = 0;
>> + void *sync_obj;
>> retry:
>> - spin_lock(&bdev->fence_lock);
>> - ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
>> - spin_unlock(&bdev->fence_lock);
>> -
>> - if (unlikely(ret != 0))
>> - return ret;
>> -
>> spin_lock(&glob->lru_lock);
>> - if (unlikely(list_empty(&bo->ddestroy))) {
>> - spin_unlock(&glob->lru_lock);
>> - return 0;
>> - }
>> -
>> ret = ttm_bo_reserve_locked(bo, interruptible,
>> no_wait_reserve, false, 0);
>> @@ -592,18 +570,37 @@ retry:
>> return ret;
>> }
>> - /**
>> - * We can re-check for sync object without taking
>> - * the bo::lock since setting the sync object requires
>> - * also bo::reserved. A busy object at this point may
>> - * be caused by another thread recently starting an accelerated
>> - * eviction.
>> - */
>> + if (unlikely(list_empty(&bo->ddestroy))) {
>> + atomic_set(&bo->reserved, 0);
>> + wake_up_all(&bo->event_queue);
>> + spin_unlock(&glob->lru_lock);
>> + return 0;
>> + }
>> + ret = ttm_bo_wait(bo, false, false, true);
>> +
>> + if (ret) {
>> + if (no_wait_gpu) {
>> + atomic_set(&bo->reserved, 0);
>> + wake_up_all(&bo->event_queue);
>> + spin_unlock(&glob->lru_lock);
>> + return ret;
>> + }
>> - if (unlikely(bo->sync_obj)) {
>> + /**
>> + * Take a reference to the fence and unreserve, if the wait
>> + * was succesful and no new sync_obj was attached,
>> + * ttm_bo_wait in retry will return ret = 0, and end the loop.
>> + */
>> +
>> + sync_obj = driver->sync_obj_ref(&bo->sync_obj);
>> atomic_set(&bo->reserved, 0);
>> wake_up_all(&bo->event_queue);
>> spin_unlock(&glob->lru_lock);
>> +
>> + ret = driver->sync_obj_wait(bo->sync_obj, false,
>> interruptible);
>> + driver->sync_obj_unref(&sync_obj);
>> + if (ret)
>> + return ret;
>> goto retry;
>> }
>> @@ -735,9 +732,7 @@ static int ttm_bo_evict(struct
>> ttm_buffer_object *bo, bool interruptible,
>> struct ttm_placement placement;
>> int ret = 0;
>> - spin_lock(&bdev->fence_lock);
>> ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
>> - spin_unlock(&bdev->fence_lock);
>> if (unlikely(ret != 0)) {
>> if (ret != -ERESTARTSYS) {
>> @@ -1054,7 +1049,6 @@ int ttm_bo_move_buffer(struct ttm_buffer_object
>> *bo,
>> {
>> int ret = 0;
>> struct ttm_mem_reg mem;
>> - struct ttm_bo_device *bdev = bo->bdev;
>> BUG_ON(!ttm_bo_is_reserved(bo));
>> @@ -1063,9 +1057,7 @@ int ttm_bo_move_buffer(struct
>> ttm_buffer_object *bo,
>> * Have the driver move function wait for idle when necessary,
>> * instead of doing it here.
>> */
>> - spin_lock(&bdev->fence_lock);
>> ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
>> - spin_unlock(&bdev->fence_lock);
>> if (ret)
>> return ret;
>> mem.num_pages = bo->num_pages;
>> @@ -1554,7 +1546,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
>> bdev->glob = glob;
>> bdev->need_dma32 = need_dma32;
>> bdev->val_seq = 0;
>> - spin_lock_init(&bdev->fence_lock);
>> mutex_lock(&glob->device_list_mutex);
>> list_add_tail(&bdev->device_list, &glob->device_list);
>> mutex_unlock(&glob->device_list_mutex);
>> @@ -1690,52 +1681,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo,
>> bool lazy, bool interruptible, bool no_wait)
>> {
>> struct ttm_bo_driver *driver = bo->bdev->driver;
>> - struct ttm_bo_device *bdev = bo->bdev;
>> - void *sync_obj;
>> int ret = 0;
>> + WARN_ON_ONCE(!ttm_bo_is_reserved(bo));
>> +
>> if (likely(bo->sync_obj == NULL))
>> return 0;
>> - while (bo->sync_obj) {
>> -
>> + if (bo->sync_obj) {
>> if (driver->sync_obj_signaled(bo->sync_obj)) {
>> void *tmp_obj = bo->sync_obj;
>> bo->sync_obj = NULL;
>> clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
>> - spin_unlock(&bdev->fence_lock);
>> driver->sync_obj_unref(&tmp_obj);
>> - spin_lock(&bdev->fence_lock);
>> - continue;
>> + return 0;
>> }
>> if (no_wait)
>> return -EBUSY;
>> - sync_obj = driver->sync_obj_ref(bo->sync_obj);
>> - spin_unlock(&bdev->fence_lock);
>> - ret = driver->sync_obj_wait(sync_obj,
>> + ret = driver->sync_obj_wait(bo->sync_obj,
>> lazy, interruptible);
>> if (unlikely(ret != 0)) {
>> - driver->sync_obj_unref(&sync_obj);
>> - spin_lock(&bdev->fence_lock);
>> return ret;
>> }
>> - spin_lock(&bdev->fence_lock);
>> - if (likely(bo->sync_obj == sync_obj)) {
>> - void *tmp_obj = bo->sync_obj;
>> - bo->sync_obj = NULL;
>> - clear_bit(TTM_BO_PRIV_FLAG_MOVING,
>> - &bo->priv_flags);
>> - spin_unlock(&bdev->fence_lock);
>> - driver->sync_obj_unref(&sync_obj);
>> - driver->sync_obj_unref(&tmp_obj);
>> - spin_lock(&bdev->fence_lock);
>> - } else {
>> - spin_unlock(&bdev->fence_lock);
>> - driver->sync_obj_unref(&sync_obj);
>> - spin_lock(&bdev->fence_lock);
>> - }
>> + driver->sync_obj_unref(&bo->sync_obj);
>> + clear_bit(TTM_BO_PRIV_FLAG_MOVING,
>> + &bo->priv_flags);
>> }
>> return 0;
>> }
>> @@ -1799,9 +1771,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink
>> *shrink)
>> * Wait for GPU, then move to system cached.
>> */
>> - spin_lock(&bo->bdev->fence_lock);
>> ret = ttm_bo_wait(bo, false, false, false);
>> - spin_unlock(&bo->bdev->fence_lock);
>> if (unlikely(ret != 0))
>> goto out;
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c
>> b/drivers/gpu/drm/ttm/ttm_bo_util.c
>> index d80d1e8..84d6e01 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
>> @@ -622,7 +622,6 @@ int ttm_bo_move_accel_cleanup(struct
>> ttm_buffer_object *bo,
>> struct ttm_buffer_object *ghost_obj;
>> void *tmp_obj = NULL;
>> - spin_lock(&bdev->fence_lock);
>> if (bo->sync_obj) {
>> tmp_obj = bo->sync_obj;
>> bo->sync_obj = NULL;
>> @@ -630,7 +629,6 @@ int ttm_bo_move_accel_cleanup(struct
>> ttm_buffer_object *bo,
>> bo->sync_obj = driver->sync_obj_ref(sync_obj);
>> if (evict) {
>> ret = ttm_bo_wait(bo, false, false, false);
>> - spin_unlock(&bdev->fence_lock);
>> if (tmp_obj)
>> driver->sync_obj_unref(&tmp_obj);
>> if (ret)
>> @@ -653,7 +651,6 @@ int ttm_bo_move_accel_cleanup(struct
>> ttm_buffer_object *bo,
>> */
>> set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
>> - spin_unlock(&bdev->fence_lock);
>> if (tmp_obj)
>> driver->sync_obj_unref(&tmp_obj);
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> index a877813..55718c2 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> @@ -122,17 +122,14 @@ static int ttm_bo_vm_fault(struct
>> vm_area_struct *vma, struct vm_fault *vmf)
>> * move.
>> */
>> - spin_lock(&bdev->fence_lock);
>> if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
>> ret = ttm_bo_wait(bo, false, true, false);
>> - spin_unlock(&bdev->fence_lock);
>> if (unlikely(ret != 0)) {
>> retval = (ret != -ERESTARTSYS) ?
>> VM_FAULT_SIGBUS : VM_FAULT_NOPAGE;
>> goto out_unlock;
>> }
>> - } else
>> - spin_unlock(&bdev->fence_lock);
>> + }
>> ret = ttm_mem_io_lock(man, true);
>> if (unlikely(ret != 0)) {
>> diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> index 721886e..51b5e97 100644
>> --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
>> @@ -207,7 +207,6 @@ void ttm_eu_fence_buffer_objects(struct list_head
>> *list, void *sync_obj)
>> driver = bdev->driver;
>> glob = bo->glob;
>> - spin_lock(&bdev->fence_lock);
>> spin_lock(&glob->lru_lock);
>> list_for_each_entry(entry, list, head) {
>> @@ -218,7 +217,6 @@ void ttm_eu_fence_buffer_objects(struct list_head
>> *list, void *sync_obj)
>> entry->reserved = false;
>> }
>> spin_unlock(&glob->lru_lock);
>> - spin_unlock(&bdev->fence_lock);
>> list_for_each_entry(entry, list, head) {
>> if (entry->old_sync_obj)
>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
>> b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
>> index ed3c1e7..e038c9a 100644
>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
>> @@ -251,13 +251,10 @@ static void vmw_dummy_query_bo_prepare(struct
>> vmw_private *dev_priv)
>> volatile SVGA3dQueryResult *result;
>> bool dummy;
>> int ret;
>> - struct ttm_bo_device *bdev = &dev_priv->bdev;
>> struct ttm_buffer_object *bo = dev_priv->dummy_query_bo;
>> ttm_bo_reserve(bo, false, false, false, 0);
>> - spin_lock(&bdev->fence_lock);
>> ret = ttm_bo_wait(bo, false, false, false);
>> - spin_unlock(&bdev->fence_lock);
>> if (unlikely(ret != 0))
>> (void) vmw_fallback_wait(dev_priv, false, true, 0, false,
>> 10*HZ);
>> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
>> index 816d9b1..6c69224 100644
>> --- a/include/drm/ttm/ttm_bo_api.h
>> +++ b/include/drm/ttm/ttm_bo_api.h
>> @@ -222,6 +222,8 @@ struct ttm_buffer_object {
>> struct file *persistent_swap_storage;
>> struct ttm_tt *ttm;
>> bool evicted;
>> + void *sync_obj;
>> + unsigned long priv_flags;
>> /**
>> * Members protected by the bdev::lru_lock.
>> @@ -242,16 +244,6 @@ struct ttm_buffer_object {
>> atomic_t reserved;
>> /**
>> - * Members protected by struct buffer_object_device::fence_lock
>> - * In addition, setting sync_obj to anything else
>> - * than NULL requires bo::reserved to be held. This allows for
>> - * checking NULL while reserved but not holding the mentioned lock.
>> - */
>> -
>> - void *sync_obj;
>> - unsigned long priv_flags;
>> -
>> - /**
>> * Members protected by the bdev::vm_lock
>> */
>> diff --git a/include/drm/ttm/ttm_bo_driver.h
>> b/include/drm/ttm/ttm_bo_driver.h
>> index 0c8c3b5..aac101b 100644
>> --- a/include/drm/ttm/ttm_bo_driver.h
>> +++ b/include/drm/ttm/ttm_bo_driver.h
>> @@ -515,8 +515,6 @@ struct ttm_bo_global {
>> *
>> * @driver: Pointer to a struct ttm_bo_driver struct setup by the
>> driver.
>> * @man: An array of mem_type_managers.
>> - * @fence_lock: Protects the synchronizing members on *all* bos
>> belonging
>> - * to this device.
>> * @addr_space_mm: Range manager for the device address space.
>> * lru_lock: Spinlock that protects the buffer+device lru lists and
>> * ddestroy lists.
>> @@ -539,7 +537,6 @@ struct ttm_bo_device {
>> struct ttm_bo_driver *driver;
>> rwlock_t vm_lock;
>> struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
>> - spinlock_t fence_lock;
>> /*
>> * Protected by the vm lock.
>> */
>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-18 7:59 ` Thomas Hellstrom
@ 2012-10-18 8:37 ` Maarten Lankhorst
2012-10-18 11:02 ` Thomas Hellstrom
0 siblings, 1 reply; 15+ messages in thread
From: Maarten Lankhorst @ 2012-10-18 8:37 UTC (permalink / raw)
To: Thomas Hellstrom; +Cc: dri-devel
Hey,
Op 18-10-12 09:59, Thomas Hellstrom schreef:
>
>
>
> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>> Hi, Maarten,
>>
>> As you know I have been having my doubts about this change.
>> To me it seems insane to be forced to read the fence pointer under a
>> reserved lock, simply because when you take the reserve lock, another
>> process may have it and there is a substantial chance that that process
>> will also be waiting for idle while holding the reserve lock.
I think it makes perfect sense, the only times you want to read the fence
is when you want to change the members protected by the reservation.
>> In essence, to read the fence pointer, there is a large chance you will
>> be waiting for idle to be able to access it. That's why it's protected by
>> a separate spinlock in the first place.
>>
>> So even if this change might not affect current driver much it's a change
>> to the TTM api that leads to an IMHO very poor design.
I would argue the opposite, no longer having a separate lock, with clear
semantics when fencing is allowed, gives you a way to clean up the core
of ttm immensely.
There were only 2 functions affected by fence lock removal and they were
on buffer destruction, ttm_bo_cleanup_refs (and *_or_queue). But since
the *_or_queue can simply change order around, you only have to worry about
ttm_bo_cleanup_refs. This function is already ugly for other reasons, and
the followup patch I was suggesting cleaned up the ugliness there too.
The only thing done differently is backing off from the reservation early.
With the cleanup it won't even try to get the reservation again, since
nobody should set a new fence on the bo when it's dead. Instead all
destruction is moved until list refcount drops to 0.
> One way we could perhaps improve on this, if you think this is necessary, is to
> build a bit on the initial rcu suggestion, still require reservation when the fence pointer is modified,
> but to also use rcu_assign_pointer() for the fence pointer.
This is a massive overkill when the only time you read the fence pointer
without reservation is during buffer destruction. RCU is only good if
there's ~10x more reads than writes, and for us it's simply 50% reads 50%
writes..
> Anyone that wants quick access to the fence pointer would then use rcu_dereference_x(), but
> if the fence is indeed signaled, that caller needs to avoid setting the bo fence pointer to NULL.
>
> A check for bo idle without taking any (bo) locks would then mean imply reading the fence pointer
> in this fashion, and if it's non-NULL check whether the fence is signaled.
Sure it may look easier, but you add more overhead and complexity. I thought
you wanted to avoid overhead in the reservation path? RCU won't be the way
to do that.
~Maarten
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-18 8:37 ` Maarten Lankhorst
@ 2012-10-18 11:02 ` Thomas Hellstrom
2012-10-18 11:38 ` Maarten Lankhorst
0 siblings, 1 reply; 15+ messages in thread
From: Thomas Hellstrom @ 2012-10-18 11:02 UTC (permalink / raw)
To: Maarten Lankhorst; +Cc: dri-devel
On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
> Hey,
>
> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>
>>
>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>> Hi, Maarten,
>>>
>>> As you know I have been having my doubts about this change.
>>> To me it seems insane to be forced to read the fence pointer under a
>>> reserved lock, simply because when you take the reserve lock, another
>>> process may have it and there is a substantial chance that that process
>>> will also be waiting for idle while holding the reserve lock.
> I think it makes perfect sense, the only times you want to read the fence
> is when you want to change the members protected by the reservation.
No, that's not true. A typical case (or the only case)
is where you want to do a map with no_wait semantics. You will want
to be able to access a buffer's results even if the eviction code
is about to schedule an unbind from the GPU, and have the buffer
reserved?
>
>>> In essence, to read the fence pointer, there is a large chance you will
>>> be waiting for idle to be able to access it. That's why it's protected by
>>> a separate spinlock in the first place.
>>>
>>> So even if this change might not affect current driver much it's a change
>>> to the TTM api that leads to an IMHO very poor design.
> I would argue the opposite, no longer having a separate lock, with clear
> semantics when fencing is allowed, gives you a way to clean up the core
> of ttm immensely.
>
> There were only 2 functions affected by fence lock removal and they were
> on buffer destruction, ttm_bo_cleanup_refs (and *_or_queue).
The actual code and the number of users is irrelevant here, since
we're discussing the implications of changing the API.
>> One way we could perhaps improve on this, if you think this is necessary, is to
>> build a bit on the initial rcu suggestion, still require reservation when the fence pointer is modified,
>> but to also use rcu_assign_pointer() for the fence pointer.
> This is a massive overkill when the only time you read the fence pointer
> without reservation is during buffer destruction. RCU is only good if
> there's ~10x more reads than writes, and for us it's simply 50% reads 50%
> writes..
>
I think you misunderstand me. I'm not suggesting going down the full RCU
path, I'm merely making
sure that reads and writes to the bo's fence pointer are atomic, using
the RCU functions
for this. I'm not suggesting any RCU syncs. This means your patch can be
kept largely as
it is, just make sure you do atomic_writes to the fence pointers.
/Thomas
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-18 11:02 ` Thomas Hellstrom
@ 2012-10-18 11:38 ` Maarten Lankhorst
2012-10-18 11:55 ` Thomas Hellstrom
0 siblings, 1 reply; 15+ messages in thread
From: Maarten Lankhorst @ 2012-10-18 11:38 UTC (permalink / raw)
To: Thomas Hellstrom; +Cc: dri-devel
Op 18-10-12 13:02, Thomas Hellstrom schreef:
> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>> Hey,
>>
>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>
>>>
>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>> Hi, Maarten,
>>>>
>>>> As you know I have been having my doubts about this change.
>>>> To me it seems insane to be forced to read the fence pointer under a
>>>> reserved lock, simply because when you take the reserve lock, another
>>>> process may have it and there is a substantial chance that that process
>>>> will also be waiting for idle while holding the reserve lock.
>> I think it makes perfect sense, the only times you want to read the fence
>> is when you want to change the members protected by the reservation.
>
> No, that's not true. A typical case (or the only case)
> is where you want to do a map with no_wait semantics. You will want
> to be able to access a buffer's results even if the eviction code
> is about to schedule an unbind from the GPU, and have the buffer
> reserved?
Well either block on reserve or return -EBUSY if reserved, presumably the
former..
ttm_bo_vm_fault does the latter already, anyway.
You don't need to hold the reservation while performing the wait itself though,
you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
take a ref to the sync_obj member and then wait after unreserving. You won't
reset sync_obj member to NULL in that case, but that should be harmless.
This will allow you to keep the reservations fast and short. Maybe a helper
would be appropriate for this since radeon and nouveau both seem to do this.
The next time someone wants to do a wait it will go through the fastpath and
unset the sync_obj member, since it's already signaled, or it's removed when
ttm_execbuffer_util is used.
~Maarten
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-18 11:38 ` Maarten Lankhorst
@ 2012-10-18 11:55 ` Thomas Hellstrom
2012-10-18 14:45 ` Maarten Lankhorst
0 siblings, 1 reply; 15+ messages in thread
From: Thomas Hellstrom @ 2012-10-18 11:55 UTC (permalink / raw)
To: Maarten Lankhorst; +Cc: dri-devel
On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>> Hey,
>>>
>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>
>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>> Hi, Maarten,
>>>>>
>>>>> As you know I have been having my doubts about this change.
>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>> process may have it and there is a substantial chance that that process
>>>>> will also be waiting for idle while holding the reserve lock.
>>> I think it makes perfect sense, the only times you want to read the fence
>>> is when you want to change the members protected by the reservation.
>> No, that's not true. A typical case (or the only case)
>> is where you want to do a map with no_wait semantics. You will want
>> to be able to access a buffer's results even if the eviction code
>> is about to schedule an unbind from the GPU, and have the buffer
>> reserved?
> Well either block on reserve or return -EBUSY if reserved, presumably the
> former..
>
> ttm_bo_vm_fault does the latter already, anyway
ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
a waiting reserve but for different reasons. Typically a user-space app
will keep
asynchronous maps to TTM during a buffer lifetime, and the buffer
lifetime may
be long as user-space apps keep caches.
That's not the same as accessing a buffer after the GPU is done with it.
>
> You don't need to hold the reservation while performing the wait itself though,
> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
> take a ref to the sync_obj member and then wait after unreserving. You won't
> reset sync_obj member to NULL in that case, but that should be harmless.
> This will allow you to keep the reservations fast and short. Maybe a helper
> would be appropriate for this since radeon and nouveau both seem to do this.
>
The problem is that as long as other users are waiting for idle with
reservation
held, for example to switch GPU engine or to delete a GPU bind, waiting
for reserve will in many case mean wait for GPU.
/Thomas
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-18 11:55 ` Thomas Hellstrom
@ 2012-10-18 14:45 ` Maarten Lankhorst
2012-10-18 16:43 ` Thomas Hellstrom
0 siblings, 1 reply; 15+ messages in thread
From: Maarten Lankhorst @ 2012-10-18 14:45 UTC (permalink / raw)
To: Thomas Hellstrom; +Cc: dri-devel
Op 18-10-12 13:55, Thomas Hellstrom schreef:
> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>> Hey,
>>>>
>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>
>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>> Hi, Maarten,
>>>>>>
>>>>>> As you know I have been having my doubts about this change.
>>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>>> process may have it and there is a substantial chance that that process
>>>>>> will also be waiting for idle while holding the reserve lock.
>>>> I think it makes perfect sense, the only times you want to read the fence
>>>> is when you want to change the members protected by the reservation.
>>> No, that's not true. A typical case (or the only case)
>>> is where you want to do a map with no_wait semantics. You will want
>>> to be able to access a buffer's results even if the eviction code
>>> is about to schedule an unbind from the GPU, and have the buffer
>>> reserved?
>> Well either block on reserve or return -EBUSY if reserved, presumably the
>> former..
>>
>> ttm_bo_vm_fault does the latter already, anyway
>
> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
> a waiting reserve but for different reasons. Typically a user-space app will keep
> asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
> be long as user-space apps keep caches.
> That's not the same as accessing a buffer after the GPU is done with it.
Ah indeed.
>>
>> You don't need to hold the reservation while performing the wait itself though,
>> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>> take a ref to the sync_obj member and then wait after unreserving. You won't
>> reset sync_obj member to NULL in that case, but that should be harmless.
>> This will allow you to keep the reservations fast and short. Maybe a helper
>> would be appropriate for this since radeon and nouveau both seem to do this.
>>
>
> The problem is that as long as other users are waiting for idle with reservation
> held, for example to switch GPU engine or to delete a GPU bind, waiting
> for reserve will in many case mean wait for GPU.
This example sounds inefficient, I know nouveau can do this, but this essentially
just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
It wouldn't surprise me if performance in nouveau could be improved simply by
fixing those cases up instead, since it stalls the application completely too for other uses.
If reservations are held, it also becomes very easy to find all outliers, I didn't hook
this part of lockdep up yet, but I intend to. See Documentation/lockstat.txt .
Lockstat becomes slightly trickier since we do multi object reservation, but we
could follow the same path as lock_mutex_interruptible in the interrupted case.
If the waiting on a reservation becomes a problem, I intend to make it a very
measurable problem. :-)
And the only reason I haven't fixed the nouveau function I mentioned
is because I wanted to see if I could make this show up as issue, and
check how much it affects normal workloads.
All other places already did the reservation before wait, so it would be
really valuable to quantify how much of a problem this really is, and fix
up those pain points instead.
~Maarten
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-18 14:45 ` Maarten Lankhorst
@ 2012-10-18 16:43 ` Thomas Hellstrom
2012-10-18 17:04 ` Jerome Glisse
0 siblings, 1 reply; 15+ messages in thread
From: Thomas Hellstrom @ 2012-10-18 16:43 UTC (permalink / raw)
To: Maarten Lankhorst; +Cc: dri-devel
On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
> Op 18-10-12 13:55, Thomas Hellstrom schreef:
>> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>>> Hey,
>>>>>
>>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>>> Hi, Maarten,
>>>>>>>
>>>>>>> As you know I have been having my doubts about this change.
>>>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>>>> process may have it and there is a substantial chance that that process
>>>>>>> will also be waiting for idle while holding the reserve lock.
>>>>> I think it makes perfect sense, the only times you want to read the fence
>>>>> is when you want to change the members protected by the reservation.
>>>> No, that's not true. A typical case (or the only case)
>>>> is where you want to do a map with no_wait semantics. You will want
>>>> to be able to access a buffer's results even if the eviction code
>>>> is about to schedule an unbind from the GPU, and have the buffer
>>>> reserved?
>>> Well either block on reserve or return -EBUSY if reserved, presumably the
>>> former..
>>>
>>> ttm_bo_vm_fault does the latter already, anyway
>> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
>> a waiting reserve but for different reasons. Typically a user-space app will keep
>> asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
>> be long as user-space apps keep caches.
>> That's not the same as accessing a buffer after the GPU is done with it.
> Ah indeed.
>>> You don't need to hold the reservation while performing the wait itself though,
>>> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>>> take a ref to the sync_obj member and then wait after unreserving. You won't
>>> reset sync_obj member to NULL in that case, but that should be harmless.
>>> This will allow you to keep the reservations fast and short. Maybe a helper
>>> would be appropriate for this since radeon and nouveau both seem to do this.
>>>
>> The problem is that as long as other users are waiting for idle with reservation
>> held, for example to switch GPU engine or to delete a GPU bind, waiting
>> for reserve will in many case mean wait for GPU.
> This example sounds inefficient, I know nouveau can do this, but this essentially
> just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
> It wouldn't surprise me if performance in nouveau could be improved simply by
> fixing those cases up instead, since it stalls the application completely too for other uses.
>
Please see the Nouveau cpu_prep patch as well.
While there are a number of cases that can be fixed up, also in Radeon,
there's no way around that reservation
is a heavyweight lock that, particularly on simpler hardware without
support for fence ordering
with barriers and / or "semaphores" and accelerated eviction will be
held while waiting for idle.
As such, it is unsuitable to protect read access to the fence pointer.
If the intention is to keep a single fence pointer
I think the best solution is to allow reading the fence pointer outside
reservation, but make sure this can be done
atomically. If the intention is to protect an array or list of fence
pointers, I think a spinlock is the better solution.
/Thomas
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-18 16:43 ` Thomas Hellstrom
@ 2012-10-18 17:04 ` Jerome Glisse
2014-03-20 23:55 ` Dave Airlie
0 siblings, 1 reply; 15+ messages in thread
From: Jerome Glisse @ 2012-10-18 17:04 UTC (permalink / raw)
To: Thomas Hellstrom; +Cc: dri-devel
On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
> >Op 18-10-12 13:55, Thomas Hellstrom schreef:
> >>On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
> >>>Op 18-10-12 13:02, Thomas Hellstrom schreef:
> >>>>On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
> >>>>>Hey,
> >>>>>
> >>>>>Op 18-10-12 09:59, Thomas Hellstrom schreef:
> >>>>>>On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
> >>>>>>>Hi, Maarten,
> >>>>>>>
> >>>>>>>As you know I have been having my doubts about this change.
> >>>>>>>To me it seems insane to be forced to read the fence pointer under a
> >>>>>>>reserved lock, simply because when you take the reserve lock, another
> >>>>>>>process may have it and there is a substantial chance that that process
> >>>>>>>will also be waiting for idle while holding the reserve lock.
> >>>>>I think it makes perfect sense, the only times you want to read the fence
> >>>>>is when you want to change the members protected by the reservation.
> >>>>No, that's not true. A typical case (or the only case)
> >>>>is where you want to do a map with no_wait semantics. You will want
> >>>>to be able to access a buffer's results even if the eviction code
> >>>>is about to schedule an unbind from the GPU, and have the buffer
> >>>>reserved?
> >>>Well either block on reserve or return -EBUSY if reserved, presumably the
> >>>former..
> >>>
> >>>ttm_bo_vm_fault does the latter already, anyway
> >>ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
> >>a waiting reserve but for different reasons. Typically a user-space app will keep
> >>asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
> >>be long as user-space apps keep caches.
> >>That's not the same as accessing a buffer after the GPU is done with it.
> >Ah indeed.
> >>>You don't need to hold the reservation while performing the wait itself though,
> >>>you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
> >>>take a ref to the sync_obj member and then wait after unreserving. You won't
> >>>reset sync_obj member to NULL in that case, but that should be harmless.
> >>>This will allow you to keep the reservations fast and short. Maybe a helper
> >>>would be appropriate for this since radeon and nouveau both seem to do this.
> >>>
> >>The problem is that as long as other users are waiting for idle with reservation
> >>held, for example to switch GPU engine or to delete a GPU bind, waiting
> >>for reserve will in many case mean wait for GPU.
> >This example sounds inefficient, I know nouveau can do this, but this essentially
> >just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
> >It wouldn't surprise me if performance in nouveau could be improved simply by
> >fixing those cases up instead, since it stalls the application completely too for other uses.
> >
> Please see the Nouveau cpu_prep patch as well.
>
> While there are a number of cases that can be fixed up, also in
> Radeon, there's no way around that reservation
> is a heavyweight lock that, particularly on simpler hardware without
> support for fence ordering
> with barriers and / or "semaphores" and accelerated eviction will be
> held while waiting for idle.
>
> As such, it is unsuitable to protect read access to the fence
> pointer. If the intention is to keep a single fence pointer
> I think the best solution is to allow reading the fence pointer
> outside reservation, but make sure this can be done
> atomically. If the intention is to protect an array or list of fence
> pointers, I think a spinlock is the better solution.
>
> /Thomas
Just wanted to point out that like Thomas i am concern about having to
have object reserved when waiting on its associated fence. I fear it
will hurt us somehow.
I will try to spend couple days stress testing your branch on radeon
trying to see if it hurts performance anyhow with current use case.
Cheers,
Jerome
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2012-10-18 17:04 ` Jerome Glisse
@ 2014-03-20 23:55 ` Dave Airlie
2014-03-21 8:27 ` Thomas Hellstrom
0 siblings, 1 reply; 15+ messages in thread
From: Dave Airlie @ 2014-03-20 23:55 UTC (permalink / raw)
To: Jerome Glisse; +Cc: dri-devel
On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse <j.glisse@gmail.com> wrote:
> On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
>> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
>> >Op 18-10-12 13:55, Thomas Hellstrom schreef:
>> >>On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>> >>>Op 18-10-12 13:02, Thomas Hellstrom schreef:
>> >>>>On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>> >>>>>Hey,
>> >>>>>
>> >>>>>Op 18-10-12 09:59, Thomas Hellstrom schreef:
>> >>>>>>On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>> >>>>>>>Hi, Maarten,
>> >>>>>>>
>> >>>>>>>As you know I have been having my doubts about this change.
>> >>>>>>>To me it seems insane to be forced to read the fence pointer under a
>> >>>>>>>reserved lock, simply because when you take the reserve lock, another
>> >>>>>>>process may have it and there is a substantial chance that that process
>> >>>>>>>will also be waiting for idle while holding the reserve lock.
>> >>>>>I think it makes perfect sense, the only times you want to read the fence
>> >>>>>is when you want to change the members protected by the reservation.
>> >>>>No, that's not true. A typical case (or the only case)
>> >>>>is where you want to do a map with no_wait semantics. You will want
>> >>>>to be able to access a buffer's results even if the eviction code
>> >>>>is about to schedule an unbind from the GPU, and have the buffer
>> >>>>reserved?
>> >>>Well either block on reserve or return -EBUSY if reserved, presumably the
>> >>>former..
>> >>>
>> >>>ttm_bo_vm_fault does the latter already, anyway
>> >>ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
>> >>a waiting reserve but for different reasons. Typically a user-space app will keep
>> >>asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
>> >>be long as user-space apps keep caches.
>> >>That's not the same as accessing a buffer after the GPU is done with it.
>> >Ah indeed.
>> >>>You don't need to hold the reservation while performing the wait itself though,
>> >>>you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>> >>>take a ref to the sync_obj member and then wait after unreserving. You won't
>> >>>reset sync_obj member to NULL in that case, but that should be harmless.
>> >>>This will allow you to keep the reservations fast and short. Maybe a helper
>> >>>would be appropriate for this since radeon and nouveau both seem to do this.
>> >>>
>> >>The problem is that as long as other users are waiting for idle with reservation
>> >>held, for example to switch GPU engine or to delete a GPU bind, waiting
>> >>for reserve will in many case mean wait for GPU.
>> >This example sounds inefficient, I know nouveau can do this, but this essentially
>> >just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
>> >It wouldn't surprise me if performance in nouveau could be improved simply by
>> >fixing those cases up instead, since it stalls the application completely too for other uses.
>> >
>> Please see the Nouveau cpu_prep patch as well.
>>
>> While there are a number of cases that can be fixed up, also in
>> Radeon, there's no way around that reservation
>> is a heavyweight lock that, particularly on simpler hardware without
>> support for fence ordering
>> with barriers and / or "semaphores" and accelerated eviction will be
>> held while waiting for idle.
>>
>> As such, it is unsuitable to protect read access to the fence
>> pointer. If the intention is to keep a single fence pointer
>> I think the best solution is to allow reading the fence pointer
>> outside reservation, but make sure this can be done
>> atomically. If the intention is to protect an array or list of fence
>> pointers, I think a spinlock is the better solution.
>>
>> /Thomas
>
> Just wanted to point out that like Thomas i am concern about having to
> have object reserved when waiting on its associated fence. I fear it
> will hurt us somehow.
>
> I will try to spend couple days stress testing your branch on radeon
> trying to see if it hurts performance anyhow with current use case.
>
I've been trying to figure out what to do with Maarten's patches going
forward since they are essential for all kinds of SoC people,
However I'm still not 100% sure I believe either the fact that the
problem is anything more than a microoptimisation, and possibly a
premature one at that, this needs someone to start suggesting
benchmarks we can run and a driver set to run them on, otherwise I'm
starting to tend towards we are taking about an optimisation we can
fix later,
The other option is to somehow merge this stuff under an option that
allows us to test it using a command line option, but I don't think
that is sane either,
So Maarten, Jerome, Thomas, please start discussing actual real world
tests you think merging this code will affect and then I can make a
better consideration.
Dave.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2014-03-20 23:55 ` Dave Airlie
@ 2014-03-21 8:27 ` Thomas Hellstrom
2014-03-21 12:12 ` Maarten Lankhorst
0 siblings, 1 reply; 15+ messages in thread
From: Thomas Hellstrom @ 2014-03-21 8:27 UTC (permalink / raw)
To: Dave Airlie; +Cc: dri-devel
On 03/21/2014 12:55 AM, Dave Airlie wrote:
> On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse <j.glisse@gmail.com> wrote:
>> On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
>>> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
>>>> Op 18-10-12 13:55, Thomas Hellstrom schreef:
>>>>> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>>>>>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>>>>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>>>>>> Hey,
>>>>>>>>
>>>>>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>>>>>> Hi, Maarten,
>>>>>>>>>>
>>>>>>>>>> As you know I have been having my doubts about this change.
>>>>>>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>>>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>>>>>>> process may have it and there is a substantial chance that that process
>>>>>>>>>> will also be waiting for idle while holding the reserve lock.
>>>>>>>> I think it makes perfect sense, the only times you want to read the fence
>>>>>>>> is when you want to change the members protected by the reservation.
>>>>>>> No, that's not true. A typical case (or the only case)
>>>>>>> is where you want to do a map with no_wait semantics. You will want
>>>>>>> to be able to access a buffer's results even if the eviction code
>>>>>>> is about to schedule an unbind from the GPU, and have the buffer
>>>>>>> reserved?
>>>>>> Well either block on reserve or return -EBUSY if reserved, presumably the
>>>>>> former..
>>>>>>
>>>>>> ttm_bo_vm_fault does the latter already, anyway
>>>>> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
>>>>> a waiting reserve but for different reasons. Typically a user-space app will keep
>>>>> asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
>>>>> be long as user-space apps keep caches.
>>>>> That's not the same as accessing a buffer after the GPU is done with it.
>>>> Ah indeed.
>>>>>> You don't need to hold the reservation while performing the wait itself though,
>>>>>> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>>>>>> take a ref to the sync_obj member and then wait after unreserving. You won't
>>>>>> reset sync_obj member to NULL in that case, but that should be harmless.
>>>>>> This will allow you to keep the reservations fast and short. Maybe a helper
>>>>>> would be appropriate for this since radeon and nouveau both seem to do this.
>>>>>>
>>>>> The problem is that as long as other users are waiting for idle with reservation
>>>>> held, for example to switch GPU engine or to delete a GPU bind, waiting
>>>>> for reserve will in many case mean wait for GPU.
>>>> This example sounds inefficient, I know nouveau can do this, but this essentially
>>>> just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
>>>> It wouldn't surprise me if performance in nouveau could be improved simply by
>>>> fixing those cases up instead, since it stalls the application completely too for other uses.
>>>>
>>> Please see the Nouveau cpu_prep patch as well.
>>>
>>> While there are a number of cases that can be fixed up, also in
>>> Radeon, there's no way around that reservation
>>> is a heavyweight lock that, particularly on simpler hardware without
>>> support for fence ordering
>>> with barriers and / or "semaphores" and accelerated eviction will be
>>> held while waiting for idle.
>>>
>>> As such, it is unsuitable to protect read access to the fence
>>> pointer. If the intention is to keep a single fence pointer
>>> I think the best solution is to allow reading the fence pointer
>>> outside reservation, but make sure this can be done
>>> atomically. If the intention is to protect an array or list of fence
>>> pointers, I think a spinlock is the better solution.
>>>
>>> /Thomas
>> Just wanted to point out that like Thomas i am concern about having to
>> have object reserved when waiting on its associated fence. I fear it
>> will hurt us somehow.
>>
>> I will try to spend couple days stress testing your branch on radeon
>> trying to see if it hurts performance anyhow with current use case.
>>
> I've been trying to figure out what to do with Maarten's patches going
> forward since they are essential for all kinds of SoC people,
>
> However I'm still not 100% sure I believe either the fact that the
> problem is anything more than a microoptimisation, and possibly a
> premature one at that, this needs someone to start suggesting
> benchmarks we can run and a driver set to run them on, otherwise I'm
> starting to tend towards we are taking about an optimisation we can
> fix later,
>
> The other option is to somehow merge this stuff under an option that
> allows us to test it using a command line option, but I don't think
> that is sane either,
>
> So Maarten, Jerome, Thomas, please start discussing actual real world
> tests you think merging this code will affect and then I can make a
> better consideration.
>
> Dave.
Dave,
This is IMHO a fundamental design discussion, not a micro-optimization.
I'm pretty sure all sorts of horrendous things could be done to the DRM
design without affecting real world application performance.
In TTM data protection is primarily done with spinlocks: This serves two
purposes.
a) You can't unnecessarily hold a data protection lock while waiting for
GPU, which is typically a very stupid thing to do (perhaps not so in
this particular case) but when the sleep while atomic or locking
inversion kernel warning turns up, that should at least
ring a bell. Trying to implement dma-buf fencing using the TTM fencing
callbacks will AFAICT cause a locking inversion.
b) Spinlocks essentially go away on UP systems. The whole reservation
path was essentially lock-free on UP systems pre dma-buf integration,
and with very few atomic locking operations even on SMP systems. It was
all prompted by a test some years ago (by Eric Anholt IIRC) showing that
locking operations turned up quite high on Intel driver profiling.
If we start protecting data with reservations, when we also export the
reservation locks, so that people find them a convenient "serialize work
on this buffer" lock, all kind of interesting things might happen, and
we might end up in a situation
similar to protecting everything with struct_mutex.
This is why I dislike this change. In particular when there is a very
simple remedy.
But if I can get an ACK to convert the reservation object fence pointers
and associated operations on them to be rcu-safe when I have some time
left, I'd be prepared to accept this patch series in it's current state.
/Thomas
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2014-03-21 8:27 ` Thomas Hellstrom
@ 2014-03-21 12:12 ` Maarten Lankhorst
2014-03-21 13:04 ` Thomas Hellstrom
0 siblings, 1 reply; 15+ messages in thread
From: Maarten Lankhorst @ 2014-03-21 12:12 UTC (permalink / raw)
To: Thomas Hellstrom, Dave Airlie; +Cc: Daniel Vetter, dri-devel
Hey,
op 21-03-14 09:27, Thomas Hellstrom schreef:
> On 03/21/2014 12:55 AM, Dave Airlie wrote:
>> On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse <j.glisse@gmail.com> wrote:
>>> On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
>>>> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
>>>>> Op 18-10-12 13:55, Thomas Hellstrom schreef:
>>>>>> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>>>>>>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>>>>>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>>>>>>> Hey,
>>>>>>>>>
>>>>>>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>>>>>>> Hi, Maarten,
>>>>>>>>>>>
>>>>>>>>>>> As you know I have been having my doubts about this change.
>>>>>>>>>>> To me it seems insane to be forced to read the fence pointer under a
>>>>>>>>>>> reserved lock, simply because when you take the reserve lock, another
>>>>>>>>>>> process may have it and there is a substantial chance that that process
>>>>>>>>>>> will also be waiting for idle while holding the reserve lock.
>>>>>>>>> I think it makes perfect sense, the only times you want to read the fence
>>>>>>>>> is when you want to change the members protected by the reservation.
>>>>>>>> No, that's not true. A typical case (or the only case)
>>>>>>>> is where you want to do a map with no_wait semantics. You will want
>>>>>>>> to be able to access a buffer's results even if the eviction code
>>>>>>>> is about to schedule an unbind from the GPU, and have the buffer
>>>>>>>> reserved?
>>>>>>> Well either block on reserve or return -EBUSY if reserved, presumably the
>>>>>>> former..
>>>>>>>
>>>>>>> ttm_bo_vm_fault does the latter already, anyway
>>>>>> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really
>>>>>> a waiting reserve but for different reasons. Typically a user-space app will keep
>>>>>> asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may
>>>>>> be long as user-space apps keep caches.
>>>>>> That's not the same as accessing a buffer after the GPU is done with it.
>>>>> Ah indeed.
>>>>>>> You don't need to hold the reservation while performing the wait itself though,
>>>>>>> you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so
>>>>>>> take a ref to the sync_obj member and then wait after unreserving. You won't
>>>>>>> reset sync_obj member to NULL in that case, but that should be harmless.
>>>>>>> This will allow you to keep the reservations fast and short. Maybe a helper
>>>>>>> would be appropriate for this since radeon and nouveau both seem to do this.
>>>>>>>
>>>>>> The problem is that as long as other users are waiting for idle with reservation
>>>>>> held, for example to switch GPU engine or to delete a GPU bind, waiting
>>>>>> for reserve will in many case mean wait for GPU.
>>>>> This example sounds inefficient, I know nouveau can do this, but this essentially
>>>>> just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close?
>>>>> It wouldn't surprise me if performance in nouveau could be improved simply by
>>>>> fixing those cases up instead, since it stalls the application completely too for other uses.
>>>>>
>>>> Please see the Nouveau cpu_prep patch as well.
>>>>
>>>> While there are a number of cases that can be fixed up, also in
>>>> Radeon, there's no way around that reservation
>>>> is a heavyweight lock that, particularly on simpler hardware without
>>>> support for fence ordering
>>>> with barriers and / or "semaphores" and accelerated eviction will be
>>>> held while waiting for idle.
>>>>
>>>> As such, it is unsuitable to protect read access to the fence
>>>> pointer. If the intention is to keep a single fence pointer
>>>> I think the best solution is to allow reading the fence pointer
>>>> outside reservation, but make sure this can be done
>>>> atomically. If the intention is to protect an array or list of fence
>>>> pointers, I think a spinlock is the better solution.
>>>>
>>>> /Thomas
>>> Just wanted to point out that like Thomas i am concern about having to
>>> have object reserved when waiting on its associated fence. I fear it
>>> will hurt us somehow.
>>>
>>> I will try to spend couple days stress testing your branch on radeon
>>> trying to see if it hurts performance anyhow with current use case.
>>>
>> I've been trying to figure out what to do with Maarten's patches going
>> forward since they are essential for all kinds of SoC people,
>>
>> However I'm still not 100% sure I believe either the fact that the
>> problem is anything more than a microoptimisation, and possibly a
>> premature one at that, this needs someone to start suggesting
>> benchmarks we can run and a driver set to run them on, otherwise I'm
>> starting to tend towards we are taking about an optimisation we can
>> fix later,
>>
>> The other option is to somehow merge this stuff under an option that
>> allows us to test it using a command line option, but I don't think
>> that is sane either,
>>
>> So Maarten, Jerome, Thomas, please start discussing actual real world
>> tests you think merging this code will affect and then I can make a
>> better consideration.
>>
>> Dave.
> Dave,
>
> This is IMHO a fundamental design discussion, not a micro-optimization.
>
> I'm pretty sure all sorts of horrendous things could be done to the DRM
> design without affecting real world application performance.
>
> In TTM data protection is primarily done with spinlocks: This serves two
> purposes.
>
> a) You can't unnecessarily hold a data protection lock while waiting for
> GPU, which is typically a very stupid thing to do (perhaps not so in
> this particular case) but when the sleep while atomic or locking
> inversion kernel warning turns up, that should at least
> ring a bell. Trying to implement dma-buf fencing using the TTM fencing
> callbacks will AFAICT cause a locking inversion.
>
> b) Spinlocks essentially go away on UP systems. The whole reservation
> path was essentially lock-free on UP systems pre dma-buf integration,
> and with very few atomic locking operations even on SMP systems. It was
> all prompted by a test some years ago (by Eric Anholt IIRC) showing that
> locking operations turned up quite high on Intel driver profiling.
>
> If we start protecting data with reservations, when we also export the
> reservation locks, so that people find them a convenient "serialize work
> on this buffer" lock, all kind of interesting things might happen, and
> we might end up in a situation
> similar to protecting everything with struct_mutex.
>
> This is why I dislike this change. In particular when there is a very
> simple remedy.
>
> But if I can get an ACK to convert the reservation object fence pointers
> and associated operations on them to be rcu-safe when I have some time
> left, I'd be prepared to accept this patch series in it's current state.
RCU could be a useful way to deal with this. But in my case I've shown there are very few places where it's needed, core ttm does not need it at all.
This is why I wanted to leave it to individual drivers to implement it.
I think kfree_rcu for free in the driver itself should be enough,
and obtaining in the driver would require something like this, similar to whats in rcuref.txt:
rcu_read_lock();
f = rcu_dereference(bo->fence);
if (f && !kref_get_unless-zero(&f->kref)) f = NULL;
rcu_read_unlock();
if (f) {
// do stuff here
...
// free fence
kref_put(&f->kref, fence_put_with_kfree_rcu);
}
Am I wrong or is something like this enough to make sure fence is still alive?
There might still be a small bug when bo->fence's refcount is decreased before bo->fence is set to null. I haven't checked core ttm so that might need fixing.
I added some more people to CC. It might be worthwhile having this in the core fence code and delete all fences with rcu, but I'm not completely certain about that yet.
~Maarten
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2014-03-21 12:12 ` Maarten Lankhorst
@ 2014-03-21 13:04 ` Thomas Hellstrom
2014-03-25 14:23 ` Maarten Lankhorst
0 siblings, 1 reply; 15+ messages in thread
From: Thomas Hellstrom @ 2014-03-21 13:04 UTC (permalink / raw)
To: Maarten Lankhorst; +Cc: Daniel Vetter, dri-devel
On 03/21/2014 01:12 PM, Maarten Lankhorst wrote:
> Hey,
>
> op 21-03-14 09:27, Thomas Hellstrom schreef:
>> On 03/21/2014 12:55 AM, Dave Airlie wrote:
>>> On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse <j.glisse@gmail.com>
>>> wrote:
>>>> On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
>>>>> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
>>>>>> Op 18-10-12 13:55, Thomas Hellstrom schreef:
>>>>>>> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>>>>>>>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>>>>>>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>>>>>>>> Hey,
>>>>>>>>>>
>>>>>>>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>>>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>>>>>>>> Hi, Maarten,
>>>>>>>>>>>>
>>>>>>>>>>>> As you know I have been having my doubts about this change.
>>>>>>>>>>>> To me it seems insane to be forced to read the fence
>>>>>>>>>>>> pointer under a
>>>>>>>>>>>> reserved lock, simply because when you take the reserve
>>>>>>>>>>>> lock, another
>>>>>>>>>>>> process may have it and there is a substantial chance that
>>>>>>>>>>>> that process
>>>>>>>>>>>> will also be waiting for idle while holding the reserve lock.
>>>>>>>>>> I think it makes perfect sense, the only times you want to
>>>>>>>>>> read the fence
>>>>>>>>>> is when you want to change the members protected by the
>>>>>>>>>> reservation.
>>>>>>>>> No, that's not true. A typical case (or the only case)
>>>>>>>>> is where you want to do a map with no_wait semantics. You will
>>>>>>>>> want
>>>>>>>>> to be able to access a buffer's results even if the eviction code
>>>>>>>>> is about to schedule an unbind from the GPU, and have the buffer
>>>>>>>>> reserved?
>>>>>>>> Well either block on reserve or return -EBUSY if reserved,
>>>>>>>> presumably the
>>>>>>>> former..
>>>>>>>>
>>>>>>>> ttm_bo_vm_fault does the latter already, anyway
>>>>>>> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem,
>>>>>>> it's really
>>>>>>> a waiting reserve but for different reasons. Typically a
>>>>>>> user-space app will keep
>>>>>>> asynchronous maps to TTM during a buffer lifetime, and the
>>>>>>> buffer lifetime may
>>>>>>> be long as user-space apps keep caches.
>>>>>>> That's not the same as accessing a buffer after the GPU is done
>>>>>>> with it.
>>>>>> Ah indeed.
>>>>>>>> You don't need to hold the reservation while performing the
>>>>>>>> wait itself though,
>>>>>>>> you could check if ttm_bo_wait(no_wait_gpu = true) returns
>>>>>>>> -EBUSY, and if so
>>>>>>>> take a ref to the sync_obj member and then wait after
>>>>>>>> unreserving. You won't
>>>>>>>> reset sync_obj member to NULL in that case, but that should be
>>>>>>>> harmless.
>>>>>>>> This will allow you to keep the reservations fast and short.
>>>>>>>> Maybe a helper
>>>>>>>> would be appropriate for this since radeon and nouveau both
>>>>>>>> seem to do this.
>>>>>>>>
>>>>>>> The problem is that as long as other users are waiting for idle
>>>>>>> with reservation
>>>>>>> held, for example to switch GPU engine or to delete a GPU bind,
>>>>>>> waiting
>>>>>>> for reserve will in many case mean wait for GPU.
>>>>>> This example sounds inefficient, I know nouveau can do this, but
>>>>>> this essentially
>>>>>> just stalls the gpu entirely. I think guess you mean functions
>>>>>> like nouveau_gem_object_close?
>>>>>> It wouldn't surprise me if performance in nouveau could be
>>>>>> improved simply by
>>>>>> fixing those cases up instead, since it stalls the application
>>>>>> completely too for other uses.
>>>>>>
>>>>> Please see the Nouveau cpu_prep patch as well.
>>>>>
>>>>> While there are a number of cases that can be fixed up, also in
>>>>> Radeon, there's no way around that reservation
>>>>> is a heavyweight lock that, particularly on simpler hardware without
>>>>> support for fence ordering
>>>>> with barriers and / or "semaphores" and accelerated eviction will be
>>>>> held while waiting for idle.
>>>>>
>>>>> As such, it is unsuitable to protect read access to the fence
>>>>> pointer. If the intention is to keep a single fence pointer
>>>>> I think the best solution is to allow reading the fence pointer
>>>>> outside reservation, but make sure this can be done
>>>>> atomically. If the intention is to protect an array or list of fence
>>>>> pointers, I think a spinlock is the better solution.
>>>>>
>>>>> /Thomas
>>>> Just wanted to point out that like Thomas i am concern about having to
>>>> have object reserved when waiting on its associated fence. I fear it
>>>> will hurt us somehow.
>>>>
>>>> I will try to spend couple days stress testing your branch on radeon
>>>> trying to see if it hurts performance anyhow with current use case.
>>>>
>>> I've been trying to figure out what to do with Maarten's patches going
>>> forward since they are essential for all kinds of SoC people,
>>>
>>> However I'm still not 100% sure I believe either the fact that the
>>> problem is anything more than a microoptimisation, and possibly a
>>> premature one at that, this needs someone to start suggesting
>>> benchmarks we can run and a driver set to run them on, otherwise I'm
>>> starting to tend towards we are taking about an optimisation we can
>>> fix later,
>>>
>>> The other option is to somehow merge this stuff under an option that
>>> allows us to test it using a command line option, but I don't think
>>> that is sane either,
>>>
>>> So Maarten, Jerome, Thomas, please start discussing actual real world
>>> tests you think merging this code will affect and then I can make a
>>> better consideration.
>>>
>>> Dave.
>> Dave,
>>
>> This is IMHO a fundamental design discussion, not a micro-optimization.
>>
>> I'm pretty sure all sorts of horrendous things could be done to the DRM
>> design without affecting real world application performance.
>>
>> In TTM data protection is primarily done with spinlocks: This serves two
>> purposes.
>>
>> a) You can't unnecessarily hold a data protection lock while waiting for
>> GPU, which is typically a very stupid thing to do (perhaps not so in
>> this particular case) but when the sleep while atomic or locking
>> inversion kernel warning turns up, that should at least
>> ring a bell. Trying to implement dma-buf fencing using the TTM fencing
>> callbacks will AFAICT cause a locking inversion.
>>
>> b) Spinlocks essentially go away on UP systems. The whole reservation
>> path was essentially lock-free on UP systems pre dma-buf integration,
>> and with very few atomic locking operations even on SMP systems. It was
>> all prompted by a test some years ago (by Eric Anholt IIRC) showing that
>> locking operations turned up quite high on Intel driver profiling.
>>
>> If we start protecting data with reservations, when we also export the
>> reservation locks, so that people find them a convenient "serialize work
>> on this buffer" lock, all kind of interesting things might happen, and
>> we might end up in a situation
>> similar to protecting everything with struct_mutex.
>>
>> This is why I dislike this change. In particular when there is a very
>> simple remedy.
>>
>> But if I can get an ACK to convert the reservation object fence pointers
>> and associated operations on them to be rcu-safe when I have some time
>> left, I'd be prepared to accept this patch series in it's current state.
> RCU could be a useful way to deal with this. But in my case I've shown
> there are very few places where it's needed, core ttm does not need it
> at all.
> This is why I wanted to leave it to individual drivers to implement it.
>
> I think kfree_rcu for free in the driver itself should be enough,
> and obtaining in the driver would require something like this, similar
> to whats in rcuref.txt:
>
> rcu_read_lock();
> f = rcu_dereference(bo->fence);
> if (f && !kref_get_unless-zero(&f->kref)) f = NULL;
> rcu_read_unlock();
>
> if (f) {
> // do stuff here
> ...
>
> // free fence
> kref_put(&f->kref, fence_put_with_kfree_rcu);
> }
>
> Am I wrong or is something like this enough to make sure fence is
> still alive?
No, you're correct.
And a bo check for idle would amount to (since we don't care if the
fence refcount is zero).
rcu_read_lock()
f = rcu_dereference(bo->fence);
signaled = !f || f->signaled;
rcu_read_unlock().
/Thomas
> There might still be a small bug when bo->fence's refcount is
> decreased before bo->fence is set to null. I haven't checked core ttm
> so that might need fixing.
>
> I added some more people to CC. It might be worthwhile having this in
> the core fence code and delete all fences with rcu, but I'm not
> completely certain about that yet.
>
> ~Maarten
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] drm/ttm: remove fence_lock
2014-03-21 13:04 ` Thomas Hellstrom
@ 2014-03-25 14:23 ` Maarten Lankhorst
0 siblings, 0 replies; 15+ messages in thread
From: Maarten Lankhorst @ 2014-03-25 14:23 UTC (permalink / raw)
To: Thomas Hellstrom; +Cc: Daniel Vetter, dri-devel
Hey,
op 21-03-14 14:04, Thomas Hellstrom schreef:
> On 03/21/2014 01:12 PM, Maarten Lankhorst wrote:
>> Hey,
>>
>> op 21-03-14 09:27, Thomas Hellstrom schreef:
>>> On 03/21/2014 12:55 AM, Dave Airlie wrote:
>>>> On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse <j.glisse@gmail.com>
>>>> wrote:
>>>>> On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
>>>>>> On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
>>>>>>> Op 18-10-12 13:55, Thomas Hellstrom schreef:
>>>>>>>> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
>>>>>>>>> Op 18-10-12 13:02, Thomas Hellstrom schreef:
>>>>>>>>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
>>>>>>>>>>> Hey,
>>>>>>>>>>>
>>>>>>>>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef:
>>>>>>>>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
>>>>>>>>>>>>> Hi, Maarten,
>>>>>>>>>>>>>
>>>>>>>>>>>>> As you know I have been having my doubts about this change.
>>>>>>>>>>>>> To me it seems insane to be forced to read the fence
>>>>>>>>>>>>> pointer under a
>>>>>>>>>>>>> reserved lock, simply because when you take the reserve
>>>>>>>>>>>>> lock, another
>>>>>>>>>>>>> process may have it and there is a substantial chance that
>>>>>>>>>>>>> that process
>>>>>>>>>>>>> will also be waiting for idle while holding the reserve lock.
>>>>>>>>>>> I think it makes perfect sense, the only times you want to
>>>>>>>>>>> read the fence
>>>>>>>>>>> is when you want to change the members protected by the
>>>>>>>>>>> reservation.
>>>>>>>>>> No, that's not true. A typical case (or the only case)
>>>>>>>>>> is where you want to do a map with no_wait semantics. You will
>>>>>>>>>> want
>>>>>>>>>> to be able to access a buffer's results even if the eviction code
>>>>>>>>>> is about to schedule an unbind from the GPU, and have the buffer
>>>>>>>>>> reserved?
>>>>>>>>> Well either block on reserve or return -EBUSY if reserved,
>>>>>>>>> presumably the
>>>>>>>>> former..
>>>>>>>>>
>>>>>>>>> ttm_bo_vm_fault does the latter already, anyway
>>>>>>>> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem,
>>>>>>>> it's really
>>>>>>>> a waiting reserve but for different reasons. Typically a
>>>>>>>> user-space app will keep
>>>>>>>> asynchronous maps to TTM during a buffer lifetime, and the
>>>>>>>> buffer lifetime may
>>>>>>>> be long as user-space apps keep caches.
>>>>>>>> That's not the same as accessing a buffer after the GPU is done
>>>>>>>> with it.
>>>>>>> Ah indeed.
>>>>>>>>> You don't need to hold the reservation while performing the
>>>>>>>>> wait itself though,
>>>>>>>>> you could check if ttm_bo_wait(no_wait_gpu = true) returns
>>>>>>>>> -EBUSY, and if so
>>>>>>>>> take a ref to the sync_obj member and then wait after
>>>>>>>>> unreserving. You won't
>>>>>>>>> reset sync_obj member to NULL in that case, but that should be
>>>>>>>>> harmless.
>>>>>>>>> This will allow you to keep the reservations fast and short.
>>>>>>>>> Maybe a helper
>>>>>>>>> would be appropriate for this since radeon and nouveau both
>>>>>>>>> seem to do this.
>>>>>>>>>
>>>>>>>> The problem is that as long as other users are waiting for idle
>>>>>>>> with reservation
>>>>>>>> held, for example to switch GPU engine or to delete a GPU bind,
>>>>>>>> waiting
>>>>>>>> for reserve will in many case mean wait for GPU.
>>>>>>> This example sounds inefficient, I know nouveau can do this, but
>>>>>>> this essentially
>>>>>>> just stalls the gpu entirely. I think guess you mean functions
>>>>>>> like nouveau_gem_object_close?
>>>>>>> It wouldn't surprise me if performance in nouveau could be
>>>>>>> improved simply by
>>>>>>> fixing those cases up instead, since it stalls the application
>>>>>>> completely too for other uses.
>>>>>>>
>>>>>> Please see the Nouveau cpu_prep patch as well.
>>>>>>
>>>>>> While there are a number of cases that can be fixed up, also in
>>>>>> Radeon, there's no way around that reservation
>>>>>> is a heavyweight lock that, particularly on simpler hardware without
>>>>>> support for fence ordering
>>>>>> with barriers and / or "semaphores" and accelerated eviction will be
>>>>>> held while waiting for idle.
>>>>>>
>>>>>> As such, it is unsuitable to protect read access to the fence
>>>>>> pointer. If the intention is to keep a single fence pointer
>>>>>> I think the best solution is to allow reading the fence pointer
>>>>>> outside reservation, but make sure this can be done
>>>>>> atomically. If the intention is to protect an array or list of fence
>>>>>> pointers, I think a spinlock is the better solution.
>>>>>>
>>>>>> /Thomas
>>>>> Just wanted to point out that like Thomas i am concern about having to
>>>>> have object reserved when waiting on its associated fence. I fear it
>>>>> will hurt us somehow.
>>>>>
>>>>> I will try to spend couple days stress testing your branch on radeon
>>>>> trying to see if it hurts performance anyhow with current use case.
>>>>>
>>>> I've been trying to figure out what to do with Maarten's patches going
>>>> forward since they are essential for all kinds of SoC people,
>>>>
>>>> However I'm still not 100% sure I believe either the fact that the
>>>> problem is anything more than a microoptimisation, and possibly a
>>>> premature one at that, this needs someone to start suggesting
>>>> benchmarks we can run and a driver set to run them on, otherwise I'm
>>>> starting to tend towards we are taking about an optimisation we can
>>>> fix later,
>>>>
>>>> The other option is to somehow merge this stuff under an option that
>>>> allows us to test it using a command line option, but I don't think
>>>> that is sane either,
>>>>
>>>> So Maarten, Jerome, Thomas, please start discussing actual real world
>>>> tests you think merging this code will affect and then I can make a
>>>> better consideration.
>>>>
>>>> Dave.
>>> Dave,
>>>
>>> This is IMHO a fundamental design discussion, not a micro-optimization.
>>>
>>> I'm pretty sure all sorts of horrendous things could be done to the DRM
>>> design without affecting real world application performance.
>>>
>>> In TTM data protection is primarily done with spinlocks: This serves two
>>> purposes.
>>>
>>> a) You can't unnecessarily hold a data protection lock while waiting for
>>> GPU, which is typically a very stupid thing to do (perhaps not so in
>>> this particular case) but when the sleep while atomic or locking
>>> inversion kernel warning turns up, that should at least
>>> ring a bell. Trying to implement dma-buf fencing using the TTM fencing
>>> callbacks will AFAICT cause a locking inversion.
>>>
>>> b) Spinlocks essentially go away on UP systems. The whole reservation
>>> path was essentially lock-free on UP systems pre dma-buf integration,
>>> and with very few atomic locking operations even on SMP systems. It was
>>> all prompted by a test some years ago (by Eric Anholt IIRC) showing that
>>> locking operations turned up quite high on Intel driver profiling.
>>>
>>> If we start protecting data with reservations, when we also export the
>>> reservation locks, so that people find them a convenient "serialize work
>>> on this buffer" lock, all kind of interesting things might happen, and
>>> we might end up in a situation
>>> similar to protecting everything with struct_mutex.
>>>
>>> This is why I dislike this change. In particular when there is a very
>>> simple remedy.
>>>
>>> But if I can get an ACK to convert the reservation object fence pointers
>>> and associated operations on them to be rcu-safe when I have some time
>>> left, I'd be prepared to accept this patch series in it's current state.
>> RCU could be a useful way to deal with this. But in my case I've shown
>> there are very few places where it's needed, core ttm does not need it
>> at all.
>> This is why I wanted to leave it to individual drivers to implement it.
>>
>> I think kfree_rcu for free in the driver itself should be enough,
>> and obtaining in the driver would require something like this, similar
>> to whats in rcuref.txt:
>>
>> rcu_read_lock();
>> f = rcu_dereference(bo->fence);
>> if (f && !kref_get_unless-zero(&f->kref)) f = NULL;
>> rcu_read_unlock();
>>
>> if (f) {
>> // do stuff here
>> ...
>>
>> // free fence
>> kref_put(&f->kref, fence_put_with_kfree_rcu);
>> }
>>
>> Am I wrong or is something like this enough to make sure fence is
>> still alive?
> No, you're correct.
>
> And a bo check for idle would amount to (since we don't care if the
> fence refcount is zero).
>
> rcu_read_lock()
> f = rcu_dereference(bo->fence);
> signaled = !f || f->signaled;
> rcu_read_unlock().
>
> /Thomas
>
This is very easy to implement when there is only 1 fence slot, bo->fence being equal to reservation_object->fence_excl.
It appears to break down when slightly when I implement it on top of shared fences.
My diff is against git://git.linaro.org/people/sumit.semwal/linux-3.x.git for-next-fences
shared_max_fence is held in reservation_object to prevent the reallocation in reservation_object_reserve_shared_fence
every time the same reservation_object gets > 4 shared fences; if it happens once, it's likely going to happen again on the same object.
Anyhow does the below look sane to you? This has only been tested by my compiler, I haven't checked if this boots.
---
commit c73b87d33fd08f7c1b0a381b08b3626128f8f3b8
Author: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Date: Tue Mar 25 15:10:21 2014 +0100
add rcu to fence HACK WIP DO NOT USE KILLS YOUR POT PLANTS PETS AND FAMILY
diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c
index 96338bf7f457..0a88c10b3db9 100644
--- a/drivers/base/dma-buf.c
+++ b/drivers/base/dma-buf.c
@@ -134,7 +134,10 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll)
{
struct dma_buf *dmabuf;
struct reservation_object *resv;
+ struct reservation_object_shared *shared;
+ struct fence *fence_excl;
unsigned long events;
+ unsigned shared_count;
dmabuf = file->private_data;
if (!dmabuf || !dmabuf->resv)
@@ -148,14 +151,18 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll)
if (!events)
return 0;
- ww_mutex_lock(&resv->lock, NULL);
+ rcu_read_lock();
- if (resv->fence_excl && (!(events & POLLOUT) ||
- resv->fence_shared_count == 0)) {
+ shared = rcu_dereference(resv->shared);
+ fence_excl = rcu_dereference(resv->fence_excl);
+ shared_count = ACCESS_ONCE(shared->count);
+
+ if (fence_excl && (!(events & POLLOUT) ||
+ (!shared || shared_count == 0))) {
struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_excl;
unsigned long pevents = POLLIN;
- if (resv->fence_shared_count == 0)
+ if (!shared || shared_count == 0)
pevents |= POLLOUT;
spin_lock_irq(&dmabuf->poll.lock);
@@ -167,19 +174,26 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll)
spin_unlock_irq(&dmabuf->poll.lock);
if (events & pevents) {
- if (!fence_add_callback(resv->fence_excl,
- &dcb->cb, dma_buf_poll_cb))
+ if (!kref_get_unless_zero(&fence_excl->refcount)) {
+ /* force a recheck */
+ events &= ~pevents;
+ dma_buf_poll_cb(NULL, &dcb->cb);
+ } else if (!fence_add_callback(fence_excl, &dcb->cb,
+ dma_buf_poll_cb)) {
events &= ~pevents;
- else
+ fence_put(fence_excl);
+ } else {
/*
* No callback queued, wake up any additional
* waiters.
*/
+ fence_put(fence_excl);
dma_buf_poll_cb(NULL, &dcb->cb);
+ }
}
}
- if ((events & POLLOUT) && resv->fence_shared_count > 0) {
+ if ((events & POLLOUT) && shared && shared_count > 0) {
struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_shared;
int i;
@@ -194,20 +208,34 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll)
if (!(events & POLLOUT))
goto out;
- for (i = 0; i < resv->fence_shared_count; ++i)
- if (!fence_add_callback(resv->fence_shared[i],
- &dcb->cb, dma_buf_poll_cb)) {
+ for (i = 0; i < shared_count; ++i) {
+ struct fence *fence = ACCESS_ONCE(shared->fence[i]);
+ if (!kref_get_unless_zero(&fence->refcount)) {
+ /*
+ * fence refcount dropped to zero, this means
+ * that the shared object has been freed from under us.
+ * call dma_buf_poll_cb and force a recheck!
+ */
events &= ~POLLOUT;
+ dma_buf_poll_cb(NULL, &dcb->cb);
break;
}
+ if (!fence_add_callback(fence, &dcb->cb,
+ dma_buf_poll_cb)) {
+ fence_put(fence);
+ events &= ~POLLOUT;
+ break;
+ }
+ fence_put(fence);
+ }
/* No callback queued, wake up any additional waiters. */
- if (i == resv->fence_shared_count)
+ if (i == shared_count)
dma_buf_poll_cb(NULL, &dcb->cb);
}
out:
- ww_mutex_unlock(&resv->lock);
+ rcu_read_unlock();
return events;
}
diff --git a/drivers/base/fence.c b/drivers/base/fence.c
index 8fff13fb86cf..be03a9e8ad8b 100644
--- a/drivers/base/fence.c
+++ b/drivers/base/fence.c
@@ -170,7 +170,7 @@ void release_fence(struct kref *kref)
if (fence->ops->release)
fence->ops->release(fence);
else
- kfree(fence);
+ kfree_rcu(fence, rcu);
}
EXPORT_SYMBOL(release_fence);
diff --git a/include/linux/fence.h b/include/linux/fence.h
index 65f2a01ee7e4..d19620405c34 100644
--- a/include/linux/fence.h
+++ b/include/linux/fence.h
@@ -40,6 +40,7 @@ struct fence_cb;
* struct fence - software synchronization primitive
* @refcount: refcount for this fence
* @ops: fence_ops associated with this fence
+ * @rcu: used for releasing fence with kfree_rcu
* @cb_list: list of all callbacks to call
* @lock: spin_lock_irqsave used for locking
* @context: execution context this fence belongs to, returned by
@@ -73,6 +74,7 @@ struct fence_cb;
struct fence {
struct kref refcount;
const struct fence_ops *ops;
+ struct rcu_head rcu;
struct list_head cb_list;
spinlock_t *lock;
unsigned context, seqno;
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index f3f57460a205..91576fabafdb 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -42,6 +42,7 @@
#include <linux/ww_mutex.h>
#include <linux/fence.h>
#include <linux/slab.h>
+#include <linux/rcupdate.h>
extern struct ww_class reservation_ww_class;
@@ -49,8 +50,12 @@ struct reservation_object {
struct ww_mutex lock;
struct fence *fence_excl;
- struct fence **fence_shared;
- u32 fence_shared_count, fence_shared_max;
+ u32 shared_max_fence;
+ struct reservation_object_shared {
+ struct rcu_head rcu;
+ u32 count;
+ struct fence *fence[];
+ } *shared;
};
static inline void
@@ -58,8 +63,8 @@ reservation_object_init(struct reservation_object *obj)
{
ww_mutex_init(&obj->lock, &reservation_ww_class);
- obj->fence_shared_count = obj->fence_shared_max = 0;
- obj->fence_shared = NULL;
+ obj->shared = NULL;
+ obj->shared_max_fence = 4;
obj->fence_excl = NULL;
}
@@ -70,11 +75,97 @@ reservation_object_fini(struct reservation_object *obj)
if (obj->fence_excl)
fence_put(obj->fence_excl);
- for (i = 0; i < obj->fence_shared_count; ++i)
- fence_put(obj->fence_shared[i]);
- kfree(obj->fence_shared);
+ if (obj->shared) {
+ for (i = 0; i < obj->shared->count; ++i)
+ fence_put(obj->shared->fence[i]);
+
+ /*
+ * This object should be dead and all references must have
+ * been released to it, so no need to free with rcu.
+ */
+ kfree(obj->shared);
+ }
ww_mutex_destroy(&obj->lock);
}
+/*
+ * Reserve space to add a shared fence to a reservation_object,
+ * must be called with obj->lock held.
+ */
+static inline int
+reservation_object_reserve_shared_fence(struct reservation_object *obj)
+{
+ struct reservation_object_shared *shared, *old;
+ u32 max = obj->shared_max_fence;
+
+ if (obj->shared) {
+ if (obj->shared->count < max)
+ return 0;
+ max *= 2;
+ }
+
+ shared = kmalloc(offsetof(typeof(*shared), fence[max]), GFP_KERNEL);
+ if (!shared)
+ return -ENOMEM;
+ old = obj->shared;
+
+ if (old) {
+ shared->count = old->count;
+ memcpy(shared->fence, old->fence, old->count * sizeof(*old->fence));
+ } else {
+ shared->count = 0;
+ }
+ rcu_assign_pointer(obj->shared, shared);
+ obj->shared_max_fence = max;
+ kfree_rcu(old, rcu);
+ return 0;
+}
+
+/*
+ * Add a fence to a shared slot, obj->lock must be held, and
+ * reservation_object_reserve_shared_fence has been called.
+ */
+
+static inline void
+reservation_object_add_shared_fence(struct reservation_object *obj,
+ struct fence *fence)
+{
+ unsigned i;
+
+ BUG_ON(obj->shared->count == obj->shared_max_fence);
+ fence_get(fence);
+
+ for (i = 0; i < obj->shared->count; ++i)
+ if (obj->shared->fence[i]->context == fence->context) {
+ struct fence *old = obj->shared->fence[i];
+ rcu_assign_pointer(obj->shared->fence[i], fence);
+ fence_put(old);
+ return;
+ }
+
+ obj->shared->fence[obj->shared->count] = fence;
+ smp_wmb();
+ obj->shared->count++;
+}
+
+/*
+ * May be called after adding an exclusive to wipe all shared fences.
+ */
+
+static inline void
+reservation_object_clear_shared(struct reservation_object *obj)
+{
+ struct reservation_object_shared *old = obj->shared;
+ unsigned i;
+
+ if (!old)
+ return;
+
+ rcu_assign_pointer(obj->shared, NULL);
+ for (i = 0; i < old->count; ++i)
+ fence_put(old->fence[i]);
+ kfree_rcu(old, rcu);
+}
+
#endif /* _LINUX_RESERVATION_H */
^ permalink raw reply related [flat|nested] 15+ messages in thread
end of thread, other threads:[~2014-03-25 14:23 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-10-12 15:23 [PATCH] drm/ttm: remove fence_lock Maarten Lankhorst
2012-10-18 7:28 ` Thomas Hellstrom
2012-10-18 7:59 ` Thomas Hellstrom
2012-10-18 8:37 ` Maarten Lankhorst
2012-10-18 11:02 ` Thomas Hellstrom
2012-10-18 11:38 ` Maarten Lankhorst
2012-10-18 11:55 ` Thomas Hellstrom
2012-10-18 14:45 ` Maarten Lankhorst
2012-10-18 16:43 ` Thomas Hellstrom
2012-10-18 17:04 ` Jerome Glisse
2014-03-20 23:55 ` Dave Airlie
2014-03-21 8:27 ` Thomas Hellstrom
2014-03-21 12:12 ` Maarten Lankhorst
2014-03-21 13:04 ` Thomas Hellstrom
2014-03-25 14:23 ` Maarten Lankhorst
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.