More tweaks

public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed

* More tweaks
@ 2017-07-26 13:25 Chris Wilson
  2017-07-26 13:25 ` [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages Chris Wilson
                   ` (16 more replies)
  0 siblings, 17 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:25 UTC (permalink / raw)
  To: intel-gfx

All other the shop, sent individually before pulled together for fun.
Some improvements for the shrinker, some improvements for fencing, and
execbuf tweaks.
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages
  2017-07-26 13:25 More tweaks Chris Wilson
@ 2017-07-26 13:25 ` Chris Wilson
  2017-08-07 20:15   ` Matthew Auld
                     ` (2 more replies)
  2017-07-26 13:25 ` [PATCH 02/16] drm/i915: Pin fence for iomap Chris Wilson
                   ` (15 subsequent siblings)
  16 siblings, 3 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:25 UTC (permalink / raw)
  To: intel-gfx

We use WC pages for coherent writes into the ppGTT on !llc
architectuures. However, to create a WC page requires a stop_machine(),
i.e. is very slow. To compensate we currently keep a per-vm cache of
recently freed pages, but we still see the slow startup of new contexts.
We can amoritize that cost slightly by allocating WC pages in small
batches (PAGEVEC_SIZE == 14) and since creating a WC page implies a
stop_machine() there is no penalty for keeping that stash global.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h     |   3 +
 drivers/gpu/drm/i915/i915_gem_gtt.c | 121 +++++++++++++++++++++++++++++-------
 2 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2c7456f4ed38..fd62be74a422 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1464,6 +1464,9 @@ struct i915_gem_mm {
 	struct llist_head free_list;
 	struct work_struct free_work;
 
+	/** Small stash of WC pages */
+	struct pagevec wc_stash;
+
 	/** Usable portion of the GTT for GEM */
 	dma_addr_t stolen_base; /* limited to low memory (32-bit) */
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 10aa7762d9a6..ad4e48435853 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -351,39 +351,85 @@ static gen6_pte_t iris_pte_encode(dma_addr_t addr,
 
 static struct page *vm_alloc_page(struct i915_address_space *vm, gfp_t gfp)
 {
-	struct page *page;
+	struct pagevec *pvec = &vm->free_pages;
 
 	if (I915_SELFTEST_ONLY(should_fail(&vm->fault_attr, 1)))
 		i915_gem_shrink_all(vm->i915);
 
-	if (vm->free_pages.nr)
-		return vm->free_pages.pages[--vm->free_pages.nr];
+	if (likely(pvec->nr))
+		return pvec->pages[--pvec->nr];
+
+	if (!vm->pt_kmap_wc)
+		return alloc_page(gfp);
+
+	lockdep_assert_held(&vm->i915->drm.struct_mutex);
+
+	/* Look in our global stash of WC pages... */
+	pvec = &vm->i915->mm.wc_stash;
+	if (likely(pvec->nr))
+		return pvec->pages[--pvec->nr];
 
-	page = alloc_page(gfp);
-	if (!page)
+	/* Otherwise batch allocate pages to amoritize cost of set_pages_wc. */
+	do {
+		struct page *page;
+
+		page = alloc_page(gfp);
+		if (unlikely(!page))
+			break;
+
+		pvec->pages[pvec->nr++] = page;
+	} while (pagevec_space(pvec));
+
+	if (unlikely(!pvec->nr))
 		return NULL;
 
-	if (vm->pt_kmap_wc)
-		set_pages_array_wc(&page, 1);
+	set_pages_array_wc(pvec->pages, pvec->nr);
 
-	return page;
+	return pvec->pages[--pvec->nr];
 }
 
-static void vm_free_pages_release(struct i915_address_space *vm)
+static void vm_free_pages_release(struct i915_address_space *vm,
+				  bool immediate)
 {
-	GEM_BUG_ON(!pagevec_count(&vm->free_pages));
+	struct pagevec *pvec = &vm->free_pages;
+
+	GEM_BUG_ON(!pagevec_count(pvec));
+
+	if (vm->pt_kmap_wc) {
+		struct pagevec *stash = &vm->i915->mm.wc_stash;
 
-	if (vm->pt_kmap_wc)
-		set_pages_array_wb(vm->free_pages.pages,
-				   pagevec_count(&vm->free_pages));
+		/* When we use WC, first fill up the global stash and then
+		 * only if full immediately free the overflow.
+		 */
+
+		lockdep_assert_held(&vm->i915->drm.struct_mutex);
+		if (pagevec_space(stash)) {
+			do {
+				stash->pages[stash->nr++] =
+					pvec->pages[--pvec->nr];
+				if (!pvec->nr)
+					return;
+			} while (pagevec_space(stash));
+
+			/* As we have made some room in the VM's free_pages,
+			 * we can wait for it to fill again. Unless we are
+			 * inside i915_address_space_fini() and must
+			 * immediately release the pages!
+			 */
+			if (!immediate)
+				return;
+		}
+
+		set_pages_array_wb(pvec->pages, pvec->nr);
+	}
 
-	__pagevec_release(&vm->free_pages);
+	__pagevec_release(pvec);
 }
 
 static void vm_free_page(struct i915_address_space *vm, struct page *page)
 {
 	if (!pagevec_add(&vm->free_pages, page))
-		vm_free_pages_release(vm);
+		vm_free_pages_release(vm, false);
 }
 
 static int __setup_page_dma(struct i915_address_space *vm,
@@ -447,12 +493,31 @@ static void fill_page_dma_32(struct i915_address_space *vm,
 static int
 setup_scratch_page(struct i915_address_space *vm, gfp_t gfp)
 {
-	return __setup_page_dma(vm, &vm->scratch_page, gfp | __GFP_ZERO);
+	struct page *page;
+	dma_addr_t addr;
+
+	page = alloc_page(gfp | __GFP_ZERO);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	addr = dma_map_page(vm->dma, page, 0, PAGE_SIZE,
+			    PCI_DMA_BIDIRECTIONAL);
+	if (unlikely(dma_mapping_error(vm->dma, addr))) {
+		__free_page(page);
+		return -ENOMEM;
+	}
+
+	vm->scratch_page.page = page;
+	vm->scratch_page.daddr = addr;
+	return 0;
 }
 
 static void cleanup_scratch_page(struct i915_address_space *vm)
 {
-	cleanup_page_dma(vm, &vm->scratch_page);
+	struct i915_page_dma *p = &vm->scratch_page;
+
+	dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+	__free_page(p->page);
 }
 
 static struct i915_page_table *alloc_pt(struct i915_address_space *vm)
@@ -1332,18 +1397,18 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 		1ULL << 48 :
 		1ULL << 32;
 
-	ret = gen8_init_scratch(&ppgtt->base);
-	if (ret) {
-		ppgtt->base.total = 0;
-		return ret;
-	}
-
 	/* There are only few exceptions for gen >=6. chv and bxt.
 	 * And we are not sure about the latter so play safe for now.
 	 */
 	if (IS_CHERRYVIEW(dev_priv) || IS_BROXTON(dev_priv))
 		ppgtt->base.pt_kmap_wc = true;
 
+	ret = gen8_init_scratch(&ppgtt->base);
+	if (ret) {
+		ppgtt->base.total = 0;
+		return ret;
+	}
+
 	if (use_4lvl(vm)) {
 		ret = setup_px(&ppgtt->base, &ppgtt->pml4);
 		if (ret)
@@ -1867,7 +1932,7 @@ static void i915_address_space_init(struct i915_address_space *vm,
 static void i915_address_space_fini(struct i915_address_space *vm)
 {
 	if (pagevec_count(&vm->free_pages))
-		vm_free_pages_release(vm);
+		vm_free_pages_release(vm, true);
 
 	i915_gem_timeline_fini(&vm->timeline);
 	drm_mm_takedown(&vm->mm);
@@ -2593,6 +2658,7 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
 {
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
 	struct i915_vma *vma, *vn;
+	struct pagevec *pvec;
 
 	ggtt->base.closed = true;
 
@@ -2616,6 +2682,13 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
 	}
 
 	ggtt->base.cleanup(&ggtt->base);
+
+	pvec = &dev_priv->mm.wc_stash;
+	if (pvec->nr) {
+		set_pages_array_wb(pvec->pages, pvec->nr);
+		__pagevec_release(pvec);
+	}
+
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	arch_phys_wc_del(ggtt->mtrr);
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 02/16] drm/i915: Pin fence for iomap
  2017-07-26 13:25 More tweaks Chris Wilson
  2017-07-26 13:25 ` [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages Chris Wilson
@ 2017-07-26 13:25 ` Chris Wilson
  2017-07-26 13:25 ` [PATCH 03/16] drm/i915: Consolidate get_fence with pin_fence Chris Wilson
                   ` (14 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:25 UTC (permalink / raw)
  To: intel-gfx

Acquire the fence register for the iomap in i915_vma_pin_iomap() on
behalf of the caller.

We probably want for the caller to specify whether the fence should be
pinned for their usage, but at the moment all callers do want the
associated fence, or none, so take it on their behalf.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_vma.c                  | 19 +++++++++++++++++++
 drivers/gpu/drm/i915/i915_vma.h                  |  7 +------
 drivers/gpu/drm/i915/selftests/i915_gem_object.c |  8 --------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 958be0a95960..d7330d3eeab6 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -278,6 +278,7 @@ int i915_vma_bind(struct i915_vma *vma, enum i915_cache_level cache_level,
 void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
 {
 	void __iomem *ptr;
+	int ret;
 
 	/* Access through the GTT requires the device to be awake. */
 	assert_rpm_wakelock_held(vma->vm->i915);
@@ -301,9 +302,27 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
 	}
 
 	__i915_vma_pin(vma);
+
+	ret = i915_vma_get_fence(vma);
+	if (ret) {
+		__i915_vma_unpin(vma);
+		return IO_ERR_PTR(ret);
+	}
+	i915_vma_pin_fence(vma);
+
 	return ptr;
 }
 
+void i915_vma_unpin_iomap(struct i915_vma *vma)
+{
+	lockdep_assert_held(&vma->obj->base.dev->struct_mutex);
+
+	GEM_BUG_ON(vma->iomap == NULL);
+
+	i915_vma_unpin_fence(vma);
+	i915_vma_unpin(vma);
+}
+
 void i915_vma_unpin_and_release(struct i915_vma **p_vma)
 {
 	struct i915_vma *vma;
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 20cf272c97b1..1d67f52ee836 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -319,12 +319,7 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma);
  * Callers must hold the struct_mutex. This function is only valid to be
  * called on a VMA previously iomapped by the caller with i915_vma_pin_iomap().
  */
-static inline void i915_vma_unpin_iomap(struct i915_vma *vma)
-{
-	lockdep_assert_held(&vma->obj->base.dev->struct_mutex);
-	GEM_BUG_ON(vma->iomap == NULL);
-	i915_vma_unpin(vma);
-}
+void i915_vma_unpin_iomap(struct i915_vma *vma);
 
 static inline struct page *i915_vma_first_page(struct i915_vma *vma)
 {
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_object.c b/drivers/gpu/drm/i915/selftests/i915_gem_object.c
index 8f011c447e41..1b8774a42e48 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_object.c
@@ -251,14 +251,6 @@ static int check_partial_mapping(struct drm_i915_gem_object *obj,
 			return PTR_ERR(io);
 		}
 
-		err = i915_vma_get_fence(vma);
-		if (err) {
-			pr_err("Failed to get fence for partial view: offset=%lu\n",
-			       page);
-			i915_vma_unpin_iomap(vma);
-			return err;
-		}
-
 		iowrite32(page, io + n * PAGE_SIZE/sizeof(*io));
 		i915_vma_unpin_iomap(vma);
 
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 03/16] drm/i915: Consolidate get_fence with pin_fence
  2017-07-26 13:25 More tweaks Chris Wilson
  2017-07-26 13:25 ` [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages Chris Wilson
  2017-07-26 13:25 ` [PATCH 02/16] drm/i915: Pin fence for iomap Chris Wilson
@ 2017-07-26 13:25 ` Chris Wilson
  2017-07-26 13:25 ` [PATCH 04/16] drm/i915: Emit pipelined fence changes Chris Wilson
                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:25 UTC (permalink / raw)
  To: intel-gfx

Following the pattern now used for obj->mm.pages, use just pin_fence and
unpin_fence to control access to the fence registers. I.e. instead of
calling get_fence(); pin_fence(), we now just need to call pin_fence().
This will make it easier to reduce the locking requirements around
fence registers.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h            |  4 ----
 drivers/gpu/drm/i915/i915_gem.c            |  3 ++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 10 ++++-----
 drivers/gpu/drm/i915/i915_gem_fence_reg.c  | 33 +++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_vma.c            |  3 +--
 drivers/gpu/drm/i915/i915_vma.h            | 20 ++++++++----------
 drivers/gpu/drm/i915/intel_display.c       |  3 +--
 7 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fd62be74a422..fa132cb50d5d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3591,10 +3591,6 @@ i915_vm_to_ppgtt(struct i915_address_space *vm)
 	return container_of(vm, struct i915_hw_ppgtt, base);
 }
 
-/* i915_gem_fence_reg.c */
-int __must_check i915_vma_get_fence(struct i915_vma *vma);
-int __must_check i915_vma_put_fence(struct i915_vma *vma);
-
 void i915_gem_revoke_fences(struct drm_i915_private *dev_priv);
 void i915_gem_restore_fences(struct drm_i915_private *dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6faabf34f142..3ab358c5db41 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1941,7 +1941,7 @@ int i915_gem_fault(struct vm_fault *vmf)
 	if (ret)
 		goto err_unpin;
 
-	ret = i915_vma_get_fence(vma);
+	ret = i915_vma_pin_fence(vma);
 	if (ret)
 		goto err_unpin;
 
@@ -1957,6 +1957,7 @@ int i915_gem_fault(struct vm_fault *vmf)
 			       min_t(u64, vma->size, area->vm_end - area->vm_start),
 			       &ggtt->mappable);
 
+	i915_vma_unpin_fence(vma);
 err_unpin:
 	__i915_vma_unpin(vma);
 err_unlock:
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 5fa44767c29e..1c6476cb1e08 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -365,12 +365,12 @@ eb_pin_vma(struct i915_execbuffer *eb,
 		return;
 
 	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
-		if (unlikely(i915_vma_get_fence(vma))) {
+		if (unlikely(i915_vma_pin_fence(vma))) {
 			i915_vma_unpin(vma);
 			return;
 		}
 
-		if (i915_vma_pin_fence(vma))
+		if (vma->fence)
 			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
 
@@ -384,7 +384,7 @@ __eb_unreserve_vma(struct i915_vma *vma,
 	GEM_BUG_ON(!(entry->flags & __EXEC_OBJECT_HAS_PIN));
 
 	if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
-		i915_vma_unpin_fence(vma);
+		__i915_vma_unpin_fence(vma);
 
 	__i915_vma_unpin(vma);
 }
@@ -561,13 +561,13 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
 	}
 
 	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
-		err = i915_vma_get_fence(vma);
+		err = i915_vma_pin_fence(vma);
 		if (unlikely(err)) {
 			i915_vma_unpin(vma);
 			return err;
 		}
 
-		if (i915_vma_pin_fence(vma))
+		if (vma->fence)
 			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 5fe2cd8c8f28..38fd077941d3 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -280,8 +280,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
  *
  * 0 on success, negative error code on failure.
  */
-int
-i915_vma_put_fence(struct i915_vma *vma)
+int i915_vma_put_fence(struct i915_vma *vma)
 {
 	struct drm_i915_fence_reg *fence = vma->fence;
 
@@ -299,6 +298,8 @@ static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
 	struct drm_i915_fence_reg *fence;
 
 	list_for_each_entry(fence, &dev_priv->mm.fence_list, link) {
+		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
+
 		if (fence->pin_count)
 			continue;
 
@@ -313,7 +314,7 @@ static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
 }
 
 /**
- * i915_vma_get_fence - set up fencing for a vma
+ * i915_vma_pin_fence - set up fencing for a vma
  * @vma: vma to map through a fence reg
  *
  * When mapping objects through the GTT, userspace wants to be able to write
@@ -331,10 +332,11 @@ static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
  * 0 on success, negative error code on failure.
  */
 int
-i915_vma_get_fence(struct i915_vma *vma)
+i915_vma_pin_fence(struct i915_vma *vma)
 {
 	struct drm_i915_fence_reg *fence;
 	struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
+	int err;
 
 	/* Note that we revoke fences on runtime suspend. Therefore the user
 	 * must keep the device awake whilst using the fence.
@@ -344,6 +346,8 @@ i915_vma_get_fence(struct i915_vma *vma)
 	/* Just update our place in the LRU if our fence is getting reused. */
 	if (vma->fence) {
 		fence = vma->fence;
+		GEM_BUG_ON(fence->vma != vma);
+		fence->pin_count++;
 		if (!fence->dirty) {
 			list_move_tail(&fence->link,
 				       &fence->i915->mm.fence_list);
@@ -353,10 +357,25 @@ i915_vma_get_fence(struct i915_vma *vma)
 		fence = fence_find(vma->vm->i915);
 		if (IS_ERR(fence))
 			return PTR_ERR(fence);
+
+		GEM_BUG_ON(fence->pin_count);
+		fence->pin_count++;
 	} else
 		return 0;
 
-	return fence_update(fence, set);
+	err = fence_update(fence, set);
+	if (err)
+		goto out_unpin;
+
+	GEM_BUG_ON(fence->vma != set);
+	GEM_BUG_ON(vma->fence != (set ? fence : NULL));
+
+	if (set)
+		return 0;
+
+out_unpin:
+	fence->pin_count--;
+	return err;
 }
 
 /**
@@ -378,6 +397,8 @@ void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
 
+		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
+
 		if (fence->vma)
 			i915_gem_release_mmap(fence->vma->obj);
 	}
@@ -399,6 +420,8 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
 		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
 		struct i915_vma *vma = reg->vma;
 
+		GEM_BUG_ON(vma && vma->fence != reg);
+
 		/*
 		 * Commit delayed tiling changes if we have an object still
 		 * attached to the fence, otherwise just clear the fence.
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d7330d3eeab6..efbfee8eac99 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -303,12 +303,11 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
 
 	__i915_vma_pin(vma);
 
-	ret = i915_vma_get_fence(vma);
+	ret = i915_vma_pin_fence(vma);
 	if (ret) {
 		__i915_vma_unpin(vma);
 		return IO_ERR_PTR(ret);
 	}
-	i915_vma_pin_fence(vma);
 
 	return ptr;
 }
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 1d67f52ee836..7c2c614f96f7 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -342,15 +342,13 @@ static inline struct page *i915_vma_first_page(struct i915_vma *vma)
  *
  * True if the vma has a fence, false otherwise.
  */
-static inline bool
-i915_vma_pin_fence(struct i915_vma *vma)
+int i915_vma_pin_fence(struct i915_vma *vma);
+int __must_check i915_vma_put_fence(struct i915_vma *vma);
+
+static inline void __i915_vma_unpin_fence(struct i915_vma *vma)
 {
-	lockdep_assert_held(&vma->obj->base.dev->struct_mutex);
-	if (vma->fence) {
-		vma->fence->pin_count++;
-		return true;
-	} else
-		return false;
+	GEM_BUG_ON(vma->fence->pin_count <= 0);
+	vma->fence->pin_count--;
 }
 
 /**
@@ -365,10 +363,8 @@ static inline void
 i915_vma_unpin_fence(struct i915_vma *vma)
 {
 	lockdep_assert_held(&vma->obj->base.dev->struct_mutex);
-	if (vma->fence) {
-		GEM_BUG_ON(vma->fence->pin_count <= 0);
-		vma->fence->pin_count--;
-	}
+	if (vma->fence)
+		__i915_vma_unpin_fence(vma);
 }
 
 #endif
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index f7b128c33aa1..a890ed2fd37c 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2178,8 +2178,7 @@ intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 		 * something and try to run the system in a "less than optimal"
 		 * mode that matches the user configuration.
 		 */
-		if (i915_vma_get_fence(vma) == 0)
-			i915_vma_pin_fence(vma);
+		i915_vma_pin_fence(vma);
 	}
 
 	i915_vma_get(vma);
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 04/16] drm/i915: Emit pipelined fence changes
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (2 preceding siblings ...)
  2017-07-26 13:25 ` [PATCH 03/16] drm/i915: Consolidate get_fence with pin_fence Chris Wilson
@ 2017-07-26 13:25 ` Chris Wilson
  2017-07-26 13:25 ` [PATCH 05/16] drm/i915: Refactor testing obj->mm.pages Chris Wilson
                   ` (12 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:25 UTC (permalink / raw)
  To: intel-gfx

Many years ago, long before requests, we tried doing this. We never
quite got it right, but now with requests we have the tracking to do the
job properly!

One of the stall points for gen2/gen3 is the use of fence registers for
GPU operations. There are only a few available, and currently if we
exhaust the available fence register we must stall the GPU between
batches. By pipelining the fence register writes, we can avoid the stall
and continuously emit the batches. The challenge is remembering to wait
for those pipelined LRI before accessing the fence with the CPU, and
that is what our request tracking makes easy.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gvt/aperture_gm.c     |   2 +-
 drivers/gpu/drm/i915/i915_debugfs.c        |   2 +-
 drivers/gpu/drm/i915/i915_gem.c            |   1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  15 ++-
 drivers/gpu/drm/i915/i915_gem_fence_reg.c  | 206 ++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_gem_fence_reg.h  |   3 +-
 drivers/gpu/drm/i915/i915_vma.c            |   1 +
 drivers/gpu/drm/i915/i915_vma.h            |  13 +-
 8 files changed, 202 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/aperture_gm.c b/drivers/gpu/drm/i915/gvt/aperture_gm.c
index 325618d969fe..c34cd56eb83a 100644
--- a/drivers/gpu/drm/i915/gvt/aperture_gm.c
+++ b/drivers/gpu/drm/i915/gvt/aperture_gm.c
@@ -196,7 +196,7 @@ static int alloc_vgpu_fence(struct intel_vgpu *vgpu)
 	i = 0;
 	list_for_each_safe(pos, q, &dev_priv->mm.fence_list) {
 		reg = list_entry(pos, struct drm_i915_fence_reg, link);
-		if (reg->pin_count || reg->vma)
+		if (atomic_read(&reg->pin_count) || reg->vma)
 			continue;
 		list_del(pos);
 		vgpu->fence.regs[i] = reg;
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6480897bcaf4..77c0c255c477 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -896,7 +896,7 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
 		struct i915_vma *vma = dev_priv->fence_regs[i].vma;
 
 		seq_printf(m, "Fence %d, pin count = %d, object = ",
-			   i, dev_priv->fence_regs[i].pin_count);
+			   i, atomic_read(&dev_priv->fence_regs[i].pin_count));
 		if (!vma)
 			seq_puts(m, "unused");
 		else
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3ab358c5db41..218ede624af9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4905,6 +4905,7 @@ i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
 		fence->i915 = dev_priv;
 		fence->id = i;
 		list_add_tail(&fence->link, &dev_priv->mm.fence_list);
+		init_request_active(&fence->pipelined, NULL);
 	}
 	i915_gem_restore_fences(dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 1c6476cb1e08..4ffc2a4b0f2d 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -365,11 +365,12 @@ eb_pin_vma(struct i915_execbuffer *eb,
 		return;
 
 	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
-		if (unlikely(i915_vma_pin_fence(vma))) {
+		if (unlikely(i915_vma_reserve_fence(vma))) {
 			i915_vma_unpin(vma);
 			return;
 		}
 
+		entry->flags &= ~EXEC_OBJECT_ASYNC;
 		if (vma->fence)
 			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
@@ -561,12 +562,13 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
 	}
 
 	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
-		err = i915_vma_pin_fence(vma);
+		err = i915_vma_reserve_fence(vma);
 		if (unlikely(err)) {
 			i915_vma_unpin(vma);
 			return err;
 		}
 
+		entry->flags &= ~EXEC_OBJECT_ASYNC;
 		if (vma->fence)
 			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
@@ -1850,6 +1852,12 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
 		if (entry->flags & EXEC_OBJECT_ASYNC)
 			goto skip_flushes;
 
+		if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
+			err = i915_vma_emit_pipelined_fence(vma, eb->request);
+			if (err)
+				return err;
+		}
+
 		err = i915_gem_request_await_object
 			(eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
 		if (err)
@@ -1934,9 +1942,6 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 		obj->base.read_domains = 0;
 	}
 	obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
-
-	if (flags & EXEC_OBJECT_NEEDS_FENCE)
-		i915_gem_active_set(&vma->last_fence, req);
 }
 
 static int i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 38fd077941d3..c5a00ff44786 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -55,10 +55,9 @@
  * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
  */
 
-#define pipelined 0
-
-static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
-				 struct i915_vma *vma)
+static int i965_write_fence_reg(struct drm_i915_fence_reg *fence,
+				struct i915_vma *vma,
+				struct drm_i915_gem_request *pipelined)
 {
 	i915_reg_t fence_reg_lo, fence_reg_hi;
 	int fence_pitch_shift;
@@ -110,11 +109,30 @@ static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
 		I915_WRITE(fence_reg_hi, upper_32_bits(val));
 		I915_WRITE(fence_reg_lo, lower_32_bits(val));
 		POSTING_READ(fence_reg_lo);
+	} else {
+		u32 *cs;
+
+		cs = intel_ring_begin(pipelined, 8);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_LOAD_REGISTER_IMM(3);
+		*cs++ = i915_mmio_reg_offset(fence_reg_lo);
+		*cs++ = 0;
+		*cs++ = i915_mmio_reg_offset(fence_reg_hi);
+		*cs++ = upper_32_bits(val);
+		*cs++ = i915_mmio_reg_offset(fence_reg_lo);
+		*cs++ = lower_32_bits(val);
+		*cs++ = MI_NOOP;
+		intel_ring_advance(pipelined, cs);
 	}
+
+	return 0;
 }
 
-static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
-				 struct i915_vma *vma)
+static int i915_write_fence_reg(struct drm_i915_fence_reg *fence,
+				struct i915_vma *vma,
+				struct drm_i915_gem_request *pipelined)
 {
 	u32 val;
 
@@ -150,11 +168,26 @@ static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
 
 		I915_WRITE(reg, val);
 		POSTING_READ(reg);
+	} else {
+		u32 *cs;
+
+		cs = intel_ring_begin(pipelined, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_LOAD_REGISTER_IMM(1);
+		*cs++ = i915_mmio_reg_offset(FENCE_REG(fence->id));
+		*cs++ = val;
+		*cs++ = MI_NOOP;
+		intel_ring_advance(pipelined, cs);
 	}
+
+	return 0;
 }
 
-static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
-				 struct i915_vma *vma)
+static int i830_write_fence_reg(struct drm_i915_fence_reg *fence,
+				struct i915_vma *vma,
+				struct drm_i915_gem_request *pipelined)
 {
 	u32 val;
 
@@ -182,29 +215,49 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
 
 		I915_WRITE(reg, val);
 		POSTING_READ(reg);
+	} else {
+		u32 *cs;
+
+		cs = intel_ring_begin(pipelined, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_LOAD_REGISTER_IMM(1);
+		*cs++ = i915_mmio_reg_offset(FENCE_REG(fence->id));
+		*cs++ = val;
+		*cs++ = MI_NOOP;
+		intel_ring_advance(pipelined, cs);
 	}
+
+	return 0;
 }
 
-static void fence_write(struct drm_i915_fence_reg *fence,
-			struct i915_vma *vma)
+static int fence_write(struct drm_i915_fence_reg *fence,
+		       struct i915_vma *vma,
+		       struct drm_i915_gem_request *rq)
 {
+	int err;
+
 	/* Previous access through the fence register is marshalled by
 	 * the mb() inside the fault handlers (i915_gem_release_mmaps)
 	 * and explicitly managed for internal users.
 	 */
 
 	if (IS_GEN2(fence->i915))
-		i830_write_fence_reg(fence, vma);
+		err = i830_write_fence_reg(fence, vma, rq);
 	else if (IS_GEN3(fence->i915))
-		i915_write_fence_reg(fence, vma);
+		err = i915_write_fence_reg(fence, vma, rq);
 	else
-		i965_write_fence_reg(fence, vma);
+		err = i965_write_fence_reg(fence, vma, rq);
+	if (err)
+		return err;
 
 	/* Access through the fenced region afterwards is
 	 * ordered by the posting reads whilst writing the registers.
 	 */
 
 	fence->dirty = false;
+	return 0;
 }
 
 static int fence_update(struct drm_i915_fence_reg *fence,
@@ -212,17 +265,15 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 {
 	int ret;
 
+	ret = i915_gem_active_retire(&fence->pipelined,
+				     &fence->i915->drm.struct_mutex);
+	if (ret)
+		return ret;
+
 	if (vma) {
 		if (!i915_vma_is_map_and_fenceable(vma))
 			return -EINVAL;
 
-		if (WARN(!i915_gem_object_get_stride(vma->obj) ||
-			 !i915_gem_object_get_tiling(vma->obj),
-			 "bogus fence setup with stride: 0x%x, tiling mode: %i\n",
-			 i915_gem_object_get_stride(vma->obj),
-			 i915_gem_object_get_tiling(vma->obj)))
-			return -EINVAL;
-
 		ret = i915_gem_active_retire(&vma->last_fence,
 					     &vma->obj->base.dev->struct_mutex);
 		if (ret)
@@ -253,7 +304,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 	 * to the runtime resume, see i915_gem_restore_fences().
 	 */
 	if (intel_runtime_pm_get_if_in_use(fence->i915)) {
-		fence_write(fence, vma);
+		fence_write(fence, vma, NULL);
 		intel_runtime_pm_put(fence->i915);
 	}
 
@@ -287,7 +338,9 @@ int i915_vma_put_fence(struct i915_vma *vma)
 	if (!fence)
 		return 0;
 
-	if (fence->pin_count)
+	GEM_BUG_ON(fence->vma != vma);
+
+	if (atomic_read(&fence->pin_count))
 		return -EBUSY;
 
 	return fence_update(fence, NULL);
@@ -300,7 +353,7 @@ static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
 	list_for_each_entry(fence, &dev_priv->mm.fence_list, link) {
 		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
 
-		if (fence->pin_count)
+		if (atomic_read(&fence->pin_count))
 			continue;
 
 		return fence;
@@ -344,11 +397,16 @@ i915_vma_pin_fence(struct i915_vma *vma)
 	assert_rpm_wakelock_held(vma->vm->i915);
 
 	/* Just update our place in the LRU if our fence is getting reused. */
-	if (vma->fence) {
-		fence = vma->fence;
+	fence = vma->fence;
+	if (fence) {
 		GEM_BUG_ON(fence->vma != vma);
-		fence->pin_count++;
+		atomic_inc(&fence->pin_count);
 		if (!fence->dirty) {
+			err = i915_gem_active_retire(&fence->pipelined,
+						     &fence->i915->drm.struct_mutex);
+			if (err)
+				goto out_unpin;
+
 			list_move_tail(&fence->link,
 				       &fence->i915->mm.fence_list);
 			return 0;
@@ -358,8 +416,8 @@ i915_vma_pin_fence(struct i915_vma *vma)
 		if (IS_ERR(fence))
 			return PTR_ERR(fence);
 
-		GEM_BUG_ON(fence->pin_count);
-		fence->pin_count++;
+		GEM_BUG_ON(atomic_read(&fence->pin_count));
+		atomic_inc(&fence->pin_count);
 	} else
 		return 0;
 
@@ -374,10 +432,97 @@ i915_vma_pin_fence(struct i915_vma *vma)
 		return 0;
 
 out_unpin:
-	fence->pin_count--;
+	atomic_dec(&fence->pin_count);
 	return err;
 }
 
+int i915_vma_reserve_fence(struct i915_vma *vma)
+{
+	struct drm_i915_fence_reg *fence;
+
+	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
+	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
+	GEM_BUG_ON(!i915_vma_is_pinned(vma));
+
+	fence = vma->fence;
+	if (!fence) {
+		if (!i915_gem_object_is_tiled(vma->obj))
+			return 0;
+
+		if (!i915_vma_is_map_and_fenceable(vma))
+			return -EINVAL;
+
+		fence = fence_find(vma->vm->i915);
+		if (IS_ERR(fence))
+			return PTR_ERR(fence);
+
+		vma->fence = fence;
+
+		if (fence->vma) {
+			i915_gem_release_mmap(fence->vma->obj);
+			fence->vma->fence = NULL;
+		}
+		fence->vma = vma;
+		fence->dirty = true;
+	}
+	atomic_inc(&fence->pin_count);
+	list_move_tail(&fence->link, &fence->i915->mm.fence_list);
+
+	GEM_BUG_ON(!i915_gem_object_is_tiled(vma->obj));
+	GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
+	GEM_BUG_ON(vma->node.size != vma->fence_size);
+	GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_alignment));
+
+	return 0;
+}
+
+int i915_vma_emit_pipelined_fence(struct i915_vma *vma,
+				  struct drm_i915_gem_request *rq)
+{
+	struct drm_i915_fence_reg *fence = vma->fence;
+	struct drm_i915_gem_request *prev;
+	int err;
+
+	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
+	GEM_BUG_ON(fence && !atomic_read(&fence->pin_count));
+
+	if (!fence)
+		goto out;
+
+	prev = i915_gem_active_raw(&fence->pipelined,
+				   &fence->i915->drm.struct_mutex);
+	if (prev) {
+		err = i915_gem_request_await_dma_fence(rq, &prev->fence);
+		if (err)
+			return err;
+	}
+
+	if (!fence->dirty)
+		goto out;
+
+	GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
+
+	if (fence->vma) {
+		prev = i915_gem_active_raw(&fence->vma->last_fence,
+					   &fence->i915->drm.struct_mutex);
+		if (prev) {
+			err = i915_gem_request_await_dma_fence(rq,
+							       &prev->fence);
+			if (err)
+				return err;
+		}
+	}
+
+	err = fence_write(fence, vma, rq);
+	if (err)
+		return err;
+
+	i915_gem_active_set(&fence->pipelined, rq);
+out:
+	i915_gem_active_set(&vma->last_fence, rq);
+	return 0;
+}
+
 /**
  * i915_gem_revoke_fences - revoke fence state
  * @dev_priv: i915 device private
@@ -428,6 +573,7 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
 		 */
 		if (vma && !i915_gem_object_is_tiled(vma->obj)) {
 			GEM_BUG_ON(!reg->dirty);
+			GEM_BUG_ON(atomic_read(&reg->pin_count));
 			GEM_BUG_ON(!list_empty(&vma->obj->userfault_link));
 
 			list_move(&reg->link, &dev_priv->mm.fence_list);
@@ -435,7 +581,7 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
 			vma = NULL;
 		}
 
-		fence_write(reg, vma);
+		fence_write(reg, vma, NULL);
 		reg->vma = vma;
 	}
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index 99a31ded4dfd..f69d894997fb 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -36,7 +36,7 @@ struct drm_i915_fence_reg {
 	struct list_head link;
 	struct drm_i915_private *i915;
 	struct i915_vma *vma;
-	int pin_count;
+	atomic_t pin_count;
 	int id;
 	/**
 	 * Whether the tiling parameters for the currently
@@ -47,6 +47,7 @@ struct drm_i915_fence_reg {
 	 * command (such as BLT on gen2/3), as a "fence".
 	 */
 	bool dirty;
+	struct i915_gem_active pipelined;
 };
 
 #endif
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index efbfee8eac99..0c489090d4ab 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -728,6 +728,7 @@ int i915_vma_unbind(struct i915_vma *vma)
 		__i915_vma_iounmap(vma);
 		vma->flags &= ~I915_VMA_CAN_FENCE;
 	}
+	GEM_BUG_ON(vma->fence);
 
 	if (likely(!vma->vm->closed)) {
 		trace_i915_vma_unbind(vma);
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 7c2c614f96f7..4808601b91f1 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -347,8 +347,8 @@ int __must_check i915_vma_put_fence(struct i915_vma *vma);
 
 static inline void __i915_vma_unpin_fence(struct i915_vma *vma)
 {
-	GEM_BUG_ON(vma->fence->pin_count <= 0);
-	vma->fence->pin_count--;
+	GEM_BUG_ON(atomic_read(&vma->fence->pin_count) <= 0);
+	atomic_dec(&vma->fence->pin_count);
 }
 
 /**
@@ -362,10 +362,17 @@ static inline void __i915_vma_unpin_fence(struct i915_vma *vma)
 static inline void
 i915_vma_unpin_fence(struct i915_vma *vma)
 {
-	lockdep_assert_held(&vma->obj->base.dev->struct_mutex);
+	/* The assumption is if the caller has a fence, the caller owns a pin
+	 * on that fence, i.e. that vma->fence cannot become NULL prior to us
+	 * releasing our pin.
+	 */
 	if (vma->fence)
 		__i915_vma_unpin_fence(vma);
 }
 
+int __must_check i915_vma_reserve_fence(struct i915_vma *vma);
+int i915_vma_emit_pipelined_fence(struct i915_vma *vma,
+				  struct drm_i915_gem_request *rq);
+
 #endif
 
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 05/16] drm/i915: Refactor testing obj->mm.pages
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (3 preceding siblings ...)
  2017-07-26 13:25 ` [PATCH 04/16] drm/i915: Emit pipelined fence changes Chris Wilson
@ 2017-07-26 13:25 ` Chris Wilson
  2017-08-10 16:26   ` Joonas Lahtinen
  2017-07-26 13:26 ` [PATCH 06/16] drm/i915: Move dev_priv->mm.[un]bound_list to its own lock Chris Wilson
                   ` (11 subsequent siblings)
  16 siblings, 1 reply; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:25 UTC (permalink / raw)
  To: intel-gfx

Since we occasionally stuff an error pointer into obj->mm.pages for a
semi-permanent or even permanent failure, we have to be more careful and
not just test against NULL when deciding if the object has a complete
set of its concurrent pages.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h              | 10 ++++++++--
 drivers/gpu/drm/i915/i915_gem.c              | 21 +++++++++++----------
 drivers/gpu/drm/i915/i915_gem_clflush.c      |  1 +
 drivers/gpu/drm/i915/i915_gem_render_state.c |  2 +-
 drivers/gpu/drm/i915/i915_gem_shrinker.c     | 10 +++++-----
 drivers/gpu/drm/i915/i915_gem_tiling.c       |  2 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c      |  2 +-
 7 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fa132cb50d5d..a908d1ba05d5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3387,10 +3387,16 @@ i915_gem_object_pin_pages(struct drm_i915_gem_object *obj)
 	return __i915_gem_object_get_pages(obj);
 }
 
+static inline bool
+i915_gem_object_has_pages(struct drm_i915_gem_object *obj)
+{
+	return !IS_ERR_OR_NULL(READ_ONCE(obj->mm.pages));
+}
+
 static inline void
 __i915_gem_object_pin_pages(struct drm_i915_gem_object *obj)
 {
-	GEM_BUG_ON(!obj->mm.pages);
+	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
 
 	atomic_inc(&obj->mm.pages_pin_count);
 }
@@ -3404,8 +3410,8 @@ i915_gem_object_has_pinned_pages(struct drm_i915_gem_object *obj)
 static inline void
 __i915_gem_object_unpin_pages(struct drm_i915_gem_object *obj)
 {
+	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
-	GEM_BUG_ON(!obj->mm.pages);
 
 	atomic_dec(&obj->mm.pages_pin_count);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 218ede624af9..858b3bb6846e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -584,7 +584,7 @@ i915_gem_object_attach_phys(struct drm_i915_gem_object *obj,
 		return ret;
 
 	__i915_gem_object_put_pages(obj, I915_MM_NORMAL);
-	if (obj->mm.pages)
+	if (i915_gem_object_has_pages(obj))
 		return -EBUSY;
 
 	GEM_BUG_ON(obj->ops != &i915_gem_object_ops);
@@ -2204,7 +2204,7 @@ void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
 	struct address_space *mapping;
 
 	lockdep_assert_held(&obj->mm.lock);
-	GEM_BUG_ON(obj->mm.pages);
+	GEM_BUG_ON(i915_gem_object_has_pages(obj));
 
 	switch (obj->mm.madv) {
 	case I915_MADV_DONTNEED:
@@ -2267,7 +2267,7 @@ void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
 		return;
 
 	GEM_BUG_ON(obj->bind_count);
-	if (!READ_ONCE(obj->mm.pages))
+	if (!i915_gem_object_has_pages(obj))
 		return;
 
 	/* May be called by shrinker from within get_pages() (on another bo) */
@@ -2546,7 +2546,7 @@ int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
 	if (err)
 		return err;
 
-	if (unlikely(IS_ERR_OR_NULL(obj->mm.pages))) {
+	if (unlikely(!i915_gem_object_has_pages(obj))) {
 		err = ____i915_gem_object_get_pages(obj);
 		if (err)
 			goto unlock;
@@ -2624,7 +2624,7 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 
 	pinned = true;
 	if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
-		if (unlikely(IS_ERR_OR_NULL(obj->mm.pages))) {
+		if (unlikely(!i915_gem_object_has_pages(obj))) {
 			ret = ____i915_gem_object_get_pages(obj);
 			if (ret)
 				goto err_unlock;
@@ -2634,7 +2634,7 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 		atomic_inc(&obj->mm.pages_pin_count);
 		pinned = false;
 	}
-	GEM_BUG_ON(!obj->mm.pages);
+	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
 
 	ptr = page_unpack_bits(obj->mm.mapping, &has_type);
 	if (ptr && has_type != type) {
@@ -2689,7 +2689,7 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
 	 * allows it to avoid the cost of retrieving a page (either swapin
 	 * or clearing-before-use) before it is overwritten.
 	 */
-	if (READ_ONCE(obj->mm.pages))
+	if (i915_gem_object_has_pages(obj))
 		return -ENODEV;
 
 	/* Before the pages are instantiated the object is treated as being
@@ -4240,7 +4240,7 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 	if (err)
 		goto out;
 
-	if (obj->mm.pages &&
+	if (i915_gem_object_has_pages(obj) &&
 	    i915_gem_object_is_tiled(obj) &&
 	    dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
 		if (obj->mm.madv == I915_MADV_WILLNEED) {
@@ -4259,7 +4259,8 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 		obj->mm.madv = args->madv;
 
 	/* if the object is no longer attached, discard its backing storage */
-	if (obj->mm.madv == I915_MADV_DONTNEED && !obj->mm.pages)
+	if (obj->mm.madv == I915_MADV_DONTNEED &&
+	    !i915_gem_object_has_pages(obj))
 		i915_gem_object_truncate(obj);
 
 	args->retained = obj->mm.madv != __I915_MADV_PURGED;
@@ -4450,7 +4451,7 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
 			atomic_set(&obj->mm.pages_pin_count, 0);
 		__i915_gem_object_put_pages(obj, I915_MM_NORMAL);
-		GEM_BUG_ON(obj->mm.pages);
+		GEM_BUG_ON(i915_gem_object_has_pages(obj));
 
 		if (obj->base.import_attach)
 			drm_prime_gem_destroy(&obj->base, NULL);
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index 348b29a845c9..3bc4d6b080f7 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -70,6 +70,7 @@ static const struct dma_fence_ops i915_clflush_ops = {
 
 static void __i915_do_clflush(struct drm_i915_gem_object *obj)
 {
+	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
 	drm_clflush_sg(obj->mm.pages);
 	intel_fb_obj_flush(obj, ORIGIN_CPU);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c
index 7032c542a9b1..241d827b85fb 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.c
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.c
@@ -229,7 +229,7 @@ int i915_gem_render_state_emit(struct drm_i915_gem_request *req)
 		return 0;
 
 	/* Recreate the page after shrinking */
-	if (!so->vma->obj->mm.pages)
+	if (!i915_gem_object_has_pages(so->vma->obj))
 		so->batch_offset = -1;
 
 	ret = i915_vma_pin(so->vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 1032f98add11..1668a62619fb 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -92,7 +92,7 @@ static bool swap_available(void)
 
 static bool can_release_pages(struct drm_i915_gem_object *obj)
 {
-	if (!obj->mm.pages)
+	if (!i915_gem_object_has_pages(obj))
 		return false;
 
 	/* Consider only shrinkable ojects. */
@@ -124,7 +124,7 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
 {
 	if (i915_gem_object_unbind(obj) == 0)
 		__i915_gem_object_put_pages(obj, I915_MM_SHRINKER);
-	return !READ_ONCE(obj->mm.pages);
+	return !i915_gem_object_has_pages(obj);
 }
 
 /**
@@ -238,7 +238,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 				/* May arrive from get_pages on another bo */
 				mutex_lock_nested(&obj->mm.lock,
 						  I915_MM_SHRINKER);
-				if (!obj->mm.pages) {
+				if (!i915_gem_object_has_pages(obj)) {
 					__i915_gem_object_invalidate(obj);
 					list_del_init(&obj->global_link);
 					count += obj->base.size >> PAGE_SHIFT;
@@ -396,7 +396,7 @@ i915_gem_shrinker_oom(struct notifier_block *nb, unsigned long event, void *ptr)
 	 */
 	unbound = bound = unevictable = 0;
 	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
-		if (!obj->mm.pages)
+		if (!i915_gem_object_has_pages(obj))
 			continue;
 
 		if (!can_release_pages(obj))
@@ -405,7 +405,7 @@ i915_gem_shrinker_oom(struct notifier_block *nb, unsigned long event, void *ptr)
 			unbound += obj->base.size >> PAGE_SHIFT;
 	}
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
-		if (!obj->mm.pages)
+		if (!i915_gem_object_has_pages(obj))
 			continue;
 
 		if (!can_release_pages(obj))
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index fb5231f98c0d..1294cf695df0 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -269,7 +269,7 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj,
 	 * due to the change in swizzling.
 	 */
 	mutex_lock(&obj->mm.lock);
-	if (obj->mm.pages &&
+	if (i915_gem_object_has_pages(obj) &&
 	    obj->mm.madv == I915_MADV_WILLNEED &&
 	    i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
 		if (tiling == I915_TILING_NONE) {
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index ccd09e8419f5..f5d4f9875e32 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -82,7 +82,7 @@ static void cancel_userptr(struct work_struct *work)
 	/* We are inside a kthread context and can't be interrupted */
 	if (i915_gem_object_unbind(obj) == 0)
 		__i915_gem_object_put_pages(obj, I915_MM_NORMAL);
-	WARN_ONCE(obj->mm.pages,
+	WARN_ONCE(i915_gem_object_has_pages(obj),
 		  "Failed to release pages: bind_count=%d, pages_pin_count=%d, pin_display=%d\n",
 		  obj->bind_count,
 		  atomic_read(&obj->mm.pages_pin_count),
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 06/16] drm/i915: Move dev_priv->mm.[un]bound_list to its own lock
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (4 preceding siblings ...)
  2017-07-26 13:25 ` [PATCH 05/16] drm/i915: Refactor testing obj->mm.pages Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-08-10 16:22   ` Joonas Lahtinen
  2017-07-26 13:26 ` [PATCH 07/16] drm/i915: Trim struct_mutex hold duration for i915_gem_free_objects Chris Wilson
                   ` (10 subsequent siblings)
  16 siblings, 1 reply; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

Remove the struct_mutex requirement around dev_priv->mm.bound_list and
dev_priv->mm.unbound_list by giving it its own spinlock. This reduces
one more requirement for struct_mutex and in the process gives us
slightly more accurate unbound_list tracking, which should improve the
shrinker.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c               | 39 ++++++++++++---
 drivers/gpu/drm/i915/i915_drv.h                   |  2 +
 drivers/gpu/drm/i915/i915_gem.c                   | 41 ++++++++++++---
 drivers/gpu/drm/i915/i915_gem_gtt.c               |  3 +-
 drivers/gpu/drm/i915/i915_gem_object.h            |  7 ++-
 drivers/gpu/drm/i915/i915_gem_shrinker.c          | 61 ++++++++++-------------
 drivers/gpu/drm/i915/i915_gem_stolen.c            |  5 +-
 drivers/gpu/drm/i915/i915_vma.c                   | 16 ++++--
 drivers/gpu/drm/i915/selftests/i915_gem_context.c |  2 +-
 drivers/gpu/drm/i915/selftests/i915_gem_evict.c   |  8 ++-
 10 files changed, 120 insertions(+), 64 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 77c0c255c477..73e0a325614d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -238,7 +238,9 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 		goto out;
 
 	total_obj_size = total_gtt_size = count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
+
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, mm.link) {
 		if (count == total)
 			break;
 
@@ -250,7 +252,7 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 		total_gtt_size += i915_gem_obj_total_ggtt_size(obj);
 
 	}
-	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
+	list_for_each_entry(obj, &dev_priv->mm.unbound_list, mm.link) {
 		if (count == total)
 			break;
 
@@ -260,6 +262,7 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 		objects[count++] = obj;
 		total_obj_size += obj->base.size;
 	}
+	spin_unlock(&dev_priv->mm.obj_lock);
 
 	sort(objects, count, sizeof(*objects), obj_rank_by_stolen, NULL);
 
@@ -418,7 +421,9 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
 	size = count = 0;
 	mapped_size = mapped_count = 0;
 	purgeable_size = purgeable_count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
+
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_for_each_entry(obj, &dev_priv->mm.unbound_list, mm.link) {
 		size += obj->base.size;
 		++count;
 
@@ -435,7 +440,7 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
 	seq_printf(m, "%u unbound objects, %llu bytes\n", count, size);
 
 	size = count = dpy_size = dpy_count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, mm.link) {
 		size += obj->base.size;
 		++count;
 
@@ -454,6 +459,8 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
 			mapped_size += obj->base.size;
 		}
 	}
+	spin_unlock(&dev_priv->mm.obj_lock);
+
 	seq_printf(m, "%u bound objects, %llu bytes\n",
 		   count, size);
 	seq_printf(m, "%u purgeable objects, %llu bytes\n",
@@ -514,31 +521,49 @@ static int i915_gem_gtt_info(struct seq_file *m, void *data)
 	struct drm_i915_private *dev_priv = node_to_i915(node);
 	struct drm_device *dev = &dev_priv->drm;
 	bool show_pin_display_only = !!node->info_ent->data;
+	struct drm_i915_gem_object **objects;
 	struct drm_i915_gem_object *obj;
 	u64 total_obj_size, total_gtt_size;
+	unsigned long nobject, n;
 	int count, ret;
 
+	nobject = READ_ONCE(dev_priv->mm.object_count);
+	objects = kvmalloc_array(nobject, sizeof(*objects), GFP_KERNEL);
+	if (!objects)
+		return -ENOMEM;
+
 	ret = mutex_lock_interruptible(&dev->struct_mutex);
 	if (ret)
 		return ret;
 
-	total_obj_size = total_gtt_size = count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
+	count = 0;
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, mm.link) {
 		if (show_pin_display_only && !obj->pin_display)
 			continue;
 
+		objects[count++] = obj;
+		if (count == nobject)
+			break;
+	}
+	spin_unlock(&dev_priv->mm.obj_lock);
+
+	total_obj_size = total_gtt_size = 0;
+	for (n = 0;  n < count; n++) {
+		obj = objects[n];
+
 		seq_puts(m, "   ");
 		describe_obj(m, obj);
 		seq_putc(m, '\n');
 		total_obj_size += obj->base.size;
 		total_gtt_size += i915_gem_obj_total_ggtt_size(obj);
-		count++;
 	}
 
 	mutex_unlock(&dev->struct_mutex);
 
 	seq_printf(m, "Total %d objects, %llu bytes, %llu GTT size\n",
 		   count, total_obj_size, total_gtt_size);
+	kvfree(objects);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a908d1ba05d5..856214fe28b2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1443,6 +1443,8 @@ struct i915_gem_mm {
 	 * always the inner lock when overlapping with struct_mutex. */
 	struct mutex stolen_lock;
 
+	spinlock_t obj_lock;
+
 	/** List of all objects in gtt_space. Used to restore gtt
 	 * mappings on resume */
 	struct list_head bound_list;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 858b3bb6846e..bcfaa86451bf 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1576,9 +1576,12 @@ static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
 		list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
 	}
 
+	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
 	i915 = to_i915(obj->base.dev);
+	spin_lock(&i915->mm.obj_lock);
 	list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
-	list_move_tail(&obj->global_link, list);
+	list_move_tail(&obj->mm.link, list);
+	spin_unlock(&i915->mm.obj_lock);
 }
 
 /**
@@ -2261,6 +2264,7 @@ static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
 				 enum i915_mm_subclass subclass)
 {
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
 	struct sg_table *pages;
 
 	if (i915_gem_object_has_pinned_pages(obj))
@@ -2281,6 +2285,10 @@ void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
 	pages = fetch_and_zero(&obj->mm.pages);
 	GEM_BUG_ON(!pages);
 
+	spin_lock(&i915->mm.obj_lock);
+	list_del(&obj->mm.link);
+	spin_unlock(&i915->mm.obj_lock);
+
 	if (obj->mm.mapping) {
 		void *ptr;
 
@@ -2497,6 +2505,8 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 				 struct sg_table *pages)
 {
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
+
 	lockdep_assert_held(&obj->mm.lock);
 
 	obj->mm.get_page.sg_pos = pages->sgl;
@@ -2505,11 +2515,15 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 	obj->mm.pages = pages;
 
 	if (i915_gem_object_is_tiled(obj) &&
-	    to_i915(obj->base.dev)->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
+	    i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
 		GEM_BUG_ON(obj->mm.quirked);
 		__i915_gem_object_pin_pages(obj);
 		obj->mm.quirked = true;
 	}
+
+	spin_lock(&i915->mm.obj_lock);
+	list_add(&obj->mm.link, &i915->mm.unbound_list);
+	spin_unlock(&i915->mm.obj_lock);
 }
 
 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
@@ -4286,7 +4300,6 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 {
 	mutex_init(&obj->mm.lock);
 
-	INIT_LIST_HEAD(&obj->global_link);
 	INIT_LIST_HEAD(&obj->userfault_link);
 	INIT_LIST_HEAD(&obj->vma_list);
 	INIT_LIST_HEAD(&obj->batch_pool_link);
@@ -4434,7 +4447,18 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		GEM_BUG_ON(!list_empty(&obj->vma_list));
 		GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
 
-		list_del(&obj->global_link);
+		/* This serializes freeing with the shrinker. Since the free
+		 * is delayed, first by RCU then by the workqueue, we want the
+		 * shrinker to be able to free pages of unreferenced objects,
+		 * or else we may oom whilst there are plenty of deferred
+		 * freed objects.
+		 */
+		if (i915_gem_object_has_pages(obj)) {
+			spin_lock(&i915->mm.obj_lock);
+			list_del_init(&obj->mm.link);
+			spin_unlock(&i915->mm.obj_lock);
+		}
+
 	}
 	intel_runtime_pm_put(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
@@ -4951,11 +4975,14 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
 		goto err_priorities;
 
 	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
+
+	spin_lock_init(&dev_priv->mm.obj_lock);
 	init_llist_head(&dev_priv->mm.free_list);
 	INIT_LIST_HEAD(&dev_priv->mm.unbound_list);
 	INIT_LIST_HEAD(&dev_priv->mm.bound_list);
 	INIT_LIST_HEAD(&dev_priv->mm.fence_list);
 	INIT_LIST_HEAD(&dev_priv->mm.userfault_list);
+
 	INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
 			  i915_gem_retire_work_handler);
 	INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
@@ -5040,12 +5067,12 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
 	i915_gem_shrink(dev_priv, -1UL, I915_SHRINK_UNBOUND);
 	i915_gem_drain_freed_objects(dev_priv);
 
-	mutex_lock(&dev_priv->drm.struct_mutex);
+	spin_lock(&dev_priv->mm.obj_lock);
 	for (p = phases; *p; p++) {
-		list_for_each_entry(obj, *p, global_link)
+		list_for_each_entry(obj, *p, mm.link)
 			__start_cpu_write(obj);
 	}
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+	spin_unlock(&dev_priv->mm.obj_lock);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index ad4e48435853..1158559d1ab2 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3182,8 +3182,7 @@ void i915_gem_restore_gtt_mappings(struct drm_i915_private *dev_priv)
 	ggtt->base.closed = true; /* skip rewriting PTE on VMA unbind */
 
 	/* clflush objects bound into the GGTT and rebind them. */
-	list_for_each_entry_safe(obj, on,
-				 &dev_priv->mm.bound_list, global_link) {
+	list_for_each_entry_safe(obj, on, &dev_priv->mm.bound_list, mm.link) {
 		bool ggtt_bound = false;
 		struct i915_vma *vma;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 5b19a4916a4d..b6425f163c8e 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -90,7 +90,6 @@ struct drm_i915_gem_object {
 
 	/** Stolen memory for this object, instead of being backed by shmem. */
 	struct drm_mm_node *stolen;
-	struct list_head global_link;
 	union {
 		struct rcu_head rcu;
 		struct llist_node freed;
@@ -152,6 +151,12 @@ struct drm_i915_gem_object {
 		} get_page;
 
 		/**
+		 * Element within i915->mm.unbound_list or i915->mm.bound_list,
+		 * locked by i915->mm.obj_lock.
+		 */
+		struct list_head link;
+
+		/**
 		 * Advice: are the backing pages purgeable?
 		 */
 		unsigned int madv:2;
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 1668a62619fb..ad129783d124 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -92,9 +92,6 @@ static bool swap_available(void)
 
 static bool can_release_pages(struct drm_i915_gem_object *obj)
 {
-	if (!i915_gem_object_has_pages(obj))
-		return false;
-
 	/* Consider only shrinkable ojects. */
 	if (!i915_gem_object_is_shrinkable(obj))
 		return false;
@@ -208,15 +205,20 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 			continue;
 
 		INIT_LIST_HEAD(&still_in_list);
+
+		/*
+		 * We serialize our access to unreferenced objects through
+		 * the use of the struct_mutex. While the objects are not
+		 * yet freed (due to RCU then a workqueue) we still want
+		 * to be able to shrink their pages, so they remain on
+		 * the unbound/bound list until actually freed.
+		 */
+		spin_lock(&dev_priv->mm.obj_lock);
 		while (count < target &&
 		       (obj = list_first_entry_or_null(phase->list,
 						       typeof(*obj),
-						       global_link))) {
-			list_move_tail(&obj->global_link, &still_in_list);
-			if (!obj->mm.pages) {
-				list_del_init(&obj->global_link);
-				continue;
-			}
+						       mm.link))) {
+			list_move_tail(&obj->mm.link, &still_in_list);
 
 			if (flags & I915_SHRINK_PURGEABLE &&
 			    obj->mm.madv != I915_MADV_DONTNEED)
@@ -234,19 +236,23 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 			if (!can_release_pages(obj))
 				continue;
 
+			spin_unlock(&dev_priv->mm.obj_lock);
+
 			if (unsafe_drop_pages(obj)) {
 				/* May arrive from get_pages on another bo */
 				mutex_lock_nested(&obj->mm.lock,
 						  I915_MM_SHRINKER);
 				if (!i915_gem_object_has_pages(obj)) {
 					__i915_gem_object_invalidate(obj);
-					list_del_init(&obj->global_link);
 					count += obj->base.size >> PAGE_SHIFT;
 				}
 				mutex_unlock(&obj->mm.lock);
 			}
+
+			spin_lock(&dev_priv->mm.obj_lock);
 		}
 		list_splice_tail(&still_in_list, phase->list);
+		spin_unlock(&dev_priv->mm.obj_lock);
 	}
 
 	if (flags & I915_SHRINK_BOUND)
@@ -293,25 +299,18 @@ i915_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc)
 	struct drm_i915_private *dev_priv =
 		container_of(shrinker, struct drm_i915_private, mm.shrinker);
 	struct drm_i915_gem_object *obj;
-	unsigned long count;
-	bool unlock;
-
-	if (!shrinker_lock(dev_priv, &unlock))
-		return 0;
-
-	i915_gem_retire_requests(dev_priv);
+	unsigned long count = 0;
 
-	count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link)
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_for_each_entry(obj, &dev_priv->mm.unbound_list, mm.link)
 		if (can_release_pages(obj))
 			count += obj->base.size >> PAGE_SHIFT;
 
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, mm.link) {
 		if (!i915_gem_object_is_active(obj) && can_release_pages(obj))
 			count += obj->base.size >> PAGE_SHIFT;
 	}
-
-	shrinker_unlock(dev_priv, unlock);
+	spin_unlock(&dev_priv->mm.obj_lock);
 
 	return count;
 }
@@ -383,10 +382,6 @@ i915_gem_shrinker_oom(struct notifier_block *nb, unsigned long event, void *ptr)
 		container_of(nb, struct drm_i915_private, mm.oom_notifier);
 	struct drm_i915_gem_object *obj;
 	unsigned long unevictable, bound, unbound, freed_pages;
-	bool unlock;
-
-	if (!shrinker_lock_uninterruptible(dev_priv, &unlock, 5000))
-		return NOTIFY_DONE;
 
 	freed_pages = i915_gem_shrink_all(dev_priv);
 
@@ -395,26 +390,20 @@ i915_gem_shrinker_oom(struct notifier_block *nb, unsigned long event, void *ptr)
 	 * being pointed to by hardware.
 	 */
 	unbound = bound = unevictable = 0;
-	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
-		if (!i915_gem_object_has_pages(obj))
-			continue;
-
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_for_each_entry(obj, &dev_priv->mm.unbound_list, mm.link) {
 		if (!can_release_pages(obj))
 			unevictable += obj->base.size >> PAGE_SHIFT;
 		else
 			unbound += obj->base.size >> PAGE_SHIFT;
 	}
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
-		if (!i915_gem_object_has_pages(obj))
-			continue;
-
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, mm.link) {
 		if (!can_release_pages(obj))
 			unevictable += obj->base.size >> PAGE_SHIFT;
 		else
 			bound += obj->base.size >> PAGE_SHIFT;
 	}
-
-	shrinker_unlock(dev_priv, unlock);
+	spin_unlock(&dev_priv->mm.obj_lock);
 
 	if (freed_pages || unbound || bound)
 		pr_info("Purging GPU memory, %lu pages freed, "
diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index c11c915382e7..a6e49d1bd389 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -717,8 +717,11 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_i915_private *dev_priv
 	vma->flags |= I915_VMA_GLOBAL_BIND;
 	__i915_vma_set_map_and_fenceable(vma);
 	list_move_tail(&vma->vm_link, &ggtt->base.inactive_list);
-	list_move_tail(&obj->global_link, &dev_priv->mm.bound_list);
+
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_move_tail(&obj->mm.link, &dev_priv->mm.bound_list);
 	obj->bind_count++;
+	spin_unlock(&dev_priv->mm.obj_lock);
 
 	return obj;
 
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 0c489090d4ab..fbb12a39ffc1 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -58,8 +58,10 @@ i915_vma_retire(struct i915_gem_active *active,
 	 * so that we don't steal from recently used but inactive objects
 	 * (unless we are forced to ofc!)
 	 */
+	spin_lock(&rq->i915->mm.obj_lock);
 	if (obj->bind_count)
-		list_move_tail(&obj->global_link, &rq->i915->mm.bound_list);
+		list_move_tail(&obj->mm.link, &rq->i915->mm.bound_list);
+	spin_unlock(&rq->i915->mm.obj_lock);
 
 	obj->mm.dirty = true; /* be paranoid  */
 
@@ -515,9 +517,13 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
 	GEM_BUG_ON(!i915_gem_valid_gtt_space(vma, obj->cache_level));
 
-	list_move_tail(&obj->global_link, &dev_priv->mm.bound_list);
 	list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
+
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_move_tail(&obj->mm.link, &dev_priv->mm.bound_list);
 	obj->bind_count++;
+	spin_unlock(&dev_priv->mm.obj_lock);
+
 	GEM_BUG_ON(atomic_read(&obj->mm.pages_pin_count) < obj->bind_count);
 
 	return 0;
@@ -530,6 +536,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 static void
 i915_vma_remove(struct i915_vma *vma)
 {
+	struct drm_i915_private *i915 = vma->vm->i915;
 	struct drm_i915_gem_object *obj = vma->obj;
 
 	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
@@ -541,9 +548,10 @@ i915_vma_remove(struct i915_vma *vma)
 	/* Since the unbound list is global, only move to that list if
 	 * no more VMAs exist.
 	 */
+	spin_lock(&i915->mm.obj_lock);
 	if (--obj->bind_count == 0)
-		list_move_tail(&obj->global_link,
-			       &to_i915(obj->base.dev)->mm.unbound_list);
+		list_move_tail(&obj->mm.link, &i915->mm.unbound_list);
+	spin_unlock(&i915->mm.obj_lock);
 
 	/* And finally now the object is completely decoupled from this vma,
 	 * we can drop its hold on the backing storage and allow it to be
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 12b85b3278cd..6a98a5e86d49 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -415,7 +415,7 @@ static int fake_aliasing_ppgtt_enable(struct drm_i915_private *i915)
 	if (err)
 		return err;
 
-	list_for_each_entry(obj, &i915->mm.bound_list, global_link) {
+	list_for_each_entry(obj, &i915->mm.bound_list, mm.link) {
 		struct i915_vma *vma;
 
 		vma = i915_vma_instance(obj, &i915->ggtt.base, NULL);
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index 5ea373221f49..d423a46bf019 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -47,7 +47,7 @@ static int populate_ggtt(struct drm_i915_private *i915)
 
 	if (!list_empty(&i915->mm.unbound_list)) {
 		size = 0;
-		list_for_each_entry(obj, &i915->mm.unbound_list, global_link)
+		list_for_each_entry(obj, &i915->mm.unbound_list, mm.link)
 			size++;
 
 		pr_err("Found %lld objects unbound!\n", size);
@@ -74,10 +74,10 @@ static void cleanup_objects(struct drm_i915_private *i915)
 {
 	struct drm_i915_gem_object *obj, *on;
 
-	list_for_each_entry_safe(obj, on, &i915->mm.unbound_list, global_link)
+	list_for_each_entry_safe(obj, on, &i915->mm.unbound_list, mm.link)
 		i915_gem_object_put(obj);
 
-	list_for_each_entry_safe(obj, on, &i915->mm.bound_list, global_link)
+	list_for_each_entry_safe(obj, on, &i915->mm.bound_list, mm.link)
 		i915_gem_object_put(obj);
 
 	mutex_unlock(&i915->drm.struct_mutex);
@@ -149,8 +149,6 @@ static int igt_overcommit(void *arg)
 		goto cleanup;
 	}
 
-	list_move(&obj->global_link, &i915->mm.unbound_list);
-
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
 	if (!IS_ERR(vma) || PTR_ERR(vma) != -ENOSPC) {
 		pr_err("Failed to evict+insert, i915_gem_object_ggtt_pin returned err=%d\n", (int)PTR_ERR(vma));
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 07/16] drm/i915: Trim struct_mutex hold duration for i915_gem_free_objects
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (5 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 06/16] drm/i915: Move dev_priv->mm.[un]bound_list to its own lock Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-08-11  7:36   ` Joonas Lahtinen
  2017-07-26 13:26 ` [PATCH 08/16] drm/i915: Only free the oldest stale object before a fresh allocation Chris Wilson
                   ` (9 subsequent siblings)
  16 siblings, 1 reply; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

We free objects in bulk after they wait for their RCU grace period.
Currently, we take struct_mutex and unbind all the objects. This can lead
to a long lock duration during which time those objects have their pages
unfreeable (i.e. the shrinker is prevented from reaping those pages). If
we only process a single object under the struct_mutex and then free the
pages, the number of objects locked away from the shrinker is minimal
and we allow regular clients better access to struct_mutex if they need
it.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bcfaa86451bf..12bec1eabe46 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4430,13 +4430,14 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 {
 	struct drm_i915_gem_object *obj, *on;
 
-	mutex_lock(&i915->drm.struct_mutex);
 	intel_runtime_pm_get(i915);
-	llist_for_each_entry(obj, freed, freed) {
+	llist_for_each_entry_safe(obj, on, freed, freed) {
 		struct i915_vma *vma, *vn;
 
 		trace_i915_gem_object_destroy(obj);
 
+		mutex_lock(&i915->drm.struct_mutex);
+
 		GEM_BUG_ON(i915_gem_object_is_active(obj));
 		list_for_each_entry_safe(vma, vn,
 					 &obj->vma_list, obj_link) {
@@ -4459,13 +4460,8 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 			spin_unlock(&i915->mm.obj_lock);
 		}
 
-	}
-	intel_runtime_pm_put(i915);
-	mutex_unlock(&i915->drm.struct_mutex);
+		mutex_unlock(&i915->drm.struct_mutex);
 
-	cond_resched();
-
-	llist_for_each_entry_safe(obj, on, freed, freed) {
 		GEM_BUG_ON(obj->bind_count);
 		GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
 
@@ -4486,7 +4482,11 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 
 		kfree(obj->bit_17);
 		i915_gem_object_free(obj);
+
+		if (on)
+			cond_resched();
 	}
+	intel_runtime_pm_put(i915);
 }
 
 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 08/16] drm/i915: Only free the oldest stale object before a fresh allocation
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (6 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 07/16] drm/i915: Trim struct_mutex hold duration for i915_gem_free_objects Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-07-26 13:26 ` [PATCH 09/16] drm/i915: Track user GTT faulting per-vma Chris Wilson
                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

Inspired by Tvrtko's critique of the reaping of the stale contexts
before allocating a new one, also limit the freed object reaping to the
oldest stale object before allocating a fresh object. Unlike contexts,
objects may have radically different sizes of backing storage, but
similar to contexts, whilst we want to prevent starvation due to
excessive freed lists, we also want do not want to delay fresh
allocations for too long. Only freeing the oldest on the freed object
list before each allocation is a reasonable compromise.

v2: Only a single consumer of llist_del_first() is allowed (although
multiple llist_add are still allowed in parallel). Unlike
i915_gem_context, i915_gem_flush_free_objects() is itself not serialized
and so we need to add our own spinlock. Otherwise KASAN eventually spots
a use-after-free for the race on *first->next.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> #v1
---
 drivers/gpu/drm/i915/i915_drv.h |  1 +
 drivers/gpu/drm/i915/i915_gem.c | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 856214fe28b2..1760a309b488 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1465,6 +1465,7 @@ struct i915_gem_mm {
 	 */
 	struct llist_head free_list;
 	struct work_struct free_work;
+	spinlock_t free_lock;
 
 	/** Small stash of WC pages */
 	struct pagevec wc_stash;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 12bec1eabe46..1895d49e9f69 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4493,9 +4493,18 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
 {
 	struct llist_node *freed;
 
-	freed = llist_del_all(&i915->mm.free_list);
-	if (unlikely(freed))
+	/* Free the oldest, most stale object to keep the free_list short */
+	freed = NULL;
+	if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
+		/* Only one consumer of llist_del_first() allowed */
+		spin_lock(&i915->mm.free_lock);
+		freed = llist_del_first(&i915->mm.free_list);
+		spin_unlock(&i915->mm.free_lock);
+	}
+	if (unlikely(freed)) {
+		freed->next = NULL;
 		__i915_gem_free_objects(i915, freed);
+	}
 }
 
 static void __i915_gem_free_work(struct work_struct *work)
@@ -4977,6 +4986,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
 	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
 
 	spin_lock_init(&dev_priv->mm.obj_lock);
+	spin_lock_init(&dev_priv->mm.free_lock);
 	init_llist_head(&dev_priv->mm.free_list);
 	INIT_LIST_HEAD(&dev_priv->mm.unbound_list);
 	INIT_LIST_HEAD(&dev_priv->mm.bound_list);
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 09/16] drm/i915: Track user GTT faulting per-vma
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (7 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 08/16] drm/i915: Only free the oldest stale object before a fresh allocation Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-07-26 13:26 ` [PATCH 10/16] drm/i915: Try a minimal attempt to insert the whole object for relocations Chris Wilson
                   ` (7 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

We don't wish to refault the entire object (other vma) when unbinding
one partial vma. To do this track which vma have been faulted into the
user's address space.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c       |  2 +-
 drivers/gpu/drm/i915/i915_gem.c           | 50 +++++++++++++++++++++----------
 drivers/gpu/drm/i915/i915_gem_evict.c     |  2 +-
 drivers/gpu/drm/i915/i915_gem_fence_reg.c | 10 ++++---
 drivers/gpu/drm/i915/i915_gem_object.h    |  1 +
 drivers/gpu/drm/i915/i915_vma.c           | 26 +++++++++++++++-
 drivers/gpu/drm/i915/i915_vma.h           | 21 ++++++++++++-
 7 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 73e0a325614d..0877040458ff 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -96,7 +96,7 @@ static char get_tiling_flag(struct drm_i915_gem_object *obj)
 
 static char get_global_flag(struct drm_i915_gem_object *obj)
 {
-	return !list_empty(&obj->userfault_link) ? 'g' : ' ';
+	return obj->userfault_count ? 'g' : ' ';
 }
 
 static char get_pin_mapped_flag(struct drm_i915_gem_object *obj)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1895d49e9f69..dfbf75bb6cf8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1948,18 +1948,22 @@ int i915_gem_fault(struct vm_fault *vmf)
 	if (ret)
 		goto err_unpin;
 
-	/* Mark as being mmapped into userspace for later revocation */
-	assert_rpm_wakelock_held(dev_priv);
-	if (list_empty(&obj->userfault_link))
-		list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
-
 	/* Finally, remap it using the new GTT offset */
 	ret = remap_io_mapping(area,
 			       area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
 			       (ggtt->mappable_base + vma->node.start) >> PAGE_SHIFT,
 			       min_t(u64, vma->size, area->vm_end - area->vm_start),
 			       &ggtt->mappable);
+	if (ret)
+		goto err_fence;
 
+	/* Mark as being mmapped into userspace for later revocation */
+	assert_rpm_wakelock_held(dev_priv);
+	if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
+		list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
+	GEM_BUG_ON(!obj->userfault_count);
+
+err_fence:
 	i915_vma_unpin_fence(vma);
 err_unpin:
 	__i915_vma_unpin(vma);
@@ -2012,6 +2016,25 @@ int i915_gem_fault(struct vm_fault *vmf)
 	return ret;
 }
 
+static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
+{
+	struct i915_vma *vma;
+
+	GEM_BUG_ON(!obj->userfault_count);
+
+	obj->userfault_count = 0;
+	list_del(&obj->userfault_link);
+	drm_vma_node_unmap(&obj->base.vma_node,
+			   obj->base.dev->anon_inode->i_mapping);
+
+	list_for_each_entry(vma, &obj->vma_list, obj_link) {
+		if (!i915_vma_is_ggtt(vma))
+			break;
+
+		i915_vma_unset_userfault(vma);
+	}
+}
+
 /**
  * i915_gem_release_mmap - remove physical page mappings
  * @obj: obj in question
@@ -2042,12 +2065,10 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
 	lockdep_assert_held(&i915->drm.struct_mutex);
 	intel_runtime_pm_get(i915);
 
-	if (list_empty(&obj->userfault_link))
+	if (!obj->userfault_count)
 		goto out;
 
-	list_del_init(&obj->userfault_link);
-	drm_vma_node_unmap(&obj->base.vma_node,
-			   obj->base.dev->anon_inode->i_mapping);
+	__i915_gem_object_release_mmap(obj);
 
 	/* Ensure that the CPU's PTE are revoked and there are not outstanding
 	 * memory transactions from userspace before we return. The TLB
@@ -2075,11 +2096,8 @@ void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
 	 */
 
 	list_for_each_entry_safe(obj, on,
-				 &dev_priv->mm.userfault_list, userfault_link) {
-		list_del_init(&obj->userfault_link);
-		drm_vma_node_unmap(&obj->base.vma_node,
-				   obj->base.dev->anon_inode->i_mapping);
-	}
+				 &dev_priv->mm.userfault_list, userfault_link)
+		__i915_gem_object_release_mmap(obj);
 
 	/* The fence will be lost when the device powers down. If any were
 	 * in use by hardware (i.e. they are pinned), we should not be powering
@@ -2102,7 +2120,7 @@ void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
 		if (!reg->vma)
 			continue;
 
-		GEM_BUG_ON(!list_empty(&reg->vma->obj->userfault_link));
+		GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
 		reg->dirty = true;
 	}
 }
@@ -4300,7 +4318,6 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 {
 	mutex_init(&obj->mm.lock);
 
-	INIT_LIST_HEAD(&obj->userfault_link);
 	INIT_LIST_HEAD(&obj->vma_list);
 	INIT_LIST_HEAD(&obj->batch_pool_link);
 
@@ -4463,6 +4480,7 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		mutex_unlock(&i915->drm.struct_mutex);
 
 		GEM_BUG_ON(obj->bind_count);
+		GEM_BUG_ON(obj->userfault_count);
 		GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
 
 		if (obj->ops->release)
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index a193f1b36c67..f4ea181d46fb 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -82,7 +82,7 @@ mark_free(struct drm_mm_scan *scan,
 	if (i915_vma_is_pinned(vma))
 		return false;
 
-	if (flags & PIN_NONFAULT && !list_empty(&vma->obj->userfault_link))
+	if (flags & PIN_NONFAULT && i915_vma_has_userfault(vma))
 		return false;
 
 	list_add(&vma->evict_link, unwind);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index c5a00ff44786..ec5c42c3dc91 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -291,7 +291,8 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		/* Ensure that all userspace CPU access is completed before
 		 * stealing the fence.
 		 */
-		i915_gem_release_mmap(fence->vma->obj);
+		GEM_BUG_ON(fence->vma->fence != fence);
+		i915_vma_revoke_mmap(fence->vma);
 
 		fence->vma->fence = NULL;
 		fence->vma = NULL;
@@ -459,7 +460,8 @@ int i915_vma_reserve_fence(struct i915_vma *vma)
 		vma->fence = fence;
 
 		if (fence->vma) {
-			i915_gem_release_mmap(fence->vma->obj);
+			GEM_BUG_ON(fence->vma->fence != fence);
+			i915_vma_revoke_mmap(fence->vma);
 			fence->vma->fence = NULL;
 		}
 		fence->vma = vma;
@@ -545,7 +547,7 @@ void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
 		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
 
 		if (fence->vma)
-			i915_gem_release_mmap(fence->vma->obj);
+			i915_vma_revoke_mmap(fence->vma);
 	}
 }
 
@@ -574,7 +576,7 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
 		if (vma && !i915_gem_object_is_tiled(vma->obj)) {
 			GEM_BUG_ON(!reg->dirty);
 			GEM_BUG_ON(atomic_read(&reg->pin_count));
-			GEM_BUG_ON(!list_empty(&vma->obj->userfault_link));
+			GEM_BUG_ON(i915_vma_has_userfault(vma));
 
 			list_move(&reg->link, &dev_priv->mm.fence_list);
 			vma->fence = NULL;
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index b6425f163c8e..83e364f32253 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -98,6 +98,7 @@ struct drm_i915_gem_object {
 	/**
 	 * Whether the object is currently in the GGTT mmap.
 	 */
+	unsigned int userfault_count;
 	struct list_head userfault_link;
 
 	struct list_head batch_pool_link;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index fbb12a39ffc1..96cb0752e8e9 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -668,6 +668,29 @@ static void __i915_vma_iounmap(struct i915_vma *vma)
 	vma->iomap = NULL;
 }
 
+void i915_vma_revoke_mmap(struct i915_vma *vma)
+{
+	struct drm_vma_offset_node *node = &vma->obj->base.vma_node;
+
+	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
+
+	if (!i915_vma_has_userfault(vma))
+		return;
+
+	GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
+	GEM_BUG_ON(!vma->obj->userfault_count);
+
+	unmap_mapping_range(vma->vm->i915->drm.anon_inode->i_mapping,
+			    drm_vma_node_offset_addr(node) +
+			    (vma->ggtt_view.partial.offset << PAGE_SHIFT),
+			    vma->size,
+			    1);
+
+	i915_vma_unset_userfault(vma);
+	if (!--vma->obj->userfault_count)
+		list_del(&vma->obj->userfault_link);
+}
+
 int i915_vma_unbind(struct i915_vma *vma)
 {
 	struct drm_i915_gem_object *obj = vma->obj;
@@ -731,12 +754,13 @@ int i915_vma_unbind(struct i915_vma *vma)
 			return ret;
 
 		/* Force a pagefault for domain tracking on next user access */
-		i915_gem_release_mmap(obj);
+		i915_vma_revoke_mmap(vma);
 
 		__i915_vma_iounmap(vma);
 		vma->flags &= ~I915_VMA_CAN_FENCE;
 	}
 	GEM_BUG_ON(vma->fence);
+	GEM_BUG_ON(i915_vma_has_userfault(vma));
 
 	if (likely(!vma->vm->closed)) {
 		trace_i915_vma_unbind(vma);
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 4808601b91f1..16bb1734783e 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -59,7 +59,7 @@ struct i915_vma {
 	u32 fence_size;
 	u32 fence_alignment;
 
-	unsigned int flags;
+	unsigned long flags;
 	/**
 	 * How many users have pinned this object in GTT space. The following
 	 * users can each hold at most one reference: pwrite/pread, execbuffer
@@ -81,6 +81,8 @@ struct i915_vma {
 #define I915_VMA_GGTT		BIT(8)
 #define I915_VMA_CAN_FENCE	BIT(9)
 #define I915_VMA_CLOSED		BIT(10)
+#define I915_VMA_USERFAULT_BIT	11
+#define I915_VMA_USERFAULT	BIT(I915_VMA_USERFAULT_BIT)
 
 	unsigned int active;
 	struct i915_gem_active last_read[I915_NUM_ENGINES];
@@ -143,6 +145,22 @@ static inline bool i915_vma_is_closed(const struct i915_vma *vma)
 	return vma->flags & I915_VMA_CLOSED;
 }
 
+static inline bool i915_vma_set_userfault(struct i915_vma *vma)
+{
+	GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
+	return __test_and_set_bit(I915_VMA_USERFAULT_BIT, &vma->flags);
+}
+
+static inline void i915_vma_unset_userfault(struct i915_vma *vma)
+{
+	return __clear_bit(I915_VMA_USERFAULT_BIT, &vma->flags);
+}
+
+static inline bool i915_vma_has_userfault(const struct i915_vma *vma)
+{
+	return test_bit(I915_VMA_USERFAULT_BIT, &vma->flags);
+}
+
 static inline unsigned int i915_vma_get_active(const struct i915_vma *vma)
 {
 	return vma->active;
@@ -241,6 +259,7 @@ bool i915_gem_valid_gtt_space(struct i915_vma *vma, unsigned long cache_level);
 bool i915_vma_misplaced(const struct i915_vma *vma,
 			u64 size, u64 alignment, u64 flags);
 void __i915_vma_set_map_and_fenceable(struct i915_vma *vma);
+void i915_vma_revoke_mmap(struct i915_vma *vma);
 int __must_check i915_vma_unbind(struct i915_vma *vma);
 void i915_vma_unlink_ctx(struct i915_vma *vma);
 void i915_vma_close(struct i915_vma *vma);
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 10/16] drm/i915: Try a minimal attempt to insert the whole object for relocations
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (8 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 09/16] drm/i915: Track user GTT faulting per-vma Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-07-26 13:26 ` [PATCH 11/16] drm/i915: Check context status before looking up our obj/vma Chris Wilson
                   ` (6 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

As we have a lightweight fallback to insert a single page into the
aperture, try to avoid any heavier evictions when attempting to insert
the entire object.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 4ffc2a4b0f2d..9aa9e751f36b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1059,7 +1059,9 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 			return ERR_PTR(err);
 
 		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
-					       PIN_MAPPABLE | PIN_NONBLOCK);
+					       PIN_MAPPABLE |
+					       PIN_NONBLOCK |
+					       PIN_NONFAULT);
 		if (IS_ERR(vma)) {
 			memset(&cache->node, 0, sizeof(cache->node));
 			err = drm_mm_insert_node_in_range
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 11/16] drm/i915: Check context status before looking up our obj/vma
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (9 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 10/16] drm/i915: Try a minimal attempt to insert the whole object for relocations Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-07-26 13:26 ` [PATCH 12/16] drm/i915: Convert execbuf to use struct-of-array packing for critical fields Chris Wilson
                   ` (5 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

Since we keep the context around across the slow lookup where we may
drop the struct_mutex, we should double check that the context is still
valid upon reacquisition.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 9aa9e751f36b..8a1aaa9cd569 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -680,13 +680,6 @@ static int eb_select_context(struct i915_execbuffer *eb)
 	if (unlikely(!ctx))
 		return -ENOENT;
 
-	if (unlikely(i915_gem_context_is_banned(ctx))) {
-		DRM_DEBUG("Context %u tried to submit while banned\n",
-			  ctx->user_handle);
-		i915_gem_context_put(ctx);
-		return -EIO;
-	}
-
 	eb->ctx = ctx;
 	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
 
@@ -708,6 +701,12 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 	int slow_pass = -1;
 	int err;
 
+	if (unlikely(i915_gem_context_is_closed(eb->ctx)))
+		return -ENOENT;
+
+	if (unlikely(i915_gem_context_is_banned(eb->ctx)))
+		return -EIO;
+
 	INIT_LIST_HEAD(&eb->relocs);
 	INIT_LIST_HEAD(&eb->unbound);
 
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 12/16] drm/i915: Convert execbuf to use struct-of-array packing for critical fields
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (10 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 11/16] drm/i915: Check context status before looking up our obj/vma Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-07-26 13:26 ` [PATCH 13/16] drm/i915: Simplify eb_lookup_vmas() Chris Wilson
                   ` (4 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

When userspace is doing most of the work, avoiding relocs (using
NO_RELOC) and opting out of implicit synchronisation (using ASYNC), we
still spend a lot of time processing the arrays in execbuf, even though
we now should have nothing to do most of the time. One issue that
becomes readily apparent in profiling anv is that iterating over the
large execobj[] is unfriendly to the loop prefetchers of the CPU and it
much prefers iterating over a pair of arrays rather than one big array.

v2: Clear vma[] on construction to handle errors during vma lookup

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_evict.c      |   4 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 310 +++++++++++++++--------------
 drivers/gpu/drm/i915/i915_vma.h            |   2 +-
 3 files changed, 161 insertions(+), 155 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index f4ea181d46fb..933ee8ecfa54 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -318,8 +318,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 		/* Overlap of objects in the same batch? */
 		if (i915_vma_is_pinned(vma)) {
 			ret = -ENOSPC;
-			if (vma->exec_entry &&
-			    vma->exec_entry->flags & EXEC_OBJECT_PINNED)
+			if (vma->exec_flags &&
+			    *vma->exec_flags & EXEC_OBJECT_PINNED)
 				ret = -EINVAL;
 			break;
 		}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 8a1aaa9cd569..92d88ec271c1 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -191,6 +191,8 @@ struct i915_execbuffer {
 	struct drm_file *file; /** per-file lookup tables and limits */
 	struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */
 	struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
+	struct i915_vma **vma;
+	unsigned int *flags;
 
 	struct intel_engine_cs *engine; /** engine to queue the request to */
 	struct i915_gem_context *ctx; /** context for building the request */
@@ -244,13 +246,7 @@ struct i915_execbuffer {
 	struct hlist_head *buckets; /** ht for relocation handles */
 };
 
-/*
- * As an alternative to creating a hashtable of handle-to-vma for a batch,
- * we used the last available reserved field in the execobject[] and stash
- * a link from the execobj to its vma.
- */
-#define __exec_to_vma(ee) (ee)->rsvd2
-#define exec_to_vma(ee) u64_to_ptr(struct i915_vma, __exec_to_vma(ee))
+#define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags])
 
 /*
  * Used to convert any address to canonical form.
@@ -319,86 +315,83 @@ static int eb_create(struct i915_execbuffer *eb)
 
 static bool
 eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
-		 const struct i915_vma *vma)
+		 const struct i915_vma *vma,
+		 unsigned int flags)
 {
-	if (!(entry->flags & __EXEC_OBJECT_HAS_PIN))
-		return true;
-
 	if (vma->node.size < entry->pad_to_size)
 		return true;
 
 	if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
 		return true;
 
-	if (entry->flags & EXEC_OBJECT_PINNED &&
+	if (flags & EXEC_OBJECT_PINNED &&
 	    vma->node.start != entry->offset)
 		return true;
 
-	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
+	if (flags & __EXEC_OBJECT_NEEDS_BIAS &&
 	    vma->node.start < BATCH_OFFSET_BIAS)
 		return true;
 
-	if (!(entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) &&
+	if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) &&
 	    (vma->node.start + vma->node.size - 1) >> 32)
 		return true;
 
 	return false;
 }
 
-static inline void
+static inline bool
 eb_pin_vma(struct i915_execbuffer *eb,
-	   struct drm_i915_gem_exec_object2 *entry,
+	   const struct drm_i915_gem_exec_object2 *entry,
 	   struct i915_vma *vma)
 {
-	u64 flags;
+	unsigned int exec_flags = *vma->exec_flags;
+	u64 pin_flags;
 
 	if (vma->node.size)
-		flags = vma->node.start;
+		pin_flags = vma->node.start;
 	else
-		flags = entry->offset & PIN_OFFSET_MASK;
+		pin_flags = entry->offset & PIN_OFFSET_MASK;
 
-	flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED;
-	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_GTT))
-		flags |= PIN_GLOBAL;
+	pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED;
+	if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_GTT))
+		pin_flags |= PIN_GLOBAL;
 
-	if (unlikely(i915_vma_pin(vma, 0, 0, flags)))
-		return;
+	if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags)))
+		return false;
 
-	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
+	if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) {
 		if (unlikely(i915_vma_reserve_fence(vma))) {
 			i915_vma_unpin(vma);
-			return;
+			return false;
 		}
 
-		entry->flags &= ~EXEC_OBJECT_ASYNC;
+		exec_flags &= ~EXEC_OBJECT_ASYNC;
 		if (vma->fence)
-			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+			exec_flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
 
-	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+	*vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN;
+	return !eb_vma_misplaced(entry, vma, exec_flags);
 }
 
-static inline void
-__eb_unreserve_vma(struct i915_vma *vma,
-		   const struct drm_i915_gem_exec_object2 *entry)
+static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags)
 {
-	GEM_BUG_ON(!(entry->flags & __EXEC_OBJECT_HAS_PIN));
+	GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN));
 
-	if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
+	if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE))
 		__i915_vma_unpin_fence(vma);
 
 	__i915_vma_unpin(vma);
 }
 
 static inline void
-eb_unreserve_vma(struct i915_vma *vma,
-		 struct drm_i915_gem_exec_object2 *entry)
+eb_unreserve_vma(struct i915_vma *vma, unsigned int *flags)
 {
-	if (!(entry->flags & __EXEC_OBJECT_HAS_PIN))
+	if (!(*flags & __EXEC_OBJECT_HAS_PIN))
 		return;
 
-	__eb_unreserve_vma(vma, entry);
-	entry->flags &= ~__EXEC_OBJECT_RESERVED;
+	__eb_unreserve_vma(vma, *flags);
+	*flags &= ~__EXEC_OBJECT_RESERVED;
 }
 
 static int
@@ -428,7 +421,7 @@ eb_validate_vma(struct i915_execbuffer *eb,
 		entry->pad_to_size = 0;
 	}
 
-	if (unlikely(vma->exec_entry)) {
+	if (unlikely(vma->exec_flags)) {
 		DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
 			  entry->handle, (int)(entry - eb->exec));
 		return -EINVAL;
@@ -441,14 +434,25 @@ eb_validate_vma(struct i915_execbuffer *eb,
 	 */
 	entry->offset = gen8_noncanonical_addr(entry->offset);
 
+	if (!eb->reloc_cache.has_fence) {
+		entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
+	} else {
+		if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE ||
+		     eb->reloc_cache.needs_unfenced) &&
+		    i915_gem_object_is_tiled(vma->obj))
+			entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
+	}
+
+	if (!(entry->flags & EXEC_OBJECT_PINNED))
+		entry->flags |= eb->context_flags;
+
 	return 0;
 }
 
 static int
-eb_add_vma(struct i915_execbuffer *eb,
-	   struct drm_i915_gem_exec_object2 *entry,
-	   struct i915_vma *vma)
+eb_add_vma(struct i915_execbuffer *eb, unsigned int i, struct i915_vma *vma)
 {
+	struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
 	int err;
 
 	GEM_BUG_ON(i915_vma_is_closed(vma));
@@ -469,40 +473,28 @@ eb_add_vma(struct i915_execbuffer *eb,
 	if (entry->relocation_count)
 		list_add_tail(&vma->reloc_link, &eb->relocs);
 
-	if (!eb->reloc_cache.has_fence) {
-		entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
-	} else {
-		if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE ||
-		     eb->reloc_cache.needs_unfenced) &&
-		    i915_gem_object_is_tiled(vma->obj))
-			entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
-	}
-
-	if (!(entry->flags & EXEC_OBJECT_PINNED))
-		entry->flags |= eb->context_flags;
-
 	/*
 	 * Stash a pointer from the vma to execobj, so we can query its flags,
 	 * size, alignment etc as provided by the user. Also we stash a pointer
 	 * to the vma inside the execobj so that we can use a direct lookup
 	 * to find the right target VMA when doing relocations.
 	 */
-	vma->exec_entry = entry;
-	__exec_to_vma(entry) = (uintptr_t)vma;
+	eb->vma[i] = vma;
+	eb->flags[i] = entry->flags;
+	vma->exec_flags = &eb->flags[i];
 
 	err = 0;
-	eb_pin_vma(eb, entry, vma);
-	if (eb_vma_misplaced(entry, vma)) {
-		eb_unreserve_vma(vma, entry);
-
-		list_add_tail(&vma->exec_link, &eb->unbound);
-		if (drm_mm_node_allocated(&vma->node))
-			err = i915_vma_unbind(vma);
-	} else {
+	if (eb_pin_vma(eb, entry, vma)) {
 		if (entry->offset != vma->node.start) {
 			entry->offset = vma->node.start | UPDATE;
 			eb->args->flags |= __EXEC_HAS_RELOC;
 		}
+	} else {
+		eb_unreserve_vma(vma, vma->exec_flags);
+
+		list_add_tail(&vma->exec_link, &eb->unbound);
+		if (drm_mm_node_allocated(&vma->node))
+			err = i915_vma_unbind(vma);
 	}
 	return err;
 }
@@ -527,32 +519,35 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
 static int eb_reserve_vma(const struct i915_execbuffer *eb,
 			  struct i915_vma *vma)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	u64 flags;
+	struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma);
+	unsigned int exec_flags = *vma->exec_flags;
+	u64 pin_flags;
 	int err;
 
-	flags = PIN_USER | PIN_NONBLOCK;
-	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
-		flags |= PIN_GLOBAL;
+	pin_flags = PIN_USER | PIN_NONBLOCK;
+	if (exec_flags & EXEC_OBJECT_NEEDS_GTT)
+		pin_flags |= PIN_GLOBAL;
 
 	/*
 	 * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
 	 * limit address to the first 4GBs for unflagged objects.
 	 */
-	if (!(entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
-		flags |= PIN_ZONE_4G;
+	if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
+		pin_flags |= PIN_ZONE_4G;
 
-	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
-		flags |= PIN_MAPPABLE;
+	if (exec_flags & __EXEC_OBJECT_NEEDS_MAP)
+		pin_flags |= PIN_MAPPABLE;
 
-	if (entry->flags & EXEC_OBJECT_PINNED) {
-		flags |= entry->offset | PIN_OFFSET_FIXED;
-		flags &= ~PIN_NONBLOCK; /* force overlapping PINNED checks */
-	} else if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS) {
-		flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
+	if (exec_flags & EXEC_OBJECT_PINNED) {
+		pin_flags |= entry->offset | PIN_OFFSET_FIXED;
+		pin_flags &= ~PIN_NONBLOCK; /* force overlapping checks */
+	} else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS) {
+		pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
 	}
 
-	err = i915_vma_pin(vma, entry->pad_to_size, entry->alignment, flags);
+	err = i915_vma_pin(vma,
+			   entry->pad_to_size, entry->alignment,
+			   pin_flags);
 	if (err)
 		return err;
 
@@ -561,20 +556,20 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
 		eb->args->flags |= __EXEC_HAS_RELOC;
 	}
 
-	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
+	if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) {
 		err = i915_vma_reserve_fence(vma);
 		if (unlikely(err)) {
 			i915_vma_unpin(vma);
 			return err;
 		}
 
-		entry->flags &= ~EXEC_OBJECT_ASYNC;
+		exec_flags &= ~EXEC_OBJECT_ASYNC;
 		if (vma->fence)
-			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+			exec_flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
 
-	entry->flags |= __EXEC_OBJECT_HAS_PIN;
-	GEM_BUG_ON(eb_vma_misplaced(entry, vma));
+	*vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN;
+	GEM_BUG_ON(eb_vma_misplaced(entry, vma, exec_flags));
 
 	return 0;
 }
@@ -616,18 +611,18 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		INIT_LIST_HEAD(&eb->unbound);
 		INIT_LIST_HEAD(&last);
 		for (i = 0; i < count; i++) {
-			struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+			unsigned int flags = eb->flags[i];
+			struct i915_vma *vma = eb->vma[i];
 
-			if (entry->flags & EXEC_OBJECT_PINNED &&
-			    entry->flags & __EXEC_OBJECT_HAS_PIN)
+			if (flags & EXEC_OBJECT_PINNED &&
+			    flags & __EXEC_OBJECT_HAS_PIN)
 				continue;
 
-			vma = exec_to_vma(entry);
-			eb_unreserve_vma(vma, entry);
+			eb_unreserve_vma(vma, &eb->flags[i]);
 
-			if (entry->flags & EXEC_OBJECT_PINNED)
+			if (flags & EXEC_OBJECT_PINNED)
 				list_add(&vma->exec_link, &eb->unbound);
-			else if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+			else if (flags & __EXEC_OBJECT_NEEDS_MAP)
 				list_add_tail(&vma->exec_link, &eb->unbound);
 			else
 				list_add_tail(&vma->exec_link, &last);
@@ -715,18 +710,15 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 	GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS);
 
 	for (i = 0; i < count; i++) {
-		__exec_to_vma(&eb->exec[i]) = 0;
-
 		hlist_for_each_entry(vma,
 				     ht_head(lut, eb->exec[i].handle),
 				     ctx_node) {
 			if (vma->ctx_handle != eb->exec[i].handle)
 				continue;
 
-			err = eb_add_vma(eb, &eb->exec[i], vma);
+			err = eb_add_vma(eb, i, vma);
 			if (unlikely(err))
 				return err;
-
 			goto next_vma;
 		}
 
@@ -747,7 +739,7 @@ next_vma: ;
 	for (i = slow_pass; i < count; i++) {
 		struct drm_i915_gem_object *obj;
 
-		if (__exec_to_vma(&eb->exec[i]))
+		if (eb->vma[i])
 			continue;
 
 		obj = to_intel_bo(idr_find(idr, eb->exec[i].handle));
@@ -759,14 +751,17 @@ next_vma: ;
 			goto err;
 		}
 
-		__exec_to_vma(&eb->exec[i]) = INTERMEDIATE | (uintptr_t)obj;
+		eb->vma[i] = (struct i915_vma *)
+			ptr_pack_bits(obj, INTERMEDIATE, 1);
 	}
 	spin_unlock(&eb->file->table_lock);
 
 	for (i = slow_pass; i < count; i++) {
 		struct drm_i915_gem_object *obj;
+		unsigned int is_obj;
 
-		if (!(__exec_to_vma(&eb->exec[i]) & INTERMEDIATE))
+		obj = (typeof(obj))ptr_unpack_bits(eb->vma[i], &is_obj, 1);
+		if (!is_obj)
 			continue;
 
 		/*
@@ -777,8 +772,6 @@ next_vma: ;
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
-		obj = u64_to_ptr(typeof(*obj),
-				 __exec_to_vma(&eb->exec[i]) & ~INTERMEDIATE);
 		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
@@ -802,14 +795,17 @@ next_vma: ;
 			i915_vma_get(vma);
 		}
 
-		err = eb_add_vma(eb, &eb->exec[i], vma);
+		err = eb_add_vma(eb, i, vma);
 		if (unlikely(err))
 			goto err;
 
+		GEM_BUG_ON(vma != eb->vma[i]);
+		GEM_BUG_ON(vma->exec_flags != &eb->flags[i]);
+
 		/* Only after we validated the user didn't use our bits */
 		if (vma->ctx != eb->ctx) {
 			i915_vma_get(vma);
-			eb->exec[i].flags |= __EXEC_OBJECT_HAS_REF;
+			*vma->exec_flags |= __EXEC_OBJECT_HAS_REF;
 		}
 	}
 
@@ -823,7 +819,8 @@ next_vma: ;
 out:
 	/* take note of the batch buffer before we might reorder the lists */
 	i = eb_batch_index(eb);
-	eb->batch = exec_to_vma(&eb->exec[i]);
+	eb->batch = eb->vma[i];
+	GEM_BUG_ON(eb->batch->exec_flags != &eb->flags[i]);
 
 	/*
 	 * SNA is doing fancy tricks with compressing batch buffers, which leads
@@ -834,18 +831,18 @@ next_vma: ;
 	 * Note that actual hangs have only been observed on gen7, but for
 	 * paranoia do it everywhere.
 	 */
-	if (!(eb->exec[i].flags & EXEC_OBJECT_PINNED))
-		eb->exec[i].flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if (!(eb->flags[i] & EXEC_OBJECT_PINNED))
+		eb->flags[i] |= __EXEC_OBJECT_NEEDS_BIAS;
 	if (eb->reloc_cache.has_fence)
-		eb->exec[i].flags |= EXEC_OBJECT_NEEDS_FENCE;
+		eb->flags[i] |= EXEC_OBJECT_NEEDS_FENCE;
 
 	eb->args->flags |= __EXEC_VALIDATED;
 	return eb_reserve(eb);
 
 err:
 	for (i = slow_pass; i < count; i++) {
-		if (__exec_to_vma(&eb->exec[i]) & INTERMEDIATE)
-			__exec_to_vma(&eb->exec[i]) = 0;
+		if (ptr_unmask_bits(eb->vma[i], 1))
+			eb->vma[i] = NULL;
 	}
 	lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS;
 	return err;
@@ -858,7 +855,7 @@ eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle)
 	if (eb->lut_size < 0) {
 		if (handle >= -eb->lut_size)
 			return NULL;
-		return exec_to_vma(&eb->exec[handle]);
+		return eb->vma[handle];
 	} else {
 		struct hlist_head *head;
 		struct i915_vma *vma;
@@ -878,24 +875,21 @@ static void eb_release_vmas(const struct i915_execbuffer *eb)
 	unsigned int i;
 
 	for (i = 0; i < count; i++) {
-		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
-		struct i915_vma *vma = exec_to_vma(entry);
+		struct i915_vma *vma = eb->vma[i];
+		unsigned int flags = eb->flags[i];
 
 		if (!vma)
 			continue;
 
-		GEM_BUG_ON(vma->exec_entry != entry);
-		vma->exec_entry = NULL;
-		__exec_to_vma(entry) = 0;
+		GEM_BUG_ON(vma->exec_flags != &eb->flags[i]);
+		vma->exec_flags = NULL;
+		eb->vma[i] = NULL;
 
-		if (entry->flags & __EXEC_OBJECT_HAS_PIN)
-			__eb_unreserve_vma(vma, entry);
+		if (flags & __EXEC_OBJECT_HAS_PIN)
+			__eb_unreserve_vma(vma, flags);
 
-		if (entry->flags & __EXEC_OBJECT_HAS_REF)
+		if (flags & __EXEC_OBJECT_HAS_REF)
 			i915_vma_put(vma);
-
-		entry->flags &=
-			~(__EXEC_OBJECT_RESERVED | __EXEC_OBJECT_HAS_REF);
 	}
 }
 
@@ -1386,7 +1380,7 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	}
 
 	if (reloc->write_domain) {
-		target->exec_entry->flags |= EXEC_OBJECT_WRITE;
+		*target->exec_flags |= EXEC_OBJECT_WRITE;
 
 		/*
 		 * Sandybridge PPGTT errata: We need a global gtt mapping
@@ -1438,7 +1432,7 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	 * do relocations we are already stalling, disable the user's opt
 	 * of our synchronisation.
 	 */
-	vma->exec_entry->flags &= ~EXEC_OBJECT_ASYNC;
+	*vma->exec_flags &= ~EXEC_OBJECT_ASYNC;
 
 	/* and update the user's relocation entry */
 	return relocate_entry(vma, reloc, eb, target);
@@ -1449,7 +1443,7 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
 	struct drm_i915_gem_relocation_entry stack[N_RELOC(512)];
 	struct drm_i915_gem_relocation_entry __user *urelocs;
-	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma);
 	unsigned int remain;
 
 	urelocs = u64_to_user_ptr(entry->relocs_ptr);
@@ -1532,7 +1526,7 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
 static int
 eb_relocate_vma_slow(struct i915_execbuffer *eb, struct i915_vma *vma)
 {
-	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma);
 	struct drm_i915_gem_relocation_entry *relocs =
 		u64_to_ptr(typeof(*relocs), entry->relocs_ptr);
 	unsigned int i;
@@ -1736,6 +1730,8 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb)
 	if (err)
 		goto err;
 
+	GEM_BUG_ON(!eb->batch);
+
 	list_for_each_entry(vma, &eb->relocs, reloc_link) {
 		if (!have_copy) {
 			pagefault_disable();
@@ -1829,11 +1825,11 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
 	int err;
 
 	for (i = 0; i < count; i++) {
-		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
-		struct i915_vma *vma = exec_to_vma(entry);
+		unsigned int flags = eb->flags[i];
+		struct i915_vma *vma = eb->vma[i];
 		struct drm_i915_gem_object *obj = vma->obj;
 
-		if (entry->flags & EXEC_OBJECT_CAPTURE) {
+		if (flags & EXEC_OBJECT_CAPTURE) {
 			struct i915_gem_capture_list *capture;
 
 			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
@@ -1841,41 +1837,42 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
 				return -ENOMEM;
 
 			capture->next = eb->request->capture_list;
-			capture->vma = vma;
+			capture->vma = eb->vma[i];
 			eb->request->capture_list = capture;
 		}
 
 		if (unlikely(obj->cache_dirty && !obj->cache_coherent)) {
 			if (i915_gem_clflush_object(obj, 0))
-				entry->flags &= ~EXEC_OBJECT_ASYNC;
+				flags &= ~EXEC_OBJECT_ASYNC;
 		}
 
-		if (entry->flags & EXEC_OBJECT_ASYNC)
-			goto skip_flushes;
+		if (flags & EXEC_OBJECT_ASYNC)
+			continue;
 
-		if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
-			err = i915_vma_emit_pipelined_fence(vma, eb->request);
+		if (unlikely(flags & EXEC_OBJECT_NEEDS_FENCE)) {
+			err = i915_vma_emit_pipelined_fence(eb->vma[i],
+							    eb->request);
 			if (err)
 				return err;
 		}
 
 		err = i915_gem_request_await_object
-			(eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
+			(eb->request, obj, flags & EXEC_OBJECT_WRITE);
 		if (err)
 			return err;
-
-skip_flushes:
-		i915_vma_move_to_active(vma, eb->request, entry->flags);
-		__eb_unreserve_vma(vma, entry);
-		vma->exec_entry = NULL;
 	}
 
 	for (i = 0; i < count; i++) {
-		const struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
-		struct i915_vma *vma = exec_to_vma(entry);
+		unsigned int flags = eb->flags[i];
+		struct i915_vma *vma = eb->vma[i];
+
+		i915_vma_move_to_active(vma, eb->request, flags);
+		eb_export_fence(vma, eb->request, flags);
 
-		eb_export_fence(vma, eb->request, entry->flags);
-		if (unlikely(entry->flags & __EXEC_OBJECT_HAS_REF))
+		__eb_unreserve_vma(vma, flags);
+		vma->exec_flags = NULL;
+
+		if (unlikely(flags & __EXEC_OBJECT_HAS_REF))
 			i915_vma_put(vma);
 	}
 	eb->exec = NULL;
@@ -1999,11 +1996,11 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 	if (IS_ERR(vma))
 		goto out;
 
-	vma->exec_entry =
-		memset(&eb->exec[eb->buffer_count++],
-		       0, sizeof(*vma->exec_entry));
-	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF;
-	__exec_to_vma(vma->exec_entry) = (uintptr_t)i915_vma_get(vma);
+	eb->vma[eb->buffer_count] = i915_vma_get(vma);
+	eb->flags[eb->buffer_count] =
+		__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF;
+	vma->exec_flags = &eb->flags[eb->buffer_count];
+	eb->buffer_count++;
 
 out:
 	i915_gem_object_unpin_pages(shadow_batch_obj);
@@ -2142,7 +2139,12 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	eb.args = args;
 	if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
 		args->flags |= __EXEC_HAS_RELOC;
+
 	eb.exec = exec;
+	eb.vma = memset(exec + args->buffer_count + 1, 0,
+			(args->buffer_count + 1) * sizeof(*eb.vma));
+	eb.flags = (unsigned int *)(eb.vma + args->buffer_count + 1);
+
 	eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
 	if (USES_FULL_PPGTT(eb.i915))
 		eb.invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
@@ -2230,7 +2232,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		goto err_vma;
 	}
 
-	if (unlikely(eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE)) {
+	if (unlikely(*eb.batch->exec_flags & EXEC_OBJECT_WRITE)) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		err = -EINVAL;
 		goto err_vma;
@@ -2374,7 +2376,9 @@ int
 i915_gem_execbuffer(struct drm_device *dev, void *data,
 		    struct drm_file *file)
 {
-	const size_t sz = sizeof(struct drm_i915_gem_exec_object2);
+	const size_t sz = (sizeof(struct drm_i915_gem_exec_object2) +
+			   sizeof(struct i915_vma *) +
+			   sizeof(unsigned int));
 	struct drm_i915_gem_execbuffer *args = data;
 	struct drm_i915_gem_execbuffer2 exec2;
 	struct drm_i915_gem_exec_object *exec_list = NULL;
@@ -2465,7 +2469,9 @@ int
 i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		     struct drm_file *file)
 {
-	const size_t sz = sizeof(struct drm_i915_gem_exec_object2);
+	const size_t sz = (sizeof(struct drm_i915_gem_exec_object2) +
+			   sizeof(struct i915_vma *) +
+			   sizeof(unsigned int));
 	struct drm_i915_gem_execbuffer2 *args = data;
 	struct drm_i915_gem_exec_object2 *exec2_list;
 	int err;
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 16bb1734783e..74a16cd5dc21 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -114,7 +114,7 @@ struct i915_vma {
 	/**
 	 * Used for performing relocations during execbuffer insertion.
 	 */
-	struct drm_i915_gem_exec_object2 *exec_entry;
+	unsigned int *exec_flags;
 	struct hlist_node exec_node;
 	u32 exec_handle;
 
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 13/16] drm/i915: Simplify eb_lookup_vmas()
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (11 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 12/16] drm/i915: Convert execbuf to use struct-of-array packing for critical fields Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-07-26 13:26 ` [PATCH 14/16] drm/i915: Replace execbuf vma ht with an idr Chris Wilson
                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

Since the introduction of being able to perform a lockless lookup of an
object (i915_gem_object_get_rcu() in fbbd37b36fa5 ("drm/i915: Move object
release to a freelist + worker") we no longer need to split the
object/vma lookup into 3 phases and so combine them into a much simpler
single loop.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 122 +++++++++--------------------
 1 file changed, 36 insertions(+), 86 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 92d88ec271c1..ed906093fb63 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -450,7 +450,9 @@ eb_validate_vma(struct i915_execbuffer *eb,
 }
 
 static int
-eb_add_vma(struct i915_execbuffer *eb, unsigned int i, struct i915_vma *vma)
+eb_add_vma(struct i915_execbuffer *eb,
+	   unsigned int i, struct i915_vma *vma,
+	   unsigned int flags)
 {
 	struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
 	int err;
@@ -480,7 +482,7 @@ eb_add_vma(struct i915_execbuffer *eb, unsigned int i, struct i915_vma *vma)
 	 * to find the right target VMA when doing relocations.
 	 */
 	eb->vma[i] = vma;
-	eb->flags[i] = entry->flags;
+	eb->flags[i] = entry->flags | flags;
 	vma->exec_flags = &eb->flags[i];
 
 	err = 0;
@@ -687,13 +689,9 @@ static int eb_select_context(struct i915_execbuffer *eb)
 
 static int eb_lookup_vmas(struct i915_execbuffer *eb)
 {
-#define INTERMEDIATE BIT(0)
-	const unsigned int count = eb->buffer_count;
 	struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut;
-	struct i915_vma *vma;
-	struct idr *idr;
+	struct drm_i915_gem_object *uninitialized_var(obj);
 	unsigned int i;
-	int slow_pass = -1;
 	int err;
 
 	if (unlikely(i915_gem_context_is_closed(eb->ctx)))
@@ -709,82 +707,38 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 		flush_work(&lut->resize);
 	GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS);
 
-	for (i = 0; i < count; i++) {
-		hlist_for_each_entry(vma,
-				     ht_head(lut, eb->exec[i].handle),
-				     ctx_node) {
-			if (vma->ctx_handle != eb->exec[i].handle)
-				continue;
-
-			err = eb_add_vma(eb, i, vma);
-			if (unlikely(err))
-				return err;
-			goto next_vma;
-		}
-
-		if (slow_pass < 0)
-			slow_pass = i;
-next_vma: ;
-	}
+	for (i = 0; i < eb->buffer_count; i++) {
+		u32 handle = eb->exec[i].handle;
+		struct hlist_head *hl = ht_head(lut, handle);
+		unsigned int flags = 0;
+		struct i915_vma *vma;
 
-	if (slow_pass < 0)
-		goto out;
+		hlist_for_each_entry(vma, hl, ctx_node) {
+			GEM_BUG_ON(vma->ctx != eb->ctx);
 
-	spin_lock(&eb->file->table_lock);
-	/*
-	 * Grab a reference to the object and release the lock so we can lookup
-	 * or create the VMA without using GFP_ATOMIC
-	 */
-	idr = &eb->file->object_idr;
-	for (i = slow_pass; i < count; i++) {
-		struct drm_i915_gem_object *obj;
+			if (vma->ctx_handle != handle)
+				continue;
 
-		if (eb->vma[i])
-			continue;
+			goto add_vma;
+		}
 
-		obj = to_intel_bo(idr_find(idr, eb->exec[i].handle));
+		obj = i915_gem_object_lookup(eb->file, handle);
 		if (unlikely(!obj)) {
-			spin_unlock(&eb->file->table_lock);
-			DRM_DEBUG("Invalid object handle %d at index %d\n",
-				  eb->exec[i].handle, i);
 			err = -ENOENT;
-			goto err;
+			goto err_vma;
 		}
 
-		eb->vma[i] = (struct i915_vma *)
-			ptr_pack_bits(obj, INTERMEDIATE, 1);
-	}
-	spin_unlock(&eb->file->table_lock);
-
-	for (i = slow_pass; i < count; i++) {
-		struct drm_i915_gem_object *obj;
-		unsigned int is_obj;
-
-		obj = (typeof(obj))ptr_unpack_bits(eb->vma[i], &is_obj, 1);
-		if (!is_obj)
-			continue;
-
-		/*
-		 * NOTE: We can leak any vmas created here when something fails
-		 * later on. But that's no issue since vma_unbind can deal with
-		 * vmas which are not actually bound. And since only
-		 * lookup_or_create exists as an interface to get at the vma
-		 * from the (obj, vm) we don't run the risk of creating
-		 * duplicated vmas for the same vm.
-		 */
 		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
-			DRM_DEBUG("Failed to lookup VMA\n");
 			err = PTR_ERR(vma);
-			goto err;
+			goto err_obj;
 		}
 
 		/* First come, first served */
 		if (!vma->ctx) {
 			vma->ctx = eb->ctx;
-			vma->ctx_handle = eb->exec[i].handle;
-			hlist_add_head(&vma->ctx_node,
-				       ht_head(lut, eb->exec[i].handle));
+			vma->ctx_handle = handle;
+			hlist_add_head(&vma->ctx_node, hl);
 			lut->ht_count++;
 			lut->ht_size |= I915_CTX_RESIZE_IN_PROGRESS;
 			if (i915_vma_is_ggtt(vma)) {
@@ -792,21 +746,19 @@ next_vma: ;
 				obj->vma_hashed = vma;
 			}
 
-			i915_vma_get(vma);
+			/* transfer ref to ctx */
+			obj = NULL;
+		} else {
+			flags = __EXEC_OBJECT_HAS_REF;
 		}
 
-		err = eb_add_vma(eb, i, vma);
+add_vma:
+		err = eb_add_vma(eb, i, vma, flags);
 		if (unlikely(err))
-			goto err;
+			goto err_obj;
 
 		GEM_BUG_ON(vma != eb->vma[i]);
 		GEM_BUG_ON(vma->exec_flags != &eb->flags[i]);
-
-		/* Only after we validated the user didn't use our bits */
-		if (vma->ctx != eb->ctx) {
-			i915_vma_get(vma);
-			*vma->exec_flags |= __EXEC_OBJECT_HAS_REF;
-		}
 	}
 
 	if (lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS) {
@@ -816,7 +768,6 @@ next_vma: ;
 			lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS;
 	}
 
-out:
 	/* take note of the batch buffer before we might reorder the lists */
 	i = eb_batch_index(eb);
 	eb->batch = eb->vma[i];
@@ -839,14 +790,13 @@ next_vma: ;
 	eb->args->flags |= __EXEC_VALIDATED;
 	return eb_reserve(eb);
 
-err:
-	for (i = slow_pass; i < count; i++) {
-		if (ptr_unmask_bits(eb->vma[i], 1))
-			eb->vma[i] = NULL;
-	}
+err_obj:
+	if (obj)
+		i915_gem_object_put(obj);
+err_vma:
+	eb->vma[i] = NULL;
 	lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS;
 	return err;
-#undef INTERMEDIATE
 }
 
 static struct i915_vma *
@@ -879,7 +829,7 @@ static void eb_release_vmas(const struct i915_execbuffer *eb)
 		unsigned int flags = eb->flags[i];
 
 		if (!vma)
-			continue;
+			break;
 
 		GEM_BUG_ON(vma->exec_flags != &eb->flags[i]);
 		vma->exec_flags = NULL;
@@ -2141,8 +2091,8 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		args->flags |= __EXEC_HAS_RELOC;
 
 	eb.exec = exec;
-	eb.vma = memset(exec + args->buffer_count + 1, 0,
-			(args->buffer_count + 1) * sizeof(*eb.vma));
+	eb.vma = (struct i915_vma **)(exec + args->buffer_count + 1);
+	eb.vma[0] = NULL;
 	eb.flags = (unsigned int *)(eb.vma + args->buffer_count + 1);
 
 	eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 14/16] drm/i915: Replace execbuf vma ht with an idr
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (12 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 13/16] drm/i915: Simplify eb_lookup_vmas() Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-07-26 13:26 ` [PATCH 15/16] drm/i915: Trivial grammar fix s/opt of/opt out of/ in comment Chris Wilson
                   ` (2 subsequent siblings)
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

This was the competing idea long ago, but it was only with the rewrite
of the idr as an radixtree and using the radixtree directly ourselves,
along with the realisation that we can store the vma directly in the
radixtree and only need a list for the reverse mapping, that made the
patch performant enough to displace using a hashtable. Though the vma ht
is fast and doesn't require and extra allocation (as we can embed the node
inside the vma), it does require a thread for resizing and serialization
and will have the occasional slow lookup. That is hairy enough to
investigate alternatives and favour them if equivalent in peak performance.
One advantage of allocating an indirection entry is that we can support a
single shared bo between many clients, something that was done on a
first-come first-serve basis for shared GGTT vma previously. To offset
the extra allocations, we create yet another kmem_cache for them.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  6 --
 drivers/gpu/drm/i915/i915_drv.h               |  1 +
 drivers/gpu/drm/i915/i915_gem.c               | 46 +++++++++-----
 drivers/gpu/drm/i915/i915_gem_context.c       | 87 ++++++---------------------
 drivers/gpu/drm/i915/i915_gem_context.h       | 39 ++++--------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c    | 79 ++++++++----------------
 drivers/gpu/drm/i915/i915_gem_object.h        | 23 ++++++-
 drivers/gpu/drm/i915/i915_vma.c               | 22 -------
 drivers/gpu/drm/i915/i915_vma.h               |  4 --
 drivers/gpu/drm/i915/selftests/mock_context.c | 15 ++---
 lib/radix-tree.c                              |  1 +
 11 files changed, 115 insertions(+), 208 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 0877040458ff..d399e7af8e6d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1979,12 +1979,6 @@ static int i915_context_status(struct seq_file *m, void *unused)
 			seq_putc(m, '\n');
 		}
 
-		seq_printf(m,
-			   "\tvma hashtable size=%u (actual %lu), count=%u\n",
-			   ctx->vma_lut.ht_size,
-			   BIT(ctx->vma_lut.ht_bits),
-			   ctx->vma_lut.ht_count);
-
 		seq_putc(m, '\n');
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1760a309b488..7be6de7099a3 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2122,6 +2122,7 @@ struct drm_i915_private {
 
 	struct kmem_cache *objects;
 	struct kmem_cache *vmas;
+	struct kmem_cache *luts;
 	struct kmem_cache *requests;
 	struct kmem_cache *dependencies;
 	struct kmem_cache *priorities;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index dfbf75bb6cf8..c952543dfadf 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3313,25 +3313,33 @@ i915_gem_idle_work_handler(struct work_struct *work)
 
 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
 {
+	struct drm_i915_private *i915 = to_i915(gem->dev);
 	struct drm_i915_gem_object *obj = to_intel_bo(gem);
 	struct drm_i915_file_private *fpriv = file->driver_priv;
-	struct i915_vma *vma, *vn;
+	struct i915_lut_handle *lut, *ln;
 
-	mutex_lock(&obj->base.dev->struct_mutex);
-	list_for_each_entry_safe(vma, vn, &obj->vma_list, obj_link)
-		if (vma->vm->file == fpriv)
+	mutex_lock(&i915->drm.struct_mutex);
+
+	list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
+		struct i915_gem_context *ctx = lut->ctx;
+		struct i915_vma *vma;
+
+		if (ctx->file_priv != fpriv)
+			continue;
+
+		vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
+
+		if (!i915_vma_is_ggtt(vma))
 			i915_vma_close(vma);
 
-	vma = obj->vma_hashed;
-	if (vma && vma->ctx->file_priv == fpriv)
-		i915_vma_unlink_ctx(vma);
+		list_del(&lut->obj_link);
+		list_del(&lut->ctx_link);
 
-	if (i915_gem_object_is_active(obj) &&
-	    !i915_gem_object_has_active_reference(obj)) {
-		i915_gem_object_set_active_reference(obj);
-		i915_gem_object_get(obj);
+		kmem_cache_free(i915->luts, lut);
+		__i915_gem_object_release_unless_active(obj);
 	}
-	mutex_unlock(&obj->base.dev->struct_mutex);
+
+	mutex_unlock(&i915->drm.struct_mutex);
 }
 
 static unsigned long to_wait_timeout(s64 timeout_ns)
@@ -4319,6 +4327,7 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 	mutex_init(&obj->mm.lock);
 
 	INIT_LIST_HEAD(&obj->vma_list);
+	INIT_LIST_HEAD(&obj->lut_list);
 	INIT_LIST_HEAD(&obj->batch_pool_link);
 
 	obj->ops = ops;
@@ -4583,8 +4592,8 @@ void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
 {
 	lockdep_assert_held(&obj->base.dev->struct_mutex);
 
-	GEM_BUG_ON(i915_gem_object_has_active_reference(obj));
-	if (i915_gem_object_is_active(obj))
+	if (!i915_gem_object_has_active_reference(obj) &&
+	    i915_gem_object_is_active(obj))
 		i915_gem_object_set_active_reference(obj);
 	else
 		i915_gem_object_put(obj);
@@ -4977,12 +4986,16 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
 	if (!dev_priv->vmas)
 		goto err_objects;
 
+	dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
+	if (!dev_priv->luts)
+		goto err_vmas;
+
 	dev_priv->requests = KMEM_CACHE(drm_i915_gem_request,
 					SLAB_HWCACHE_ALIGN |
 					SLAB_RECLAIM_ACCOUNT |
 					SLAB_TYPESAFE_BY_RCU);
 	if (!dev_priv->requests)
-		goto err_vmas;
+		goto err_luts;
 
 	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
 					    SLAB_HWCACHE_ALIGN |
@@ -5030,6 +5043,8 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
 	kmem_cache_destroy(dev_priv->dependencies);
 err_requests:
 	kmem_cache_destroy(dev_priv->requests);
+err_luts:
+	kmem_cache_destroy(dev_priv->luts);
 err_vmas:
 	kmem_cache_destroy(dev_priv->vmas);
 err_objects:
@@ -5052,6 +5067,7 @@ void i915_gem_load_cleanup(struct drm_i915_private *dev_priv)
 	kmem_cache_destroy(dev_priv->priorities);
 	kmem_cache_destroy(dev_priv->dependencies);
 	kmem_cache_destroy(dev_priv->requests);
+	kmem_cache_destroy(dev_priv->luts);
 	kmem_cache_destroy(dev_priv->vmas);
 	kmem_cache_destroy(dev_priv->objects);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index ed91ac8ca832..358aadef09af 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -93,69 +93,28 @@
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
 
-/* Initial size (as log2) to preallocate the handle->object hashtable */
-#define VMA_HT_BITS 2u /* 4 x 2 pointers, 64 bytes minimum */
-
-static void resize_vma_ht(struct work_struct *work)
+static void lut_close(struct i915_gem_context *ctx)
 {
-	struct i915_gem_context_vma_lut *lut =
-		container_of(work, typeof(*lut), resize);
-	unsigned int bits, new_bits, size, i;
-	struct hlist_head *new_ht;
-
-	GEM_BUG_ON(!(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS));
-
-	bits = 1 + ilog2(4*lut->ht_count/3 + 1);
-	new_bits = min_t(unsigned int,
-			 max(bits, VMA_HT_BITS),
-			 sizeof(unsigned int) * BITS_PER_BYTE - 1);
-	if (new_bits == lut->ht_bits)
-		goto out;
-
-	new_ht = kzalloc(sizeof(*new_ht)<<new_bits, GFP_KERNEL | __GFP_NOWARN);
-	if (!new_ht)
-		new_ht = vzalloc(sizeof(*new_ht)<<new_bits);
-	if (!new_ht)
-		/* Pretend resize succeeded and stop calling us for a bit! */
-		goto out;
-
-	size = BIT(lut->ht_bits);
-	for (i = 0; i < size; i++) {
-		struct i915_vma *vma;
-		struct hlist_node *tmp;
+	struct i915_lut_handle *lut, *ln;
+	struct radix_tree_iter iter;
+	void __rcu **slot;
 
-		hlist_for_each_entry_safe(vma, tmp, &lut->ht[i], ctx_node)
-			hlist_add_head(&vma->ctx_node,
-				       &new_ht[hash_32(vma->ctx_handle,
-						       new_bits)]);
+	list_for_each_entry_safe(lut, ln, &ctx->handles_list, ctx_link) {
+		list_del(&lut->obj_link);
+		kmem_cache_free(ctx->i915->luts, lut);
 	}
-	kvfree(lut->ht);
-	lut->ht = new_ht;
-	lut->ht_bits = new_bits;
-out:
-	smp_store_release(&lut->ht_size, BIT(bits));
-	GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS);
-}
 
-static void vma_lut_free(struct i915_gem_context *ctx)
-{
-	struct i915_gem_context_vma_lut *lut = &ctx->vma_lut;
-	unsigned int i, size;
+	radix_tree_for_each_slot(slot, &ctx->handles_vma, &iter, 0) {
+		struct i915_vma *vma = rcu_dereference_raw(*slot);
+		struct drm_i915_gem_object *obj = vma->obj;
 
-	if (lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS)
-		cancel_work_sync(&lut->resize);
+		radix_tree_iter_delete(&ctx->handles_vma, &iter, slot);
 
-	size = BIT(lut->ht_bits);
-	for (i = 0; i < size; i++) {
-		struct i915_vma *vma;
+		if (!i915_vma_is_ggtt(vma))
+			i915_vma_close(vma);
 
-		hlist_for_each_entry(vma, &lut->ht[i], ctx_node) {
-			vma->obj->vma_hashed = NULL;
-			vma->ctx = NULL;
-			i915_vma_put(vma);
-		}
+		__i915_gem_object_release_unless_active(obj);
 	}
-	kvfree(lut->ht);
 }
 
 static void i915_gem_context_free(struct i915_gem_context *ctx)
@@ -165,7 +124,6 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
 	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915_gem_context_is_closed(ctx));
 
-	vma_lut_free(ctx);
 	i915_ppgtt_put(ctx->ppgtt);
 
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
@@ -239,8 +197,11 @@ void i915_gem_context_release(struct kref *ref)
 static void context_close(struct i915_gem_context *ctx)
 {
 	i915_gem_context_set_closed(ctx);
+
+	lut_close(ctx);
 	if (ctx->ppgtt)
 		i915_ppgtt_close(&ctx->ppgtt->base);
+
 	ctx->file_priv = ERR_PTR(-EBADF);
 	i915_gem_context_put(ctx);
 }
@@ -313,16 +274,8 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 	ctx->i915 = dev_priv;
 	ctx->priority = I915_PRIORITY_NORMAL;
 
-	ctx->vma_lut.ht_bits = VMA_HT_BITS;
-	ctx->vma_lut.ht_size = BIT(VMA_HT_BITS);
-	BUILD_BUG_ON(BIT(VMA_HT_BITS) == I915_CTX_RESIZE_IN_PROGRESS);
-	ctx->vma_lut.ht = kcalloc(ctx->vma_lut.ht_size,
-				  sizeof(*ctx->vma_lut.ht),
-				  GFP_KERNEL);
-	if (!ctx->vma_lut.ht)
-		goto err_out;
-
-	INIT_WORK(&ctx->vma_lut.resize, resize_vma_ht);
+	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
+	INIT_LIST_HEAD(&ctx->handles_list);
 
 	/* Default context will never have a file_priv */
 	ret = DEFAULT_CONTEXT_HANDLE;
@@ -372,8 +325,6 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 	put_pid(ctx->pid);
 	idr_remove(&file_priv->context_idr, ctx->user_handle);
 err_lut:
-	kvfree(ctx->vma_lut.ht);
-err_out:
 	context_close(ctx);
 	return ERR_PTR(ret);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 2d02918a449e..44688e22a5c2 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -27,6 +27,7 @@
 
 #include <linux/bitops.h>
 #include <linux/list.h>
+#include <linux/radix-tree.h>
 
 struct pid;
 
@@ -149,32 +150,6 @@ struct i915_gem_context {
 	/** ggtt_offset_bias: placement restriction for context objects */
 	u32 ggtt_offset_bias;
 
-	struct i915_gem_context_vma_lut {
-		/** ht_size: last request size to allocate the hashtable for. */
-		unsigned int ht_size;
-#define I915_CTX_RESIZE_IN_PROGRESS BIT(0)
-		/** ht_bits: real log2(size) of hashtable. */
-		unsigned int ht_bits;
-		/** ht_count: current number of entries inside the hashtable */
-		unsigned int ht_count;
-
-		/** ht: the array of buckets comprising the simple hashtable */
-		struct hlist_head *ht;
-
-		/**
-		 * resize: After an execbuf completes, we check the load factor
-		 * of the hashtable. If the hashtable is too full, or too empty,
-		 * we schedule a task to resize the hashtable. During the
-		 * resize, the entries are moved between different buckets and
-		 * so we cannot simultaneously read the hashtable as it is
-		 * being resized (unlike rhashtable). Therefore we treat the
-		 * active work as a strong barrier, pausing a subsequent
-		 * execbuf to wait for the resize worker to complete, if
-		 * required.
-		 */
-		struct work_struct resize;
-	} vma_lut;
-
 	/** engine: per-engine logical HW state */
 	struct intel_context {
 		struct i915_vma *state;
@@ -205,6 +180,18 @@ struct i915_gem_context {
 
 	/** remap_slice: Bitmask of cache lines that need remapping */
 	u8 remap_slice;
+
+	/** handles_vma: rbtree to look up our context specific obj/vma for
+	 * the user handle. (user handles are per fd, but the binding is
+	 * per vm, which may be one per context or shared with the global GTT)
+	 */
+	struct radix_tree_root handles_vma;
+
+	/** handles_list: reverse list of all the rbtree entries in use for
+	 * this context, which allows us to free all the allocations on
+	 * context close.
+	 */
+	struct list_head handles_list;
 };
 
 static inline bool i915_gem_context_is_closed(const struct i915_gem_context *ctx)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index ed906093fb63..1ce8e5065041 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -450,9 +450,7 @@ eb_validate_vma(struct i915_execbuffer *eb,
 }
 
 static int
-eb_add_vma(struct i915_execbuffer *eb,
-	   unsigned int i, struct i915_vma *vma,
-	   unsigned int flags)
+eb_add_vma(struct i915_execbuffer *eb, unsigned int i, struct i915_vma *vma)
 {
 	struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
 	int err;
@@ -482,7 +480,7 @@ eb_add_vma(struct i915_execbuffer *eb,
 	 * to find the right target VMA when doing relocations.
 	 */
 	eb->vma[i] = vma;
-	eb->flags[i] = entry->flags | flags;
+	eb->flags[i] = entry->flags;
 	vma->exec_flags = &eb->flags[i];
 
 	err = 0;
@@ -648,19 +646,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
 	} while (1);
 }
 
-static inline struct hlist_head *
-ht_head(const  struct i915_gem_context_vma_lut *lut, u32 handle)
-{
-	return &lut->ht[hash_32(handle, lut->ht_bits)];
-}
-
-static inline bool
-ht_needs_resize(const struct i915_gem_context_vma_lut *lut)
-{
-	return (4*lut->ht_count > 3*lut->ht_size ||
-		4*lut->ht_count + 1 < lut->ht_size);
-}
-
 static unsigned int eb_batch_index(const struct i915_execbuffer *eb)
 {
 	if (eb->args->flags & I915_EXEC_BATCH_FIRST)
@@ -689,7 +674,7 @@ static int eb_select_context(struct i915_execbuffer *eb)
 
 static int eb_lookup_vmas(struct i915_execbuffer *eb)
 {
-	struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut;
+	struct radix_tree_root *handles_vma = &eb->ctx->handles_vma;
 	struct drm_i915_gem_object *uninitialized_var(obj);
 	unsigned int i;
 	int err;
@@ -703,24 +688,14 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 	INIT_LIST_HEAD(&eb->relocs);
 	INIT_LIST_HEAD(&eb->unbound);
 
-	if (unlikely(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS))
-		flush_work(&lut->resize);
-	GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS);
-
 	for (i = 0; i < eb->buffer_count; i++) {
 		u32 handle = eb->exec[i].handle;
-		struct hlist_head *hl = ht_head(lut, handle);
-		unsigned int flags = 0;
+		struct i915_lut_handle *lut;
 		struct i915_vma *vma;
 
-		hlist_for_each_entry(vma, hl, ctx_node) {
-			GEM_BUG_ON(vma->ctx != eb->ctx);
-
-			if (vma->ctx_handle != handle)
-				continue;
-
+		vma = radix_tree_lookup(handles_vma, handle);
+		if (likely(vma))
 			goto add_vma;
-		}
 
 		obj = i915_gem_object_lookup(eb->file, handle);
 		if (unlikely(!obj)) {
@@ -734,26 +709,28 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 			goto err_obj;
 		}
 
-		/* First come, first served */
-		if (!vma->ctx) {
-			vma->ctx = eb->ctx;
-			vma->ctx_handle = handle;
-			hlist_add_head(&vma->ctx_node, hl);
-			lut->ht_count++;
-			lut->ht_size |= I915_CTX_RESIZE_IN_PROGRESS;
-			if (i915_vma_is_ggtt(vma)) {
-				GEM_BUG_ON(obj->vma_hashed);
-				obj->vma_hashed = vma;
-			}
+		lut = kmem_cache_alloc(eb->i915->luts, GFP_KERNEL);
+		if (unlikely(!lut)) {
+			err = -ENOMEM;
+			goto err_obj;
+		}
 
-			/* transfer ref to ctx */
-			obj = NULL;
-		} else {
-			flags = __EXEC_OBJECT_HAS_REF;
+		err = radix_tree_insert(handles_vma, handle, vma);
+		if (unlikely(err)) {
+			kfree(lut);
+			goto err_obj;
 		}
 
+		list_add(&lut->obj_link, &obj->lut_list);
+		list_add(&lut->ctx_link, &eb->ctx->handles_list);
+		lut->ctx = eb->ctx;
+		lut->handle = handle;
+
+		/* transfer ref to ctx */
+		obj = NULL;
+
 add_vma:
-		err = eb_add_vma(eb, i, vma, flags);
+		err = eb_add_vma(eb, i, vma);
 		if (unlikely(err))
 			goto err_obj;
 
@@ -761,13 +738,6 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 		GEM_BUG_ON(vma->exec_flags != &eb->flags[i]);
 	}
 
-	if (lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS) {
-		if (ht_needs_resize(lut))
-			queue_work(system_highpri_wq, &lut->resize);
-		else
-			lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS;
-	}
-
 	/* take note of the batch buffer before we might reorder the lists */
 	i = eb_batch_index(eb);
 	eb->batch = eb->vma[i];
@@ -795,7 +765,6 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 		i915_gem_object_put(obj);
 err_vma:
 	eb->vma[i] = NULL;
-	lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS;
 	return err;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 83e364f32253..6a1826b07b7d 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -35,6 +35,19 @@
 
 #include "i915_selftest.h"
 
+/*
+ * struct i915_lut_handle tracks the fast lookups from handle to vma used
+ * for execbuf. Although we use a radixtree for that mapping, in order to
+ * remove them as the object or context is closed, we need a secondary list
+ * and a translation entry (i915_lut_handle).
+ */
+struct i915_lut_handle {
+	struct list_head obj_link;
+	struct list_head ctx_link;
+	struct i915_gem_context *ctx;
+	u32 handle;
+};
+
 struct drm_i915_gem_object_ops {
 	unsigned int flags;
 #define I915_GEM_OBJECT_HAS_STRUCT_PAGE BIT(0)
@@ -86,7 +99,15 @@ struct drm_i915_gem_object {
 	 * They are also added to @vma_list for easy iteration.
 	 */
 	struct rb_root vma_tree;
-	struct i915_vma *vma_hashed;
+
+	/**
+	 * @lut_list: List of vma lookup entries in use for this object.
+	 *
+	 * If this object is closed, we need to remove all of its VMA from
+	 * the fast lookup index in associated contexts; @lut_list provides
+	 * this translation from object to context->handles_vma.
+	 */
+	struct list_head lut_list;
 
 	/** Stolen memory for this object, instead of being backed by shmem. */
 	struct drm_mm_node *stolen;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 96cb0752e8e9..f524bf6d3182 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -623,33 +623,11 @@ static void i915_vma_destroy(struct i915_vma *vma)
 	kmem_cache_free(to_i915(vma->obj->base.dev)->vmas, vma);
 }
 
-void i915_vma_unlink_ctx(struct i915_vma *vma)
-{
-	struct i915_gem_context *ctx = vma->ctx;
-
-	if (ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS) {
-		cancel_work_sync(&ctx->vma_lut.resize);
-		ctx->vma_lut.ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS;
-	}
-
-	__hlist_del(&vma->ctx_node);
-	ctx->vma_lut.ht_count--;
-
-	if (i915_vma_is_ggtt(vma))
-		vma->obj->vma_hashed = NULL;
-	vma->ctx = NULL;
-
-	i915_vma_put(vma);
-}
-
 void i915_vma_close(struct i915_vma *vma)
 {
 	GEM_BUG_ON(i915_vma_is_closed(vma));
 	vma->flags |= I915_VMA_CLOSED;
 
-	if (vma->ctx)
-		i915_vma_unlink_ctx(vma);
-
 	list_del(&vma->obj_link);
 	rb_erase(&vma->obj_node, &vma->obj->vma_tree);
 
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 74a16cd5dc21..96d1061028e2 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -117,10 +117,6 @@ struct i915_vma {
 	unsigned int *exec_flags;
 	struct hlist_node exec_node;
 	u32 exec_handle;
-
-	struct i915_gem_context *ctx;
-	struct hlist_node ctx_node;
-	u32 ctx_handle;
 };
 
 struct i915_vma *
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index d436f2d5089b..098ce643ad07 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -40,18 +40,13 @@ mock_context(struct drm_i915_private *i915,
 	INIT_LIST_HEAD(&ctx->link);
 	ctx->i915 = i915;
 
-	ctx->vma_lut.ht_bits = VMA_HT_BITS;
-	ctx->vma_lut.ht_size = BIT(VMA_HT_BITS);
-	ctx->vma_lut.ht = kcalloc(ctx->vma_lut.ht_size,
-				  sizeof(*ctx->vma_lut.ht),
-				  GFP_KERNEL);
-	if (!ctx->vma_lut.ht)
-		goto err_free;
+	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
+	INIT_LIST_HEAD(&ctx->handles_list);
 
 	ret = ida_simple_get(&i915->contexts.hw_ida,
 			     0, MAX_CONTEXT_HW_ID, GFP_KERNEL);
 	if (ret < 0)
-		goto err_vma_ht;
+		goto err_handles;
 	ctx->hw_id = ret;
 
 	if (name) {
@@ -66,9 +61,7 @@ mock_context(struct drm_i915_private *i915,
 
 	return ctx;
 
-err_vma_ht:
-	kvfree(ctx->vma_lut.ht);
-err_free:
+err_handles:
 	kfree(ctx);
 	return NULL;
 
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 898e87998417..3527eb364964 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2022,6 +2022,7 @@ void radix_tree_iter_delete(struct radix_tree_root *root,
 	if (__radix_tree_delete(root, iter->node, slot))
 		iter->index = iter->next_index;
 }
+EXPORT_SYMBOL(radix_tree_iter_delete);
 
 /**
  * radix_tree_delete_item - delete an item from a radix tree
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 15/16] drm/i915: Trivial grammar fix s/opt of/opt out of/ in comment
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (13 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 14/16] drm/i915: Replace execbuf vma ht with an idr Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-07-26 13:26 ` [PATCH 16/16] drm/i915: Mark the GT as busy before idling the previous request Chris Wilson
  2017-07-26 14:13 ` ✓ Fi.CI.BAT: success for series starting with [01/16] drm/i915: Keep a small stash of preallocated WC pages Patchwork
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

The word out was dropped from the sentence across the line break, put it
back.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 1ce8e5065041..407bf82682ac 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1349,7 +1349,7 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	 * patching using the GPU (though that should be serialised by the
 	 * timeline). To be completely sure, and since we are required to
 	 * do relocations we are already stalling, disable the user's opt
-	 * of our synchronisation.
+	 * out of our synchronisation.
 	 */
 	*vma->exec_flags &= ~EXEC_OBJECT_ASYNC;
 
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 16/16] drm/i915: Mark the GT as busy before idling the previous request
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (14 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 15/16] drm/i915: Trivial grammar fix s/opt of/opt out of/ in comment Chris Wilson
@ 2017-07-26 13:26 ` Chris Wilson
  2017-07-26 14:13 ` ✓ Fi.CI.BAT: success for series starting with [01/16] drm/i915: Keep a small stash of preallocated WC pages Patchwork
  16 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-07-26 13:26 UTC (permalink / raw)
  To: intel-gfx

In a synchronous setup, we may retire the last request before we
complete allocating the next request. As the last request is retired, we
queue a timer to mark the device as idle, and promptly have to execute
ad cancel that timer once we complete allocating the request and need to
keep the device awake. If we rearrange the mark_busy() to occur before
we retire the previous request, we can skip this ping-pong.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 61 ++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 9eedd33eb524..87b34153945c 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -265,6 +265,13 @@ static int reserve_seqno(struct intel_engine_cs *engine)
 
 static void unreserve_seqno(struct intel_engine_cs *engine)
 {
+	if (!--engine->i915->gt.active_requests) {
+		GEM_BUG_ON(!engine->i915->gt.awake);
+		mod_delayed_work(engine->i915->wq,
+				 &engine->i915->gt.idle_work,
+				 msecs_to_jiffies(100));
+	}
+
 	GEM_BUG_ON(!engine->timeline->inflight_seqnos);
 	engine->timeline->inflight_seqnos--;
 }
@@ -333,12 +340,6 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	list_del_init(&request->link);
 	spin_unlock_irq(&engine->timeline->lock);
 
-	if (!--request->i915->gt.active_requests) {
-		GEM_BUG_ON(!request->i915->gt.awake);
-		mod_delayed_work(request->i915->wq,
-				 &request->i915->gt.idle_work,
-				 msecs_to_jiffies(100));
-	}
 	unreserve_seqno(request->engine);
 	advance_ring(request);
 
@@ -540,6 +541,26 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	return NOTIFY_DONE;
 }
 
+static void mark_busy(struct drm_i915_private *i915)
+{
+	if (i915->gt.awake)
+		return;
+
+	GEM_BUG_ON(!i915->gt.active_requests);
+
+	intel_runtime_pm_get_noresume(i915);
+	i915->gt.awake = true;
+
+	intel_enable_gt_powersave(i915);
+	i915_update_gfx_val(i915);
+	if (INTEL_GEN(i915) >= 6)
+		gen6_rps_busy(i915);
+
+	queue_delayed_work(i915->wq,
+			   &i915->gt.retire_work,
+			   round_jiffies_up_relative(HZ));
+}
+
 /**
  * i915_gem_request_alloc - allocate a request structure
  *
@@ -579,6 +600,9 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	if (ret)
 		goto err_unpin;
 
+	if (!dev_priv->gt.active_requests++)
+		mark_busy(dev_priv);
+
 	/* Move the oldest request to the slab-cache (if not in use!) */
 	req = list_first_entry_or_null(&engine->timeline->requests,
 				       typeof(*req), link);
@@ -863,28 +887,6 @@ i915_gem_request_await_object(struct drm_i915_gem_request *to,
 	return ret;
 }
 
-static void i915_gem_mark_busy(const struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	if (dev_priv->gt.awake)
-		return;
-
-	GEM_BUG_ON(!dev_priv->gt.active_requests);
-
-	intel_runtime_pm_get_noresume(dev_priv);
-	dev_priv->gt.awake = true;
-
-	intel_enable_gt_powersave(dev_priv);
-	i915_update_gfx_val(dev_priv);
-	if (INTEL_GEN(dev_priv) >= 6)
-		gen6_rps_busy(dev_priv);
-
-	queue_delayed_work(dev_priv->wq,
-			   &dev_priv->gt.retire_work,
-			   round_jiffies_up_relative(HZ));
-}
-
 /*
  * NB: This function is not allowed to fail. Doing so would mean the the
  * request is not being tracked for completion but the work itself is
@@ -966,9 +968,6 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	list_add_tail(&request->ring_link, &ring->request_list);
 	request->emitted_jiffies = jiffies;
 
-	if (!request->i915->gt.active_requests++)
-		i915_gem_mark_busy(engine);
-
 	/* Let the backend know a new request has arrived that may need
 	 * to adjust the existing execution schedule due to a high priority
 	 * request - i.e. we may want to preempt the current request in order
-- 
2.13.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [01/16] drm/i915: Keep a small stash of preallocated WC pages
  2017-07-26 13:25 More tweaks Chris Wilson
                   ` (15 preceding siblings ...)
  2017-07-26 13:26 ` [PATCH 16/16] drm/i915: Mark the GT as busy before idling the previous request Chris Wilson
@ 2017-07-26 14:13 ` Patchwork
  16 siblings, 0 replies; 28+ messages in thread
From: Patchwork @ 2017-07-26 14:13 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/16] drm/i915: Keep a small stash of preallocated WC pages
URL   : https://patchwork.freedesktop.org/series/27934/
State : success

== Summary ==

Series 27934v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/27934/revisions/1/mbox/

Test gem_exec_flush:
        Subgroup basic-batch-kernel-default-uc:
                fail       -> PASS       (fi-snb-2600) fdo#100007
Test kms_pipe_crc_basic:
        Subgroup hang-read-crc-pipe-b:
                pass       -> DMESG-WARN (fi-pnv-d510) fdo#101597
        Subgroup suspend-read-crc-pipe-b:
                dmesg-warn -> PASS       (fi-byt-j1900) fdo#101705

fdo#100007 https://bugs.freedesktop.org/show_bug.cgi?id=100007
fdo#101597 https://bugs.freedesktop.org/show_bug.cgi?id=101597
fdo#101705 https://bugs.freedesktop.org/show_bug.cgi?id=101705

fi-bdw-5557u     total:280  pass:269  dwarn:0   dfail:0   fail:0   skip:11  time:446s
fi-bdw-gvtdvm    total:280  pass:266  dwarn:0   dfail:0   fail:0   skip:14  time:432s
fi-blb-e6850     total:280  pass:225  dwarn:1   dfail:0   fail:0   skip:54  time:359s
fi-bsw-n3050     total:280  pass:244  dwarn:0   dfail:0   fail:0   skip:36  time:540s
fi-bxt-j4205     total:280  pass:261  dwarn:0   dfail:0   fail:0   skip:19  time:515s
fi-byt-j1900     total:280  pass:256  dwarn:0   dfail:0   fail:0   skip:24  time:490s
fi-byt-n2820     total:280  pass:252  dwarn:0   dfail:0   fail:0   skip:28  time:489s
fi-glk-2a        total:280  pass:261  dwarn:0   dfail:0   fail:0   skip:19  time:600s
fi-hsw-4770r     total:280  pass:264  dwarn:0   dfail:0   fail:0   skip:16  time:415s
fi-ilk-650       total:280  pass:230  dwarn:0   dfail:0   fail:0   skip:50  time:419s
fi-ivb-3520m     total:280  pass:262  dwarn:0   dfail:0   fail:0   skip:18  time:501s
fi-ivb-3770      total:280  pass:262  dwarn:0   dfail:0   fail:0   skip:18  time:472s
fi-kbl-7500u     total:280  pass:262  dwarn:0   dfail:0   fail:0   skip:18  time:471s
fi-kbl-7560u     total:280  pass:270  dwarn:0   dfail:0   fail:0   skip:10  time:575s
fi-kbl-r         total:280  pass:262  dwarn:0   dfail:0   fail:0   skip:18  time:593s
fi-pnv-d510      total:280  pass:223  dwarn:2   dfail:0   fail:0   skip:55  time:562s
fi-skl-6260u     total:280  pass:270  dwarn:0   dfail:0   fail:0   skip:10  time:466s
fi-skl-6700hq    total:280  pass:263  dwarn:0   dfail:0   fail:0   skip:17  time:587s
fi-skl-6700k     total:280  pass:262  dwarn:0   dfail:0   fail:0   skip:18  time:475s
fi-skl-6770hq    total:280  pass:270  dwarn:0   dfail:0   fail:0   skip:10  time:481s
fi-skl-gvtdvm    total:280  pass:267  dwarn:0   dfail:0   fail:0   skip:13  time:433s
fi-skl-x1585l    total:280  pass:269  dwarn:0   dfail:0   fail:0   skip:11  time:474s
fi-snb-2520m     total:280  pass:252  dwarn:0   dfail:0   fail:0   skip:28  time:547s
fi-snb-2600      total:280  pass:250  dwarn:0   dfail:0   fail:1   skip:29  time:405s

7631f077f1c06e8884f5f590eb06c1197f5a1d52 drm-tip: 2017y-07m-26d-12h-14m-27s UTC integration manifest
21cf46c067a7 drm/i915: Mark the GT as busy before idling the previous request
19e0003f2a83 drm/i915: Trivial grammar fix s/opt of/opt out of/ in comment
a3179841487b drm/i915: Replace execbuf vma ht with an idr
8a86af0c8f42 drm/i915: Simplify eb_lookup_vmas()
0baad0bc6c3b drm/i915: Convert execbuf to use struct-of-array packing for critical fields
e4705b93afb2 drm/i915: Check context status before looking up our obj/vma
0c1fec9d97ef drm/i915: Try a minimal attempt to insert the whole object for relocations
9cf50f150e31 drm/i915: Track user GTT faulting per-vma
f74e4bac67cd drm/i915: Only free the oldest stale object before a fresh allocation
85783a2ef630 drm/i915: Trim struct_mutex hold duration for i915_gem_free_objects
1f745cd254c9 drm/i915: Move dev_priv->mm.[un]bound_list to its own lock
e23ce06f9c6f drm/i915: Refactor testing obj->mm.pages
73d722c5c8ee drm/i915: Emit pipelined fence changes
cb760103e99a drm/i915: Consolidate get_fence with pin_fence
64f9b1827142 drm/i915: Pin fence for iomap
d45576a67fde drm/i915: Keep a small stash of preallocated WC pages

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_5281/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages
  2017-07-26 13:25 ` [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages Chris Wilson
@ 2017-08-07 20:15   ` Matthew Auld
  2017-08-11  7:34   ` Joonas Lahtinen
  2017-08-11  9:51   ` Jani Nikula
  2 siblings, 0 replies; 28+ messages in thread
From: Matthew Auld @ 2017-08-07 20:15 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Intel Graphics Development

On 26 July 2017 at 14:25, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> We use WC pages for coherent writes into the ppGTT on !llc
> architectuures. However, to create a WC page requires a stop_machine(),
architectures

> i.e. is very slow. To compensate we currently keep a per-vm cache of
> recently freed pages, but we still see the slow startup of new contexts.
> We can amoritize that cost slightly by allocating WC pages in small
> batches (PAGEVEC_SIZE == 14) and since creating a WC page implies a
> stop_machine() there is no penalty for keeping that stash global.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

fwiw,
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 06/16] drm/i915: Move dev_priv->mm.[un]bound_list to its own lock
  2017-07-26 13:26 ` [PATCH 06/16] drm/i915: Move dev_priv->mm.[un]bound_list to its own lock Chris Wilson
@ 2017-08-10 16:22   ` Joonas Lahtinen
  2017-08-12 10:26     ` Chris Wilson
  0 siblings, 1 reply; 28+ messages in thread
From: Joonas Lahtinen @ 2017-08-10 16:22 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On ke, 2017-07-26 at 14:26 +0100, Chris Wilson wrote:
> Remove the struct_mutex requirement around dev_priv->mm.bound_list and
> dev_priv->mm.unbound_list by giving it its own spinlock. This reduces
> one more requirement for struct_mutex and in the process gives us
> slightly more accurate unbound_list tracking, which should improve the
> shrinker.

Please mention the global_link -> mm.link rename too.

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

<SNIP>

> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1443,6 +1443,8 @@ struct i915_gem_mm {
>  	 * always the inner lock when overlapping with struct_mutex. */
>  	struct mutex stolen_lock;

Please comment what this lock achieves, here too. Maybe cross-link with
the other doc?

> +	spinlock_t obj_lock;
> +
>  	/** List of all objects in gtt_space. Used to restore gtt
>  	 * mappings on resume */

<SNIP>

> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -92,9 +92,6 @@ static bool swap_available(void)
>  
>  static bool can_release_pages(struct drm_i915_gem_object *obj)
>  {
> -	if (!i915_gem_object_has_pages(obj))
> -		return false;

So you think the inaccuracies we get for being lockless don't matter
compared to better forward progress?

Maybe the cons should be documented in the commit message to help in
bisecting.

> @@ -149,8 +149,6 @@ static int igt_overcommit(void *arg)
>  		goto cleanup;
>  	}
>  
> -	list_move(&obj->global_link, &i915->mm.unbound_list);

Accidentally dropped line?

> -
>  	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
>  	if (!IS_ERR(vma) || PTR_ERR(vma) != -ENOSPC) {
>  		pr_err("Failed to evict+insert, i915_gem_object_ggtt_pin returned err=%d\n", (int)PTR_ERR(vma));
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 05/16] drm/i915: Refactor testing obj->mm.pages
  2017-07-26 13:25 ` [PATCH 05/16] drm/i915: Refactor testing obj->mm.pages Chris Wilson
@ 2017-08-10 16:26   ` Joonas Lahtinen
  0 siblings, 0 replies; 28+ messages in thread
From: Joonas Lahtinen @ 2017-08-10 16:26 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On ke, 2017-07-26 at 14:25 +0100, Chris Wilson wrote:
> Since we occasionally stuff an error pointer into obj->mm.pages for a
> semi-permanent or even permanent failure, we have to be more careful and
> not just test against NULL when deciding if the object has a complete
> set of its concurrent pages.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Ok, this is a good consolidation.

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages
  2017-07-26 13:25 ` [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages Chris Wilson
  2017-08-07 20:15   ` Matthew Auld
@ 2017-08-11  7:34   ` Joonas Lahtinen
  2017-08-11 11:55     ` Chris Wilson
  2017-08-11  9:51   ` Jani Nikula
  2 siblings, 1 reply; 28+ messages in thread
From: Joonas Lahtinen @ 2017-08-11  7:34 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On ke, 2017-07-26 at 14:25 +0100, Chris Wilson wrote:
> We use WC pages for coherent writes into the ppGTT on !llc
> architectuures. However, to create a WC page requires a stop_machine(),
> i.e. is very slow. To compensate we currently keep a per-vm cache of
> recently freed pages, but we still see the slow startup of new contexts.
> We can amoritize that cost slightly by allocating WC pages in small
> batches (PAGEVEC_SIZE == 14) and since creating a WC page implies a
> stop_machine() there is no penalty for keeping that stash global.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

<SNIP>

> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1464,6 +1464,9 @@ struct i915_gem_mm {
>  	struct llist_head free_list;
>  	struct work_struct free_work;
>  
> +	/** Small stash of WC pages */

Please briefly add the "why".

> +	struct pagevec wc_stash;
> +
>  	/** Usable portion of the GTT for GEM */
>  	dma_addr_t stolen_base; /* limited to low memory (32-bit) */
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 10aa7762d9a6..ad4e48435853 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -351,39 +351,85 @@ static gen6_pte_t iris_pte_encode(dma_addr_t addr,
>  
>  static struct page *vm_alloc_page(struct i915_address_space *vm, gfp_t gfp)
>  {
> -	struct page *page;
> +	struct pagevec *pvec = &vm->free_pages;
>  
>  	if (I915_SELFTEST_ONLY(should_fail(&vm->fault_attr, 1)))
>  		i915_gem_shrink_all(vm->i915);
>  
> -	if (vm->free_pages.nr)
> -		return vm->free_pages.pages[--vm->free_pages.nr];
> +	if (likely(pvec->nr))
> +		return pvec->pages[--pvec->nr];
> +
> +	if (!vm->pt_kmap_wc)
> +		return alloc_page(gfp);
> +
> +	lockdep_assert_held(&vm->i915->drm.struct_mutex);

Lift this to the beginning of the func, should trigger easier.
 
> -static void vm_free_pages_release(struct i915_address_space *vm)
> +static void vm_free_pages_release(struct i915_address_space *vm,
> +				  bool immediate)
>  {
> -	GEM_BUG_ON(!pagevec_count(&vm->free_pages));
> +	struct pagevec *pvec = &vm->free_pages;
> +
> +	GEM_BUG_ON(!pagevec_count(pvec));
> +
> +	if (vm->pt_kmap_wc) {
> +		struct pagevec *stash = &vm->i915->mm.wc_stash;
>  
> -	if (vm->pt_kmap_wc)
> -		set_pages_array_wb(vm->free_pages.pages,
> -				   pagevec_count(&vm->free_pages));
> +		/* When we use WC, first fill up the global stash and then
> +		 * only if full immediately free the overflow.
> +		 */
> +
> +		lockdep_assert_held(&vm->i915->drm.struct_mutex);

I'd again lift this up to the beginning.

> @@ -447,12 +493,31 @@ static void fill_page_dma_32(struct i915_address_space *vm,
>  static int
>  setup_scratch_page(struct i915_address_space *vm, gfp_t gfp)
>  {
> -	return __setup_page_dma(vm, &vm->scratch_page, gfp | __GFP_ZERO);
> +	struct page *page;
> +	dma_addr_t addr;
> +
> +	page = alloc_page(gfp | __GFP_ZERO);
> +	if (unlikely(!page))
> +		return -ENOMEM;
> +
> +	addr = dma_map_page(vm->dma, page, 0, PAGE_SIZE,
> +			    PCI_DMA_BIDIRECTIONAL);
> +	if (unlikely(dma_mapping_error(vm->dma, addr))) {
> +		__free_page(page);
> +		return -ENOMEM;
> +	}
> +
> +	vm->scratch_page.page = page;
> +	vm->scratch_page.daddr = addr;
> +	return 0;

Unrelated hunk, please split.

>  }
>  
>  static void cleanup_scratch_page(struct i915_address_space *vm)
>  {
> -	cleanup_page_dma(vm, &vm->scratch_page);
> +	struct i915_page_dma *p = &vm->scratch_page;
> +
> +	dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
> +	__free_page(p->page);

Ditto.

>  }
>  
>  static struct i915_page_table *alloc_pt(struct i915_address_space *vm)
> @@ -1332,18 +1397,18 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>  		1ULL << 48 :
>  		1ULL << 32;
>  
> -	ret = gen8_init_scratch(&ppgtt->base);
> -	if (ret) {
> -		ppgtt->base.total = 0;
> -		return ret;
> -	}
> -
>  	/* There are only few exceptions for gen >=6. chv and bxt.
>  	 * And we are not sure about the latter so play safe for now.
>  	 */
>  	if (IS_CHERRYVIEW(dev_priv) || IS_BROXTON(dev_priv))
>  		ppgtt->base.pt_kmap_wc = true;
>  
> +	ret = gen8_init_scratch(&ppgtt->base);
> +	if (ret) {
> +		ppgtt->base.total = 0;
> +		return ret;
> +	}
> +

More unrelated hunks.

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 07/16] drm/i915: Trim struct_mutex hold duration for i915_gem_free_objects
  2017-07-26 13:26 ` [PATCH 07/16] drm/i915: Trim struct_mutex hold duration for i915_gem_free_objects Chris Wilson
@ 2017-08-11  7:36   ` Joonas Lahtinen
  0 siblings, 0 replies; 28+ messages in thread
From: Joonas Lahtinen @ 2017-08-11  7:36 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On ke, 2017-07-26 at 14:26 +0100, Chris Wilson wrote:
> We free objects in bulk after they wait for their RCU grace period.
> Currently, we take struct_mutex and unbind all the objects. This can lead
> to a long lock duration during which time those objects have their pages
> unfreeable (i.e. the shrinker is prevented from reaping those pages). If
> we only process a single object under the struct_mutex and then free the
> pages, the number of objects locked away from the shrinker is minimal
> and we allow regular clients better access to struct_mutex if they need
> it.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages
  2017-07-26 13:25 ` [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages Chris Wilson
  2017-08-07 20:15   ` Matthew Auld
  2017-08-11  7:34   ` Joonas Lahtinen
@ 2017-08-11  9:51   ` Jani Nikula
  2 siblings, 0 replies; 28+ messages in thread
From: Jani Nikula @ 2017-08-11  9:51 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On Wed, 26 Jul 2017, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> We use WC pages for coherent writes into the ppGTT on !llc
> architectuures. However, to create a WC page requires a stop_machine(),
> i.e. is very slow. To compensate we currently keep a per-vm cache of
> recently freed pages, but we still see the slow startup of new contexts.
> We can amoritize that cost slightly by allocating WC pages in small
> batches (PAGEVEC_SIZE == 14) and since creating a WC page implies a
> stop_machine() there is no penalty for keeping that stash global.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_drv.h     |   3 +
>  drivers/gpu/drm/i915/i915_gem_gtt.c | 121 +++++++++++++++++++++++++++++-------
>  2 files changed, 100 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2c7456f4ed38..fd62be74a422 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1464,6 +1464,9 @@ struct i915_gem_mm {
>  	struct llist_head free_list;
>  	struct work_struct free_work;
>  
> +	/** Small stash of WC pages */
> +	struct pagevec wc_stash;
> +
>  	/** Usable portion of the GTT for GEM */

These are not proper kernel-doc comments.

BR,
Jani.

>  	dma_addr_t stolen_base; /* limited to low memory (32-bit) */
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 10aa7762d9a6..ad4e48435853 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -351,39 +351,85 @@ static gen6_pte_t iris_pte_encode(dma_addr_t addr,
>  
>  static struct page *vm_alloc_page(struct i915_address_space *vm, gfp_t gfp)
>  {
> -	struct page *page;
> +	struct pagevec *pvec = &vm->free_pages;
>  
>  	if (I915_SELFTEST_ONLY(should_fail(&vm->fault_attr, 1)))
>  		i915_gem_shrink_all(vm->i915);
>  
> -	if (vm->free_pages.nr)
> -		return vm->free_pages.pages[--vm->free_pages.nr];
> +	if (likely(pvec->nr))
> +		return pvec->pages[--pvec->nr];
> +
> +	if (!vm->pt_kmap_wc)
> +		return alloc_page(gfp);
> +
> +	lockdep_assert_held(&vm->i915->drm.struct_mutex);
> +
> +	/* Look in our global stash of WC pages... */
> +	pvec = &vm->i915->mm.wc_stash;
> +	if (likely(pvec->nr))
> +		return pvec->pages[--pvec->nr];
>  
> -	page = alloc_page(gfp);
> -	if (!page)
> +	/* Otherwise batch allocate pages to amoritize cost of set_pages_wc. */
> +	do {
> +		struct page *page;
> +
> +		page = alloc_page(gfp);
> +		if (unlikely(!page))
> +			break;
> +
> +		pvec->pages[pvec->nr++] = page;
> +	} while (pagevec_space(pvec));
> +
> +	if (unlikely(!pvec->nr))
>  		return NULL;
>  
> -	if (vm->pt_kmap_wc)
> -		set_pages_array_wc(&page, 1);
> +	set_pages_array_wc(pvec->pages, pvec->nr);
>  
> -	return page;
> +	return pvec->pages[--pvec->nr];
>  }
>  
> -static void vm_free_pages_release(struct i915_address_space *vm)
> +static void vm_free_pages_release(struct i915_address_space *vm,
> +				  bool immediate)
>  {
> -	GEM_BUG_ON(!pagevec_count(&vm->free_pages));
> +	struct pagevec *pvec = &vm->free_pages;
> +
> +	GEM_BUG_ON(!pagevec_count(pvec));
> +
> +	if (vm->pt_kmap_wc) {
> +		struct pagevec *stash = &vm->i915->mm.wc_stash;
>  
> -	if (vm->pt_kmap_wc)
> -		set_pages_array_wb(vm->free_pages.pages,
> -				   pagevec_count(&vm->free_pages));
> +		/* When we use WC, first fill up the global stash and then
> +		 * only if full immediately free the overflow.
> +		 */
> +
> +		lockdep_assert_held(&vm->i915->drm.struct_mutex);
> +		if (pagevec_space(stash)) {
> +			do {
> +				stash->pages[stash->nr++] =
> +					pvec->pages[--pvec->nr];
> +				if (!pvec->nr)
> +					return;
> +			} while (pagevec_space(stash));
> +
> +			/* As we have made some room in the VM's free_pages,
> +			 * we can wait for it to fill again. Unless we are
> +			 * inside i915_address_space_fini() and must
> +			 * immediately release the pages!
> +			 */
> +			if (!immediate)
> +				return;
> +		}
> +
> +		set_pages_array_wb(pvec->pages, pvec->nr);
> +	}
>  
> -	__pagevec_release(&vm->free_pages);
> +	__pagevec_release(pvec);
>  }
>  
>  static void vm_free_page(struct i915_address_space *vm, struct page *page)
>  {
>  	if (!pagevec_add(&vm->free_pages, page))
> -		vm_free_pages_release(vm);
> +		vm_free_pages_release(vm, false);
>  }
>  
>  static int __setup_page_dma(struct i915_address_space *vm,
> @@ -447,12 +493,31 @@ static void fill_page_dma_32(struct i915_address_space *vm,
>  static int
>  setup_scratch_page(struct i915_address_space *vm, gfp_t gfp)
>  {
> -	return __setup_page_dma(vm, &vm->scratch_page, gfp | __GFP_ZERO);
> +	struct page *page;
> +	dma_addr_t addr;
> +
> +	page = alloc_page(gfp | __GFP_ZERO);
> +	if (unlikely(!page))
> +		return -ENOMEM;
> +
> +	addr = dma_map_page(vm->dma, page, 0, PAGE_SIZE,
> +			    PCI_DMA_BIDIRECTIONAL);
> +	if (unlikely(dma_mapping_error(vm->dma, addr))) {
> +		__free_page(page);
> +		return -ENOMEM;
> +	}
> +
> +	vm->scratch_page.page = page;
> +	vm->scratch_page.daddr = addr;
> +	return 0;
>  }
>  
>  static void cleanup_scratch_page(struct i915_address_space *vm)
>  {
> -	cleanup_page_dma(vm, &vm->scratch_page);
> +	struct i915_page_dma *p = &vm->scratch_page;
> +
> +	dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
> +	__free_page(p->page);
>  }
>  
>  static struct i915_page_table *alloc_pt(struct i915_address_space *vm)
> @@ -1332,18 +1397,18 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
>  		1ULL << 48 :
>  		1ULL << 32;
>  
> -	ret = gen8_init_scratch(&ppgtt->base);
> -	if (ret) {
> -		ppgtt->base.total = 0;
> -		return ret;
> -	}
> -
>  	/* There are only few exceptions for gen >=6. chv and bxt.
>  	 * And we are not sure about the latter so play safe for now.
>  	 */
>  	if (IS_CHERRYVIEW(dev_priv) || IS_BROXTON(dev_priv))
>  		ppgtt->base.pt_kmap_wc = true;
>  
> +	ret = gen8_init_scratch(&ppgtt->base);
> +	if (ret) {
> +		ppgtt->base.total = 0;
> +		return ret;
> +	}
> +
>  	if (use_4lvl(vm)) {
>  		ret = setup_px(&ppgtt->base, &ppgtt->pml4);
>  		if (ret)
> @@ -1867,7 +1932,7 @@ static void i915_address_space_init(struct i915_address_space *vm,
>  static void i915_address_space_fini(struct i915_address_space *vm)
>  {
>  	if (pagevec_count(&vm->free_pages))
> -		vm_free_pages_release(vm);
> +		vm_free_pages_release(vm, true);
>  
>  	i915_gem_timeline_fini(&vm->timeline);
>  	drm_mm_takedown(&vm->mm);
> @@ -2593,6 +2658,7 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
>  {
>  	struct i915_ggtt *ggtt = &dev_priv->ggtt;
>  	struct i915_vma *vma, *vn;
> +	struct pagevec *pvec;
>  
>  	ggtt->base.closed = true;
>  
> @@ -2616,6 +2682,13 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
>  	}
>  
>  	ggtt->base.cleanup(&ggtt->base);
> +
> +	pvec = &dev_priv->mm.wc_stash;
> +	if (pvec->nr) {
> +		set_pages_array_wb(pvec->pages, pvec->nr);
> +		__pagevec_release(pvec);
> +	}
> +
>  	mutex_unlock(&dev_priv->drm.struct_mutex);
>  
>  	arch_phys_wc_del(ggtt->mtrr);

-- 
Jani Nikula, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages
  2017-08-11  7:34   ` Joonas Lahtinen
@ 2017-08-11 11:55     ` Chris Wilson
  0 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-08-11 11:55 UTC (permalink / raw)
  To: Joonas Lahtinen, intel-gfx

Quoting Joonas Lahtinen (2017-08-11 08:34:02)
> On ke, 2017-07-26 at 14:25 +0100, Chris Wilson wrote:
> > -     if (vm->pt_kmap_wc)
> > -             set_pages_array_wb(vm->free_pages.pages,
> > -                                pagevec_count(&vm->free_pages));
> > +             /* When we use WC, first fill up the global stash and then
> > +              * only if full immediately free the overflow.
> > +              */
> > +
> > +             lockdep_assert_held(&vm->i915->drm.struct_mutex);
> 
> I'd again lift this up to the beginning.

I placed it here to show that only this path required the extra lock
with a plan that we would take a more specialised lock at this point.

> 
> > @@ -447,12 +493,31 @@ static void fill_page_dma_32(struct i915_address_space *vm,
> >  static int
> >  setup_scratch_page(struct i915_address_space *vm, gfp_t gfp)
> >  {
> > -     return __setup_page_dma(vm, &vm->scratch_page, gfp | __GFP_ZERO);
> > +     struct page *page;
> > +     dma_addr_t addr;
> > +
> > +     page = alloc_page(gfp | __GFP_ZERO);
> > +     if (unlikely(!page))
> > +             return -ENOMEM;
> > +
> > +     addr = dma_map_page(vm->dma, page, 0, PAGE_SIZE,
> > +                         PCI_DMA_BIDIRECTIONAL);
> > +     if (unlikely(dma_mapping_error(vm->dma, addr))) {
> > +             __free_page(page);
> > +             return -ENOMEM;
> > +     }
> > +
> > +     vm->scratch_page.page = page;
> > +     vm->scratch_page.daddr = addr;
> > +     return 0;
> 
> Unrelated hunk, please split.
> 
> >  }
> >  
> >  static void cleanup_scratch_page(struct i915_address_space *vm)
> >  {
> > -     cleanup_page_dma(vm, &vm->scratch_page);
> > +     struct i915_page_dma *p = &vm->scratch_page;
> > +
> > +     dma_unmap_page(vm->dma, p->daddr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
> > +     __free_page(p->page);
> 
> Ditto.
> 
> >  }
> >  
> >  static struct i915_page_table *alloc_pt(struct i915_address_space *vm)
> > @@ -1332,18 +1397,18 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
> >               1ULL << 48 :
> >               1ULL << 32;
> >  
> > -     ret = gen8_init_scratch(&ppgtt->base);
> > -     if (ret) {
> > -             ppgtt->base.total = 0;
> > -             return ret;
> > -     }
> > -
> >       /* There are only few exceptions for gen >=6. chv and bxt.
> >        * And we are not sure about the latter so play safe for now.
> >        */
> >       if (IS_CHERRYVIEW(dev_priv) || IS_BROXTON(dev_priv))
> >               ppgtt->base.pt_kmap_wc = true;
> >  
> > +     ret = gen8_init_scratch(&ppgtt->base);
> > +     if (ret) {
> > +             ppgtt->base.total = 0;
> > +             return ret;
> > +     }
> > +
> 
> More unrelated hunks.

Ah, but they weren't.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 06/16] drm/i915: Move dev_priv->mm.[un]bound_list to its own lock
  2017-08-10 16:22   ` Joonas Lahtinen
@ 2017-08-12 10:26     ` Chris Wilson
  2017-08-12 11:34       ` Chris Wilson
  2017-08-12 11:41       ` Chris Wilson
  0 siblings, 2 replies; 28+ messages in thread
From: Chris Wilson @ 2017-08-12 10:26 UTC (permalink / raw)
  To: Joonas Lahtinen, intel-gfx

Quoting Joonas Lahtinen (2017-08-10 17:22:49)
> On ke, 2017-07-26 at 14:26 +0100, Chris Wilson wrote:
> > +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > @@ -92,9 +92,6 @@ static bool swap_available(void)
> >  
> >  static bool can_release_pages(struct drm_i915_gem_object *obj)
> >  {
> > -     if (!i915_gem_object_has_pages(obj))
> > -             return false;
> 
> So you think the inaccuracies we get for being lockless don't matter
> compared to better forward progress?

Actually, looking at this again, it is all improvement. We no longer
have a scan over the inactive/active list here, just the bound/unbound
list, so the locking is equivalent. And we gain better accounting going
in (no longer do we have an untracked phase for objects that have pages
prior to ever seeing the struct_mutex and being placed on the unbound_list.

We still have an unpleasant walk over many 10s of thousands of objects.
Hmm, I wonder if sticking a cond_resched_lock() in there is sensible.
Probably.

> Maybe the cons should be documented in the commit message to help in
> bisecting.
> 
> > @@ -149,8 +149,6 @@ static int igt_overcommit(void *arg)
> >               goto cleanup;
> >       }
> >  
> > -     list_move(&obj->global_link, &i915->mm.unbound_list);
> 
> Accidentally dropped line?

No, a big change was that in dropping the struct_mutex requirement for
the unbound_list, we could do the list_add in i915_gem_object_init() and
so not need this.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 06/16] drm/i915: Move dev_priv->mm.[un]bound_list to its own lock
  2017-08-12 10:26     ` Chris Wilson
@ 2017-08-12 11:34       ` Chris Wilson
  2017-08-12 11:41       ` Chris Wilson
  1 sibling, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-08-12 11:34 UTC (permalink / raw)
  To: Joonas Lahtinen, intel-gfx

Quoting Chris Wilson (2017-08-12 11:26:08)
> Quoting Joonas Lahtinen (2017-08-10 17:22:49)
> > On ke, 2017-07-26 at 14:26 +0100, Chris Wilson wrote:
> > > +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > > @@ -92,9 +92,6 @@ static bool swap_available(void)
> > >  
> > >  static bool can_release_pages(struct drm_i915_gem_object *obj)
> > >  {
> > > -     if (!i915_gem_object_has_pages(obj))
> > > -             return false;
> > 
> > So you think the inaccuracies we get for being lockless don't matter
> > compared to better forward progress?
> 
> Actually, looking at this again, it is all improvement. We no longer
> have a scan over the inactive/active list here, just the bound/unbound
> list, so the locking is equivalent. And we gain better accounting going
> in (no longer do we have an untracked phase for objects that have pages
> prior to ever seeing the struct_mutex and being placed on the unbound_list.
> 
> We still have an unpleasant walk over many 10s of thousands of objects.
> Hmm, I wonder if sticking a cond_resched_lock() in there is sensible.
> Probably.

Scratch that, too complicated. If we do end up rescheduling, the count
may be even more wrong, or we restart entirely defeating the purpose.

It's not even that useful. Every time the system decides to reclaim a
page, we free a random number of pages. Not that I have a better answer,
just unease that we end up sacrificing too many of our own objects for
the buffercache.  At the back of my mind, I've considered bumping the
shrinker->batch to say 1024 (equivalent to 4MiB), but the heuristic is so
nebulous it's hard to know the impact it will have on the balance between
slabs.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 06/16] drm/i915: Move dev_priv->mm.[un]bound_list to its own lock
  2017-08-12 10:26     ` Chris Wilson
  2017-08-12 11:34       ` Chris Wilson
@ 2017-08-12 11:41       ` Chris Wilson
  1 sibling, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2017-08-12 11:41 UTC (permalink / raw)
  To: Joonas Lahtinen, intel-gfx

Quoting Chris Wilson (2017-08-12 11:26:08)
> Quoting Joonas Lahtinen (2017-08-10 17:22:49)
> > On ke, 2017-07-26 at 14:26 +0100, Chris Wilson wrote:
> > > +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > > @@ -92,9 +92,6 @@ static bool swap_available(void)
> > >  
> > >  static bool can_release_pages(struct drm_i915_gem_object *obj)
> > >  {
> > > -     if (!i915_gem_object_has_pages(obj))
> > > -             return false;
> > 
> > So you think the inaccuracies we get for being lockless don't matter
> > compared to better forward progress?
> 
> Actually, looking at this again, it is all improvement.

Bah. Forgot i915_gem_object_is_active() depends on the explicit
retirement and not the resv object lookup.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2017-08-12 11:41 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-07-26 13:25 More tweaks Chris Wilson
2017-07-26 13:25 ` [PATCH 01/16] drm/i915: Keep a small stash of preallocated WC pages Chris Wilson
2017-08-07 20:15   ` Matthew Auld
2017-08-11  7:34   ` Joonas Lahtinen
2017-08-11 11:55     ` Chris Wilson
2017-08-11  9:51   ` Jani Nikula
2017-07-26 13:25 ` [PATCH 02/16] drm/i915: Pin fence for iomap Chris Wilson
2017-07-26 13:25 ` [PATCH 03/16] drm/i915: Consolidate get_fence with pin_fence Chris Wilson
2017-07-26 13:25 ` [PATCH 04/16] drm/i915: Emit pipelined fence changes Chris Wilson
2017-07-26 13:25 ` [PATCH 05/16] drm/i915: Refactor testing obj->mm.pages Chris Wilson
2017-08-10 16:26   ` Joonas Lahtinen
2017-07-26 13:26 ` [PATCH 06/16] drm/i915: Move dev_priv->mm.[un]bound_list to its own lock Chris Wilson
2017-08-10 16:22   ` Joonas Lahtinen
2017-08-12 10:26     ` Chris Wilson
2017-08-12 11:34       ` Chris Wilson
2017-08-12 11:41       ` Chris Wilson
2017-07-26 13:26 ` [PATCH 07/16] drm/i915: Trim struct_mutex hold duration for i915_gem_free_objects Chris Wilson
2017-08-11  7:36   ` Joonas Lahtinen
2017-07-26 13:26 ` [PATCH 08/16] drm/i915: Only free the oldest stale object before a fresh allocation Chris Wilson
2017-07-26 13:26 ` [PATCH 09/16] drm/i915: Track user GTT faulting per-vma Chris Wilson
2017-07-26 13:26 ` [PATCH 10/16] drm/i915: Try a minimal attempt to insert the whole object for relocations Chris Wilson
2017-07-26 13:26 ` [PATCH 11/16] drm/i915: Check context status before looking up our obj/vma Chris Wilson
2017-07-26 13:26 ` [PATCH 12/16] drm/i915: Convert execbuf to use struct-of-array packing for critical fields Chris Wilson
2017-07-26 13:26 ` [PATCH 13/16] drm/i915: Simplify eb_lookup_vmas() Chris Wilson
2017-07-26 13:26 ` [PATCH 14/16] drm/i915: Replace execbuf vma ht with an idr Chris Wilson
2017-07-26 13:26 ` [PATCH 15/16] drm/i915: Trivial grammar fix s/opt of/opt out of/ in comment Chris Wilson
2017-07-26 13:26 ` [PATCH 16/16] drm/i915: Mark the GT as busy before idling the previous request Chris Wilson
2017-07-26 14:13 ` ✓ Fi.CI.BAT: success for series starting with [01/16] drm/i915: Keep a small stash of preallocated WC pages Patchwork

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox