public inbox for intel-gfx@lists.freedesktop.org
 help / color / mirror / Atom feed
* [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work
@ 2015-03-16 16:00 Michel Thierry
  2015-03-16 16:00 ` [PATCH 1/5] drm/i915: page table generalizations Michel Thierry
                   ` (5 more replies)
  0 siblings, 6 replies; 15+ messages in thread
From: Michel Thierry @ 2015-03-16 16:00 UTC (permalink / raw)
  To: intel-gfx

Splitting this prep work should ease the code review and help identify
problems (and also shrink the Gen8 patch series, which is of more interest).

4 patches have already been sent as part of the main patchset, only "page
table generalizations" is brand new (suggested by Mika) and should help to
clean some of the code.

Ben Widawsky (4):
  drm/i915: Extract context switch skip and add pd load logic
  drm/i915: Track GEN6 page table usage
  drm/i915: Track page table reload need
  drm/i915: Initialize all contexts

Michel Thierry (1):
  drm/i915: page table generalizations

 drivers/gpu/drm/i915/i915_gem.c            |   9 +
 drivers/gpu/drm/i915/i915_gem_context.c    | 103 +++++++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  11 +
 drivers/gpu/drm/i915/i915_gem_gtt.c        | 364 ++++++++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_gtt.h        | 102 +++++++-
 5 files changed, 434 insertions(+), 155 deletions(-)

-- 
2.1.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/5] drm/i915: page table generalizations
  2015-03-16 16:00 [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Michel Thierry
@ 2015-03-16 16:00 ` Michel Thierry
  2015-03-16 16:00 ` [PATCH 2/5] drm/i915: Extract context switch skip and add pd load logic Michel Thierry
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Michel Thierry @ 2015-03-16 16:00 UTC (permalink / raw)
  To: intel-gfx

No functional changes, but will improve code clarity and removed some
duplicated defines.

Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 160 ++++++++++++++++++------------------
 drivers/gpu/drm/i915/i915_gem_gtt.h |  28 ++++---
 2 files changed, 96 insertions(+), 92 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 2034f7c..6d85bff 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -147,11 +147,11 @@ static void ppgtt_bind_vma(struct i915_vma *vma,
 			   u32 flags);
 static void ppgtt_unbind_vma(struct i915_vma *vma);
 
-static inline gen8_gtt_pte_t gen8_pte_encode(dma_addr_t addr,
-					     enum i915_cache_level level,
-					     bool valid)
+static inline gen8_pte_t gen8_pte_encode(dma_addr_t addr,
+					 enum i915_cache_level level,
+					 bool valid)
 {
-	gen8_gtt_pte_t pte = valid ? _PAGE_PRESENT | _PAGE_RW : 0;
+	gen8_pte_t pte = valid ? _PAGE_PRESENT | _PAGE_RW : 0;
 	pte |= addr;
 
 	switch (level) {
@@ -169,11 +169,11 @@ static inline gen8_gtt_pte_t gen8_pte_encode(dma_addr_t addr,
 	return pte;
 }
 
-static inline gen8_ppgtt_pde_t gen8_pde_encode(struct drm_device *dev,
-					     dma_addr_t addr,
-					     enum i915_cache_level level)
+static inline gen8_pde_t gen8_pde_encode(struct drm_device *dev,
+					  dma_addr_t addr,
+					  enum i915_cache_level level)
 {
-	gen8_ppgtt_pde_t pde = _PAGE_PRESENT | _PAGE_RW;
+	gen8_pde_t pde = _PAGE_PRESENT | _PAGE_RW;
 	pde |= addr;
 	if (level != I915_CACHE_NONE)
 		pde |= PPAT_CACHED_PDE_INDEX;
@@ -182,11 +182,11 @@ static inline gen8_ppgtt_pde_t gen8_pde_encode(struct drm_device *dev,
 	return pde;
 }
 
-static gen6_gtt_pte_t snb_pte_encode(dma_addr_t addr,
-				     enum i915_cache_level level,
-				     bool valid, u32 unused)
+static gen6_pte_t snb_pte_encode(dma_addr_t addr,
+				 enum i915_cache_level level,
+				 bool valid, u32 unused)
 {
-	gen6_gtt_pte_t pte = valid ? GEN6_PTE_VALID : 0;
+	gen6_pte_t pte = valid ? GEN6_PTE_VALID : 0;
 	pte |= GEN6_PTE_ADDR_ENCODE(addr);
 
 	switch (level) {
@@ -204,11 +204,11 @@ static gen6_gtt_pte_t snb_pte_encode(dma_addr_t addr,
 	return pte;
 }
 
-static gen6_gtt_pte_t ivb_pte_encode(dma_addr_t addr,
-				     enum i915_cache_level level,
-				     bool valid, u32 unused)
+static gen6_pte_t ivb_pte_encode(dma_addr_t addr,
+				 enum i915_cache_level level,
+				 bool valid, u32 unused)
 {
-	gen6_gtt_pte_t pte = valid ? GEN6_PTE_VALID : 0;
+	gen6_pte_t pte = valid ? GEN6_PTE_VALID : 0;
 	pte |= GEN6_PTE_ADDR_ENCODE(addr);
 
 	switch (level) {
@@ -228,11 +228,11 @@ static gen6_gtt_pte_t ivb_pte_encode(dma_addr_t addr,
 	return pte;
 }
 
-static gen6_gtt_pte_t byt_pte_encode(dma_addr_t addr,
-				     enum i915_cache_level level,
-				     bool valid, u32 flags)
+static gen6_pte_t byt_pte_encode(dma_addr_t addr,
+				 enum i915_cache_level level,
+				 bool valid, u32 flags)
 {
-	gen6_gtt_pte_t pte = valid ? GEN6_PTE_VALID : 0;
+	gen6_pte_t pte = valid ? GEN6_PTE_VALID : 0;
 	pte |= GEN6_PTE_ADDR_ENCODE(addr);
 
 	if (!(flags & PTE_READ_ONLY))
@@ -244,11 +244,11 @@ static gen6_gtt_pte_t byt_pte_encode(dma_addr_t addr,
 	return pte;
 }
 
-static gen6_gtt_pte_t hsw_pte_encode(dma_addr_t addr,
-				     enum i915_cache_level level,
-				     bool valid, u32 unused)
+static gen6_pte_t hsw_pte_encode(dma_addr_t addr,
+				 enum i915_cache_level level,
+				 bool valid, u32 unused)
 {
-	gen6_gtt_pte_t pte = valid ? GEN6_PTE_VALID : 0;
+	gen6_pte_t pte = valid ? GEN6_PTE_VALID : 0;
 	pte |= HSW_PTE_ADDR_ENCODE(addr);
 
 	if (level != I915_CACHE_NONE)
@@ -257,11 +257,11 @@ static gen6_gtt_pte_t hsw_pte_encode(dma_addr_t addr,
 	return pte;
 }
 
-static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
-				      enum i915_cache_level level,
-				      bool valid, u32 unused)
+static gen6_pte_t iris_pte_encode(dma_addr_t addr,
+				  enum i915_cache_level level,
+				  bool valid, u32 unused)
 {
-	gen6_gtt_pte_t pte = valid ? GEN6_PTE_VALID : 0;
+	gen6_pte_t pte = valid ? GEN6_PTE_VALID : 0;
 	pte |= HSW_PTE_ADDR_ENCODE(addr);
 
 	switch (level) {
@@ -323,7 +323,7 @@ static int alloc_pt_range(struct i915_page_directory_entry *pd, uint16_t pde, si
 	int i, ret;
 
 	/* 512 is the max page tables per page_directory on any platform. */
-	if (WARN_ON(pde + count > GEN6_PPGTT_PD_ENTRIES))
+	if (WARN_ON(pde + count > I915_PDES))
 		return -EINVAL;
 
 	for (i = pde; i < pde + count; i++) {
@@ -401,7 +401,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
 	int i, ret;
 
 	/* bit of a hack to find the actual last used pd */
-	int used_pd = ppgtt->num_pd_entries / GEN8_PDES_PER_PAGE;
+	int used_pd = ppgtt->num_pd_entries / I915_PDES;
 
 	for (i = used_pd - 1; i >= 0; i--) {
 		dma_addr_t addr = ppgtt->pdp.page_directory[i]->daddr;
@@ -420,7 +420,7 @@ static void gen8_ppgtt_clear_range(struct i915_address_space *vm,
 {
 	struct i915_hw_ppgtt *ppgtt =
 		container_of(vm, struct i915_hw_ppgtt, base);
-	gen8_gtt_pte_t *pt_vaddr, scratch_pte;
+	gen8_pte_t *pt_vaddr, scratch_pte;
 	unsigned pdpe = start >> GEN8_PDPE_SHIFT & GEN8_PDPE_MASK;
 	unsigned pde = start >> GEN8_PDE_SHIFT & GEN8_PDE_MASK;
 	unsigned pte = start >> GEN8_PTE_SHIFT & GEN8_PTE_MASK;
@@ -451,8 +451,8 @@ static void gen8_ppgtt_clear_range(struct i915_address_space *vm,
 		page_table = pt->page;
 
 		last_pte = pte + num_entries;
-		if (last_pte > GEN8_PTES_PER_PAGE)
-			last_pte = GEN8_PTES_PER_PAGE;
+		if (last_pte > GEN8_PTES)
+			last_pte = GEN8_PTES;
 
 		pt_vaddr = kmap_atomic(page_table);
 
@@ -466,7 +466,7 @@ static void gen8_ppgtt_clear_range(struct i915_address_space *vm,
 		kunmap_atomic(pt_vaddr);
 
 		pte = 0;
-		if (++pde == GEN8_PDES_PER_PAGE) {
+		if (++pde == I915_PDES) {
 			pdpe++;
 			pde = 0;
 		}
@@ -480,7 +480,7 @@ static void gen8_ppgtt_insert_entries(struct i915_address_space *vm,
 {
 	struct i915_hw_ppgtt *ppgtt =
 		container_of(vm, struct i915_hw_ppgtt, base);
-	gen8_gtt_pte_t *pt_vaddr;
+	gen8_pte_t *pt_vaddr;
 	unsigned pdpe = start >> GEN8_PDPE_SHIFT & GEN8_PDPE_MASK;
 	unsigned pde = start >> GEN8_PDE_SHIFT & GEN8_PDE_MASK;
 	unsigned pte = start >> GEN8_PTE_SHIFT & GEN8_PTE_MASK;
@@ -503,12 +503,12 @@ static void gen8_ppgtt_insert_entries(struct i915_address_space *vm,
 		pt_vaddr[pte] =
 			gen8_pte_encode(sg_page_iter_dma_address(&sg_iter),
 					cache_level, true);
-		if (++pte == GEN8_PTES_PER_PAGE) {
+		if (++pte == GEN8_PTES) {
 			if (!HAS_LLC(ppgtt->base.dev))
 				drm_clflush_virt_range(pt_vaddr, PAGE_SIZE);
 			kunmap_atomic(pt_vaddr);
 			pt_vaddr = NULL;
-			if (++pde == GEN8_PDES_PER_PAGE) {
+			if (++pde == I915_PDES) {
 				pdpe++;
 				pde = 0;
 			}
@@ -529,7 +529,7 @@ static void gen8_free_page_tables(struct i915_page_directory_entry *pd, struct d
 	if (!pd->page)
 		return;
 
-	for (i = 0; i < GEN8_PDES_PER_PAGE; i++) {
+	for (i = 0; i < I915_PDES; i++) {
 		if (WARN_ON(!pd->page_table[i]))
 			continue;
 
@@ -565,7 +565,7 @@ static void gen8_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
 		pci_unmap_page(hwdev, ppgtt->pdp.page_directory[i]->daddr, PAGE_SIZE,
 			       PCI_DMA_BIDIRECTIONAL);
 
-		for (j = 0; j < GEN8_PDES_PER_PAGE; j++) {
+		for (j = 0; j < I915_PDES; j++) {
 			struct i915_page_directory_entry *pd = ppgtt->pdp.page_directory[i];
 			struct i915_page_table_entry *pt;
 			dma_addr_t addr;
@@ -598,7 +598,7 @@ static int gen8_ppgtt_allocate_page_tables(struct i915_hw_ppgtt *ppgtt)
 
 	for (i = 0; i < ppgtt->num_pd_pages; i++) {
 		ret = alloc_pt_range(ppgtt->pdp.page_directory[i],
-				     0, GEN8_PDES_PER_PAGE, ppgtt->base.dev);
+				     0, I915_PDES, ppgtt->base.dev);
 		if (ret)
 			goto unwind_out;
 	}
@@ -648,7 +648,7 @@ static int gen8_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt,
 	if (ret)
 		goto err_out;
 
-	ppgtt->num_pd_entries = max_pdp * GEN8_PDES_PER_PAGE;
+	ppgtt->num_pd_entries = max_pdp * I915_PDES;
 
 	return 0;
 
@@ -710,7 +710,7 @@ static int gen8_ppgtt_setup_page_tables(struct i915_hw_ppgtt *ppgtt,
 static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt, uint64_t size)
 {
 	const int max_pdp = DIV_ROUND_UP(size, 1 << 30);
-	const int min_pt_pages = GEN8_PDES_PER_PAGE * max_pdp;
+	const int min_pt_pages = I915_PDES * max_pdp;
 	int i, j, ret;
 
 	if (size % (1<<30))
@@ -733,7 +733,7 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt, uint64_t size)
 		if (ret)
 			goto bail;
 
-		for (j = 0; j < GEN8_PDES_PER_PAGE; j++) {
+		for (j = 0; j < I915_PDES; j++) {
 			ret = gen8_ppgtt_setup_page_tables(ppgtt, i, j);
 			if (ret)
 				goto bail;
@@ -750,9 +750,9 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt, uint64_t size)
 	 */
 	for (i = 0; i < GEN8_LEGACY_PDPES; i++) {
 		struct i915_page_directory_entry *pd = ppgtt->pdp.page_directory[i];
-		gen8_ppgtt_pde_t *pd_vaddr;
+		gen8_pde_t *pd_vaddr;
 		pd_vaddr = kmap_atomic(ppgtt->pdp.page_directory[i]->page);
-		for (j = 0; j < GEN8_PDES_PER_PAGE; j++) {
+		for (j = 0; j < I915_PDES; j++) {
 			struct i915_page_table_entry *pt = pd->page_table[j];
 			dma_addr_t addr = pt->daddr;
 			pd_vaddr[j] = gen8_pde_encode(ppgtt->base.dev, addr,
@@ -770,11 +770,11 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt, uint64_t size)
 	ppgtt->base.start = 0;
 
 	/* This is the area that we advertise as usable for the caller */
-	ppgtt->base.total = max_pdp * GEN8_PDES_PER_PAGE * GEN8_PTES_PER_PAGE * PAGE_SIZE;
+	ppgtt->base.total = max_pdp * I915_PDES * GEN8_PTES * PAGE_SIZE;
 
 	/* Set all ptes to a valid scratch page. Also above requested space */
 	ppgtt->base.clear_range(&ppgtt->base, 0,
-				ppgtt->num_pd_pages * GEN8_PTES_PER_PAGE * PAGE_SIZE,
+				ppgtt->num_pd_pages * GEN8_PTES * PAGE_SIZE,
 				true);
 
 	DRM_DEBUG_DRIVER("Allocated %d pages for page directories (%d wasted)\n",
@@ -794,22 +794,22 @@ static void gen6_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
 {
 	struct drm_i915_private *dev_priv = ppgtt->base.dev->dev_private;
 	struct i915_address_space *vm = &ppgtt->base;
-	gen6_gtt_pte_t __iomem *pd_addr;
-	gen6_gtt_pte_t scratch_pte;
+	gen6_pte_t __iomem *pd_addr;
+	gen6_pte_t scratch_pte;
 	uint32_t pd_entry;
 	int pte, pde;
 
 	scratch_pte = vm->pte_encode(vm->scratch.addr, I915_CACHE_LLC, true, 0);
 
-	pd_addr = (gen6_gtt_pte_t __iomem *)dev_priv->gtt.gsm +
-		ppgtt->pd.pd_offset / sizeof(gen6_gtt_pte_t);
+	pd_addr = (gen6_pte_t __iomem *)dev_priv->gtt.gsm +
+		ppgtt->pd.pd_offset / sizeof(gen6_pte_t);
 
 	seq_printf(m, "  VM %p (pd_offset %x-%x):\n", vm,
 		   ppgtt->pd.pd_offset,
 		   ppgtt->pd.pd_offset + ppgtt->num_pd_entries);
 	for (pde = 0; pde < ppgtt->num_pd_entries; pde++) {
 		u32 expected;
-		gen6_gtt_pte_t *pt_vaddr;
+		gen6_pte_t *pt_vaddr;
 		dma_addr_t pt_addr = ppgtt->pd.page_table[pde]->daddr;
 		pd_entry = readl(pd_addr + pde);
 		expected = (GEN6_PDE_ADDR_ENCODE(pt_addr) | GEN6_PDE_VALID);
@@ -822,9 +822,9 @@ static void gen6_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
 		seq_printf(m, "\tPDE: %x\n", pd_entry);
 
 		pt_vaddr = kmap_atomic(ppgtt->pd.page_table[pde]->page);
-		for (pte = 0; pte < I915_PPGTT_PT_ENTRIES; pte+=4) {
+		for (pte = 0; pte < GEN6_PTES; pte+=4) {
 			unsigned long va =
-				(pde * PAGE_SIZE * I915_PPGTT_PT_ENTRIES) +
+				(pde * PAGE_SIZE * GEN6_PTES) +
 				(pte * PAGE_SIZE);
 			int i;
 			bool found = false;
@@ -850,13 +850,13 @@ static void gen6_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
 static void gen6_write_pdes(struct i915_hw_ppgtt *ppgtt)
 {
 	struct drm_i915_private *dev_priv = ppgtt->base.dev->dev_private;
-	gen6_gtt_pte_t __iomem *pd_addr;
+	gen6_pte_t __iomem *pd_addr;
 	uint32_t pd_entry;
 	int i;
 
 	WARN_ON(ppgtt->pd.pd_offset & 0x3f);
-	pd_addr = (gen6_gtt_pte_t __iomem*)dev_priv->gtt.gsm +
-		ppgtt->pd.pd_offset / sizeof(gen6_gtt_pte_t);
+	pd_addr = (gen6_pte_t __iomem*)dev_priv->gtt.gsm +
+		ppgtt->pd.pd_offset / sizeof(gen6_pte_t);
 	for (i = 0; i < ppgtt->num_pd_entries; i++) {
 		dma_addr_t pt_addr;
 
@@ -1022,19 +1022,19 @@ static void gen6_ppgtt_clear_range(struct i915_address_space *vm,
 {
 	struct i915_hw_ppgtt *ppgtt =
 		container_of(vm, struct i915_hw_ppgtt, base);
-	gen6_gtt_pte_t *pt_vaddr, scratch_pte;
+	gen6_pte_t *pt_vaddr, scratch_pte;
 	unsigned first_entry = start >> PAGE_SHIFT;
 	unsigned num_entries = length >> PAGE_SHIFT;
-	unsigned act_pt = first_entry / I915_PPGTT_PT_ENTRIES;
-	unsigned first_pte = first_entry % I915_PPGTT_PT_ENTRIES;
+	unsigned act_pt = first_entry / GEN6_PTES;
+	unsigned first_pte = first_entry % GEN6_PTES;
 	unsigned last_pte, i;
 
 	scratch_pte = vm->pte_encode(vm->scratch.addr, I915_CACHE_LLC, true, 0);
 
 	while (num_entries) {
 		last_pte = first_pte + num_entries;
-		if (last_pte > I915_PPGTT_PT_ENTRIES)
-			last_pte = I915_PPGTT_PT_ENTRIES;
+		if (last_pte > GEN6_PTES)
+			last_pte = GEN6_PTES;
 
 		pt_vaddr = kmap_atomic(ppgtt->pd.page_table[act_pt]->page);
 
@@ -1056,10 +1056,10 @@ static void gen6_ppgtt_insert_entries(struct i915_address_space *vm,
 {
 	struct i915_hw_ppgtt *ppgtt =
 		container_of(vm, struct i915_hw_ppgtt, base);
-	gen6_gtt_pte_t *pt_vaddr;
+	gen6_pte_t *pt_vaddr;
 	unsigned first_entry = start >> PAGE_SHIFT;
-	unsigned act_pt = first_entry / I915_PPGTT_PT_ENTRIES;
-	unsigned act_pte = first_entry % I915_PPGTT_PT_ENTRIES;
+	unsigned act_pt = first_entry / GEN6_PTES;
+	unsigned act_pte = first_entry % GEN6_PTES;
 	struct sg_page_iter sg_iter;
 
 	pt_vaddr = NULL;
@@ -1071,7 +1071,7 @@ static void gen6_ppgtt_insert_entries(struct i915_address_space *vm,
 			vm->pte_encode(sg_page_iter_dma_address(&sg_iter),
 				       cache_level, true, flags);
 
-		if (++act_pte == I915_PPGTT_PT_ENTRIES) {
+		if (++act_pte == GEN6_PTES) {
 			kunmap_atomic(pt_vaddr);
 			pt_vaddr = NULL;
 			act_pt++;
@@ -1150,7 +1150,7 @@ alloc:
 	if (ppgtt->node.start < dev_priv->gtt.mappable_end)
 		DRM_DEBUG("Forced to use aperture for PDEs\n");
 
-	ppgtt->num_pd_entries = GEN6_PPGTT_PD_ENTRIES;
+	ppgtt->num_pd_entries = I915_PDES;
 	return 0;
 }
 
@@ -1230,11 +1230,11 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 	ppgtt->base.insert_entries = gen6_ppgtt_insert_entries;
 	ppgtt->base.cleanup = gen6_ppgtt_cleanup;
 	ppgtt->base.start = 0;
-	ppgtt->base.total = ppgtt->num_pd_entries * I915_PPGTT_PT_ENTRIES * PAGE_SIZE;
+	ppgtt->base.total = ppgtt->num_pd_entries * GEN6_PTES * PAGE_SIZE;
 	ppgtt->debug_dump = gen6_dump_ppgtt;
 
 	ppgtt->pd.pd_offset =
-		ppgtt->node.start / PAGE_SIZE * sizeof(gen6_gtt_pte_t);
+		ppgtt->node.start / PAGE_SIZE * sizeof(gen6_pte_t);
 
 	ppgtt->base.clear_range(&ppgtt->base, 0, ppgtt->base.total, true);
 
@@ -1540,7 +1540,7 @@ int i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj)
 	return 0;
 }
 
-static inline void gen8_set_pte(void __iomem *addr, gen8_gtt_pte_t pte)
+static inline void gen8_set_pte(void __iomem *addr, gen8_pte_t pte)
 {
 #ifdef writeq
 	writeq(pte, addr);
@@ -1557,8 +1557,8 @@ static void gen8_ggtt_insert_entries(struct i915_address_space *vm,
 {
 	struct drm_i915_private *dev_priv = vm->dev->dev_private;
 	unsigned first_entry = start >> PAGE_SHIFT;
-	gen8_gtt_pte_t __iomem *gtt_entries =
-		(gen8_gtt_pte_t __iomem *)dev_priv->gtt.gsm + first_entry;
+	gen8_pte_t __iomem *gtt_entries =
+		(gen8_pte_t __iomem *)dev_priv->gtt.gsm + first_entry;
 	int i = 0;
 	struct sg_page_iter sg_iter;
 	dma_addr_t addr = 0; /* shut up gcc */
@@ -1603,8 +1603,8 @@ static void gen6_ggtt_insert_entries(struct i915_address_space *vm,
 {
 	struct drm_i915_private *dev_priv = vm->dev->dev_private;
 	unsigned first_entry = start >> PAGE_SHIFT;
-	gen6_gtt_pte_t __iomem *gtt_entries =
-		(gen6_gtt_pte_t __iomem *)dev_priv->gtt.gsm + first_entry;
+	gen6_pte_t __iomem *gtt_entries =
+		(gen6_pte_t __iomem *)dev_priv->gtt.gsm + first_entry;
 	int i = 0;
 	struct sg_page_iter sg_iter;
 	dma_addr_t addr = 0;
@@ -1642,8 +1642,8 @@ static void gen8_ggtt_clear_range(struct i915_address_space *vm,
 	struct drm_i915_private *dev_priv = vm->dev->dev_private;
 	unsigned first_entry = start >> PAGE_SHIFT;
 	unsigned num_entries = length >> PAGE_SHIFT;
-	gen8_gtt_pte_t scratch_pte, __iomem *gtt_base =
-		(gen8_gtt_pte_t __iomem *) dev_priv->gtt.gsm + first_entry;
+	gen8_pte_t scratch_pte, __iomem *gtt_base =
+		(gen8_pte_t __iomem *) dev_priv->gtt.gsm + first_entry;
 	const int max_entries = gtt_total_entries(dev_priv->gtt) - first_entry;
 	int i;
 
@@ -1668,8 +1668,8 @@ static void gen6_ggtt_clear_range(struct i915_address_space *vm,
 	struct drm_i915_private *dev_priv = vm->dev->dev_private;
 	unsigned first_entry = start >> PAGE_SHIFT;
 	unsigned num_entries = length >> PAGE_SHIFT;
-	gen6_gtt_pte_t scratch_pte, __iomem *gtt_base =
-		(gen6_gtt_pte_t __iomem *) dev_priv->gtt.gsm + first_entry;
+	gen6_pte_t scratch_pte, __iomem *gtt_base =
+		(gen6_pte_t __iomem *) dev_priv->gtt.gsm + first_entry;
 	const int max_entries = gtt_total_entries(dev_priv->gtt) - first_entry;
 	int i;
 
@@ -2181,7 +2181,7 @@ static int gen8_gmch_probe(struct drm_device *dev,
 		gtt_size = gen8_get_total_gtt_size(snb_gmch_ctl);
 	}
 
-	*gtt_total = (gtt_size / sizeof(gen8_gtt_pte_t)) << PAGE_SHIFT;
+	*gtt_total = (gtt_size / sizeof(gen8_pte_t)) << PAGE_SHIFT;
 
 	if (IS_CHERRYVIEW(dev))
 		chv_setup_private_ppat(dev_priv);
@@ -2226,7 +2226,7 @@ static int gen6_gmch_probe(struct drm_device *dev,
 	*stolen = gen6_get_stolen_size(snb_gmch_ctl);
 
 	gtt_size = gen6_get_total_gtt_size(snb_gmch_ctl);
-	*gtt_total = (gtt_size / sizeof(gen6_gtt_pte_t)) << PAGE_SHIFT;
+	*gtt_total = (gtt_size / sizeof(gen6_pte_t)) << PAGE_SHIFT;
 
 	ret = ggtt_probe_common(dev, gtt_size);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index c9e93f5..5ca7c5e 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -36,13 +36,13 @@
 
 struct drm_i915_file_private;
 
-typedef uint32_t gen6_gtt_pte_t;
-typedef uint64_t gen8_gtt_pte_t;
-typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
+typedef uint32_t gen6_pte_t;
+typedef uint64_t gen8_pte_t;
+typedef uint64_t gen8_pde_t;
 
 #define gtt_total_entries(gtt) ((gtt).base.total >> PAGE_SHIFT)
 
-#define I915_PPGTT_PT_ENTRIES		(PAGE_SIZE / sizeof(gen6_gtt_pte_t))
+
 /* gen6-hsw has bit 11-4 for physical addr bit 39-32 */
 #define GEN6_GTT_ADDR_ENCODE(addr)	((addr) | (((addr) >> 28) & 0xff0))
 #define GEN6_PTE_ADDR_ENCODE(addr)	GEN6_GTT_ADDR_ENCODE(addr)
@@ -51,8 +51,13 @@ typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
 #define GEN6_PTE_UNCACHED		(1 << 1)
 #define GEN6_PTE_VALID			(1 << 0)
 
-#define GEN6_PPGTT_PD_ENTRIES		512
-#define GEN6_PD_SIZE			(GEN6_PPGTT_PD_ENTRIES * PAGE_SIZE)
+#define I915_PTES(pte_len)		(PAGE_SIZE / (pte_len))
+#define I915_PTE_MASK(pte_len)		(I915_PTES(pte_len) - 1)
+#define I915_PDES			512
+#define I915_PDE_MASK			(I915_PDES - 1)
+
+#define GEN6_PTES			I915_PTES(sizeof(gen6_pte_t))
+#define GEN6_PD_SIZE		        (I915_PDES * PAGE_SIZE)
 #define GEN6_PD_ALIGN			(PAGE_SIZE * 16)
 #define GEN6_PDE_VALID			(1 << 0)
 
@@ -89,8 +94,7 @@ typedef gen8_gtt_pte_t gen8_ppgtt_pde_t;
 #define GEN8_PTE_SHIFT			12
 #define GEN8_PTE_MASK			0x1ff
 #define GEN8_LEGACY_PDPES		4
-#define GEN8_PTES_PER_PAGE		(PAGE_SIZE / sizeof(gen8_gtt_pte_t))
-#define GEN8_PDES_PER_PAGE		(PAGE_SIZE / sizeof(gen8_ppgtt_pde_t))
+#define GEN8_PTES			I915_PTES(sizeof(gen8_pte_t))
 
 #define PPAT_UNCACHED_INDEX		(_PAGE_PWT | _PAGE_PCD)
 #define PPAT_CACHED_PDE_INDEX		0 /* WB LLC */
@@ -199,7 +203,7 @@ struct i915_page_directory_entry {
 		dma_addr_t daddr;
 	};
 
-	struct i915_page_table_entry *page_table[GEN6_PPGTT_PD_ENTRIES]; /* PDEs */
+	struct i915_page_table_entry *page_table[I915_PDES]; /* PDEs */
 };
 
 struct i915_page_directory_pointer_entry {
@@ -243,9 +247,9 @@ struct i915_address_space {
 	struct list_head inactive_list;
 
 	/* FIXME: Need a more generic return type */
-	gen6_gtt_pte_t (*pte_encode)(dma_addr_t addr,
-				     enum i915_cache_level level,
-				     bool valid, u32 flags); /* Create a valid PTE */
+	gen6_pte_t (*pte_encode)(dma_addr_t addr,
+				 enum i915_cache_level level,
+				 bool valid, u32 flags); /* Create a valid PTE */
 	void (*clear_range)(struct i915_address_space *vm,
 			    uint64_t start,
 			    uint64_t length,
-- 
2.1.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/5] drm/i915: Extract context switch skip and add pd load logic
  2015-03-16 16:00 [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Michel Thierry
  2015-03-16 16:00 ` [PATCH 1/5] drm/i915: page table generalizations Michel Thierry
@ 2015-03-16 16:00 ` Michel Thierry
  2015-03-16 16:00 ` [PATCH 3/5] drm/i915: Track GEN6 page table usage Michel Thierry
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Michel Thierry @ 2015-03-16 16:00 UTC (permalink / raw)
  To: intel-gfx

From: Ben Widawsky <benjamin.widawsky@intel.com>

In Gen8, PDPs are saved and restored with legacy contexts (legacy contexts
only exist on the render ring). So change the ordering of LRI vs MI_SET_CONTEXT
for the initialization of the context. Also the only cases in which we
need to manually update the PDPs are when MI_RESTORE_INHIBIT has been
set in MI_SET_CONTEXT (i.e. when the context is not yet initialized or
it is the default context).

Legacy submission is not available post GEN8, so it isn't necessary to
add extra checks for newer generations.

v2: Use new functions to replace the logic right away (Daniel)
v3: Add missing pd load logic.
v4: Add warning in case pd_load_pre & pd_load_post are true, and add
missing trace_switch_mm. Cleaned up pd_load conditions. Add more
information about when is pd_load_post needed. (Mika)

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Michel Thierry <michel.thierry@intel.com> (v2+)
---
 drivers/gpu/drm/i915/i915_gem_context.c | 75 ++++++++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 70346b0..b6ea85d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -569,6 +569,56 @@ mi_set_context(struct intel_engine_cs *ring,
 	return ret;
 }
 
+static inline bool should_skip_switch(struct intel_engine_cs *ring,
+				      struct intel_context *from,
+				      struct intel_context *to)
+{
+	if (from == to && !to->remap_slice)
+		return true;
+
+	return false;
+}
+
+static bool
+needs_pd_load_pre(struct intel_engine_cs *ring, struct intel_context *to)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
+	if (!to->ppgtt)
+		return false;
+
+	if (INTEL_INFO(ring->dev)->gen < 8)
+		return true;
+
+	if (ring != &dev_priv->ring[RCS])
+		return true;
+
+	return false;
+}
+
+static bool
+needs_pd_load_post(struct intel_engine_cs *ring, struct intel_context *to)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
+	if (!to->ppgtt)
+		return false;
+
+	if (!IS_GEN8(ring->dev))
+		return false;
+
+	if (ring != &dev_priv->ring[RCS])
+		return false;
+
+	if (!to->legacy_hw_ctx.initialized)
+		return true;
+
+	if (i915_gem_context_is_default(to))
+		return true;
+
+	return false;
+}
+
 static int do_switch(struct intel_engine_cs *ring,
 		     struct intel_context *to)
 {
@@ -584,7 +634,7 @@ static int do_switch(struct intel_engine_cs *ring,
 		BUG_ON(!i915_gem_obj_is_pinned(from->legacy_hw_ctx.rcs_state));
 	}
 
-	if (from == to && !to->remap_slice)
+	if (should_skip_switch(ring, from, to))
 		return 0;
 
 	/* Trying to pin first makes error handling easier. */
@@ -602,7 +652,14 @@ static int do_switch(struct intel_engine_cs *ring,
 	 */
 	from = ring->last_context;
 
-	if (to->ppgtt) {
+	/* We should never emit switch_mm more than once */
+	WARN_ON(needs_pd_load_pre(ring, to) && needs_pd_load_post(ring, to));
+
+	if (needs_pd_load_pre(ring, to)) {
+		/* Older GENs and non render rings still want the load first,
+		 * "PP_DCLV followed by PP_DIR_BASE register through Load
+		 * Register Immediate commands in Ring Buffer before submitting
+		 * a context."*/
 		trace_switch_mm(ring, to);
 		ret = to->ppgtt->switch_mm(to->ppgtt, ring);
 		if (ret)
@@ -644,6 +701,20 @@ static int do_switch(struct intel_engine_cs *ring,
 	if (ret)
 		goto unpin_out;
 
+	if (needs_pd_load_post(ring, to)) {
+		trace_switch_mm(ring, to);
+		ret = to->ppgtt->switch_mm(to->ppgtt, ring);
+		/* The hardware context switch is emitted, but we haven't
+		 * actually changed the state - so it's probably safe to bail
+		 * here. Still, let the user know something dangerous has
+		 * happened.
+		 */
+		if (ret) {
+			DRM_ERROR("Failed to change address space on context switch\n");
+			goto unpin_out;
+		}
+	}
+
 	for (i = 0; i < MAX_L3_SLICES; i++) {
 		if (!(to->remap_slice & (1<<i)))
 			continue;
-- 
2.1.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/5] drm/i915: Track GEN6 page table usage
  2015-03-16 16:00 [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Michel Thierry
  2015-03-16 16:00 ` [PATCH 1/5] drm/i915: page table generalizations Michel Thierry
  2015-03-16 16:00 ` [PATCH 2/5] drm/i915: Extract context switch skip and add pd load logic Michel Thierry
@ 2015-03-16 16:00 ` Michel Thierry
  2015-03-19 16:08   ` Daniel Vetter
  2015-03-19 16:13   ` Daniel Vetter
  2015-03-16 16:00 ` [PATCH 4/5] drm/i915: Track page table reload need Michel Thierry
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 15+ messages in thread
From: Michel Thierry @ 2015-03-16 16:00 UTC (permalink / raw)
  To: intel-gfx

From: Ben Widawsky <benjamin.widawsky@intel.com>

Instead of implementing the full tracking + dynamic allocation, this
patch does a bit less than half of the work, by tracking and warning on
unexpected conditions. The tracking itself follows which PTEs within a
page table are currently being used for objects. The next patch will
modify this to actually allocate the page tables only when necessary.

With the current patch there isn't much in the way of making a gen
agnostic range allocation function. However, in the next patch we'll add
more specificity which makes having separate functions a bit easier to
manage.

One important change introduced here is that DMA mappings are
created/destroyed at the same page directories/tables are
allocated/deallocated.

Notice that aliasing PPGTT is not managed here. The patch which actually
begins dynamic allocation/teardown explains the reasoning for this.

v2: s/pdp.page_directory/pdp.page_directories
Make a scratch page allocation helper

v3: Rebase and expand commit message.

v4: Allocate required pagetables only when it is needed, _bind_to_vm
instead of bind_vma (Daniel).

v5: Rebased to remove the unnecessary noise in the diff, also:
 - PDE mask is GEN agnostic, renamed GEN6_PDE_MASK to I915_PDE_MASK.
 - Removed unnecessary checks in gen6_alloc_va_range.
 - Changed map/unmap_px_single macros to use dma functions directly and
   be part of a static inline function instead.
 - Moved drm_device plumbing through page tables operation to its own
   patch.
 - Moved allocate/teardown_va_range calls until they are fully
   implemented (in subsequent patch).
 - Merged pt and scratch_pt unmap_and_free path.
 - Moved scratch page allocator helper to the patch that will use it.

v6: Reduce complexity by not tearing down pagetables dynamically, the
same can be achieved while freeing empty vms. (Daniel)

v7: s/i915_dma_map_px_single/i915_dma_map_single
s/gen6_write_pdes/gen6_write_pde
Prevent a NULL case when only GGTT is available. (Mika)

v8: Rebased after s/page_tables/page_table/.

v9: Reworked i915_pte_index and i915_pte_count.
Also exercise bitmap allocation here (gen6_alloc_va_range) and fix
incorrect write_page_range in i915_gem_restore_gtt_mappings (Mika).

Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Michel Thierry <michel.thierry@intel.com> (v3+)
---
 drivers/gpu/drm/i915/i915_gem.c     |   9 ++
 drivers/gpu/drm/i915/i915_gem_gtt.c | 199 +++++++++++++++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_gtt.h |  73 +++++++++++++
 3 files changed, 219 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e154770..07d427a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3600,6 +3600,15 @@ search_free:
 	if (ret)
 		goto err_remove_node;
 
+	/*  allocate before insert / bind */
+	if (vma->vm->allocate_va_range) {
+		ret = vma->vm->allocate_va_range(vma->vm,
+						vma->node.start,
+						vma->node.size);
+		if (ret)
+			goto err_remove_node;
+	}
+
 	trace_i915_vma_bind(vma, flags);
 	ret = i915_vma_bind(vma, obj->cache_level,
 			    flags & PIN_GLOBAL ? GLOBAL_BIND : 0);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 6d85bff..c50b4ab 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -278,29 +278,88 @@ static gen6_pte_t iris_pte_encode(dma_addr_t addr,
 	return pte;
 }
 
-static void unmap_and_free_pt(struct i915_page_table_entry *pt, struct drm_device *dev)
+#define i915_dma_unmap_single(px, dev) \
+	__i915_dma_unmap_single((px)->daddr, dev)
+
+static inline void __i915_dma_unmap_single(dma_addr_t daddr,
+					struct drm_device *dev)
+{
+	struct device *device = &dev->pdev->dev;
+
+	dma_unmap_page(device, daddr, 4096, PCI_DMA_BIDIRECTIONAL);
+}
+
+/**
+ * i915_dma_map_single() - Create a dma mapping for a page table/dir/etc.
+ * @px:	Page table/dir/etc to get a DMA map for
+ * @dev:	drm device
+ *
+ * Page table allocations are unified across all gens. They always require a
+ * single 4k allocation, as well as a DMA mapping. If we keep the structs
+ * symmetric here, the simple macro covers us for every page table type.
+ *
+ * Return: 0 if success.
+ */
+#define i915_dma_map_single(px, dev) \
+	i915_dma_map_page_single((px)->page, (dev), &(px)->daddr)
+
+static inline int i915_dma_map_page_single(struct page *page,
+					   struct drm_device *dev,
+					   dma_addr_t *daddr)
+{
+	struct device *device = &dev->pdev->dev;
+
+	*daddr = dma_map_page(device, page, 0, 4096, PCI_DMA_BIDIRECTIONAL);
+	return dma_mapping_error(device, *daddr);
+}
+
+static void unmap_and_free_pt(struct i915_page_table_entry *pt,
+			       struct drm_device *dev)
 {
 	if (WARN_ON(!pt->page))
 		return;
+
+	i915_dma_unmap_single(pt, dev);
 	__free_page(pt->page);
+	kfree(pt->used_ptes);
 	kfree(pt);
 }
 
 static struct i915_page_table_entry *alloc_pt_single(struct drm_device *dev)
 {
 	struct i915_page_table_entry *pt;
+	const size_t count = INTEL_INFO(dev)->gen >= 8 ?
+		GEN8_PTES : GEN6_PTES;
+	int ret = -ENOMEM;
 
 	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
 	if (!pt)
 		return ERR_PTR(-ENOMEM);
 
+	pt->used_ptes = kcalloc(BITS_TO_LONGS(count), sizeof(*pt->used_ptes),
+				GFP_KERNEL);
+
+	if (!pt->used_ptes)
+		goto fail_bitmap;
+
 	pt->page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	if (!pt->page) {
-		kfree(pt);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!pt->page)
+		goto fail_page;
+
+	ret = i915_dma_map_single(pt, dev);
+	if (ret)
+		goto fail_dma;
 
 	return pt;
+
+fail_dma:
+	__free_page(pt->page);
+fail_page:
+	kfree(pt->used_ptes);
+fail_bitmap:
+	kfree(pt);
+
+	return ERR_PTR(ret);
 }
 
 /**
@@ -847,26 +906,36 @@ static void gen6_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
 	}
 }
 
-static void gen6_write_pdes(struct i915_hw_ppgtt *ppgtt)
+/* Write pde (index) from the page directory @pd to the page table @pt */
+static void gen6_write_pde(struct i915_page_directory_entry *pd,
+			    const int pde, struct i915_page_table_entry *pt)
 {
-	struct drm_i915_private *dev_priv = ppgtt->base.dev->dev_private;
-	gen6_pte_t __iomem *pd_addr;
-	uint32_t pd_entry;
-	int i;
+	/* Caller needs to make sure the write completes if necessary */
+	struct i915_hw_ppgtt *ppgtt =
+		container_of(pd, struct i915_hw_ppgtt, pd);
+	u32 pd_entry;
 
-	WARN_ON(ppgtt->pd.pd_offset & 0x3f);
-	pd_addr = (gen6_pte_t __iomem*)dev_priv->gtt.gsm +
-		ppgtt->pd.pd_offset / sizeof(gen6_pte_t);
-	for (i = 0; i < ppgtt->num_pd_entries; i++) {
-		dma_addr_t pt_addr;
+	pd_entry = GEN6_PDE_ADDR_ENCODE(pt->daddr);
+	pd_entry |= GEN6_PDE_VALID;
 
-		pt_addr = ppgtt->pd.page_table[i]->daddr;
-		pd_entry = GEN6_PDE_ADDR_ENCODE(pt_addr);
-		pd_entry |= GEN6_PDE_VALID;
+	writel(pd_entry, ppgtt->pd_addr + pde);
+}
 
-		writel(pd_entry, pd_addr + i);
-	}
-	readl(pd_addr);
+/* Write all the page tables found in the ppgtt structure to incrementing page
+ * directories. */
+static void gen6_write_page_range(struct drm_i915_private *dev_priv,
+				  struct i915_page_directory_entry *pd,
+				  uint32_t start, uint32_t length)
+{
+	struct i915_page_table_entry *pt;
+	uint32_t pde, temp;
+
+	gen6_for_each_pde(pt, pd, start, length, temp, pde)
+		gen6_write_pde(pd, pde, pt);
+
+	/* Make sure write is complete before other code can use this page
+	 * table. Also require for WC mapped PTEs */
+	readl(dev_priv->gtt.gsm);
 }
 
 static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
@@ -1092,6 +1161,28 @@ static void gen6_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
 			       4096, PCI_DMA_BIDIRECTIONAL);
 }
 
+static int gen6_alloc_va_range(struct i915_address_space *vm,
+			       uint64_t start, uint64_t length)
+{
+	struct i915_hw_ppgtt *ppgtt =
+				container_of(vm, struct i915_hw_ppgtt, base);
+	struct i915_page_table_entry *pt;
+	uint32_t pde, temp;
+
+	gen6_for_each_pde(pt, &ppgtt->pd, start, length, temp, pde) {
+		DECLARE_BITMAP(tmp_bitmap, GEN6_PTES);
+
+		bitmap_zero(tmp_bitmap, GEN6_PTES);
+		bitmap_set(tmp_bitmap, gen6_pte_index(start),
+			   gen6_pte_count(start, length));
+
+		bitmap_or(pt->used_ptes, pt->used_ptes, tmp_bitmap,
+				GEN6_PTES);
+	}
+
+	return 0;
+}
+
 static void gen6_ppgtt_free(struct i915_hw_ppgtt *ppgtt)
 {
 	int i;
@@ -1138,20 +1229,24 @@ alloc:
 					       0, dev_priv->gtt.base.total,
 					       0);
 		if (ret)
-			return ret;
+			goto err_out;
 
 		retried = true;
 		goto alloc;
 	}
 
 	if (ret)
-		return ret;
+		goto err_out;
+
 
 	if (ppgtt->node.start < dev_priv->gtt.mappable_end)
 		DRM_DEBUG("Forced to use aperture for PDEs\n");
 
 	ppgtt->num_pd_entries = I915_PDES;
 	return 0;
+
+err_out:
+	return ret;
 }
 
 static int gen6_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt)
@@ -1173,30 +1268,6 @@ static int gen6_ppgtt_alloc(struct i915_hw_ppgtt *ppgtt)
 	return 0;
 }
 
-static int gen6_ppgtt_setup_page_tables(struct i915_hw_ppgtt *ppgtt)
-{
-	struct drm_device *dev = ppgtt->base.dev;
-	int i;
-
-	for (i = 0; i < ppgtt->num_pd_entries; i++) {
-		struct page *page;
-		dma_addr_t pt_addr;
-
-		page = ppgtt->pd.page_table[i]->page;
-		pt_addr = pci_map_page(dev->pdev, page, 0, 4096,
-				       PCI_DMA_BIDIRECTIONAL);
-
-		if (pci_dma_mapping_error(dev->pdev, pt_addr)) {
-			gen6_ppgtt_unmap_pages(ppgtt);
-			return -EIO;
-		}
-
-		ppgtt->pd.page_table[i]->daddr = pt_addr;
-	}
-
-	return 0;
-}
-
 static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 {
 	struct drm_device *dev = ppgtt->base.dev;
@@ -1220,12 +1291,7 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 	if (ret)
 		return ret;
 
-	ret = gen6_ppgtt_setup_page_tables(ppgtt);
-	if (ret) {
-		gen6_ppgtt_free(ppgtt);
-		return ret;
-	}
-
+	ppgtt->base.allocate_va_range = gen6_alloc_va_range;
 	ppgtt->base.clear_range = gen6_ppgtt_clear_range;
 	ppgtt->base.insert_entries = gen6_ppgtt_insert_entries;
 	ppgtt->base.cleanup = gen6_ppgtt_cleanup;
@@ -1236,13 +1302,17 @@ static int gen6_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 	ppgtt->pd.pd_offset =
 		ppgtt->node.start / PAGE_SIZE * sizeof(gen6_pte_t);
 
+	ppgtt->pd_addr = (gen6_pte_t __iomem *)dev_priv->gtt.gsm +
+		ppgtt->pd.pd_offset / sizeof(gen6_pte_t);
+
 	ppgtt->base.clear_range(&ppgtt->base, 0, ppgtt->base.total, true);
 
+	gen6_write_page_range(dev_priv, &ppgtt->pd, 0, ppgtt->base.total);
+
 	DRM_DEBUG_DRIVER("Allocated pde space (%lldM) at GTT entry: %llx\n",
 			 ppgtt->node.size >> 20,
 			 ppgtt->node.start / PAGE_SIZE);
 
-	gen6_write_pdes(ppgtt);
 	DRM_DEBUG("Adding PPGTT at offset %x\n",
 		  ppgtt->pd.pd_offset << 10);
 
@@ -1513,15 +1583,20 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 		return;
 	}
 
-	list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
-		/* TODO: Perhaps it shouldn't be gen6 specific */
-		if (i915_is_ggtt(vm)) {
-			if (dev_priv->mm.aliasing_ppgtt)
-				gen6_write_pdes(dev_priv->mm.aliasing_ppgtt);
-			continue;
-		}
+	if (USES_PPGTT(dev)) {
+		list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
+			/* TODO: Perhaps it shouldn't be gen6 specific */
+
+			struct i915_hw_ppgtt *ppgtt =
+					container_of(vm, struct i915_hw_ppgtt,
+						     base);
 
-		gen6_write_pdes(container_of(vm, struct i915_hw_ppgtt, base));
+			if (i915_is_ggtt(vm))
+				ppgtt = dev_priv->mm.aliasing_ppgtt;
+
+			gen6_write_page_range(dev_priv, &ppgtt->pd,
+					      0, ppgtt->base.total);
+		}
 	}
 
 	i915_ggtt_flush(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 5ca7c5e..c30af62 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -55,10 +55,12 @@ typedef uint64_t gen8_pde_t;
 #define I915_PTE_MASK(pte_len)		(I915_PTES(pte_len) - 1)
 #define I915_PDES			512
 #define I915_PDE_MASK			(I915_PDES - 1)
+#define NUM_PTE(pde_shift)     (1 << (pde_shift - PAGE_SHIFT))
 
 #define GEN6_PTES			I915_PTES(sizeof(gen6_pte_t))
 #define GEN6_PD_SIZE		        (I915_PDES * PAGE_SIZE)
 #define GEN6_PD_ALIGN			(PAGE_SIZE * 16)
+#define GEN6_PDE_SHIFT			22
 #define GEN6_PDE_VALID			(1 << 0)
 
 #define GEN7_PTE_CACHE_L3_LLC		(3 << 1)
@@ -194,6 +196,8 @@ struct i915_vma {
 struct i915_page_table_entry {
 	struct page *page;
 	dma_addr_t daddr;
+
+	unsigned long *used_ptes;
 };
 
 struct i915_page_directory_entry {
@@ -250,6 +254,9 @@ struct i915_address_space {
 	gen6_pte_t (*pte_encode)(dma_addr_t addr,
 				 enum i915_cache_level level,
 				 bool valid, u32 flags); /* Create a valid PTE */
+	int (*allocate_va_range)(struct i915_address_space *vm,
+				 uint64_t start,
+				 uint64_t length);
 	void (*clear_range)(struct i915_address_space *vm,
 			    uint64_t start,
 			    uint64_t length,
@@ -302,12 +309,78 @@ struct i915_hw_ppgtt {
 
 	struct drm_i915_file_private *file_priv;
 
+	gen6_pte_t __iomem *pd_addr;
+
 	int (*enable)(struct i915_hw_ppgtt *ppgtt);
 	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
 			 struct intel_engine_cs *ring);
 	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
 };
 
+/* For each pde iterates over every pde between from start until start + length.
+ * If start, and start+length are not perfectly divisible, the macro will round
+ * down, and up as needed. The macro modifies pde, start, and length. Dev is
+ * only used to differentiate shift values. Temp is temp.  On gen6/7, start = 0,
+ * and length = 2G effectively iterates over every PDE in the system.
+ *
+ * XXX: temp is not actually needed, but it saves doing the ALIGN operation.
+ */
+#define gen6_for_each_pde(pt, pd, start, length, temp, iter) \
+	for (iter = gen6_pde_index(start), pt = (pd)->page_table[iter]; \
+	     length > 0 && iter < I915_PDES; \
+	     pt = (pd)->page_table[++iter], \
+	     temp = ALIGN(start+1, 1 << GEN6_PDE_SHIFT) - start, \
+	     temp = min_t(unsigned, temp, length), \
+	     start += temp, length -= temp)
+
+static inline uint32_t i915_pte_index(uint64_t address, uint32_t pde_shift)
+{
+	const uint32_t mask = NUM_PTE(pde_shift) - 1;
+
+	return (address >> PAGE_SHIFT) & mask;
+}
+
+/* Helper to counts the number of PTEs within the given length. This count
+ * does not cross a page table boundary, so the max value would be
+ * GEN6_PTES for GEN6, and GEN8_PTES for GEN8.
+*/
+static inline uint32_t i915_pte_count(uint64_t addr, size_t length,
+					uint32_t pde_shift)
+{
+	const uint64_t mask = ~((1 << pde_shift) - 1);
+	uint64_t end;
+
+	BUG_ON(length == 0);
+	BUG_ON(offset_in_page(addr|length));
+
+	end = addr + length;
+
+	if ((addr & mask) != (end & mask))
+		return NUM_PTE(pde_shift) - i915_pte_index(addr, pde_shift);
+
+	return i915_pte_index(end, pde_shift) - i915_pte_index(addr, pde_shift);
+}
+
+static inline uint32_t i915_pde_index(uint64_t addr, uint32_t shift)
+{
+	return (addr >> shift) & I915_PDE_MASK;
+}
+
+static inline uint32_t gen6_pte_index(uint32_t addr)
+{
+	return i915_pte_index(addr, GEN6_PDE_SHIFT);
+}
+
+static inline size_t gen6_pte_count(uint32_t addr, uint32_t length)
+{
+	return i915_pte_count(addr, length, GEN6_PDE_SHIFT);
+}
+
+static inline uint32_t gen6_pde_index(uint32_t addr)
+{
+	return i915_pde_index(addr, GEN6_PDE_SHIFT);
+}
+
 int i915_gem_gtt_init(struct drm_device *dev);
 void i915_gem_init_global_gtt(struct drm_device *dev);
 void i915_global_gtt_cleanup(struct drm_device *dev);
-- 
2.1.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/5] drm/i915: Track page table reload need
  2015-03-16 16:00 [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Michel Thierry
                   ` (2 preceding siblings ...)
  2015-03-16 16:00 ` [PATCH 3/5] drm/i915: Track GEN6 page table usage Michel Thierry
@ 2015-03-16 16:00 ` Michel Thierry
  2015-03-19  9:01   ` Mika Kuoppala
  2015-03-19 12:53   ` [PATCH] " Michel Thierry
  2015-03-16 16:00 ` [PATCH 5/5] drm/i915: Initialize all contexts Michel Thierry
  2015-03-19 13:32 ` [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Mika Kuoppala
  5 siblings, 2 replies; 15+ messages in thread
From: Michel Thierry @ 2015-03-16 16:00 UTC (permalink / raw)
  To: intel-gfx

From: Ben Widawsky <benjamin.widawsky@intel.com>

This patch was formerly known as, "Force pd restore when PDEs change,
gen6-7." I had to change the name because it is needed for GEN8 too.

The real issue this is trying to solve is when a new object is mapped
into the current address space. The GPU does not snoop the new mapping
so we must do the gen specific action to reload the page tables.

GEN8 and GEN7 do differ in the way they load page tables for the RCS.
GEN8 does so with the context restore, while GEN7 requires the proper
load commands in the command streamer. Non-render is similar for both.

Caveat for GEN7
The docs say you cannot change the PDEs of a currently running context.
We never map new PDEs of a running context, and expect them to be
present - so I think this is okay. (We can unmap, but this should also
be okay since we only unmap unreferenced objects that the GPU shouldn't
be tryingto va->pa xlate.) The MI_SET_CONTEXT command does have a flag
to signal that even if the context is the same, force a reload. It's
unclear exactly what this does, but I have a hunch it's the right thing
to do.

The logic assumes that we always emit a context switch after mapping new
PDEs, and before we submit a batch. This is the case today, and has been
the case since the inception of hardware contexts. A note in the comment
let's the user know.

It's not just for gen8. If the current context has mappings change, we
need a context reload to switch

v2: Rebased after ppgtt clean up patches. Split the warning for aliasing
and true ppgtt options. And do not break aliasing ppgtt, where to->ppgtt
is always null.

v3: Invalidate PPGTT TLBs inside alloc_va_range.

v4: Rename ppgtt_invalidate_tlbs to mark_tlbs_dirty and move
pd_dirty_rings from i915_address_space to i915_hw_ppgtt. Fixes when
neither ctx->ppgtt and aliasing_ppgtt exist.

v5: Removed references to teardown_va_range.

v6: Updated needs_pd_load_pre/post.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Michel Thierry <michel.thierry@intel.com> (v2+)
---
 drivers/gpu/drm/i915/i915_gem_context.c    | 26 ++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 +++++++++++
 drivers/gpu/drm/i915/i915_gem_gtt.c        | 11 +++++++++++
 drivers/gpu/drm/i915/i915_gem_gtt.h        |  1 +
 4 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index b6ea85d..9197ff4 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -573,8 +573,20 @@ static inline bool should_skip_switch(struct intel_engine_cs *ring,
 				      struct intel_context *from,
 				      struct intel_context *to)
 {
-	if (from == to && !to->remap_slice)
-		return true;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
+	if (to->remap_slice)
+		return false;
+
+	if (to->ppgtt) {
+		if (from == to && !test_bit(ring->id,
+				&to->ppgtt->pd_dirty_rings))
+			return true;
+	} else if (dev_priv->mm.aliasing_ppgtt) {
+		if (from == to && !test_bit(ring->id,
+				&dev_priv->mm.aliasing_ppgtt->pd_dirty_rings))
+			return true;
+	}
 
 	return false;
 }
@@ -610,10 +622,7 @@ needs_pd_load_post(struct intel_engine_cs *ring, struct intel_context *to)
 	if (ring != &dev_priv->ring[RCS])
 		return false;
 
-	if (!to->legacy_hw_ctx.initialized)
-		return true;
-
-	if (i915_gem_context_is_default(to))
+	if (&to->ppgtt->pd_dirty_rings)
 		return true;
 
 	return false;
@@ -664,6 +673,9 @@ static int do_switch(struct intel_engine_cs *ring,
 		ret = to->ppgtt->switch_mm(to->ppgtt, ring);
 		if (ret)
 			goto unpin_out;
+
+		/* Doing a PD load always reloads the page dirs */
+		clear_bit(ring->id, &to->ppgtt->pd_dirty_rings);
 	}
 
 	if (ring != &dev_priv->ring[RCS]) {
@@ -696,6 +708,8 @@ static int do_switch(struct intel_engine_cs *ring,
 
 	if (!to->legacy_hw_ctx.initialized || i915_gem_context_is_default(to))
 		hw_flags |= MI_RESTORE_INHIBIT;
+	else if (to->ppgtt && test_and_clear_bit(ring->id, &to->ppgtt->pd_dirty_rings))
+		hw_flags |= MI_FORCE_RESTORE;
 
 	ret = mi_set_context(ring, to, hw_flags);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index dc10bc4..fd0241a 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1187,6 +1187,13 @@ i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
 	if (ret)
 		goto error;
 
+	if (ctx->ppgtt)
+		WARN(ctx->ppgtt->pd_dirty_rings & (1<<ring->id),
+			"%s didn't clear reload\n", ring->name);
+	else if (dev_priv->mm.aliasing_ppgtt)
+		WARN(dev_priv->mm.aliasing_ppgtt->pd_dirty_rings &
+			(1<<ring->id), "%s didn't clear reload\n", ring->name);
+
 	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
 	instp_mask = I915_EXEC_CONSTANTS_MASK;
 	switch (instp_mode) {
@@ -1456,6 +1463,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		goto err;
 
+	/* XXX: Reserve has possibly change PDEs which means we must do a
+	 * context switch before we can coherently read some of the reserved
+	 * VMAs. */
+
 	/* The objects are in their final locations, apply the relocations. */
 	if (need_relocs)
 		ret = i915_gem_execbuffer_relocate(eb);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index c50b4ab..1758fd9 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1161,6 +1161,16 @@ static void gen6_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
 			       4096, PCI_DMA_BIDIRECTIONAL);
 }
 
+/* PDE TLBs are a pain invalidate pre GEN8. It requires a context reload. If we
+ * are switching between contexts with the same LRCA, we also must do a force
+ * restore.
+ */
+static inline void mark_tlbs_dirty(struct i915_hw_ppgtt *ppgtt)
+{
+	/* If current vm != vm, */
+	ppgtt->pd_dirty_rings = INTEL_INFO(ppgtt->base.dev)->ring_mask;
+}
+
 static int gen6_alloc_va_range(struct i915_address_space *vm,
 			       uint64_t start, uint64_t length)
 {
@@ -1180,6 +1190,7 @@ static int gen6_alloc_va_range(struct i915_address_space *vm,
 				GEN6_PTES);
 	}
 
+	mark_tlbs_dirty(ppgtt);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index c30af62..074b1fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -300,6 +300,7 @@ struct i915_hw_ppgtt {
 	struct i915_address_space base;
 	struct kref ref;
 	struct drm_mm_node node;
+	unsigned long pd_dirty_rings;
 	unsigned num_pd_entries;
 	unsigned num_pd_pages; /* gen8+ */
 	union {
-- 
2.1.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/5] drm/i915: Initialize all contexts
  2015-03-16 16:00 [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Michel Thierry
                   ` (3 preceding siblings ...)
  2015-03-16 16:00 ` [PATCH 4/5] drm/i915: Track page table reload need Michel Thierry
@ 2015-03-16 16:00 ` Michel Thierry
  2015-03-16 21:39   ` shuang.he
  2015-03-19 13:32 ` [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Mika Kuoppala
  5 siblings, 1 reply; 15+ messages in thread
From: Michel Thierry @ 2015-03-16 16:00 UTC (permalink / raw)
  To: intel-gfx

From: Ben Widawsky <benjamin.widawsky@intel.com>

The problem is we're going to switch to a new context, which could be
the default context. The plan was to use restore inhibit, which would be
fine, except if we are using dynamic page tables (which we will). If we
use dynamic page tables and we don't load new page tables, the previous
page tables might go away, and future operations will fault.

CTXA runs.
switch to default, restore inhibit
CTXA dies and has its address space taken away.
Run CTXB, tries to save using the context A's address space - this
fails.

The general solution is to make sure every context has it's own state,
and its own address space. For cases when we must restore inhibit, first
thing we do is load a valid address space. I thought this would be
enough, but apparently there are references within the context itself
which will refer to the old address space - therefore, we also must
reinitialize.

v2: to->ppgtt is only valid in full ppgtt.
v3: Rebased.
v4: Make post PDP update clearer.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Michel Thierry <michel.thierry@intel.com> (v2+)
---
 drivers/gpu/drm/i915/i915_gem_context.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 9197ff4..f3e84c4 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -609,7 +609,8 @@ needs_pd_load_pre(struct intel_engine_cs *ring, struct intel_context *to)
 }
 
 static bool
-needs_pd_load_post(struct intel_engine_cs *ring, struct intel_context *to)
+needs_pd_load_post(struct intel_engine_cs *ring, struct intel_context *to,
+		u32 hw_flags)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 
@@ -622,7 +623,7 @@ needs_pd_load_post(struct intel_engine_cs *ring, struct intel_context *to)
 	if (ring != &dev_priv->ring[RCS])
 		return false;
 
-	if (&to->ppgtt->pd_dirty_rings)
+	if (hw_flags & MI_RESTORE_INHIBIT)
 		return true;
 
 	return false;
@@ -661,9 +662,6 @@ static int do_switch(struct intel_engine_cs *ring,
 	 */
 	from = ring->last_context;
 
-	/* We should never emit switch_mm more than once */
-	WARN_ON(needs_pd_load_pre(ring, to) && needs_pd_load_post(ring, to));
-
 	if (needs_pd_load_pre(ring, to)) {
 		/* Older GENs and non render rings still want the load first,
 		 * "PP_DCLV followed by PP_DIR_BASE register through Load
@@ -706,16 +704,28 @@ static int do_switch(struct intel_engine_cs *ring,
 			goto unpin_out;
 	}
 
-	if (!to->legacy_hw_ctx.initialized || i915_gem_context_is_default(to))
+	if (!to->legacy_hw_ctx.initialized) {
 		hw_flags |= MI_RESTORE_INHIBIT;
-	else if (to->ppgtt && test_and_clear_bit(ring->id, &to->ppgtt->pd_dirty_rings))
+		/* NB: If we inhibit the restore, the context is not allowed to
+		 * die because future work may end up depending on valid address
+		 * space. This means we must enforce that a page table load
+		 * occur when this occurs. */
+	} else if (to->ppgtt &&
+			test_and_clear_bit(ring->id, &to->ppgtt->pd_dirty_rings))
 		hw_flags |= MI_FORCE_RESTORE;
 
+	/* We should never emit switch_mm more than once */
+	WARN_ON(needs_pd_load_pre(ring, to) &&
+			needs_pd_load_post(ring, to, hw_flags));
+
 	ret = mi_set_context(ring, to, hw_flags);
 	if (ret)
 		goto unpin_out;
 
-	if (needs_pd_load_post(ring, to)) {
+	/* GEN8 does *not* require an explicit reload if the PDPs have been
+	 * setup, and we do not wish to move them.
+	 */
+	if (needs_pd_load_post(ring, to, hw_flags)) {
 		trace_switch_mm(ring, to);
 		ret = to->ppgtt->switch_mm(to->ppgtt, ring);
 		/* The hardware context switch is emitted, but we haven't
@@ -766,7 +776,7 @@ static int do_switch(struct intel_engine_cs *ring,
 		i915_gem_context_unreference(from);
 	}
 
-	uninitialized = !to->legacy_hw_ctx.initialized && from == NULL;
+	uninitialized = !to->legacy_hw_ctx.initialized;
 	to->legacy_hw_ctx.initialized = true;
 
 done:
-- 
2.1.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 5/5] drm/i915: Initialize all contexts
  2015-03-16 16:00 ` [PATCH 5/5] drm/i915: Initialize all contexts Michel Thierry
@ 2015-03-16 21:39   ` shuang.he
  0 siblings, 0 replies; 15+ messages in thread
From: shuang.he @ 2015-03-16 21:39 UTC (permalink / raw)
  To: shuang.he, ethan.gao, intel-gfx, michel.thierry

Tested-By: PRC QA PRTS (Patch Regression Test System Contact: shuang.he@intel.com)
Task id: 5961
-------------------------------------Summary-------------------------------------
Platform          Delta          drm-intel-nightly          Series Applied
PNV                 -1              275/275              274/275
ILK                                  303/303              303/303
SNB                                  279/279              279/279
IVB                                  343/343              343/343
BYT                                  287/287              287/287
HSW                                  361/361              361/361
BDW                                  308/308              308/308
-------------------------------------Detailed-------------------------------------
Platform  Test                                drm-intel-nightly          Series Applied
*PNV  igt_gem_userptr_blits_minor-unsync-interruptible      PASS(2)      DMESG_WARN(1)PASS(1)
Note: You need to pay more attention to line start with '*'
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] drm/i915: Track page table reload need
  2015-03-16 16:00 ` [PATCH 4/5] drm/i915: Track page table reload need Michel Thierry
@ 2015-03-19  9:01   ` Mika Kuoppala
  2015-03-19 11:26     ` Michel Thierry
  2015-03-19 12:53   ` [PATCH] " Michel Thierry
  1 sibling, 1 reply; 15+ messages in thread
From: Mika Kuoppala @ 2015-03-19  9:01 UTC (permalink / raw)
  To: Michel Thierry, intel-gfx

Michel Thierry <michel.thierry@intel.com> writes:

> From: Ben Widawsky <benjamin.widawsky@intel.com>
>
> This patch was formerly known as, "Force pd restore when PDEs change,
> gen6-7." I had to change the name because it is needed for GEN8 too.
>
> The real issue this is trying to solve is when a new object is mapped
> into the current address space. The GPU does not snoop the new mapping
> so we must do the gen specific action to reload the page tables.
>
> GEN8 and GEN7 do differ in the way they load page tables for the RCS.
> GEN8 does so with the context restore, while GEN7 requires the proper
> load commands in the command streamer. Non-render is similar for both.
>
> Caveat for GEN7
> The docs say you cannot change the PDEs of a currently running context.
> We never map new PDEs of a running context, and expect them to be
> present - so I think this is okay. (We can unmap, but this should also
> be okay since we only unmap unreferenced objects that the GPU shouldn't
> be tryingto va->pa xlate.) The MI_SET_CONTEXT command does have a flag
> to signal that even if the context is the same, force a reload. It's
> unclear exactly what this does, but I have a hunch it's the right thing
> to do.
>
> The logic assumes that we always emit a context switch after mapping new
> PDEs, and before we submit a batch. This is the case today, and has been
> the case since the inception of hardware contexts. A note in the comment
> let's the user know.
>
> It's not just for gen8. If the current context has mappings change, we
> need a context reload to switch
>
> v2: Rebased after ppgtt clean up patches. Split the warning for aliasing
> and true ppgtt options. And do not break aliasing ppgtt, where to->ppgtt
> is always null.
>
> v3: Invalidate PPGTT TLBs inside alloc_va_range.
>
> v4: Rename ppgtt_invalidate_tlbs to mark_tlbs_dirty and move
> pd_dirty_rings from i915_address_space to i915_hw_ppgtt. Fixes when
> neither ctx->ppgtt and aliasing_ppgtt exist.
>
> v5: Removed references to teardown_va_range.
>
> v6: Updated needs_pd_load_pre/post.
>
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com> (v2+)
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c    | 26 ++++++++++++++++++++------
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 +++++++++++
>  drivers/gpu/drm/i915/i915_gem_gtt.c        | 11 +++++++++++
>  drivers/gpu/drm/i915/i915_gem_gtt.h        |  1 +
>  4 files changed, 43 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index b6ea85d..9197ff4 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -573,8 +573,20 @@ static inline bool should_skip_switch(struct intel_engine_cs *ring,
>  				      struct intel_context *from,
>  				      struct intel_context *to)
>  {
> -	if (from == to && !to->remap_slice)
> -		return true;
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +
> +	if (to->remap_slice)
> +		return false;
> +
> +	if (to->ppgtt) {
> +		if (from == to && !test_bit(ring->id,
> +				&to->ppgtt->pd_dirty_rings))
> +			return true;
> +	} else if (dev_priv->mm.aliasing_ppgtt) {
> +		if (from == to && !test_bit(ring->id,
> +				&dev_priv->mm.aliasing_ppgtt->pd_dirty_rings))
> +			return true;
> +	}
>  
>  	return false;
>  }
> @@ -610,10 +622,7 @@ needs_pd_load_post(struct intel_engine_cs *ring, struct intel_context *to)
>  	if (ring != &dev_priv->ring[RCS])
>  		return false;
>  
> -	if (!to->legacy_hw_ctx.initialized)
> -		return true;
> -
> -	if (i915_gem_context_is_default(to))
> +	if (&to->ppgtt->pd_dirty_rings)

if (to->ppgtt->pd_dirty_rings) ?

>  		return true;
>  
>  	return false;
> @@ -664,6 +673,9 @@ static int do_switch(struct intel_engine_cs *ring,
>  		ret = to->ppgtt->switch_mm(to->ppgtt, ring);
>  		if (ret)
>  			goto unpin_out;
> +
> +		/* Doing a PD load always reloads the page dirs */
> +		clear_bit(ring->id, &to->ppgtt->pd_dirty_rings);
>  	}
>  
>  	if (ring != &dev_priv->ring[RCS]) {
> @@ -696,6 +708,8 @@ static int do_switch(struct intel_engine_cs *ring,
>  
>  	if (!to->legacy_hw_ctx.initialized || i915_gem_context_is_default(to))
>  		hw_flags |= MI_RESTORE_INHIBIT;
> +	else if (to->ppgtt && test_and_clear_bit(ring->id, &to->ppgtt->pd_dirty_rings))
> +		hw_flags |= MI_FORCE_RESTORE;
>  
>  	ret = mi_set_context(ring, to, hw_flags);
>  	if (ret)
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index dc10bc4..fd0241a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1187,6 +1187,13 @@ i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
>  	if (ret)
>  		goto error;
>  
> +	if (ctx->ppgtt)
> +		WARN(ctx->ppgtt->pd_dirty_rings & (1<<ring->id),
> +			"%s didn't clear reload\n", ring->name);
> +	else if (dev_priv->mm.aliasing_ppgtt)
> +		WARN(dev_priv->mm.aliasing_ppgtt->pd_dirty_rings &
> +			(1<<ring->id), "%s didn't clear reload\n", ring->name);
> +
>  	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
>  	instp_mask = I915_EXEC_CONSTANTS_MASK;
>  	switch (instp_mode) {
> @@ -1456,6 +1463,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	if (ret)
>  		goto err;
>  
> +	/* XXX: Reserve has possibly change PDEs which means we must do a
> +	 * context switch before we can coherently read some of the reserved
> +	 * VMAs. */
> +

With ggtt we should be always be safe as the ptes are snooped. And we
don't do dynamic allocation on the ggtt anyways. So why we need the
ctx switch?

As I fail to connect the dots, could you please elaborate the comment
section.

--Mika

>  	/* The objects are in their final locations, apply the relocations. */
>  	if (need_relocs)
>  		ret = i915_gem_execbuffer_relocate(eb);
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index c50b4ab..1758fd9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1161,6 +1161,16 @@ static void gen6_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
>  			       4096, PCI_DMA_BIDIRECTIONAL);
>  }
>  
> +/* PDE TLBs are a pain invalidate pre GEN8. It requires a context reload. If we
> + * are switching between contexts with the same LRCA, we also must do a force
> + * restore.
> + */
> +static inline void mark_tlbs_dirty(struct i915_hw_ppgtt *ppgtt)
> +{
> +	/* If current vm != vm, */
> +	ppgtt->pd_dirty_rings = INTEL_INFO(ppgtt->base.dev)->ring_mask;
> +}
> +
>  static int gen6_alloc_va_range(struct i915_address_space *vm,
>  			       uint64_t start, uint64_t length)
>  {
> @@ -1180,6 +1190,7 @@ static int gen6_alloc_va_range(struct i915_address_space *vm,
>  				GEN6_PTES);
>  	}
>  
> +	mark_tlbs_dirty(ppgtt);
>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index c30af62..074b1fc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -300,6 +300,7 @@ struct i915_hw_ppgtt {
>  	struct i915_address_space base;
>  	struct kref ref;
>  	struct drm_mm_node node;
> +	unsigned long pd_dirty_rings;
>  	unsigned num_pd_entries;
>  	unsigned num_pd_pages; /* gen8+ */
>  	union {
> -- 
> 2.1.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] drm/i915: Track page table reload need
  2015-03-19  9:01   ` Mika Kuoppala
@ 2015-03-19 11:26     ` Michel Thierry
  0 siblings, 0 replies; 15+ messages in thread
From: Michel Thierry @ 2015-03-19 11:26 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

On 3/19/2015 9:01 AM, Mika Kuoppala wrote:
> Michel Thierry <michel.thierry@intel.com> writes:
>
>> From: Ben Widawsky <benjamin.widawsky@intel.com>
>>
>> This patch was formerly known as, "Force pd restore when PDEs change,
>> gen6-7." I had to change the name because it is needed for GEN8 too.
>>
>> The real issue this is trying to solve is when a new object is mapped
>> into the current address space. The GPU does not snoop the new mapping
>> so we must do the gen specific action to reload the page tables.
>>
>> GEN8 and GEN7 do differ in the way they load page tables for the RCS.
>> GEN8 does so with the context restore, while GEN7 requires the proper
>> load commands in the command streamer. Non-render is similar for both.
>>
>> Caveat for GEN7
>> The docs say you cannot change the PDEs of a currently running context.
>> We never map new PDEs of a running context, and expect them to be
>> present - so I think this is okay. (We can unmap, but this should also
>> be okay since we only unmap unreferenced objects that the GPU shouldn't
>> be tryingto va->pa xlate.) The MI_SET_CONTEXT command does have a flag
>> to signal that even if the context is the same, force a reload. It's
>> unclear exactly what this does, but I have a hunch it's the right thing
>> to do.
>>
>> The logic assumes that we always emit a context switch after mapping new
>> PDEs, and before we submit a batch. This is the case today, and has been
>> the case since the inception of hardware contexts. A note in the comment
>> let's the user know.
>>
>> It's not just for gen8. If the current context has mappings change, we
>> need a context reload to switch
>>
>> v2: Rebased after ppgtt clean up patches. Split the warning for aliasing
>> and true ppgtt options. And do not break aliasing ppgtt, where to->ppgtt
>> is always null.
>>
>> v3: Invalidate PPGTT TLBs inside alloc_va_range.
>>
>> v4: Rename ppgtt_invalidate_tlbs to mark_tlbs_dirty and move
>> pd_dirty_rings from i915_address_space to i915_hw_ppgtt. Fixes when
>> neither ctx->ppgtt and aliasing_ppgtt exist.
>>
>> v5: Removed references to teardown_va_range.
>>
>> v6: Updated needs_pd_load_pre/post.
>>
>> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com> (v2+)
>> ---
>>   drivers/gpu/drm/i915/i915_gem_context.c    | 26 ++++++++++++++++++++------
>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 +++++++++++
>>   drivers/gpu/drm/i915/i915_gem_gtt.c        | 11 +++++++++++
>>   drivers/gpu/drm/i915/i915_gem_gtt.h        |  1 +
>>   4 files changed, 43 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index b6ea85d..9197ff4 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -573,8 +573,20 @@ static inline bool should_skip_switch(struct intel_engine_cs *ring,
>>   				      struct intel_context *from,
>>   				      struct intel_context *to)
>>   {
>> -	if (from == to && !to->remap_slice)
>> -		return true;
>> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>> +
>> +	if (to->remap_slice)
>> +		return false;
>> +
>> +	if (to->ppgtt) {
>> +		if (from == to && !test_bit(ring->id,
>> +				&to->ppgtt->pd_dirty_rings))
>> +			return true;
>> +	} else if (dev_priv->mm.aliasing_ppgtt) {
>> +		if (from == to && !test_bit(ring->id,
>> +				&dev_priv->mm.aliasing_ppgtt->pd_dirty_rings))
>> +			return true;
>> +	}
>>   
>>   	return false;
>>   }
>> @@ -610,10 +622,7 @@ needs_pd_load_post(struct intel_engine_cs *ring, struct intel_context *to)
>>   	if (ring != &dev_priv->ring[RCS])
>>   		return false;
>>   
>> -	if (!to->legacy_hw_ctx.initialized)
>> -		return true;
>> -
>> -	if (i915_gem_context_is_default(to))
>> +	if (&to->ppgtt->pd_dirty_rings)
> if (to->ppgtt->pd_dirty_rings) ?
Sorry, wrong copy/paste from test/clear_bit functions.

>>   		return true;
>>   
>>   	return false;
>> @@ -664,6 +673,9 @@ static int do_switch(struct intel_engine_cs *ring,
>>   		ret = to->ppgtt->switch_mm(to->ppgtt, ring);
>>   		if (ret)
>>   			goto unpin_out;
>> +
>> +		/* Doing a PD load always reloads the page dirs */
>> +		clear_bit(ring->id, &to->ppgtt->pd_dirty_rings);
>>   	}
>>   
>>   	if (ring != &dev_priv->ring[RCS]) {
>> @@ -696,6 +708,8 @@ static int do_switch(struct intel_engine_cs *ring,
>>   
>>   	if (!to->legacy_hw_ctx.initialized || i915_gem_context_is_default(to))
>>   		hw_flags |= MI_RESTORE_INHIBIT;
>> +	else if (to->ppgtt && test_and_clear_bit(ring->id, &to->ppgtt->pd_dirty_rings))
>> +		hw_flags |= MI_FORCE_RESTORE;
>>   
>>   	ret = mi_set_context(ring, to, hw_flags);
>>   	if (ret)
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index dc10bc4..fd0241a 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -1187,6 +1187,13 @@ i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
>>   	if (ret)
>>   		goto error;
>>   
>> +	if (ctx->ppgtt)
>> +		WARN(ctx->ppgtt->pd_dirty_rings & (1<<ring->id),
>> +			"%s didn't clear reload\n", ring->name);
>> +	else if (dev_priv->mm.aliasing_ppgtt)
>> +		WARN(dev_priv->mm.aliasing_ppgtt->pd_dirty_rings &
>> +			(1<<ring->id), "%s didn't clear reload\n", ring->name);
>> +
>>   	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
>>   	instp_mask = I915_EXEC_CONSTANTS_MASK;
>>   	switch (instp_mode) {
>> @@ -1456,6 +1463,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>   	if (ret)
>>   		goto err;
>>   
>> +	/* XXX: Reserve has possibly change PDEs which means we must do a
>> +	 * context switch before we can coherently read some of the reserved
>> +	 * VMAs. */
>> +
> With ggtt we should be always be safe as the ptes are snooped. And we
> don't do dynamic allocation on the ggtt anyways. So why we need the
> ctx switch?
>
> As I fail to connect the dots, could you please elaborate the comment
> section.
Right, this comment is only applicable to full PPGTT, which won't happen 
here:
i915_gem_execbuffer_reserve()
->i915_gem_execbuffer_reserve_vma()
-->i915_gem_object_pin() (vma == NULL || !drm_mm_node_allocated(&vma->node))
--->i915_gem_object_bind_to_vm (allocate before insert / bind)
---->gen6_alloc_va_range

>
> --Mika
>
>>   	/* The objects are in their final locations, apply the relocations. */
>>   	if (need_relocs)
>>   		ret = i915_gem_execbuffer_relocate(eb);
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> index c50b4ab..1758fd9 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>> @@ -1161,6 +1161,16 @@ static void gen6_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
>>   			       4096, PCI_DMA_BIDIRECTIONAL);
>>   }
>>   
>> +/* PDE TLBs are a pain invalidate pre GEN8. It requires a context reload. If we
>> + * are switching between contexts with the same LRCA, we also must do a force
>> + * restore.
>> + */
>> +static inline void mark_tlbs_dirty(struct i915_hw_ppgtt *ppgtt)
>> +{
>> +	/* If current vm != vm, */
>> +	ppgtt->pd_dirty_rings = INTEL_INFO(ppgtt->base.dev)->ring_mask;
>> +}
>> +
>>   static int gen6_alloc_va_range(struct i915_address_space *vm,
>>   			       uint64_t start, uint64_t length)
>>   {
>> @@ -1180,6 +1190,7 @@ static int gen6_alloc_va_range(struct i915_address_space *vm,
>>   				GEN6_PTES);
>>   	}
>>   
>> +	mark_tlbs_dirty(ppgtt);
>>   	return 0;
>>   }
>>   
>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> index c30af62..074b1fc 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>> @@ -300,6 +300,7 @@ struct i915_hw_ppgtt {
>>   	struct i915_address_space base;
>>   	struct kref ref;
>>   	struct drm_mm_node node;
>> +	unsigned long pd_dirty_rings;
>>   	unsigned num_pd_entries;
>>   	unsigned num_pd_pages; /* gen8+ */
>>   	union {
>> -- 
>> 2.1.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] drm/i915: Track page table reload need
  2015-03-16 16:00 ` [PATCH 4/5] drm/i915: Track page table reload need Michel Thierry
  2015-03-19  9:01   ` Mika Kuoppala
@ 2015-03-19 12:53   ` Michel Thierry
  1 sibling, 0 replies; 15+ messages in thread
From: Michel Thierry @ 2015-03-19 12:53 UTC (permalink / raw)
  To: intel-gfx

From: Ben Widawsky <benjamin.widawsky@intel.com>

This patch was formerly known as, "Force pd restore when PDEs change,
gen6-7." I had to change the name because it is needed for GEN8 too.

The real issue this is trying to solve is when a new object is mapped
into the current address space. The GPU does not snoop the new mapping
so we must do the gen specific action to reload the page tables.

GEN8 and GEN7 do differ in the way they load page tables for the RCS.
GEN8 does so with the context restore, while GEN7 requires the proper
load commands in the command streamer. Non-render is similar for both.

Caveat for GEN7
The docs say you cannot change the PDEs of a currently running context.
We never map new PDEs of a running context, and expect them to be
present - so I think this is okay. (We can unmap, but this should also
be okay since we only unmap unreferenced objects that the GPU shouldn't
be tryingto va->pa xlate.) The MI_SET_CONTEXT command does have a flag
to signal that even if the context is the same, force a reload. It's
unclear exactly what this does, but I have a hunch it's the right thing
to do.

The logic assumes that we always emit a context switch after mapping new
PDEs, and before we submit a batch. This is the case today, and has been
the case since the inception of hardware contexts. A note in the comment
let's the user know.

It's not just for gen8. If the current context has mappings change, we
need a context reload to switch

v2: Rebased after ppgtt clean up patches. Split the warning for aliasing
and true ppgtt options. And do not break aliasing ppgtt, where to->ppgtt
is always null.

v3: Invalidate PPGTT TLBs inside alloc_va_range.

v4: Rename ppgtt_invalidate_tlbs to mark_tlbs_dirty and move
pd_dirty_rings from i915_address_space to i915_hw_ppgtt. Fixes when
neither ctx->ppgtt and aliasing_ppgtt exist.

v5: Removed references to teardown_va_range.

v6: Updated needs_pd_load_pre/post.

v7: Fix pd_dirty_rings check in needs_pd_load_post, and update/move
comment about updated PDEs to object_pin/bind (Mika).

Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Michel Thierry <michel.thierry@intel.com> (v2+)
---
 drivers/gpu/drm/i915/i915_gem.c            |  4 ++++
 drivers/gpu/drm/i915/i915_gem_context.c    | 26 ++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  7 +++++++
 drivers/gpu/drm/i915/i915_gem_gtt.c        | 11 +++++++++++
 drivers/gpu/drm/i915/i915_gem_gtt.h        |  1 +
 5 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 979dcb6..eefba9d 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4247,6 +4247,10 @@ i915_gem_object_pin_view(struct drm_i915_gem_object *obj,
 
 	bound = vma ? vma->bound : 0;
 	if (vma == NULL || !drm_mm_node_allocated(&vma->node)) {
+		/* In true PPGTT, bind has possibly changed PDEs, which
+		 * means we must do a context switch before the GPU can
+		 * accurately read some of the VMAs.
+		 */
 		vma = i915_gem_object_bind_to_vm(obj, vm, alignment,
 						 flags, view);
 		if (IS_ERR(vma))
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index b6ea85d..dd9ab36 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -573,8 +573,20 @@ static inline bool should_skip_switch(struct intel_engine_cs *ring,
 				      struct intel_context *from,
 				      struct intel_context *to)
 {
-	if (from == to && !to->remap_slice)
-		return true;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
+	if (to->remap_slice)
+		return false;
+
+	if (to->ppgtt) {
+		if (from == to && !test_bit(ring->id,
+				&to->ppgtt->pd_dirty_rings))
+			return true;
+	} else if (dev_priv->mm.aliasing_ppgtt) {
+		if (from == to && !test_bit(ring->id,
+				&dev_priv->mm.aliasing_ppgtt->pd_dirty_rings))
+			return true;
+	}
 
 	return false;
 }
@@ -610,10 +622,7 @@ needs_pd_load_post(struct intel_engine_cs *ring, struct intel_context *to)
 	if (ring != &dev_priv->ring[RCS])
 		return false;
 
-	if (!to->legacy_hw_ctx.initialized)
-		return true;
-
-	if (i915_gem_context_is_default(to))
+	if (to->ppgtt->pd_dirty_rings)
 		return true;
 
 	return false;
@@ -664,6 +673,9 @@ static int do_switch(struct intel_engine_cs *ring,
 		ret = to->ppgtt->switch_mm(to->ppgtt, ring);
 		if (ret)
 			goto unpin_out;
+
+		/* Doing a PD load always reloads the page dirs */
+		clear_bit(ring->id, &to->ppgtt->pd_dirty_rings);
 	}
 
 	if (ring != &dev_priv->ring[RCS]) {
@@ -696,6 +708,8 @@ static int do_switch(struct intel_engine_cs *ring,
 
 	if (!to->legacy_hw_ctx.initialized || i915_gem_context_is_default(to))
 		hw_flags |= MI_RESTORE_INHIBIT;
+	else if (to->ppgtt && test_and_clear_bit(ring->id, &to->ppgtt->pd_dirty_rings))
+		hw_flags |= MI_FORCE_RESTORE;
 
 	ret = mi_set_context(ring, to, hw_flags);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index dc10bc4..3513fb9 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1187,6 +1187,13 @@ i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
 	if (ret)
 		goto error;
 
+	if (ctx->ppgtt)
+		WARN(ctx->ppgtt->pd_dirty_rings & (1<<ring->id),
+			"%s didn't clear reload\n", ring->name);
+	else if (dev_priv->mm.aliasing_ppgtt)
+		WARN(dev_priv->mm.aliasing_ppgtt->pd_dirty_rings &
+			(1<<ring->id), "%s didn't clear reload\n", ring->name);
+
 	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
 	instp_mask = I915_EXEC_CONSTANTS_MASK;
 	switch (instp_mode) {
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index c50b4ab..1758fd9 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1161,6 +1161,16 @@ static void gen6_ppgtt_unmap_pages(struct i915_hw_ppgtt *ppgtt)
 			       4096, PCI_DMA_BIDIRECTIONAL);
 }
 
+/* PDE TLBs are a pain invalidate pre GEN8. It requires a context reload. If we
+ * are switching between contexts with the same LRCA, we also must do a force
+ * restore.
+ */
+static inline void mark_tlbs_dirty(struct i915_hw_ppgtt *ppgtt)
+{
+	/* If current vm != vm, */
+	ppgtt->pd_dirty_rings = INTEL_INFO(ppgtt->base.dev)->ring_mask;
+}
+
 static int gen6_alloc_va_range(struct i915_address_space *vm,
 			       uint64_t start, uint64_t length)
 {
@@ -1180,6 +1190,7 @@ static int gen6_alloc_va_range(struct i915_address_space *vm,
 				GEN6_PTES);
 	}
 
+	mark_tlbs_dirty(ppgtt);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index c30af62..074b1fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -300,6 +300,7 @@ struct i915_hw_ppgtt {
 	struct i915_address_space base;
 	struct kref ref;
 	struct drm_mm_node node;
+	unsigned long pd_dirty_rings;
 	unsigned num_pd_entries;
 	unsigned num_pd_pages; /* gen8+ */
 	union {
-- 
2.1.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work
  2015-03-16 16:00 [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Michel Thierry
                   ` (4 preceding siblings ...)
  2015-03-16 16:00 ` [PATCH 5/5] drm/i915: Initialize all contexts Michel Thierry
@ 2015-03-19 13:32 ` Mika Kuoppala
  2015-03-19 16:21   ` Daniel Vetter
  5 siblings, 1 reply; 15+ messages in thread
From: Mika Kuoppala @ 2015-03-19 13:32 UTC (permalink / raw)
  To: Michel Thierry, intel-gfx

Michel Thierry <michel.thierry@intel.com> writes:

> Splitting this prep work should ease the code review and help identify
> problems (and also shrink the Gen8 patch series, which is of more interest).
>
> 4 patches have already been sent as part of the main patchset, only "page
> table generalizations" is brand new (suggested by Mika) and should help to
> clean some of the code.
>
> Ben Widawsky (4):
>   drm/i915: Extract context switch skip and add pd load logic
>   drm/i915: Track GEN6 page table usage
>   drm/i915: Track page table reload need
>   drm/i915: Initialize all contexts
>
> Michel Thierry (1):
>   drm/i915: page table generalizations
>

Patches:
1,2,3,4v7,5

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>


>  drivers/gpu/drm/i915/i915_gem.c            |   9 +
>  drivers/gpu/drm/i915/i915_gem_context.c    | 103 +++++++-
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  11 +
>  drivers/gpu/drm/i915/i915_gem_gtt.c        | 364 ++++++++++++++++++-----------
>  drivers/gpu/drm/i915/i915_gem_gtt.h        | 102 +++++++-
>  5 files changed, 434 insertions(+), 155 deletions(-)
>
> -- 
> 2.1.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] drm/i915: Track GEN6 page table usage
  2015-03-16 16:00 ` [PATCH 3/5] drm/i915: Track GEN6 page table usage Michel Thierry
@ 2015-03-19 16:08   ` Daniel Vetter
  2015-03-19 16:13   ` Daniel Vetter
  1 sibling, 0 replies; 15+ messages in thread
From: Daniel Vetter @ 2015-03-19 16:08 UTC (permalink / raw)
  To: Michel Thierry; +Cc: intel-gfx

On Mon, Mar 16, 2015 at 04:00:56PM +0000, Michel Thierry wrote:
> +	BUG_ON(length == 0);
> +	BUG_ON(offset_in_page(addr|length));

Broken record: I'm not a fan of BUG_ON at all, it's a massive pain if you
hit that in driver load in the initial modeset. Since these are just
consistency checks that won't result in immediate fireworks (afaics,
please correct) I've switched them to a WARN_ON.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] drm/i915: Track GEN6 page table usage
  2015-03-16 16:00 ` [PATCH 3/5] drm/i915: Track GEN6 page table usage Michel Thierry
  2015-03-19 16:08   ` Daniel Vetter
@ 2015-03-19 16:13   ` Daniel Vetter
  2015-03-19 17:16     ` Michel Thierry
  1 sibling, 1 reply; 15+ messages in thread
From: Daniel Vetter @ 2015-03-19 16:13 UTC (permalink / raw)
  To: Michel Thierry; +Cc: intel-gfx

On Mon, Mar 16, 2015 at 04:00:56PM +0000, Michel Thierry wrote:
> +static inline uint32_t i915_pte_count(uint64_t addr, size_t length,
> +					uint32_t pde_shift)
> +{
> +	const uint64_t mask = ~((1 << pde_shift) - 1);
> +	uint64_t end;
> +
> +	BUG_ON(length == 0);
> +	BUG_ON(offset_in_page(addr|length));
> +
> +	end = addr + length;
> +
> +	if ((addr & mask) != (end & mask))
> +		return NUM_PTE(pde_shift) - i915_pte_index(addr, pde_shift);
> +
> +	return i915_pte_index(end, pde_shift) - i915_pte_index(addr, pde_shift);
> +}

Also this is an impressively big function, I'm pretty sure too big to get
inlined reasonable. Can you please follow-up with a patch to move this
into i915_gem_gtt.c?

General rule for static inline is that either
a) function body is obviously smaller when inlined in code-size
b) patch comes with solid justification in the form of hard performance
data that the inline is better

If neither applies drop the static inline. Generally these
microoptimizations are premature and only result in binary code size
increase. Which tends to thrash instruction caches and actually reduces
performance.

Iirc there's more of that going on in this series, so perhaps do a
follow-up for all of these.
Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work
  2015-03-19 13:32 ` [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Mika Kuoppala
@ 2015-03-19 16:21   ` Daniel Vetter
  0 siblings, 0 replies; 15+ messages in thread
From: Daniel Vetter @ 2015-03-19 16:21 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Thu, Mar 19, 2015 at 03:32:06PM +0200, Mika Kuoppala wrote:
> Michel Thierry <michel.thierry@intel.com> writes:
> 
> > Splitting this prep work should ease the code review and help identify
> > problems (and also shrink the Gen8 patch series, which is of more interest).
> >
> > 4 patches have already been sent as part of the main patchset, only "page
> > table generalizations" is brand new (suggested by Mika) and should help to
> > clean some of the code.
> >
> > Ben Widawsky (4):
> >   drm/i915: Extract context switch skip and add pd load logic
> >   drm/i915: Track GEN6 page table usage
> >   drm/i915: Track page table reload need
> >   drm/i915: Initialize all contexts
> >
> > Michel Thierry (1):
> >   drm/i915: page table generalizations
> >
> 
> Patches:
> 1,2,3,4v7,5
> 
> Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

All merged, thanks for patches&review.

Another thing that Chris just mentioned again on irc is the use of size_t
in i915_gem_gtt.c. That doesn't really make sense in hw driver code. At
the end of this ggtt endeavor we should throw a patch on top which
mass-replaces size_t with uint64_t and also audits the code for any kinds
of evil overflows. We definitely have bugs since 48bit gtt doesn't work on
32bit kernels. Not that this is an interesting feature, but because C
being careful with integer conversion is always good practice - it can
blow up in rather interesting ways.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] drm/i915: Track GEN6 page table usage
  2015-03-19 16:13   ` Daniel Vetter
@ 2015-03-19 17:16     ` Michel Thierry
  0 siblings, 0 replies; 15+ messages in thread
From: Michel Thierry @ 2015-03-19 17:16 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On 3/19/2015 4:13 PM, Daniel Vetter wrote:
> On Mon, Mar 16, 2015 at 04:00:56PM +0000, Michel Thierry wrote:
>> +static inline uint32_t i915_pte_count(uint64_t addr, size_t length,
>> +					uint32_t pde_shift)
>> +{
>> +	const uint64_t mask = ~((1 << pde_shift) - 1);
>> +	uint64_t end;
>> +
>> +	BUG_ON(length == 0);
>> +	BUG_ON(offset_in_page(addr|length));
>> +
>> +	end = addr + length;
>> +
>> +	if ((addr & mask) != (end & mask))
>> +		return NUM_PTE(pde_shift) - i915_pte_index(addr, pde_shift);
>> +
>> +	return i915_pte_index(end, pde_shift) - i915_pte_index(addr, pde_shift);
>> +}
> Also this is an impressively big function, I'm pretty sure too big to get
> inlined reasonable. Can you please follow-up with a patch to move this
> into i915_gem_gtt.c?
>
> General rule for static inline is that either
> a) function body is obviously smaller when inlined in code-size
> b) patch comes with solid justification in the form of hard performance
> data that the inline is better
>
> If neither applies drop the static inline. Generally these
> microoptimizations are premature and only result in binary code size
> increase. Which tends to thrash instruction caches and actually reduces
> performance.
>
> Iirc there's more of that going on in this series, so perhaps do a
> follow-up for all of these.
> Thanks, Daniel
That's the biggest one, the rest will be 2-3 lines long. I'll send a 
follow-up patch for this one.

Thanks
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-03-19 17:17 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-03-16 16:00 [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Michel Thierry
2015-03-16 16:00 ` [PATCH 1/5] drm/i915: page table generalizations Michel Thierry
2015-03-16 16:00 ` [PATCH 2/5] drm/i915: Extract context switch skip and add pd load logic Michel Thierry
2015-03-16 16:00 ` [PATCH 3/5] drm/i915: Track GEN6 page table usage Michel Thierry
2015-03-19 16:08   ` Daniel Vetter
2015-03-19 16:13   ` Daniel Vetter
2015-03-19 17:16     ` Michel Thierry
2015-03-16 16:00 ` [PATCH 4/5] drm/i915: Track page table reload need Michel Thierry
2015-03-19  9:01   ` Mika Kuoppala
2015-03-19 11:26     ` Michel Thierry
2015-03-19 12:53   ` [PATCH] " Michel Thierry
2015-03-16 16:00 ` [PATCH 5/5] drm/i915: Initialize all contexts Michel Thierry
2015-03-16 21:39   ` shuang.he
2015-03-19 13:32 ` [PATCH 0/5] Gen6/7 PPGTT dynamic page alloc prep work Mika Kuoppala
2015-03-19 16:21   ` Daniel Vetter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox