* [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
@ 2017-06-15 12:38 Chris Wilson
2017-06-15 12:38 ` [passive aggressive RESEND 2/2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty Chris Wilson
` (3 more replies)
0 siblings, 4 replies; 14+ messages in thread
From: Chris Wilson @ 2017-06-15 12:38 UTC (permalink / raw)
To: intel-gfx; +Cc: Dongwon Kim
Currently, we only mark the CPU cache as dirty if we skip a clflush.
This leads to some confusion where we have to ask if the object is in
the write domain or missed a clflush. If we always mark the cache as
dirty, this becomes a much simply question to answer.
The goal remains to do as few clflushes as required and to do them as
late as possible, in the hope of deferring the work to a kthread and not
block the caller (e.g. execbuf, flips).
v2: Always call clflush before GPU execution when the cache_dirty flag
is set. This may cause some extra work on llc systems that migrate dirty
buffers back and forth - but we do try to limit that by only setting
cache_dirty at the end of the gpu sequence.
v3: Always mark the cache as dirty upon a level change, as we need to
invalidate any stale cachelines due to external writes.
Reported-by: Dongwon Kim <dongwon.kim@intel.com>
Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Tested-by: Dongwon Kim <dongwon.kim@intel.com>
---
drivers/gpu/drm/i915/i915_gem.c | 76 ++++++++++++++----------
drivers/gpu/drm/i915/i915_gem_clflush.c | 15 +++--
drivers/gpu/drm/i915/i915_gem_execbuffer.c | 21 +++----
drivers/gpu/drm/i915/i915_gem_internal.c | 3 +-
drivers/gpu/drm/i915/i915_gem_userptr.c | 5 +-
drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 +-
6 files changed, 67 insertions(+), 56 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 31cbe78171a9..b1504a829c6a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
{
- if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
+ if (obj->cache_dirty)
return false;
if (!i915_gem_object_is_coherent(obj))
@@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
return st;
}
+static void __start_cpu_write(struct drm_i915_gem_object *obj)
+{
+ obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+ obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+ if (cpu_write_needs_clflush(obj))
+ obj->cache_dirty = true;
+}
+
static void
__i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
struct sg_table *pages,
@@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
!i915_gem_object_is_coherent(obj))
drm_clflush_sg(pages);
- obj->base.read_domains = I915_GEM_DOMAIN_CPU;
- obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+ __start_cpu_write(obj);
}
static void
@@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
args->size, &args->handle);
}
+static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
+{
+ return !(obj->cache_level == I915_CACHE_NONE ||
+ obj->cache_level == I915_CACHE_WT);
+}
+
/**
* Creates a new mm object and returns a handle to it.
* @dev: drm device pointer
@@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
case I915_GEM_DOMAIN_CPU:
i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
break;
+
+ case I915_GEM_DOMAIN_RENDER:
+ if (gpu_write_needs_clflush(obj))
+ obj->cache_dirty = true;
+ break;
}
obj->base.write_domain = 0;
@@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
* optimizes for the case when the gpu will dirty the data
* anyway again before the next pread happens.
*/
- if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
+ if (!obj->cache_dirty &&
+ !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
*needs_clflush = CLFLUSH_BEFORE;
out:
@@ -906,14 +925,16 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
* This optimizes for the case when the gpu will use the data
* right away and we therefore have to clflush anyway.
*/
- if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
+ if (!obj->cache_dirty) {
*needs_clflush |= CLFLUSH_AFTER;
- /* Same trick applies to invalidate partially written cachelines read
- * before writing.
- */
- if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
- *needs_clflush |= CLFLUSH_BEFORE;
+ /*
+ * Same trick applies to invalidate partially written
+ * cachelines read before writing.
+ */
+ if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
+ *needs_clflush |= CLFLUSH_BEFORE;
+ }
out:
intel_fb_obj_invalidate(obj, ORIGIN_CPU);
@@ -3395,10 +3416,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
{
- if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty)
- return;
-
- i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
+ /*
+ * We manually flush the CPU domain so that we can override and
+ * force the flush for the display, and perform it asyncrhonously.
+ */
+ flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
+ if (obj->cache_dirty)
+ i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
obj->base.write_domain = 0;
}
@@ -3657,13 +3681,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
}
}
- if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
- i915_gem_object_is_coherent(obj))
- obj->cache_dirty = true;
-
list_for_each_entry(vma, &obj->vma_list, obj_link)
vma->node.color = cache_level;
obj->cache_level = cache_level;
+ obj->cache_dirty = true; /* Always invalidate stale cachelines */
return 0;
}
@@ -3885,9 +3906,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
if (ret)
return ret;
- if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
- return 0;
-
flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
/* Flush the CPU cache if it's still invalid. */
@@ -3899,15 +3917,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
/* It should now be out of any other write domains, and we can update
* the domain values for our changes.
*/
- GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
+ GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
/* If we're writing through the CPU, then the GPU read domains will
* need to be invalidated at next use.
*/
- if (write) {
- obj->base.read_domains = I915_GEM_DOMAIN_CPU;
- obj->base.write_domain = I915_GEM_DOMAIN_CPU;
- }
+ if (write)
+ __start_cpu_write(obj);
return 0;
}
@@ -4328,6 +4344,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
} else
obj->cache_level = I915_CACHE_NONE;
+ obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+
trace_i915_gem_object_create(obj);
return obj;
@@ -4994,10 +5012,8 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
mutex_lock(&dev_priv->drm.struct_mutex);
for (p = phases; *p; p++) {
- list_for_each_entry(obj, *p, global_link) {
- obj->base.read_domains = I915_GEM_DOMAIN_CPU;
- obj->base.write_domain = I915_GEM_DOMAIN_CPU;
- }
+ list_for_each_entry(obj, *p, global_link)
+ __start_cpu_write(obj);
}
mutex_unlock(&dev_priv->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index ffac7a1f0caf..17b207e963c2 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -71,8 +71,6 @@ static const struct dma_fence_ops i915_clflush_ops = {
static void __i915_do_clflush(struct drm_i915_gem_object *obj)
{
drm_clflush_sg(obj->mm.pages);
- obj->cache_dirty = false;
-
intel_fb_obj_flush(obj, ORIGIN_CPU);
}
@@ -81,9 +79,6 @@ static void i915_clflush_work(struct work_struct *work)
struct clflush *clflush = container_of(work, typeof(*clflush), work);
struct drm_i915_gem_object *obj = clflush->obj;
- if (!obj->cache_dirty)
- goto out;
-
if (i915_gem_object_pin_pages(obj)) {
DRM_ERROR("Failed to acquire obj->pages for clflushing\n");
goto out;
@@ -131,10 +126,10 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
* anything not backed by physical memory we consider to be always
* coherent and not need clflushing.
*/
- if (!i915_gem_object_has_struct_page(obj))
+ if (!i915_gem_object_has_struct_page(obj)) {
+ obj->cache_dirty = false;
return;
-
- obj->cache_dirty = true;
+ }
/* If the GPU is snooping the contents of the CPU cache,
* we do not need to manually clear the CPU cache lines. However,
@@ -153,6 +148,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
if (!(flags & I915_CLFLUSH_SYNC))
clflush = kmalloc(sizeof(*clflush), GFP_KERNEL);
if (clflush) {
+ GEM_BUG_ON(!obj->cache_dirty);
+
dma_fence_init(&clflush->dma,
&i915_clflush_ops,
&clflush_lock,
@@ -180,4 +177,6 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
} else {
GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
}
+
+ obj->cache_dirty = false;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 96705171e397..2a9aed5640e2 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -309,7 +309,7 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
return DBG_USE_CPU_RELOC > 0;
return (HAS_LLC(to_i915(obj->base.dev)) ||
- obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+ obj->cache_dirty ||
obj->cache_level != I915_CACHE_NONE);
}
@@ -1110,10 +1110,8 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
continue;
- if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
+ if (obj->cache_dirty)
i915_gem_clflush_object(obj, 0);
- obj->base.write_domain = 0;
- }
ret = i915_gem_request_await_object
(eb->request, obj, obj->base.pending_write_domain);
@@ -1248,12 +1246,6 @@ static int eb_select_context(struct i915_execbuffer *eb)
return 0;
}
-static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
-{
- return !(obj->cache_level == I915_CACHE_NONE ||
- obj->cache_level == I915_CACHE_WT);
-}
-
void i915_vma_move_to_active(struct i915_vma *vma,
struct drm_i915_gem_request *req,
unsigned int flags)
@@ -1277,15 +1269,16 @@ void i915_vma_move_to_active(struct i915_vma *vma,
i915_gem_active_set(&vma->last_read[idx], req);
list_move_tail(&vma->vm_link, &vma->vm->active_list);
+ obj->base.write_domain = 0;
if (flags & EXEC_OBJECT_WRITE) {
+ obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
+
if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
i915_gem_active_set(&obj->frontbuffer_write, req);
- /* update for the implicit flush after a batch */
- obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
- if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
- obj->cache_dirty = true;
+ obj->base.read_domains = 0;
}
+ obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
if (flags & EXEC_OBJECT_NEEDS_FENCE)
i915_gem_active_set(&vma->last_fence, req);
diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
index fc950abbe400..58e93e87d573 100644
--- a/drivers/gpu/drm/i915/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/i915_gem_internal.c
@@ -188,9 +188,10 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
drm_gem_private_object_init(&i915->drm, &obj->base, size);
i915_gem_object_init(obj, &i915_gem_object_internal_ops);
- obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+ obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
+ obj->cache_dirty = !i915_gem_object_is_coherent(obj);
return obj;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 1a0ce1dc68f5..34461e1928bc 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -802,9 +802,10 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
drm_gem_private_object_init(dev, &obj->base, args->user_size);
i915_gem_object_init(obj, &i915_gem_userptr_ops);
- obj->cache_level = I915_CACHE_LLC;
- obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+ obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+ obj->cache_level = I915_CACHE_LLC;
+ obj->cache_dirty = !i915_gem_object_is_coherent(obj);
obj->userptr.ptr = args->user_ptr;
obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
index 4e681fc13be4..0ca867a877b6 100644
--- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
+++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
@@ -126,9 +126,10 @@ huge_gem_object(struct drm_i915_private *i915,
drm_gem_private_object_init(&i915->drm, &obj->base, dma_size);
i915_gem_object_init(obj, &huge_ops);
- obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+ obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
+ obj->cache_dirty = !i915_gem_object_is_coherent(obj);
obj->scratch = phys_size;
return obj;
--
2.11.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 14+ messages in thread* [passive aggressive RESEND 2/2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty
2017-06-15 12:38 [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Chris Wilson
@ 2017-06-15 12:38 ` Chris Wilson
2017-06-15 14:41 ` Tvrtko Ursulin
2017-06-16 10:54 ` [PATCH v2] " Chris Wilson
2017-06-15 12:59 ` ✓ Fi.CI.BAT: success for series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Patchwork
` (2 subsequent siblings)
3 siblings, 2 replies; 14+ messages in thread
From: Chris Wilson @ 2017-06-15 12:38 UTC (permalink / raw)
To: intel-gfx; +Cc: Dongwon Kim
For ease of use (i.e. avoiding a few checks and function calls), store
the object's cache coherency next to the cache is dirty bit.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Tested-by: Dongwon Kim <dongwon.kim@intel.com>
---
drivers/gpu/drm/i915/i915_gem.c | 14 +++++++-------
drivers/gpu/drm/i915/i915_gem_clflush.c | 2 +-
drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +-
drivers/gpu/drm/i915/i915_gem_internal.c | 3 ++-
drivers/gpu/drm/i915/i915_gem_object.h | 1 +
drivers/gpu/drm/i915/i915_gem_stolen.c | 1 +
drivers/gpu/drm/i915/i915_gem_userptr.c | 3 ++-
drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 ++-
8 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b1504a829c6a..4ae30f74c475 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -52,7 +52,7 @@ static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
if (obj->cache_dirty)
return false;
- if (!i915_gem_object_is_coherent(obj))
+ if (!obj->cache_coherent)
return true;
return obj->pin_display;
@@ -253,7 +253,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
if (needs_clflush &&
(obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
- !i915_gem_object_is_coherent(obj))
+ !obj->cache_coherent)
drm_clflush_sg(pages);
__start_cpu_write(obj);
@@ -856,8 +856,7 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
if (ret)
return ret;
- if (i915_gem_object_is_coherent(obj) ||
- !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+ if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
ret = i915_gem_object_set_to_cpu_domain(obj, false);
if (ret)
goto err_unpin;
@@ -909,8 +908,7 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
if (ret)
return ret;
- if (i915_gem_object_is_coherent(obj) ||
- !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+ if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
ret = i915_gem_object_set_to_cpu_domain(obj, true);
if (ret)
goto err_unpin;
@@ -3684,6 +3682,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
list_for_each_entry(vma, &obj->vma_list, obj_link)
vma->node.color = cache_level;
obj->cache_level = cache_level;
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
obj->cache_dirty = true; /* Always invalidate stale cachelines */
return 0;
@@ -4344,7 +4343,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
} else
obj->cache_level = I915_CACHE_NONE;
- obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
+ obj->cache_dirty = !obj->cache_coherent;
trace_i915_gem_object_create(obj);
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index 17b207e963c2..152f16c11878 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -139,7 +139,7 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
* snooping behaviour occurs naturally as the result of our domain
* tracking.
*/
- if (!(flags & I915_CLFLUSH_FORCE) && i915_gem_object_is_coherent(obj))
+ if (!(flags & I915_CLFLUSH_FORCE) && obj->cache_coherent)
return;
trace_i915_gem_object_clflush(obj);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 2a9aed5640e2..20933a15be46 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1110,7 +1110,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
continue;
- if (obj->cache_dirty)
+ if (obj->cache_dirty & ~obj->cache_coherent)
i915_gem_clflush_object(obj, 0);
ret = i915_gem_request_await_object
diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
index 58e93e87d573..568bf83af1f5 100644
--- a/drivers/gpu/drm/i915/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/i915_gem_internal.c
@@ -191,7 +191,8 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
- obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
+ obj->cache_dirty = !obj->cache_coherent;
return obj;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 915057824284..adb482b00271 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -121,6 +121,7 @@ struct drm_i915_gem_object {
unsigned long gt_ro:1;
unsigned int cache_level:3;
unsigned int cache_dirty:1;
+ unsigned int cache_coherent:1;
atomic_t frontbuffer_bits;
unsigned int frontbuffer_ggtt_origin; /* write once */
diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index 681db6083f4d..a817b3e0b17e 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -590,6 +590,7 @@ _i915_gem_object_create_stolen(struct drm_i915_private *dev_priv,
obj->stolen = stolen;
obj->base.read_domains = I915_GEM_DOMAIN_CPU | I915_GEM_DOMAIN_GTT;
obj->cache_level = HAS_LLC(dev_priv) ? I915_CACHE_LLC : I915_CACHE_NONE;
+ obj->cache_coherent = true; /* assumptions! more like cache_oblivious */
if (i915_gem_object_pin_pages(obj))
goto cleanup;
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 34461e1928bc..05c36f663550 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -805,7 +805,8 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->cache_level = I915_CACHE_LLC;
- obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
+ obj->cache_dirty = !obj->cache_coherent;
obj->userptr.ptr = args->user_ptr;
obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
index 0ca867a877b6..caf76af36aba 100644
--- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
+++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
@@ -129,7 +129,8 @@ huge_gem_object(struct drm_i915_private *i915,
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
- obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
+ obj->cache_dirty = !obj->cache_coherent;
obj->scratch = phys_size;
return obj;
--
2.11.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [passive aggressive RESEND 2/2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty
2017-06-15 12:38 ` [passive aggressive RESEND 2/2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty Chris Wilson
@ 2017-06-15 14:41 ` Tvrtko Ursulin
2017-06-16 10:04 ` Chris Wilson
2017-06-16 10:54 ` [PATCH v2] " Chris Wilson
1 sibling, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2017-06-15 14:41 UTC (permalink / raw)
To: Chris Wilson, intel-gfx; +Cc: Dongwon Kim
On 15/06/2017 13:38, Chris Wilson wrote:
> For ease of use (i.e. avoiding a few checks and function calls), store
> the object's cache coherency next to the cache is dirty bit.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> Tested-by: Dongwon Kim <dongwon.kim@intel.com>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 14 +++++++-------
> drivers/gpu/drm/i915/i915_gem_clflush.c | 2 +-
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +-
> drivers/gpu/drm/i915/i915_gem_internal.c | 3 ++-
> drivers/gpu/drm/i915/i915_gem_object.h | 1 +
> drivers/gpu/drm/i915/i915_gem_stolen.c | 1 +
> drivers/gpu/drm/i915/i915_gem_userptr.c | 3 ++-
> drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 ++-
> 8 files changed, 17 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b1504a829c6a..4ae30f74c475 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -52,7 +52,7 @@ static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> if (obj->cache_dirty)
> return false;
>
> - if (!i915_gem_object_is_coherent(obj))
> + if (!obj->cache_coherent)
> return true;
>
> return obj->pin_display;
> @@ -253,7 +253,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>
> if (needs_clflush &&
> (obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
> - !i915_gem_object_is_coherent(obj))
> + !obj->cache_coherent)
> drm_clflush_sg(pages);
>
> __start_cpu_write(obj);
> @@ -856,8 +856,7 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
> if (ret)
> return ret;
>
> - if (i915_gem_object_is_coherent(obj) ||
> - !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> + if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> ret = i915_gem_object_set_to_cpu_domain(obj, false);
> if (ret)
> goto err_unpin;
> @@ -909,8 +908,7 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
> if (ret)
> return ret;
>
> - if (i915_gem_object_is_coherent(obj) ||
> - !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> + if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> ret = i915_gem_object_set_to_cpu_domain(obj, true);
> if (ret)
> goto err_unpin;
> @@ -3684,6 +3682,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> list_for_each_entry(vma, &obj->vma_list, obj_link)
> vma->node.color = cache_level;
> obj->cache_level = cache_level;
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> obj->cache_dirty = true; /* Always invalidate stale cachelines */
>
> return 0;
> @@ -4344,7 +4343,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
> } else
> obj->cache_level = I915_CACHE_NONE;
>
> - obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> + obj->cache_dirty = !obj->cache_coherent;
>
> trace_i915_gem_object_create(obj);
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
> index 17b207e963c2..152f16c11878 100644
> --- a/drivers/gpu/drm/i915/i915_gem_clflush.c
> +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
> @@ -139,7 +139,7 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
> * snooping behaviour occurs naturally as the result of our domain
> * tracking.
> */
> - if (!(flags & I915_CLFLUSH_FORCE) && i915_gem_object_is_coherent(obj))
> + if (!(flags & I915_CLFLUSH_FORCE) && obj->cache_coherent)
> return;
>
> trace_i915_gem_object_clflush(obj);
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 2a9aed5640e2..20933a15be46 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1110,7 +1110,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
> if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
> continue;
>
> - if (obj->cache_dirty)
> + if (obj->cache_dirty & ~obj->cache_coherent)
What is the explanation for this change?
> i915_gem_clflush_object(obj, 0);
>
> ret = i915_gem_request_await_object
> diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
> index 58e93e87d573..568bf83af1f5 100644
> --- a/drivers/gpu/drm/i915/i915_gem_internal.c
> +++ b/drivers/gpu/drm/i915/i915_gem_internal.c
> @@ -191,7 +191,8 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
> obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> - obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> + obj->cache_dirty = !obj->cache_coherent;
>
> return obj;
> }
> diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
> index 915057824284..adb482b00271 100644
> --- a/drivers/gpu/drm/i915/i915_gem_object.h
> +++ b/drivers/gpu/drm/i915/i915_gem_object.h
> @@ -121,6 +121,7 @@ struct drm_i915_gem_object {
> unsigned long gt_ro:1;
> unsigned int cache_level:3;
> unsigned int cache_dirty:1;
> + unsigned int cache_coherent:1;
>
> atomic_t frontbuffer_bits;
> unsigned int frontbuffer_ggtt_origin; /* write once */
> diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
> index 681db6083f4d..a817b3e0b17e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_stolen.c
> +++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
> @@ -590,6 +590,7 @@ _i915_gem_object_create_stolen(struct drm_i915_private *dev_priv,
> obj->stolen = stolen;
> obj->base.read_domains = I915_GEM_DOMAIN_CPU | I915_GEM_DOMAIN_GTT;
> obj->cache_level = HAS_LLC(dev_priv) ? I915_CACHE_LLC : I915_CACHE_NONE;
> + obj->cache_coherent = true; /* assumptions! more like cache_oblivious */
>
> if (i915_gem_object_pin_pages(obj))
> goto cleanup;
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 34461e1928bc..05c36f663550 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -805,7 +805,8 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
> obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->cache_level = I915_CACHE_LLC;
> - obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> + obj->cache_dirty = !obj->cache_coherent;
>
> obj->userptr.ptr = args->user_ptr;
> obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
> diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> index 0ca867a877b6..caf76af36aba 100644
> --- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> +++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> @@ -129,7 +129,8 @@ huge_gem_object(struct drm_i915_private *i915,
> obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> - obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> + obj->cache_dirty = !obj->cache_coherent;
> obj->scratch = phys_size;
>
> return obj;
>
Option of converting i915_gem_object_is_coherent to just return
obj->cache_coherent for less churn? (And adding
i915_gem_object_set_coherent or something if enough call sites to justify?)
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [passive aggressive RESEND 2/2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty
2017-06-15 14:41 ` Tvrtko Ursulin
@ 2017-06-16 10:04 ` Chris Wilson
0 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2017-06-16 10:04 UTC (permalink / raw)
To: Tvrtko Ursulin, intel-gfx; +Cc: Dongwon Kim
Quoting Tvrtko Ursulin (2017-06-15 15:41:08)
>
> On 15/06/2017 13:38, Chris Wilson wrote:
> > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > index 2a9aed5640e2..20933a15be46 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > @@ -1110,7 +1110,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
> > if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
> > continue;
> >
> > - if (obj->cache_dirty)
> > + if (obj->cache_dirty & ~obj->cache_coherent)
>
> What is the explanation for this change?
From above:
> > For ease of use (i.e. avoiding a few checks and function calls), store
> > the object's cache coherency next to the cache is dirty bit.
It's a single load for cache_dirty + cache_coherent, but we add a new
reg mov and shift, a couple of instructions to save a function call and
a fresh load.
>
> > i915_gem_clflush_object(obj, 0);
> >
> > ret = i915_gem_request_await_object
> > diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> > index 0ca867a877b6..caf76af36aba 100644
> > --- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> > +++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> > @@ -129,7 +129,8 @@ huge_gem_object(struct drm_i915_private *i915,
> > obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> > obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> > obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> > - obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> > + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> > + obj->cache_dirty = !obj->cache_coherent;
> > obj->scratch = phys_size;
> >
> > return obj;
> >
>
> Option of converting i915_gem_object_is_coherent to just return
> obj->cache_coherent for less churn? (And adding
> i915_gem_object_set_coherent or something if enough call sites to justify?)
My starting point was doing the bitops in execbuf, so I haven't
considered that alternative. However,
static inline bool i915_gem_object_is_coherent(struct drm_i915_gem_object *obj)
{
return obj->cache_coherent;
}
might give Joonas a fit before his vacation :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty
2017-06-15 12:38 ` [passive aggressive RESEND 2/2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty Chris Wilson
2017-06-15 14:41 ` Tvrtko Ursulin
@ 2017-06-16 10:54 ` Chris Wilson
2017-06-16 11:10 ` Tvrtko Ursulin
1 sibling, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2017-06-16 10:54 UTC (permalink / raw)
To: intel-gfx; +Cc: Dongwon Kim
For ease of use (i.e. avoiding a few checks and function calls), store
the object's cache coherency next to the cache is dirty bit.
Specifically this patch aims to reduce the frequency of no-op calls to
i915_gem_object_clflush() to counter-act the increase of such calls for
GPU only objects in the previous patch.
v2: Replace cache_dirty & ~cache_coherent with cache_dirty &&
!cache_coherent as gcc generates much better code for the latter
(Tvrtko)
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Tested-by: Dongwon Kim <dongwon.kim@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
drivers/gpu/drm/i915/i915_gem.c | 14 +++++++-------
drivers/gpu/drm/i915/i915_gem_clflush.c | 2 +-
drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +-
drivers/gpu/drm/i915/i915_gem_internal.c | 3 ++-
drivers/gpu/drm/i915/i915_gem_object.h | 1 +
drivers/gpu/drm/i915/i915_gem_stolen.c | 1 +
drivers/gpu/drm/i915/i915_gem_userptr.c | 3 ++-
drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 ++-
8 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d464bcf70127..2bd6d71b432e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -52,7 +52,7 @@ static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
if (obj->cache_dirty)
return false;
- if (!i915_gem_object_is_coherent(obj))
+ if (!obj->cache_coherent)
return true;
return obj->pin_display;
@@ -253,7 +253,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
if (needs_clflush &&
(obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
- !i915_gem_object_is_coherent(obj))
+ !obj->cache_coherent)
drm_clflush_sg(pages);
__start_cpu_write(obj);
@@ -870,8 +870,7 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
if (ret)
return ret;
- if (i915_gem_object_is_coherent(obj) ||
- !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+ if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
ret = i915_gem_object_set_to_cpu_domain(obj, false);
if (ret)
goto err_unpin;
@@ -923,8 +922,7 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
if (ret)
return ret;
- if (i915_gem_object_is_coherent(obj) ||
- !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+ if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
ret = i915_gem_object_set_to_cpu_domain(obj, true);
if (ret)
goto err_unpin;
@@ -3698,6 +3696,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
list_for_each_entry(vma, &obj->vma_list, obj_link)
vma->node.color = cache_level;
obj->cache_level = cache_level;
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
obj->cache_dirty = true; /* Always invalidate stale cachelines */
return 0;
@@ -4358,7 +4357,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
} else
obj->cache_level = I915_CACHE_NONE;
- obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
+ obj->cache_dirty = !obj->cache_coherent;
trace_i915_gem_object_create(obj);
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index 17b207e963c2..152f16c11878 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -139,7 +139,7 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
* snooping behaviour occurs naturally as the result of our domain
* tracking.
*/
- if (!(flags & I915_CLFLUSH_FORCE) && i915_gem_object_is_coherent(obj))
+ if (!(flags & I915_CLFLUSH_FORCE) && obj->cache_coherent)
return;
trace_i915_gem_object_clflush(obj);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 2a9aed5640e2..d6099d084748 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1110,7 +1110,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
continue;
- if (obj->cache_dirty)
+ if (unlikely(obj->cache_dirty && !obj->cache_coherent))
i915_gem_clflush_object(obj, 0);
ret = i915_gem_request_await_object
diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
index 58e93e87d573..568bf83af1f5 100644
--- a/drivers/gpu/drm/i915/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/i915_gem_internal.c
@@ -191,7 +191,8 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
- obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
+ obj->cache_dirty = !obj->cache_coherent;
return obj;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 915057824284..adb482b00271 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -121,6 +121,7 @@ struct drm_i915_gem_object {
unsigned long gt_ro:1;
unsigned int cache_level:3;
unsigned int cache_dirty:1;
+ unsigned int cache_coherent:1;
atomic_t frontbuffer_bits;
unsigned int frontbuffer_ggtt_origin; /* write once */
diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index 681db6083f4d..a817b3e0b17e 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -590,6 +590,7 @@ _i915_gem_object_create_stolen(struct drm_i915_private *dev_priv,
obj->stolen = stolen;
obj->base.read_domains = I915_GEM_DOMAIN_CPU | I915_GEM_DOMAIN_GTT;
obj->cache_level = HAS_LLC(dev_priv) ? I915_CACHE_LLC : I915_CACHE_NONE;
+ obj->cache_coherent = true; /* assumptions! more like cache_oblivious */
if (i915_gem_object_pin_pages(obj))
goto cleanup;
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 34461e1928bc..05c36f663550 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -805,7 +805,8 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->cache_level = I915_CACHE_LLC;
- obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
+ obj->cache_dirty = !obj->cache_coherent;
obj->userptr.ptr = args->user_ptr;
obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
index 0ca867a877b6..caf76af36aba 100644
--- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
+++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
@@ -129,7 +129,8 @@ huge_gem_object(struct drm_i915_private *i915,
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
- obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+ obj->cache_coherent = i915_gem_object_is_coherent(obj);
+ obj->cache_dirty = !obj->cache_coherent;
obj->scratch = phys_size;
return obj;
--
2.11.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH v2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty
2017-06-16 10:54 ` [PATCH v2] " Chris Wilson
@ 2017-06-16 11:10 ` Tvrtko Ursulin
0 siblings, 0 replies; 14+ messages in thread
From: Tvrtko Ursulin @ 2017-06-16 11:10 UTC (permalink / raw)
To: Chris Wilson, intel-gfx; +Cc: Dongwon Kim
On 16/06/2017 11:54, Chris Wilson wrote:
> For ease of use (i.e. avoiding a few checks and function calls), store
> the object's cache coherency next to the cache is dirty bit.
>
> Specifically this patch aims to reduce the frequency of no-op calls to
> i915_gem_object_clflush() to counter-act the increase of such calls for
> GPU only objects in the previous patch.
>
> v2: Replace cache_dirty & ~cache_coherent with cache_dirty &&
> !cache_coherent as gcc generates much better code for the latter
> (Tvrtko)
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> Tested-by: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 14 +++++++-------
> drivers/gpu/drm/i915/i915_gem_clflush.c | 2 +-
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +-
> drivers/gpu/drm/i915/i915_gem_internal.c | 3 ++-
> drivers/gpu/drm/i915/i915_gem_object.h | 1 +
> drivers/gpu/drm/i915/i915_gem_stolen.c | 1 +
> drivers/gpu/drm/i915/i915_gem_userptr.c | 3 ++-
> drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 ++-
> 8 files changed, 17 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d464bcf70127..2bd6d71b432e 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -52,7 +52,7 @@ static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> if (obj->cache_dirty)
> return false;
>
> - if (!i915_gem_object_is_coherent(obj))
> + if (!obj->cache_coherent)
> return true;
>
> return obj->pin_display;
> @@ -253,7 +253,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>
> if (needs_clflush &&
> (obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
> - !i915_gem_object_is_coherent(obj))
> + !obj->cache_coherent)
> drm_clflush_sg(pages);
>
> __start_cpu_write(obj);
> @@ -870,8 +870,7 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
> if (ret)
> return ret;
>
> - if (i915_gem_object_is_coherent(obj) ||
> - !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> + if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> ret = i915_gem_object_set_to_cpu_domain(obj, false);
> if (ret)
> goto err_unpin;
> @@ -923,8 +922,7 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
> if (ret)
> return ret;
>
> - if (i915_gem_object_is_coherent(obj) ||
> - !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> + if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> ret = i915_gem_object_set_to_cpu_domain(obj, true);
> if (ret)
> goto err_unpin;
> @@ -3698,6 +3696,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> list_for_each_entry(vma, &obj->vma_list, obj_link)
> vma->node.color = cache_level;
> obj->cache_level = cache_level;
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> obj->cache_dirty = true; /* Always invalidate stale cachelines */
>
> return 0;
> @@ -4358,7 +4357,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
> } else
> obj->cache_level = I915_CACHE_NONE;
>
> - obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> + obj->cache_dirty = !obj->cache_coherent;
>
> trace_i915_gem_object_create(obj);
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
> index 17b207e963c2..152f16c11878 100644
> --- a/drivers/gpu/drm/i915/i915_gem_clflush.c
> +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
> @@ -139,7 +139,7 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
> * snooping behaviour occurs naturally as the result of our domain
> * tracking.
> */
> - if (!(flags & I915_CLFLUSH_FORCE) && i915_gem_object_is_coherent(obj))
> + if (!(flags & I915_CLFLUSH_FORCE) && obj->cache_coherent)
> return;
>
> trace_i915_gem_object_clflush(obj);
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 2a9aed5640e2..d6099d084748 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1110,7 +1110,7 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
> if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
> continue;
>
> - if (obj->cache_dirty)
> + if (unlikely(obj->cache_dirty && !obj->cache_coherent))
> i915_gem_clflush_object(obj, 0);
>
> ret = i915_gem_request_await_object
> diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
> index 58e93e87d573..568bf83af1f5 100644
> --- a/drivers/gpu/drm/i915/i915_gem_internal.c
> +++ b/drivers/gpu/drm/i915/i915_gem_internal.c
> @@ -191,7 +191,8 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
> obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> - obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> + obj->cache_dirty = !obj->cache_coherent;
>
> return obj;
> }
> diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
> index 915057824284..adb482b00271 100644
> --- a/drivers/gpu/drm/i915/i915_gem_object.h
> +++ b/drivers/gpu/drm/i915/i915_gem_object.h
> @@ -121,6 +121,7 @@ struct drm_i915_gem_object {
> unsigned long gt_ro:1;
> unsigned int cache_level:3;
> unsigned int cache_dirty:1;
> + unsigned int cache_coherent:1;
>
> atomic_t frontbuffer_bits;
> unsigned int frontbuffer_ggtt_origin; /* write once */
> diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
> index 681db6083f4d..a817b3e0b17e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_stolen.c
> +++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
> @@ -590,6 +590,7 @@ _i915_gem_object_create_stolen(struct drm_i915_private *dev_priv,
> obj->stolen = stolen;
> obj->base.read_domains = I915_GEM_DOMAIN_CPU | I915_GEM_DOMAIN_GTT;
> obj->cache_level = HAS_LLC(dev_priv) ? I915_CACHE_LLC : I915_CACHE_NONE;
> + obj->cache_coherent = true; /* assumptions! more like cache_oblivious */
>
> if (i915_gem_object_pin_pages(obj))
> goto cleanup;
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 34461e1928bc..05c36f663550 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -805,7 +805,8 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
> obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->cache_level = I915_CACHE_LLC;
> - obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> + obj->cache_dirty = !obj->cache_coherent;
>
> obj->userptr.ptr = args->user_ptr;
> obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
> diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> index 0ca867a877b6..caf76af36aba 100644
> --- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> +++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> @@ -129,7 +129,8 @@ huge_gem_object(struct drm_i915_private *i915,
> obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> - obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> + obj->cache_coherent = i915_gem_object_is_coherent(obj);
> + obj->cache_dirty = !obj->cache_coherent;
> obj->scratch = phys_size;
>
> return obj;
>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* ✓ Fi.CI.BAT: success for series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
2017-06-15 12:38 [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Chris Wilson
2017-06-15 12:38 ` [passive aggressive RESEND 2/2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty Chris Wilson
@ 2017-06-15 12:59 ` Patchwork
2017-06-15 14:28 ` [passive aggressive RESEND 1/2] " Tvrtko Ursulin
2017-06-16 11:11 ` ✗ Fi.CI.BAT: warning for series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes (rev2) Patchwork
3 siblings, 0 replies; 14+ messages in thread
From: Patchwork @ 2017-06-15 12:59 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
URL : https://patchwork.freedesktop.org/series/25841/
State : success
== Summary ==
Series 25841v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/25841/revisions/1/mbox/
Test gem_exec_flush:
Subgroup basic-batch-kernel-default-uc:
fail -> PASS (fi-snb-2600) fdo#100007
Test gem_exec_suspend:
Subgroup basic-s4-devices:
dmesg-warn -> PASS (fi-kbl-7560u) fdo#100125
Test kms_cursor_legacy:
Subgroup basic-busy-flip-before-cursor-atomic:
pass -> FAIL (fi-snb-2600) fdo#100215 +1
fdo#100007 https://bugs.freedesktop.org/show_bug.cgi?id=100007
fdo#100125 https://bugs.freedesktop.org/show_bug.cgi?id=100125
fdo#100215 https://bugs.freedesktop.org/show_bug.cgi?id=100215
fi-bdw-5557u total:278 pass:267 dwarn:0 dfail:0 fail:0 skip:11 time:466s
fi-bdw-gvtdvm total:278 pass:256 dwarn:8 dfail:0 fail:0 skip:14 time:485s
fi-bsw-n3050 total:278 pass:242 dwarn:0 dfail:0 fail:0 skip:36 time:587s
fi-bxt-j4205 total:278 pass:259 dwarn:0 dfail:0 fail:0 skip:19 time:550s
fi-byt-j1900 total:278 pass:254 dwarn:0 dfail:0 fail:0 skip:24 time:492s
fi-byt-n2820 total:278 pass:250 dwarn:0 dfail:0 fail:0 skip:28 time:488s
fi-glk-2a total:278 pass:259 dwarn:0 dfail:0 fail:0 skip:19 time:590s
fi-hsw-4770 total:278 pass:262 dwarn:0 dfail:0 fail:0 skip:16 time:432s
fi-hsw-4770r total:278 pass:261 dwarn:0 dfail:0 fail:0 skip:16 time:420s
fi-ilk-650 total:278 pass:227 dwarn:0 dfail:0 fail:0 skip:50 time:462s
fi-ivb-3520m total:278 pass:259 dwarn:0 dfail:0 fail:0 skip:18 time:502s
fi-ivb-3770 total:278 pass:259 dwarn:0 dfail:0 fail:0 skip:18 time:514s
fi-kbl-7500u total:278 pass:260 dwarn:0 dfail:0 fail:0 skip:18 time:482s
fi-kbl-7560u total:278 pass:268 dwarn:0 dfail:0 fail:0 skip:10 time:572s
fi-kbl-r total:278 pass:259 dwarn:1 dfail:0 fail:0 skip:18 time:577s
fi-skl-6260u total:278 pass:268 dwarn:0 dfail:0 fail:0 skip:10 time:483s
fi-skl-6700hq total:278 pass:228 dwarn:1 dfail:0 fail:27 skip:22 time:447s
fi-skl-6700k total:278 pass:256 dwarn:4 dfail:0 fail:0 skip:18 time:511s
fi-skl-6770hq total:278 pass:268 dwarn:0 dfail:0 fail:0 skip:10 time:506s
fi-skl-gvtdvm total:278 pass:265 dwarn:0 dfail:0 fail:0 skip:13 time:510s
fi-snb-2520m total:278 pass:249 dwarn:0 dfail:0 fail:0 skip:28 time:626s
fi-snb-2600 total:278 pass:246 dwarn:0 dfail:0 fail:2 skip:29 time:404s
94b0761a394e3d61e51bc920d0201364c21690d6 drm-tip: 2017y-06m-15d-10h-05m-24s UTC integration manifest
502a2ef drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty
ad420fb drm/i915: Mark CPU cache as dirty on every transition for CPU writes
== Logs ==
For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4959/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
2017-06-15 12:38 [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Chris Wilson
2017-06-15 12:38 ` [passive aggressive RESEND 2/2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty Chris Wilson
2017-06-15 12:59 ` ✓ Fi.CI.BAT: success for series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Patchwork
@ 2017-06-15 14:28 ` Tvrtko Ursulin
2017-06-16 11:26 ` Chris Wilson
2017-06-16 11:11 ` ✗ Fi.CI.BAT: warning for series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes (rev2) Patchwork
3 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2017-06-15 14:28 UTC (permalink / raw)
To: Chris Wilson, intel-gfx; +Cc: Dongwon Kim
On 15/06/2017 13:38, Chris Wilson wrote:
> Currently, we only mark the CPU cache as dirty if we skip a clflush.
> This leads to some confusion where we have to ask if the object is in
> the write domain or missed a clflush. If we always mark the cache as
> dirty, this becomes a much simply question to answer.
>
> The goal remains to do as few clflushes as required and to do them as
> late as possible, in the hope of deferring the work to a kthread and not
> block the caller (e.g. execbuf, flips).
>
> v2: Always call clflush before GPU execution when the cache_dirty flag
> is set. This may cause some extra work on llc systems that migrate dirty
> buffers back and forth - but we do try to limit that by only settin
settin*g*
Subject sounded like reviewer is wanted so I thought to give it a try.
> cache_dirty at the end of the gpu sequence.
>
> v3: Always mark the cache as dirty upon a level change, as we need to
> invalidate any stale cachelines due to external writes.
>
> Reported-by: Dongwon Kim <dongwon.kim@intel.com>
> Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> Tested-by: Dongwon Kim <dongwon.kim@intel.com>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 76 ++++++++++++++----------
> drivers/gpu/drm/i915/i915_gem_clflush.c | 15 +++--
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 21 +++----
> drivers/gpu/drm/i915/i915_gem_internal.c | 3 +-
> drivers/gpu/drm/i915/i915_gem_userptr.c | 5 +-
> drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 +-
> 6 files changed, 67 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 31cbe78171a9..b1504a829c6a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
>
> static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> {
> - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> + if (obj->cache_dirty)
> return false >
> if (!i915_gem_object_is_coherent(obj))
> @@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
> return st;
> }
>
> +static void __start_cpu_write(struct drm_i915_gem_object *obj)
> +{
> + obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> + obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> + if (cpu_write_needs_clflush(obj))
> + obj->cache_dirty = true;
I find the logic here quite hard to understand because
cpu_write_needs_clflush is actually consulting obj->cache_dirty so it's
all a bit recursive.
If we start a cpu write on an object which has been clflushed, but
otherwise needs clflushing like pin_display - what is the correct thing
to do? Not set obj->cache_dirty to true?
> +}
> +
> static void
> __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
> struct sg_table *pages,
> @@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
> !i915_gem_object_is_coherent(obj))
> drm_clflush_sg(pages);
>
> - obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> + __start_cpu_write(obj);
> }
>
> static void
> @@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
> args->size, &args->handle);
> }
>
> +static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> +{
> + return !(obj->cache_level == I915_CACHE_NONE ||
> + obj->cache_level == I915_CACHE_WT);
Hm, is this reversed? On LLC I would assume we don't need explicit
clflush, but perhaps I am misunderstanding something.
> +}
> +
> /**
> * Creates a new mm object and returns a handle to it.
> * @dev: drm device pointer
> @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
> case I915_GEM_DOMAIN_CPU:
> i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
> break;
> +
> + case I915_GEM_DOMAIN_RENDER:
> + if (gpu_write_needs_clflush(obj))
> + obj->cache_dirty = true;
> + break;
> }
>
> obj->base.write_domain = 0;
> @@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
> * optimizes for the case when the gpu will dirty the data
> * anyway again before the next pread happens.
> */
> - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> + if (!obj->cache_dirty &&
> + !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> *needs_clflush = CLFLUSH_BEFORE;
Obviously I don't understand something critical here since once more I
was expecting the reverse here - if the cache is not dirty no need to
flush it.
>
> out:
> @@ -906,14 +925,16 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
> * This optimizes for the case when the gpu will use the data
> * right away and we therefore have to clflush anyway.
> */
> - if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
> + if (!obj->cache_dirty) {
> *needs_clflush |= CLFLUSH_AFTER;
>
> - /* Same trick applies to invalidate partially written cachelines read
> - * before writing.
> - */
> - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> - *needs_clflush |= CLFLUSH_BEFORE;
> + /*
> + * Same trick applies to invalidate partially written
> + * cachelines read before writing.
> + */
> + if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> + *needs_clflush |= CLFLUSH_BEFORE;
> + }
>
> out:
> intel_fb_obj_invalidate(obj, ORIGIN_CPU);
> @@ -3395,10 +3416,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>
> static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
> {
> - if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty)
> - return;
> -
> - i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
> + /*
> + * We manually flush the CPU domain so that we can override and
> + * force the flush for the display, and perform it asyncrhonously.
Typo in asynchronously.
> + */
> + flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
> + if (obj->cache_dirty)
> + i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
> obj->base.write_domain = 0;
> }
>
> @@ -3657,13 +3681,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> }
> }
>
> - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
> - i915_gem_object_is_coherent(obj))
> - obj->cache_dirty = true;
> -
> list_for_each_entry(vma, &obj->vma_list, obj_link)
> vma->node.color = cache_level;
> obj->cache_level = cache_level;
> + obj->cache_dirty = true; /* Always invalidate stale cachelines */
>
> return 0;
> }
> @@ -3885,9 +3906,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
> if (ret)
> return ret;
>
> - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> - return 0;
> -
> flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
>
> /* Flush the CPU cache if it's still invalid. */
> @@ -3899,15 +3917,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
> /* It should now be out of any other write domains, and we can update
> * the domain values for our changes.
> */
> - GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
> + GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
>
> /* If we're writing through the CPU, then the GPU read domains will
> * need to be invalidated at next use.
> */
> - if (write) {
> - obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> - }
> + if (write)
> + __start_cpu_write(obj);
>
> return 0;
> }
> @@ -4328,6 +4344,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
> } else
> obj->cache_level = I915_CACHE_NONE;
>
> + obj->cache_dirty = !i915_gem_object_is_coherent(obj);
Could this, here and elsewhere, together with setting of read and write
cpu domains, be consolidated to i915_gem_object_init?
> +
> trace_i915_gem_object_create(obj);
>
> return obj;
> @@ -4994,10 +5012,8 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
>
> mutex_lock(&dev_priv->drm.struct_mutex);
> for (p = phases; *p; p++) {
> - list_for_each_entry(obj, *p, global_link) {
> - obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> - }
> + list_for_each_entry(obj, *p, global_link)
> + __start_cpu_write(obj);
> }
> mutex_unlock(&dev_priv->drm.struct_mutex);
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
> index ffac7a1f0caf..17b207e963c2 100644
> --- a/drivers/gpu/drm/i915/i915_gem_clflush.c
> +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
> @@ -71,8 +71,6 @@ static const struct dma_fence_ops i915_clflush_ops = {
> static void __i915_do_clflush(struct drm_i915_gem_object *obj)
> {
> drm_clflush_sg(obj->mm.pages);
> - obj->cache_dirty = false;
> -
> intel_fb_obj_flush(obj, ORIGIN_CPU);
> }
>
> @@ -81,9 +79,6 @@ static void i915_clflush_work(struct work_struct *work)
> struct clflush *clflush = container_of(work, typeof(*clflush), work);
> struct drm_i915_gem_object *obj = clflush->obj;
>
> - if (!obj->cache_dirty)
> - goto out;
> -
> if (i915_gem_object_pin_pages(obj)) {
> DRM_ERROR("Failed to acquire obj->pages for clflushing\n");
> goto out;
> @@ -131,10 +126,10 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
> * anything not backed by physical memory we consider to be always
> * coherent and not need clflushing.
> */
> - if (!i915_gem_object_has_struct_page(obj))
> + if (!i915_gem_object_has_struct_page(obj)) {
> + obj->cache_dirty = false;
> return;
> -
> - obj->cache_dirty = true;
> + }
>
> /* If the GPU is snooping the contents of the CPU cache,
> * we do not need to manually clear the CPU cache lines. However,
> @@ -153,6 +148,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
> if (!(flags & I915_CLFLUSH_SYNC))
> clflush = kmalloc(sizeof(*clflush), GFP_KERNEL);
> if (clflush) {
> + GEM_BUG_ON(!obj->cache_dirty);
> +
> dma_fence_init(&clflush->dma,
> &i915_clflush_ops,
> &clflush_lock,
> @@ -180,4 +177,6 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
> } else {
> GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
> }
> +
> + obj->cache_dirty = false;
> }
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 96705171e397..2a9aed5640e2 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -309,7 +309,7 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
> return DBG_USE_CPU_RELOC > 0;
>
> return (HAS_LLC(to_i915(obj->base.dev)) ||
> - obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
> + obj->cache_dirty ||
> obj->cache_level != I915_CACHE_NONE);
> }
>
> @@ -1110,10 +1110,8 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
> if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
> continue;
>
> - if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
> + if (obj->cache_dirty)
> i915_gem_clflush_object(obj, 0);
> - obj->base.write_domain = 0;
> - }
>
> ret = i915_gem_request_await_object
> (eb->request, obj, obj->base.pending_write_domain);
> @@ -1248,12 +1246,6 @@ static int eb_select_context(struct i915_execbuffer *eb)
> return 0;
> }
>
> -static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> -{
> - return !(obj->cache_level == I915_CACHE_NONE ||
> - obj->cache_level == I915_CACHE_WT);
> -}
> -
> void i915_vma_move_to_active(struct i915_vma *vma,
> struct drm_i915_gem_request *req,
> unsigned int flags)
> @@ -1277,15 +1269,16 @@ void i915_vma_move_to_active(struct i915_vma *vma,
> i915_gem_active_set(&vma->last_read[idx], req);
> list_move_tail(&vma->vm_link, &vma->vm->active_list);
>
> + obj->base.write_domain = 0;
> if (flags & EXEC_OBJECT_WRITE) {
> + obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
> +
> if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
> i915_gem_active_set(&obj->frontbuffer_write, req);
>
> - /* update for the implicit flush after a batch */
> - obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
> - if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
> - obj->cache_dirty = true;
> + obj->base.read_domains = 0;
> }
> + obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
>
> if (flags & EXEC_OBJECT_NEEDS_FENCE)
> i915_gem_active_set(&vma->last_fence, req);
> diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
> index fc950abbe400..58e93e87d573 100644
> --- a/drivers/gpu/drm/i915/i915_gem_internal.c
> +++ b/drivers/gpu/drm/i915/i915_gem_internal.c
> @@ -188,9 +188,10 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
> drm_gem_private_object_init(&i915->drm, &obj->base, size);
> i915_gem_object_init(obj, &i915_gem_object_internal_ops);
>
> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> + obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> + obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>
> return obj;
> }
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 1a0ce1dc68f5..34461e1928bc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -802,9 +802,10 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
>
> drm_gem_private_object_init(dev, &obj->base, args->user_size);
> i915_gem_object_init(obj, &i915_gem_userptr_ops);
> - obj->cache_level = I915_CACHE_LLC;
> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> + obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> + obj->cache_level = I915_CACHE_LLC;
> + obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>
> obj->userptr.ptr = args->user_ptr;
> obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
> diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> index 4e681fc13be4..0ca867a877b6 100644
> --- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> +++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> @@ -126,9 +126,10 @@ huge_gem_object(struct drm_i915_private *i915,
> drm_gem_private_object_init(&i915->drm, &obj->base, dma_size);
> i915_gem_object_init(obj, &huge_ops);
>
> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> + obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> + obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> obj->scratch = phys_size;
>
> return obj;
>
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
2017-06-15 14:28 ` [passive aggressive RESEND 1/2] " Tvrtko Ursulin
@ 2017-06-16 11:26 ` Chris Wilson
2017-06-16 12:05 ` Tvrtko Ursulin
0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2017-06-16 11:26 UTC (permalink / raw)
To: Tvrtko Ursulin, intel-gfx; +Cc: Dongwon Kim
Quoting Tvrtko Ursulin (2017-06-15 15:28:10)
>
> On 15/06/2017 13:38, Chris Wilson wrote:
> > Currently, we only mark the CPU cache as dirty if we skip a clflush.
> > This leads to some confusion where we have to ask if the object is in
> > the write domain or missed a clflush. If we always mark the cache as
> > dirty, this becomes a much simply question to answer.
> >
> > The goal remains to do as few clflushes as required and to do them as
> > late as possible, in the hope of deferring the work to a kthread and not
> > block the caller (e.g. execbuf, flips).
> >
> > v2: Always call clflush before GPU execution when the cache_dirty flag
> > is set. This may cause some extra work on llc systems that migrate dirty
> > buffers back and forth - but we do try to limit that by only settin
>
> settin*g*
>
> Subject sounded like reviewer is wanted so I thought to give it a try.
>
> > cache_dirty at the end of the gpu sequence.
> >
> > v3: Always mark the cache as dirty upon a level change, as we need to
> > invalidate any stale cachelines due to external writes.
> >
> > Reported-by: Dongwon Kim <dongwon.kim@intel.com>
> > Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Dongwon Kim <dongwon.kim@intel.com>
> > Cc: Matt Roper <matthew.d.roper@intel.com>
> > Tested-by: Dongwon Kim <dongwon.kim@intel.com>
> > ---
> > drivers/gpu/drm/i915/i915_gem.c | 76 ++++++++++++++----------
> > drivers/gpu/drm/i915/i915_gem_clflush.c | 15 +++--
> > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 21 +++----
> > drivers/gpu/drm/i915/i915_gem_internal.c | 3 +-
> > drivers/gpu/drm/i915/i915_gem_userptr.c | 5 +-
> > drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 +-
> > 6 files changed, 67 insertions(+), 56 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 31cbe78171a9..b1504a829c6a 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
> >
> > static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> > {
> > - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> > + if (obj->cache_dirty)
> > return false >
> > if (!i915_gem_object_is_coherent(obj))
> > @@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
> > return st;
> > }
> >
> > +static void __start_cpu_write(struct drm_i915_gem_object *obj)
> > +{
> > + obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> > + obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> > + if (cpu_write_needs_clflush(obj))
> > + obj->cache_dirty = true;
>
> I find the logic here quite hard to understand because
> cpu_write_needs_clflush is actually consulting obj->cache_dirty so it's
> all a bit recursive.
>
> If we start a cpu write on an object which has been clflushed, but
> otherwise needs clflushing like pin_display - what is the correct thing
> to do? Not set obj->cache_dirty to true?
No, we want cache_dirty to be true as we want a clflush after the write.
Really that obj->pin_display only exists there for the transition inside
set-cache-level for i915_gem_object_pin_to_display_plane. Outside of
set-cache-level obj->cache_coherent will be accurate. I can pencil in
making yet another change to limit use of obj->pin_display again.
> > +}
> > +
> > static void
> > __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
> > struct sg_table *pages,
> > @@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
> > !i915_gem_object_is_coherent(obj))
> > drm_clflush_sg(pages);
> >
> > - obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> > - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> > + __start_cpu_write(obj);
> > }
> >
> > static void
> > @@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
> > args->size, &args->handle);
> > }
> >
> > +static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> > +{
> > + return !(obj->cache_level == I915_CACHE_NONE ||
> > + obj->cache_level == I915_CACHE_WT);
>
> Hm, is this reversed? On LLC I would assume we don't need explicit
> clflush, but perhaps I am misunderstanding something.
It's accurate. We all shared the same misunderstanding for several
years. The gpu writes via the ring (and the shared llc) so if we
transition to accessing the memory directly from the cpu, we bypass the
shared llc and our data is stale. So after using LLC, be that either
with CPU writes or GPU writes, we need to clflush.
> > +}
> > +
> > /**
> > * Creates a new mm object and returns a handle to it.
> > * @dev: drm device pointer
> > @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
> > case I915_GEM_DOMAIN_CPU:
> > i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
> > break;
> > +
> > + case I915_GEM_DOMAIN_RENDER:
> > + if (gpu_write_needs_clflush(obj))
> > + obj->cache_dirty = true;
> > + break;
> > }
> >
> > obj->base.write_domain = 0;
> > @@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
> > * optimizes for the case when the gpu will dirty the data
> > * anyway again before the next pread happens.
> > */
> > - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> > + if (!obj->cache_dirty &&
> > + !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> > *needs_clflush = CLFLUSH_BEFORE;
>
> Obviously I don't understand something critical here since once more I
> was expecting the reverse here - if the cache is not dirty no need to
> flush it.
This is an invalidate. We need to make sure that any stray cacheline the
CPU pulled in are invalidated prior to accessing the physical address
which was written to by the GPU out of sight of the cpu.
> > + */
> > + flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
> > + if (obj->cache_dirty)
> > + i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
> > obj->base.write_domain = 0;
> > }
> >
> > @@ -3657,13 +3681,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> > }
> > }
> >
> > - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
> > - i915_gem_object_is_coherent(obj))
> > - obj->cache_dirty = true;
> > -
> > list_for_each_entry(vma, &obj->vma_list, obj_link)
> > vma->node.color = cache_level;
> > obj->cache_level = cache_level;
> > + obj->cache_dirty = true; /* Always invalidate stale cachelines */
> >
> > return 0;
> > }
> > @@ -3885,9 +3906,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
> > if (ret)
> > return ret;
> >
> > - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> > - return 0;
> > -
> > flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
> >
> > /* Flush the CPU cache if it's still invalid. */
> > @@ -3899,15 +3917,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
> > /* It should now be out of any other write domains, and we can update
> > * the domain values for our changes.
> > */
> > - GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
> > + GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
> >
> > /* If we're writing through the CPU, then the GPU read domains will
> > * need to be invalidated at next use.
> > */
> > - if (write) {
> > - obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> > - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> > - }
> > + if (write)
> > + __start_cpu_write(obj);
> >
> > return 0;
> > }
> > @@ -4328,6 +4344,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
> > } else
> > obj->cache_level = I915_CACHE_NONE;
> >
> > + obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>
> Could this, here and elsewhere, together with setting of read and write
> cpu domains, be consolidated to i915_gem_object_init?
Possibly, I did consider it, but the freedom in which callers mixed the
domains reduced the desire to refactor.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
2017-06-16 11:26 ` Chris Wilson
@ 2017-06-16 12:05 ` Tvrtko Ursulin
2017-06-16 12:13 ` Chris Wilson
0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2017-06-16 12:05 UTC (permalink / raw)
To: Chris Wilson, intel-gfx; +Cc: Dongwon Kim
On 16/06/2017 12:26, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-06-15 15:28:10)
>>
>> On 15/06/2017 13:38, Chris Wilson wrote:
>>> Currently, we only mark the CPU cache as dirty if we skip a clflush.
>>> This leads to some confusion where we have to ask if the object is in
>>> the write domain or missed a clflush. If we always mark the cache as
>>> dirty, this becomes a much simply question to answer.
>>>
>>> The goal remains to do as few clflushes as required and to do them as
>>> late as possible, in the hope of deferring the work to a kthread and not
>>> block the caller (e.g. execbuf, flips).
>>>
>>> v2: Always call clflush before GPU execution when the cache_dirty flag
>>> is set. This may cause some extra work on llc systems that migrate dirty
>>> buffers back and forth - but we do try to limit that by only settin
>>
>> settin*g*
>>
>> Subject sounded like reviewer is wanted so I thought to give it a try.
>>
>>> cache_dirty at the end of the gpu sequence.
>>>
>>> v3: Always mark the cache as dirty upon a level change, as we need to
>>> invalidate any stale cachelines due to external writes.
>>>
>>> Reported-by: Dongwon Kim <dongwon.kim@intel.com>
>>> Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Dongwon Kim <dongwon.kim@intel.com>
>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>> Tested-by: Dongwon Kim <dongwon.kim@intel.com>
>>> ---
>>> drivers/gpu/drm/i915/i915_gem.c | 76 ++++++++++++++----------
>>> drivers/gpu/drm/i915/i915_gem_clflush.c | 15 +++--
>>> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 21 +++----
>>> drivers/gpu/drm/i915/i915_gem_internal.c | 3 +-
>>> drivers/gpu/drm/i915/i915_gem_userptr.c | 5 +-
>>> drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 +-
>>> 6 files changed, 67 insertions(+), 56 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index 31cbe78171a9..b1504a829c6a 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
>>>
>>> static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
>>> {
>>> - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
>>> + if (obj->cache_dirty)
>>> return false >
>>> if (!i915_gem_object_is_coherent(obj))
>>> @@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
>>> return st;
>>> }
>>>
>>> +static void __start_cpu_write(struct drm_i915_gem_object *obj)
>>> +{
>>> + obj->base.read_domains = I915_GEM_DOMAIN_CPU;
>>> + obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>>> + if (cpu_write_needs_clflush(obj))
>>> + obj->cache_dirty = true;
>>
>> I find the logic here quite hard to understand because
>> cpu_write_needs_clflush is actually consulting obj->cache_dirty so it's
>> all a bit recursive.
>>
>> If we start a cpu write on an object which has been clflushed, but
>> otherwise needs clflushing like pin_display - what is the correct thing
>> to do? Not set obj->cache_dirty to true?
>
> No, we want cache_dirty to be true as we want a clflush after the write.
Right, but I think it will be false.
0. pin_display=true
1. clflush from somewhere: cache_dirty=false
2. __start_write: cpu_write_needs_clflush returns false ->
cache_dirty=false after __start_write
> Really that obj->pin_display only exists there for the transition inside
> set-cache-level for i915_gem_object_pin_to_display_plane. Outside of
> set-cache-level obj->cache_coherent will be accurate. I can pencil in
> making yet another change to limit use of obj->pin_display again.
Like modifying obj->cache_coherent on pin/unpin to display?
>>> +}
>>> +
>>> static void
>>> __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>>> struct sg_table *pages,
>>> @@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>>> !i915_gem_object_is_coherent(obj))
>>> drm_clflush_sg(pages);
>>>
>>> - obj->base.read_domains = I915_GEM_DOMAIN_CPU;
>>> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>>> + __start_cpu_write(obj);
>>> }
>>>
>>> static void
>>> @@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
>>> args->size, &args->handle);
>>> }
>>>
>>> +static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
>>> +{
>>> + return !(obj->cache_level == I915_CACHE_NONE ||
>>> + obj->cache_level == I915_CACHE_WT);
>>
>> Hm, is this reversed? On LLC I would assume we don't need explicit
>> clflush, but perhaps I am misunderstanding something.
>
> It's accurate. We all shared the same misunderstanding for several
> years. The gpu writes via the ring (and the shared llc) so if we
> transition to accessing the memory directly from the cpu, we bypass the
> shared llc and our data is stale. So after using LLC, be that either
> with CPU writes or GPU writes, we need to clflush.
Why does accessing the memory from the CPU bypasses the LLC? Or you mean
if we access in a way that bypasses the LLC and if so which ways are that?
>
>>> +}
>>> +
>>> /**
>>> * Creates a new mm object and returns a handle to it.
>>> * @dev: drm device pointer
>>> @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
>>> case I915_GEM_DOMAIN_CPU:
>>> i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
>>> break;
>>> +
>>> + case I915_GEM_DOMAIN_RENDER:
>>> + if (gpu_write_needs_clflush(obj))
>>> + obj->cache_dirty = true;
>>> + break;
>>> }
>>>
>>> obj->base.write_domain = 0;
>>> @@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
>>> * optimizes for the case when the gpu will dirty the data
>>> * anyway again before the next pread happens.
>>> */
>>> - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
>>> + if (!obj->cache_dirty &&
>>> + !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
>>> *needs_clflush = CLFLUSH_BEFORE;
>>
>> Obviously I don't understand something critical here since once more I
>> was expecting the reverse here - if the cache is not dirty no need to
>> flush it.
>
> This is an invalidate. We need to make sure that any stray cacheline the
> CPU pulled in are invalidated prior to accessing the physical address
> which was written to by the GPU out of sight of the cpu.
Why we don't need to do that if CPU cache is dirty?
>>> + */
>>> + flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
>>> + if (obj->cache_dirty)
>>> + i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
>>> obj->base.write_domain = 0;
>>> }
>>>
>>> @@ -3657,13 +3681,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>>> }
>>> }
>>>
>>> - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
>>> - i915_gem_object_is_coherent(obj))
>>> - obj->cache_dirty = true;
>>> -
>>> list_for_each_entry(vma, &obj->vma_list, obj_link)
>>> vma->node.color = cache_level;
>>> obj->cache_level = cache_level;
>>> + obj->cache_dirty = true; /* Always invalidate stale cachelines */
>>>
>>> return 0;
>>> }
>>> @@ -3885,9 +3906,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
>>> if (ret)
>>> return ret;
>>>
>>> - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
>>> - return 0;
>>> -
>>> flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
>>>
>>> /* Flush the CPU cache if it's still invalid. */
>>> @@ -3899,15 +3917,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
>>> /* It should now be out of any other write domains, and we can update
>>> * the domain values for our changes.
>>> */
>>> - GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
>>> + GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
>>>
>>> /* If we're writing through the CPU, then the GPU read domains will
>>> * need to be invalidated at next use.
>>> */
>>> - if (write) {
>>> - obj->base.read_domains = I915_GEM_DOMAIN_CPU;
>>> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>>> - }
>>> + if (write)
>>> + __start_cpu_write(obj);
>>>
>>> return 0;
>>> }
>>> @@ -4328,6 +4344,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
>>> } else
>>> obj->cache_level = I915_CACHE_NONE;
>>>
>>> + obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>>
>> Could this, here and elsewhere, together with setting of read and write
>> cpu domains, be consolidated to i915_gem_object_init?
>
> Possibly, I did consider it, but the freedom in which callers mixed the
> domains reduced the desire to refactor.
Ok.
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
2017-06-16 12:05 ` Tvrtko Ursulin
@ 2017-06-16 12:13 ` Chris Wilson
2017-06-16 12:59 ` Tvrtko Ursulin
0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2017-06-16 12:13 UTC (permalink / raw)
To: Tvrtko Ursulin, intel-gfx; +Cc: Dongwon Kim
Quoting Tvrtko Ursulin (2017-06-16 13:05:21)
>
> On 16/06/2017 12:26, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2017-06-15 15:28:10)
> >>
> >> On 15/06/2017 13:38, Chris Wilson wrote:
> >>> Currently, we only mark the CPU cache as dirty if we skip a clflush.
> >>> This leads to some confusion where we have to ask if the object is in
> >>> the write domain or missed a clflush. If we always mark the cache as
> >>> dirty, this becomes a much simply question to answer.
> >>>
> >>> The goal remains to do as few clflushes as required and to do them as
> >>> late as possible, in the hope of deferring the work to a kthread and not
> >>> block the caller (e.g. execbuf, flips).
> >>>
> >>> v2: Always call clflush before GPU execution when the cache_dirty flag
> >>> is set. This may cause some extra work on llc systems that migrate dirty
> >>> buffers back and forth - but we do try to limit that by only settin
> >>
> >> settin*g*
> >>
> >> Subject sounded like reviewer is wanted so I thought to give it a try.
> >>
> >>> cache_dirty at the end of the gpu sequence.
> >>>
> >>> v3: Always mark the cache as dirty upon a level change, as we need to
> >>> invalidate any stale cachelines due to external writes.
> >>>
> >>> Reported-by: Dongwon Kim <dongwon.kim@intel.com>
> >>> Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Cc: Dongwon Kim <dongwon.kim@intel.com>
> >>> Cc: Matt Roper <matthew.d.roper@intel.com>
> >>> Tested-by: Dongwon Kim <dongwon.kim@intel.com>
> >>> ---
> >>> drivers/gpu/drm/i915/i915_gem.c | 76 ++++++++++++++----------
> >>> drivers/gpu/drm/i915/i915_gem_clflush.c | 15 +++--
> >>> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 21 +++----
> >>> drivers/gpu/drm/i915/i915_gem_internal.c | 3 +-
> >>> drivers/gpu/drm/i915/i915_gem_userptr.c | 5 +-
> >>> drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 +-
> >>> 6 files changed, 67 insertions(+), 56 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >>> index 31cbe78171a9..b1504a829c6a 100644
> >>> --- a/drivers/gpu/drm/i915/i915_gem.c
> >>> +++ b/drivers/gpu/drm/i915/i915_gem.c
> >>> @@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
> >>>
> >>> static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> >>> {
> >>> - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> >>> + if (obj->cache_dirty)
> >>> return false >
> >>> if (!i915_gem_object_is_coherent(obj))
> >>> @@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
> >>> return st;
> >>> }
> >>>
> >>> +static void __start_cpu_write(struct drm_i915_gem_object *obj)
> >>> +{
> >>> + obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> >>> + obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> >>> + if (cpu_write_needs_clflush(obj))
> >>> + obj->cache_dirty = true;
> >>
> >> I find the logic here quite hard to understand because
> >> cpu_write_needs_clflush is actually consulting obj->cache_dirty so it's
> >> all a bit recursive.
> >>
> >> If we start a cpu write on an object which has been clflushed, but
> >> otherwise needs clflushing like pin_display - what is the correct thing
> >> to do? Not set obj->cache_dirty to true?
> >
> > No, we want cache_dirty to be true as we want a clflush after the write.
>
> Right, but I think it will be false.
>
> 0. pin_display=true
> 1. clflush from somewhere: cache_dirty=false
> 2. __start_write: cpu_write_needs_clflush returns false ->
> cache_dirty=false after __start_write
? cpu_write_needs_clflush() returns true if either the object is not
cache coherent or pin_display is set. It's that we return true for
pin_display being set that is the big hammer.
> > Really that obj->pin_display only exists there for the transition inside
> > set-cache-level for i915_gem_object_pin_to_display_plane. Outside of
> > set-cache-level obj->cache_coherent will be accurate. I can pencil in
> > making yet another change to limit use of obj->pin_display again.
>
> Like modifying obj->cache_coherent on pin/unpin to display?
Yes, we just have to consider when to mark the cache as dirty for the
clflush and for that we take pin_display into consideration (mostly for
paranoia).
> >>> +}
> >>> +
> >>> static void
> >>> __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
> >>> struct sg_table *pages,
> >>> @@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
> >>> !i915_gem_object_is_coherent(obj))
> >>> drm_clflush_sg(pages);
> >>>
> >>> - obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> >>> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> >>> + __start_cpu_write(obj);
> >>> }
> >>>
> >>> static void
> >>> @@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
> >>> args->size, &args->handle);
> >>> }
> >>>
> >>> +static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> >>> +{
> >>> + return !(obj->cache_level == I915_CACHE_NONE ||
> >>> + obj->cache_level == I915_CACHE_WT);
> >>
> >> Hm, is this reversed? On LLC I would assume we don't need explicit
> >> clflush, but perhaps I am misunderstanding something.
> >
> > It's accurate. We all shared the same misunderstanding for several
> > years. The gpu writes via the ring (and the shared llc) so if we
> > transition to accessing the memory directly from the cpu, we bypass the
> > shared llc and our data is stale. So after using LLC, be that either
> > with CPU writes or GPU writes, we need to clflush.
>
> Why does accessing the memory from the CPU bypasses the LLC? Or you mean
> if we access in a way that bypasses the LLC and if so which ways are that?
Any uncached access (WC from CPU or from scanout) bypass the LLC.
> >>> +}
> >>> +
> >>> /**
> >>> * Creates a new mm object and returns a handle to it.
> >>> * @dev: drm device pointer
> >>> @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
> >>> case I915_GEM_DOMAIN_CPU:
> >>> i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
> >>> break;
> >>> +
> >>> + case I915_GEM_DOMAIN_RENDER:
> >>> + if (gpu_write_needs_clflush(obj))
> >>> + obj->cache_dirty = true;
> >>> + break;
> >>> }
> >>>
> >>> obj->base.write_domain = 0;
> >>> @@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
> >>> * optimizes for the case when the gpu will dirty the data
> >>> * anyway again before the next pread happens.
> >>> */
> >>> - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> >>> + if (!obj->cache_dirty &&
> >>> + !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> >>> *needs_clflush = CLFLUSH_BEFORE;
> >>
> >> Obviously I don't understand something critical here since once more I
> >> was expecting the reverse here - if the cache is not dirty no need to
> >> flush it.
> >
> > This is an invalidate. We need to make sure that any stray cacheline the
> > CPU pulled in are invalidated prior to accessing the physical address
> > which was written to by the GPU out of sight of the cpu.
>
> Why we don't need to do that if CPU cache is dirty?
If the CPU cache is dirty it already means we have passed the invalidate
phase and own the cache. We will be doing the clflush at the completion
of the write phase.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
2017-06-16 12:13 ` Chris Wilson
@ 2017-06-16 12:59 ` Tvrtko Ursulin
2017-06-16 13:56 ` Chris Wilson
0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2017-06-16 12:59 UTC (permalink / raw)
To: Chris Wilson, intel-gfx; +Cc: Dongwon Kim
On 16/06/2017 13:13, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-06-16 13:05:21)
>>
>> On 16/06/2017 12:26, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2017-06-15 15:28:10)
>>>>
>>>> On 15/06/2017 13:38, Chris Wilson wrote:
>>>>> Currently, we only mark the CPU cache as dirty if we skip a clflush.
>>>>> This leads to some confusion where we have to ask if the object is in
>>>>> the write domain or missed a clflush. If we always mark the cache as
>>>>> dirty, this becomes a much simply question to answer.
>>>>>
>>>>> The goal remains to do as few clflushes as required and to do them as
>>>>> late as possible, in the hope of deferring the work to a kthread and not
>>>>> block the caller (e.g. execbuf, flips).
>>>>>
>>>>> v2: Always call clflush before GPU execution when the cache_dirty flag
>>>>> is set. This may cause some extra work on llc systems that migrate dirty
>>>>> buffers back and forth - but we do try to limit that by only settin
>>>>
>>>> settin*g*
>>>>
>>>> Subject sounded like reviewer is wanted so I thought to give it a try.
>>>>
>>>>> cache_dirty at the end of the gpu sequence.
>>>>>
>>>>> v3: Always mark the cache as dirty upon a level change, as we need to
>>>>> invalidate any stale cachelines due to external writes.
>>>>>
>>>>> Reported-by: Dongwon Kim <dongwon.kim@intel.com>
>>>>> Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> Cc: Dongwon Kim <dongwon.kim@intel.com>
>>>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>>>> Tested-by: Dongwon Kim <dongwon.kim@intel.com>
>>>>> ---
>>>>> drivers/gpu/drm/i915/i915_gem.c | 76 ++++++++++++++----------
>>>>> drivers/gpu/drm/i915/i915_gem_clflush.c | 15 +++--
>>>>> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 21 +++----
>>>>> drivers/gpu/drm/i915/i915_gem_internal.c | 3 +-
>>>>> drivers/gpu/drm/i915/i915_gem_userptr.c | 5 +-
>>>>> drivers/gpu/drm/i915/selftests/huge_gem_object.c | 3 +-
>>>>> 6 files changed, 67 insertions(+), 56 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>>>> index 31cbe78171a9..b1504a829c6a 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>>>> @@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
>>>>>
>>>>> static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
>>>>> {
>>>>> - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
>>>>> + if (obj->cache_dirty)
>>>>> return false >
>>>>> if (!i915_gem_object_is_coherent(obj))
>>>>> @@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
>>>>> return st;
>>>>> }
>>>>>
>>>>> +static void __start_cpu_write(struct drm_i915_gem_object *obj)
>>>>> +{
>>>>> + obj->base.read_domains = I915_GEM_DOMAIN_CPU;
>>>>> + obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>>>>> + if (cpu_write_needs_clflush(obj))
>>>>> + obj->cache_dirty = true;
>>>>
>>>> I find the logic here quite hard to understand because
>>>> cpu_write_needs_clflush is actually consulting obj->cache_dirty so it's
>>>> all a bit recursive.
>>>>
>>>> If we start a cpu write on an object which has been clflushed, but
>>>> otherwise needs clflushing like pin_display - what is the correct thing
>>>> to do? Not set obj->cache_dirty to true?
>>>
>>> No, we want cache_dirty to be true as we want a clflush after the write.
>>
>> Right, but I think it will be false.
>>
>> 0. pin_display=true
>> 1. clflush from somewhere: cache_dirty=false
>> 2. __start_write: cpu_write_needs_clflush returns false ->
>> cache_dirty=false after __start_write
>
> ? cpu_write_needs_clflush() returns true if either the object is not
> cache coherent or pin_display is set. It's that we return true for
> pin_display being set that is the big hammer.
My bad.. was imagining things. :(
>>> Really that obj->pin_display only exists there for the transition inside
>>> set-cache-level for i915_gem_object_pin_to_display_plane. Outside of
>>> set-cache-level obj->cache_coherent will be accurate. I can pencil in
>>> making yet another change to limit use of obj->pin_display again.
>>
>> Like modifying obj->cache_coherent on pin/unpin to display?
>
> Yes, we just have to consider when to mark the cache as dirty for the
> clflush and for that we take pin_display into consideration (mostly for
> paranoia).
>
>>>>> +}
>>>>> +
>>>>> static void
>>>>> __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>>>>> struct sg_table *pages,
>>>>> @@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>>>>> !i915_gem_object_is_coherent(obj))
>>>>> drm_clflush_sg(pages);
>>>>>
>>>>> - obj->base.read_domains = I915_GEM_DOMAIN_CPU;
>>>>> - obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>>>>> + __start_cpu_write(obj);
>>>>> }
>>>>>
>>>>> static void
>>>>> @@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
>>>>> args->size, &args->handle);
>>>>> }
>>>>>
>>>>> +static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
>>>>> +{
>>>>> + return !(obj->cache_level == I915_CACHE_NONE ||
>>>>> + obj->cache_level == I915_CACHE_WT);
>>>>
>>>> Hm, is this reversed? On LLC I would assume we don't need explicit
>>>> clflush, but perhaps I am misunderstanding something.
>>>
>>> It's accurate. We all shared the same misunderstanding for several
>>> years. The gpu writes via the ring (and the shared llc) so if we
>>> transition to accessing the memory directly from the cpu, we bypass the
>>> shared llc and our data is stale. So after using LLC, be that either
>>> with CPU writes or GPU writes, we need to clflush.
>>
>> Why does accessing the memory from the CPU bypasses the LLC? Or you mean
>> if we access in a way that bypasses the LLC and if so which ways are that?
>
> Any uncached access (WC from CPU or from scanout) bypass the LLC.
Think I got it now, thanks.
>>>>> +}
>>>>> +
>>>>> /**
>>>>> * Creates a new mm object and returns a handle to it.
>>>>> * @dev: drm device pointer
>>>>> @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
>>>>> case I915_GEM_DOMAIN_CPU:
>>>>> i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
>>>>> break;
>>>>> +
>>>>> + case I915_GEM_DOMAIN_RENDER:
>>>>> + if (gpu_write_needs_clflush(obj))
>>>>> + obj->cache_dirty = true;
>>>>> + break;
>>>>> }
>>>>>
>>>>> obj->base.write_domain = 0;
>>>>> @@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
>>>>> * optimizes for the case when the gpu will dirty the data
>>>>> * anyway again before the next pread happens.
>>>>> */
>>>>> - if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
>>>>> + if (!obj->cache_dirty &&
>>>>> + !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
>>>>> *needs_clflush = CLFLUSH_BEFORE;
>>>>
>>>> Obviously I don't understand something critical here since once more I
>>>> was expecting the reverse here - if the cache is not dirty no need to
>>>> flush it.
>>>
>>> This is an invalidate. We need to make sure that any stray cacheline the
>>> CPU pulled in are invalidated prior to accessing the physical address
>>> which was written to by the GPU out of sight of the cpu.
>>
>> Why we don't need to do that if CPU cache is dirty?
>
> If the CPU cache is dirty it already means we have passed the invalidate
> phase and own the cache. We will be doing the clflush at the completion
> of the write phase.
Okay, got this one as well. There is what looks to be a stale comment
talking about moving to gtt above this block btw.
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
2017-06-16 12:59 ` Tvrtko Ursulin
@ 2017-06-16 13:56 ` Chris Wilson
0 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2017-06-16 13:56 UTC (permalink / raw)
To: Tvrtko Ursulin, intel-gfx; +Cc: Dongwon Kim
Quoting Tvrtko Ursulin (2017-06-16 13:59:38)
> Okay, got this one as well. There is what looks to be a stale comment
> talking about moving to gtt above this block btw.
>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Thanks for taking the time to go through these. The code has been
very much developed empirically and it shows; many earlier hypotheses
about how the hw actualy works have had to be thrown out when the bugs
bit. A fresh viewpoint is most welcome.
Pushed,
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* ✗ Fi.CI.BAT: warning for series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes (rev2)
2017-06-15 12:38 [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Chris Wilson
` (2 preceding siblings ...)
2017-06-15 14:28 ` [passive aggressive RESEND 1/2] " Tvrtko Ursulin
@ 2017-06-16 11:11 ` Patchwork
3 siblings, 0 replies; 14+ messages in thread
From: Patchwork @ 2017-06-16 11:11 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes (rev2)
URL : https://patchwork.freedesktop.org/series/25841/
State : warning
== Summary ==
Series 25841v2 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/25841/revisions/2/mbox/
Test gem_exec_suspend:
Subgroup basic-s4-devices:
dmesg-warn -> PASS (fi-kbl-7560u) fdo#100125
Test kms_busy:
Subgroup basic-flip-default-b:
fail -> DMESG-WARN (fi-skl-6700hq) fdo#101144
Test kms_cursor_legacy:
Subgroup basic-busy-flip-before-cursor-legacy:
fail -> PASS (fi-snb-2600) fdo#100215
Test kms_flip:
Subgroup basic-flip-vs-dpms:
pass -> DMESG-WARN (fi-bsw-n3050)
fdo#100125 https://bugs.freedesktop.org/show_bug.cgi?id=100125
fdo#101144 https://bugs.freedesktop.org/show_bug.cgi?id=101144
fdo#100215 https://bugs.freedesktop.org/show_bug.cgi?id=100215
fi-bdw-5557u total:278 pass:267 dwarn:0 dfail:0 fail:0 skip:11 time:464s
fi-bdw-gvtdvm total:278 pass:256 dwarn:8 dfail:0 fail:0 skip:14 time:491s
fi-bsw-n3050 total:278 pass:241 dwarn:1 dfail:0 fail:0 skip:36 time:596s
fi-bxt-j4205 total:278 pass:259 dwarn:0 dfail:0 fail:0 skip:19 time:549s
fi-byt-j1900 total:278 pass:254 dwarn:0 dfail:0 fail:0 skip:24 time:498s
fi-byt-n2820 total:278 pass:250 dwarn:0 dfail:0 fail:0 skip:28 time:494s
fi-glk-2a total:278 pass:259 dwarn:0 dfail:0 fail:0 skip:19 time:590s
fi-hsw-4770 total:278 pass:262 dwarn:0 dfail:0 fail:0 skip:16 time:429s
fi-hsw-4770r total:278 pass:261 dwarn:0 dfail:0 fail:0 skip:16 time:417s
fi-ilk-650 total:278 pass:227 dwarn:0 dfail:0 fail:0 skip:50 time:466s
fi-ivb-3520m total:278 pass:259 dwarn:0 dfail:0 fail:0 skip:18 time:494s
fi-ivb-3770 total:278 pass:259 dwarn:0 dfail:0 fail:0 skip:18 time:511s
fi-kbl-7500u total:278 pass:260 dwarn:0 dfail:0 fail:0 skip:18 time:475s
fi-kbl-7560u total:278 pass:268 dwarn:0 dfail:0 fail:0 skip:10 time:573s
fi-kbl-r total:278 pass:259 dwarn:1 dfail:0 fail:0 skip:18 time:583s
fi-skl-6260u total:278 pass:268 dwarn:0 dfail:0 fail:0 skip:10 time:487s
fi-skl-6700hq total:278 pass:228 dwarn:2 dfail:0 fail:26 skip:22 time:443s
fi-skl-6700k total:278 pass:256 dwarn:4 dfail:0 fail:0 skip:18 time:515s
fi-skl-6770hq total:278 pass:268 dwarn:0 dfail:0 fail:0 skip:10 time:506s
fi-skl-gvtdvm total:278 pass:265 dwarn:0 dfail:0 fail:0 skip:13 time:509s
fi-snb-2600 total:278 pass:247 dwarn:0 dfail:0 fail:1 skip:29 time:405s
fi-snb-2520m failed to connect after reboot
728e84dbddb4fc5f7a4e48dafcda215c527737ca drm-tip: 2017y-06m-16d-07h-10m-54s UTC integration manifest
5888c4d drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty
ce9e414 drm/i915: Mark CPU cache as dirty on every transition for CPU writes
== Logs ==
For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4967/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2017-06-16 13:56 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-06-15 12:38 [passive aggressive RESEND 1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Chris Wilson
2017-06-15 12:38 ` [passive aggressive RESEND 2/2] drm/i915: Store i915_gem_object_is_coherent() as a bit next to cache-dirty Chris Wilson
2017-06-15 14:41 ` Tvrtko Ursulin
2017-06-16 10:04 ` Chris Wilson
2017-06-16 10:54 ` [PATCH v2] " Chris Wilson
2017-06-16 11:10 ` Tvrtko Ursulin
2017-06-15 12:59 ` ✓ Fi.CI.BAT: success for series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Patchwork
2017-06-15 14:28 ` [passive aggressive RESEND 1/2] " Tvrtko Ursulin
2017-06-16 11:26 ` Chris Wilson
2017-06-16 12:05 ` Tvrtko Ursulin
2017-06-16 12:13 ` Chris Wilson
2017-06-16 12:59 ` Tvrtko Ursulin
2017-06-16 13:56 ` Chris Wilson
2017-06-16 11:11 ` ✗ Fi.CI.BAT: warning for series starting with [passive,aggressive,RESEND,1/2] drm/i915: Mark CPU cache as dirty on every transition for CPU writes (rev2) Patchwork
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox