All of lore.kernel.org
 help / color / mirror / Atom feed
* Updated fence patches
@ 2014-09-02 21:32 Jesse Barnes
  2014-09-02 21:32 ` [PATCH 1/2] drm/i915: Android sync points for i915 v2 Jesse Barnes
  2014-09-02 21:32 ` [PATCH 2/2] drm/i915: allow sync points within batches Jesse Barnes
  0 siblings, 2 replies; 13+ messages in thread
From: Jesse Barnes @ 2014-09-02 21:32 UTC (permalink / raw)
  To: intel-gfx

This set includes a sketch of how we might allow fences to be emitted
directly within a batch buffer.  This gets rid of the need for flushing
around fence operations, which can be a win, and lets userspace more
finely control things.

If it looks reasonable, we could drop the separate ioctl and just use
this instead...  Alternately, I could still add the "return a fence from
execbuf" behavior to the main execbuf ioctl, to represent completion for
the whole thing.  And per the last comments, we still want a way to pass
an array of fences in with a given operation, so it can be properly
ordered by the scheduler.

Jesse

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/2] drm/i915: Android sync points for i915 v2
  2014-09-02 21:32 Updated fence patches Jesse Barnes
@ 2014-09-02 21:32 ` Jesse Barnes
  2014-09-03  7:09   ` Chris Wilson
  2014-09-02 21:32 ` [PATCH 2/2] drm/i915: allow sync points within batches Jesse Barnes
  1 sibling, 1 reply; 13+ messages in thread
From: Jesse Barnes @ 2014-09-02 21:32 UTC (permalink / raw)
  To: intel-gfx

Expose an ioctl to create Android fences based on the Android sync point
infrastructure (which in turn is based on DMA-buf fences).  Just a
sketch at this point, no testing has been done.

There are a couple of goals here:
  1) allow applications and libraries to create fences without an
     associated buffer
  2) re-use a common API so userspace doesn't have to impedance mismatch
     between different driver implementations too much
  3) allow applications and libraries to use explicit synchronization if
     they choose by exposing fences directly

v2: use struct fence directly using Maarten's new interface

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/gpu/drm/i915/Kconfig     |   2 +
 drivers/gpu/drm/i915/Makefile    |   1 +
 drivers/gpu/drm/i915/i915_dma.c  |   1 +
 drivers/gpu/drm/i915/i915_drv.h  |  10 ++
 drivers/gpu/drm/i915/i915_gem.c  |  15 +-
 drivers/gpu/drm/i915/i915_irq.c  |   4 +-
 drivers/gpu/drm/i915/i915_sync.c | 323 +++++++++++++++++++++++++++++++++++++++
 include/uapi/drm/i915_drm.h      |  23 +++
 8 files changed, 373 insertions(+), 6 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_sync.c

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 4e39ab3..cd0f2ec 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -6,6 +6,8 @@ config DRM_I915
 	select INTEL_GTT
 	select AGP_INTEL if AGP
 	select INTERVAL_TREE
+	select ANDROID
+	select SYNC
 	# we need shmfs for the swappable backing store, and in particular
 	# the shmem_readpage() which depends upon tmpfs
 	select SHMEM
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 91bd167..61a3eb5c 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -25,6 +25,7 @@ i915-y += i915_cmd_parser.o \
 	  i915_gem_execbuffer.o \
 	  i915_gem_gtt.o \
 	  i915_gem.o \
+	  i915_sync.o \
 	  i915_gem_stolen.o \
 	  i915_gem_tiling.o \
 	  i915_gem_userptr.o \
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 2e7f03a..84086e1 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -2043,6 +2043,7 @@ const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_REG_READ, i915_reg_read_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GET_RESET_STATS, i915_get_reset_stats_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GEM_USERPTR, i915_gem_userptr_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_FENCE, i915_sync_create_fence_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
 };
 
 int i915_max_ioctl = ARRAY_SIZE(i915_ioctls);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d604f4f..6eb119e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1388,6 +1388,8 @@ struct i915_frontbuffer_tracking {
 	unsigned flip_bits;
 };
 
+struct i915_sync_timeline;
+
 struct drm_i915_private {
 	struct drm_device *dev;
 	struct kmem_cache *slab;
@@ -1422,6 +1424,8 @@ struct drm_i915_private {
 	struct drm_i915_gem_object *semaphore_obj;
 	uint32_t last_seqno, next_seqno;
 
+	struct i915_sync_timeline *sync_tl[I915_NUM_RINGS];
+
 	drm_dma_handle_t *status_page_dmah;
 	struct resource mch_res;
 
@@ -2275,6 +2279,12 @@ void i915_init_vm(struct drm_i915_private *dev_priv,
 void i915_gem_free_object(struct drm_gem_object *obj);
 void i915_gem_vma_destroy(struct i915_vma *vma);
 
+/* i915_sync.c */
+int i915_sync_init(struct drm_i915_private *dev_priv);
+void i915_sync_fini(struct drm_i915_private *dev_priv);
+int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
+				 struct drm_file *file);
+
 #define PIN_MAPPABLE 0x1
 #define PIN_NONBLOCK 0x2
 #define PIN_GLOBAL 0x4
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index dcd8d7b..ace716e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1146,11 +1146,11 @@ static bool can_wait_boost(struct drm_i915_file_private *file_priv)
  * Returns 0 if the seqno was found within the alloted time. Else returns the
  * errno with remaining time filled in timeout argument.
  */
-static int __wait_seqno(struct intel_engine_cs *ring, u32 seqno,
-			unsigned reset_counter,
-			bool interruptible,
-			struct timespec *timeout,
-			struct drm_i915_file_private *file_priv)
+int __wait_seqno(struct intel_engine_cs *ring, u32 seqno,
+		 unsigned reset_counter,
+		 bool interruptible,
+		 struct timespec *timeout,
+		 struct drm_i915_file_private *file_priv)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -4775,6 +4775,9 @@ int i915_gem_init(struct drm_device *dev)
 		atomic_set_mask(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
 		ret = 0;
 	}
+
+	i915_sync_init(dev_priv);
+
 	mutex_unlock(&dev->struct_mutex);
 
 	/* Allow hardware batchbuffers unless told otherwise, but not for KMS. */
@@ -4970,6 +4973,8 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
 		request->file_priv = NULL;
 	}
 	spin_unlock(&file_priv->mm.lock);
+
+	i915_sync_fini(dev->dev_private);
 }
 
 static void
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 98abc22..149e083 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -33,6 +33,7 @@
 #include <linux/circ_buf.h>
 #include <drm/drmP.h>
 #include <drm/i915_drm.h>
+#include "../../../staging/android/sync.h"
 #include "i915_drv.h"
 #include "i915_trace.h"
 #include "intel_drv.h"
@@ -2617,8 +2618,9 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
 	 */
 
 	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
-	for_each_ring(ring, dev_priv, i)
+	for_each_ring(ring, dev_priv, i) {
 		wake_up_all(&ring->irq_queue);
+	}
 
 	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
 	wake_up_all(&dev_priv->pending_flip_queue);
diff --git a/drivers/gpu/drm/i915/i915_sync.c b/drivers/gpu/drm/i915/i915_sync.c
new file mode 100644
index 0000000..4938616
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_sync.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jesse Barnes <jbarnes@virtuousgeek.org>
+ *
+ */
+
+#include <drm/drmP.h>
+#include <drm/drm_vma_manager.h>
+#include <drm/i915_drm.h>
+#include "i915_drv.h"
+#include "i915_trace.h"
+#include "intel_drv.h"
+#include <linux/oom.h>
+#include <linux/shmem_fs.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/pci.h>
+#include <linux/dma-buf.h>
+#include "../../../staging/android/sync.h"
+
+/* Nothing really to protect here... */
+spinlock_t fence_lock;
+
+/*
+ * i915 fences on sync timelines
+ *
+ * We implement sync points in terms of i915 seqnos.  They're exposed
+ * through the new DRM_I915_GEM_FENCE ioctl, and can be mixed and matched
+ * with other Android timelines and aggregated into sync_fences, etc.
+ *
+ * TODO:
+ *   rebase on top of Chris's seqno/request stuff and use requests
+ *   allow non-RCS fences (need ring/context association)
+ */
+
+struct i915_fence {
+	struct fence base;
+	struct intel_engine_cs *ring;
+	struct intel_context *ctx;
+	wait_queue_t wait;
+	u32 seqno;
+};
+
+#define to_intel_fence(x) container_of(x, struct i915_fence, base)
+
+int __wait_seqno(struct intel_engine_cs *ring, u32 seqno,
+		 unsigned reset_counter,
+		 bool interruptible,
+		 struct timespec *timeout,
+		 struct drm_i915_file_private *file_priv);
+
+static const char *i915_fence_get_driver_name(struct fence *fence)
+{
+	return "i915";
+}
+
+static const char *i915_fence_get_timeline_name(struct fence *fence)
+{
+	struct i915_fence *intel_fence = to_intel_fence(fence);
+
+	return intel_fence->ring->name;
+}
+
+static int i915_fence_check(wait_queue_t *wait, unsigned mode, int flags,
+			    void *key)
+{
+	struct i915_fence *intel_fence = wait->private;
+	struct intel_engine_cs *ring = intel_fence->ring;
+
+	if (!i915_seqno_passed(ring->get_seqno(ring, false),
+			       intel_fence->seqno))
+		return 0;
+
+	fence_signal_locked(&intel_fence->base);
+
+	__remove_wait_queue(&ring->irq_queue, wait);
+	fence_put(&intel_fence->base);
+	ring->irq_put(ring);
+
+	return 0;
+}
+
+static bool i915_fence_enable_signaling(struct fence *fence)
+{
+	struct i915_fence *intel_fence = to_intel_fence(fence);
+	struct intel_engine_cs *ring = intel_fence->ring;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	wait_queue_t *wait = &intel_fence->wait;
+
+	/* queue fence wait queue on irq queue and get fence */
+	if (i915_seqno_passed(ring->get_seqno(ring, false),
+			      intel_fence->seqno) ||
+	    i915_terminally_wedged(&dev_priv->gpu_error))
+		return false;
+
+	if (!ring->irq_get(ring))
+		return false;
+
+	wait->flags = 0;
+	wait->private = intel_fence;
+	wait->func = i915_fence_check;
+
+	__add_wait_queue(&ring->irq_queue, wait);
+	fence_get(fence);
+
+	return true;
+}
+
+static bool i915_fence_signaled(struct fence *fence)
+{
+	struct i915_fence *intel_fence = to_intel_fence(fence);
+	struct intel_engine_cs *ring = intel_fence->ring;
+
+	return i915_seqno_passed(ring->get_seqno(ring, false),
+				 intel_fence->seqno);
+}
+
+static signed long i915_fence_wait(struct fence *fence, bool intr,
+				   signed long timeout)
+{
+	struct i915_fence *intel_fence = to_intel_fence(fence);
+	struct drm_i915_private *dev_priv = intel_fence->ring->dev->dev_private;
+	struct timespec ts;
+	int ret;
+
+	jiffies_to_timespec(timeout, &ts);
+
+	ret = __wait_seqno(intel_fence->ring, intel_fence->seqno,
+			   atomic_read(&dev_priv->gpu_error.reset_counter),
+			   intr, &ts, NULL);
+	if (ret == -ETIME)
+		return timespec_to_jiffies(&ts);
+
+	return ret;
+}
+
+static int i915_fence_fill_driver_data(struct fence *fence, void *data,
+				      int size)
+{
+	struct i915_fence *intel_fence = to_intel_fence(fence);
+
+	if (size < sizeof(intel_fence->seqno))
+		return -ENOMEM;
+
+	memcpy(data, &intel_fence->seqno, sizeof(intel_fence->seqno));
+
+	return sizeof(intel_fence->seqno);
+}
+
+static void i915_fence_value_str(struct fence *fence, char *str, int size)
+{
+	struct i915_fence *intel_fence = to_intel_fence(fence);
+
+	snprintf(str, size, "%u", intel_fence->seqno);
+}
+
+static void i915_fence_timeline_value_str(struct fence *fence, char *str,
+					  int size)
+{
+	struct i915_fence *intel_fence = to_intel_fence(fence);
+	struct intel_engine_cs *ring = intel_fence->ring;
+
+	snprintf(str, size, "%u", ring->get_seqno(ring, false));
+}
+
+static struct fence_ops i915_fence_ops = {
+	.get_driver_name = 	i915_fence_get_driver_name,
+	.get_timeline_name =	i915_fence_get_timeline_name,
+	.enable_signaling =	i915_fence_enable_signaling,
+	.signaled =		i915_fence_signaled,
+	.wait =			i915_fence_wait,
+	.fill_driver_data =	i915_fence_fill_driver_data,
+	.fence_value_str =	i915_fence_value_str,
+	.timeline_value_str =	i915_fence_timeline_value_str,
+};
+
+static struct fence *i915_fence_create(struct intel_engine_cs *ring,
+				       struct intel_context *ctx)
+{
+	struct i915_fence *fence;
+	int ret;
+
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		return NULL;
+
+	ret = ring->add_request(ring);
+	if (ret) {
+		DRM_ERROR("add_request failed\n");
+		fence_free((struct fence *)fence);
+		return NULL;
+	}
+
+	fence->ring = ring;
+	fence->ctx = ctx;
+	fence->seqno = ring->outstanding_lazy_seqno;
+	fence_init(&fence->base, &i915_fence_ops, &fence_lock, ctx->user_handle,
+		   fence->seqno);
+
+	return &fence->base;
+}
+
+/**
+ * i915_sync_create_fence_ioctl - fence creation function
+ * @dev: drm device
+ * @data: ioctl data
+ * @file: file struct
+ *
+ * This function creates a fence given a context and ring, and returns
+ * it to the caller in the form of a file descriptor.
+ *
+ * The returned descriptor is a sync fence fd, and can be used with all
+ * the usual sync fence operations (poll, ioctl, etc).
+ *
+ * The process fd limit should prevent an overallocation of fence objects,
+ * which need to be destroyed manually with a close() call.
+ */
+int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
+				 struct drm_file *file)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_gem_fence *fdata = data;
+	struct fence *fence;
+	struct sync_fence *sfence;
+	struct intel_engine_cs *ring;
+	struct intel_context *ctx;
+	u32 ctx_id = fdata->ctx_id;
+	int fd = get_unused_fd_flags(O_CLOEXEC);
+	int ret = 0;
+
+	if (file == NULL) {
+		DRM_ERROR("no file priv?\n");
+		return -EINVAL;
+	}
+
+	ret = i915_mutex_lock_interruptible(dev);
+	if (ret) {
+		DRM_ERROR("mutex interrupted\n");
+		goto out;
+	}
+
+	ctx = i915_gem_context_get(file->driver_priv, ctx_id);
+	if (ctx == NULL) {
+		DRM_ERROR("context lookup failed\n");
+		ret = -ENOENT;
+		goto err;
+	}
+
+	ring = &dev_priv->ring[RCS];
+
+	if (!intel_ring_initialized(ring)) {
+		DRM_ERROR("ring not ready\n");
+		ret = -EIO;
+		goto err;
+	}
+
+	fence = i915_fence_create(ring, ctx);
+	if (!fence) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	fdata->name[sizeof(fdata->name) - 1] = '\0';
+	sfence = sync_fence_create_dma(fdata->name, fence);
+	if (!sfence) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	fdata->fd = fd;
+
+	sync_fence_install(sfence, fd);
+
+	mutex_unlock(&dev->struct_mutex);
+out:
+	return ret;
+
+err:
+	mutex_unlock(&dev->struct_mutex);
+	put_unused_fd(fd);
+	return ret;
+}
+
+int i915_sync_init(struct drm_i915_private *dev_priv)
+{
+	struct intel_engine_cs *ring;
+	int i, ret = 0;
+
+	for_each_ring(ring, dev_priv, i) {
+		/* FIXME: non-RCS fences */
+	}
+
+	return ret;
+}
+
+void i915_sync_fini(struct drm_i915_private *dev_priv)
+{
+	int i;
+
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+	}
+}
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index ff57f07..65bd271 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -224,6 +224,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_REG_READ		0x31
 #define DRM_I915_GET_RESET_STATS	0x32
 #define DRM_I915_GEM_USERPTR		0x33
+#define DRM_I915_GEM_FENCE		0x34
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -275,6 +276,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_REG_READ			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read)
 #define DRM_IOCTL_I915_GET_RESET_STATS		DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats)
 #define DRM_IOCTL_I915_GEM_USERPTR			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_USERPTR, struct drm_i915_gem_userptr)
+#define DRM_IOCTL_I915_GEM_FENCE			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_FENCE, struct drm_i915_gem_fence)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -1066,4 +1068,25 @@ struct drm_i915_gem_userptr {
 	__u32 handle;
 };
 
+/**
+ * drm_i915_gem_fence - create a fence
+ * @fd: fd for fence
+ * @ctx_id: context ID for fence
+ * @flags: flags for operation
+ *
+ * Creates a fence in @fd and returns it to the caller.  This fd can be
+ * passed around between processes as any other fd, and can be poll'd
+ * and read for status.
+ *
+ * RETURNS:
+ * A valid fd in the @fd field or an errno on error.
+ */
+struct drm_i915_gem_fence {
+	__s32 fd;
+	__u32 ctx_id;
+	__u32 flags;
+	__u32 pad;
+	char name[32];
+};
+
 #endif /* _UAPI_I915_DRM_H_ */
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/2] drm/i915: allow sync points within batches
  2014-09-02 21:32 Updated fence patches Jesse Barnes
  2014-09-02 21:32 ` [PATCH 1/2] drm/i915: Android sync points for i915 v2 Jesse Barnes
@ 2014-09-02 21:32 ` Jesse Barnes
  2014-09-03  7:01   ` Chris Wilson
  1 sibling, 1 reply; 13+ messages in thread
From: Jesse Barnes @ 2014-09-02 21:32 UTC (permalink / raw)
  To: intel-gfx

Use a new reloc type to allow userspace to insert sync points within
batches before they're submitted.  The corresponding fence fds are
returned in the offset field of the returned reloc tree, and can be
operated on with the sync fence APIs.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/gpu/drm/i915/i915_drv.h            |   4 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_sync.c           |  58 ++++++++++---
 include/uapi/drm/i915_drm.h                |  11 ++-
 4 files changed, 167 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6eb119e..410eedf 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv);
 void i915_sync_fini(struct drm_i915_private *dev_priv);
 int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
 				 struct drm_file *file);
+int i915_sync_fence_create(struct intel_engine_cs *ring,
+			   struct intel_context *ctx,
+			   u32 seqno);
+
 
 #define PIN_MAPPABLE 0x1
 #define PIN_NONBLOCK 0x2
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 60998fc..32ec599 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
+#include "../../../staging/android/sync.h"
 
 #define  __EXEC_OBJECT_HAS_PIN (1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
 		!obj->map_and_fenceable ||
 		obj->cache_level != I915_CACHE_NONE);
 }
+static int
+emit_sync_obj_cpu(struct drm_i915_gem_object *obj,
+		  struct drm_i915_gem_relocation_entry *reloc)
+{
+	uint32_t page_offset = offset_in_page(reloc->offset);
+	char *vaddr;
+	int ret;
+
+	ret = i915_gem_object_set_to_cpu_domain(obj, true);
+	if (ret)
+		return ret;
+
+	vaddr = kmap_atomic(i915_gem_object_get_page(obj,
+				reloc->offset >> PAGE_SHIFT));
+	*(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX;
+	*(uint32_t *)(vaddr + page_offset + 4) =
+		I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+	*(uint32_t *)(vaddr + page_offset + 8) =
+		obj->ring->outstanding_lazy_seqno;
+	*(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT;
+
+	kunmap_atomic(vaddr);
+
+	return 0;
+}
+
+static int
+emit_sync_obj_gtt(struct drm_i915_gem_object *obj,
+		  struct drm_i915_gem_relocation_entry *reloc)
+{
+	struct drm_device *dev = obj->base.dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t __iomem *reloc_entry;
+	void __iomem *reloc_page;
+	int ret;
+
+	ret = i915_gem_object_set_to_gtt_domain(obj, true);
+	if (ret)
+		return ret;
+
+	ret = i915_gem_object_put_fence(obj);
+	if (ret)
+		return ret;
+
+	/* Map the page containing the relocation we're going to perform.  */
+	reloc->offset += i915_gem_obj_ggtt_offset(obj);
+	reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable,
+			reloc->offset & PAGE_MASK);
+
+	reloc_entry = (uint32_t __iomem *)
+		(reloc_page + offset_in_page(reloc->offset));
+	iowrite32(MI_STORE_DWORD_INDEX, reloc_entry);
+	iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT,
+		  reloc_entry);
+	iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry);
+	iowrite32(MI_USER_INTERRUPT, reloc_entry);
+
+	io_mapping_unmap_atomic(reloc_page);
+
+	return 0;
+}
 
 static int
 relocate_entry_cpu(struct drm_i915_gem_object *obj,
@@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
 static int
 i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 				   struct eb_vmas *eb,
-				   struct drm_i915_gem_relocation_entry *reloc)
+				   struct drm_i915_gem_relocation_entry *reloc,
+				   struct intel_context *ctx)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_gem_object *target_obj;
@@ -433,23 +496,39 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	if (obj->active && in_atomic())
 		return -EFAULT;
 
-	if (use_cpu_reloc(obj))
-		ret = relocate_entry_cpu(obj, reloc, target_offset);
-	else
-		ret = relocate_entry_gtt(obj, reloc, target_offset);
+	if (reloc->write_domain & I915_GEM_DOMAIN_SYNC_OBJ) {
+		int fd;
+
+		/* get a new seqno */
+		intel_ring_begin(obj->ring, 0);
+
+		if (use_cpu_reloc(obj))
+			ret = emit_sync_obj_cpu(obj, reloc);
+		else
+			ret = emit_sync_obj_gtt(obj, reloc);
+
+		fd = i915_sync_fence_create(obj->ring, ctx,
+					    obj->ring->outstanding_lazy_seqno);
+		reloc->presumed_offset = fd;
+	} else {
+		if (use_cpu_reloc(obj))
+			ret = relocate_entry_cpu(obj, reloc, target_offset);
+		else
+			ret = relocate_entry_gtt(obj, reloc, target_offset);
+		/* and update the user's relocation entry */
+		reloc->presumed_offset = target_offset;
+	}
 
 	if (ret)
 		return ret;
 
-	/* and update the user's relocation entry */
-	reloc->presumed_offset = target_offset;
-
 	return 0;
 }
 
 static int
 i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
-				 struct eb_vmas *eb)
+				 struct eb_vmas *eb,
+				 struct intel_context *ctx)
 {
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
 	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
@@ -473,7 +552,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 		do {
 			u64 offset = r->presumed_offset;
 
-			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r);
+			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, ctx);
 			if (ret)
 				return ret;
 
@@ -496,13 +575,14 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 static int
 i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
 				      struct eb_vmas *eb,
-				      struct drm_i915_gem_relocation_entry *relocs)
+				      struct drm_i915_gem_relocation_entry *relocs,
+				      struct intel_context *ctx)
 {
 	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	int i, ret;
 
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i]);
+		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], ctx);
 		if (ret)
 			return ret;
 	}
@@ -511,7 +591,7 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
 }
 
 static int
-i915_gem_execbuffer_relocate(struct eb_vmas *eb)
+i915_gem_execbuffer_relocate(struct eb_vmas *eb, struct intel_context *ctx)
 {
 	struct i915_vma *vma;
 	int ret = 0;
@@ -525,7 +605,7 @@ i915_gem_execbuffer_relocate(struct eb_vmas *eb)
 	 */
 	pagefault_disable();
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		ret = i915_gem_execbuffer_relocate_vma(vma, eb);
+		ret = i915_gem_execbuffer_relocate_vma(vma, eb, ctx);
 		if (ret)
 			break;
 	}
@@ -664,6 +744,13 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *ring,
 			obj->tiling_mode != I915_TILING_NONE;
 		need_mappable = need_fence || need_reloc_mappable(vma);
 
+		/*
+		 * If we're emitting a sync obj, we always need a reloc
+		 * pass to write the seqno.
+		 */
+		if (entry->flags & EXEC_OBJECT_SYNC_OBJ)
+			*need_relocs = true;
+
 		if (need_mappable)
 			list_move(&vma->exec_list, &ordered_vmas);
 		else
@@ -734,7 +821,8 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 				  struct drm_file *file,
 				  struct intel_engine_cs *ring,
 				  struct eb_vmas *eb,
-				  struct drm_i915_gem_exec_object2 *exec)
+				  struct drm_i915_gem_exec_object2 *exec,
+				  struct intel_context *ctx)
 {
 	struct drm_i915_gem_relocation_entry *reloc;
 	struct i915_address_space *vm;
@@ -830,7 +918,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
 		int offset = vma->exec_entry - exec;
 		ret = i915_gem_execbuffer_relocate_vma_slow(vma, eb,
-							    reloc + reloc_offset[offset]);
+							    reloc + reloc_offset[offset], ctx);
 		if (ret)
 			goto err;
 	}
@@ -1340,17 +1428,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	/* Move the objects en-masse into the GTT, evicting if necessary. */
 	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
+
 	ret = i915_gem_execbuffer_reserve(ring, &eb->vmas, &need_relocs);
 	if (ret)
 		goto err;
 
 	/* The objects are in their final locations, apply the relocations. */
 	if (need_relocs)
-		ret = i915_gem_execbuffer_relocate(eb);
+		ret = i915_gem_execbuffer_relocate(eb, ctx);
 	if (ret) {
 		if (ret == -EFAULT) {
 			ret = i915_gem_execbuffer_relocate_slow(dev, args, file, ring,
-								eb, exec);
+								eb, exec, ctx);
 			BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 		}
 		if (ret)
diff --git a/drivers/gpu/drm/i915/i915_sync.c b/drivers/gpu/drm/i915/i915_sync.c
index 4938616..bd54fca 100644
--- a/drivers/gpu/drm/i915/i915_sync.c
+++ b/drivers/gpu/drm/i915/i915_sync.c
@@ -195,32 +195,72 @@ static struct fence_ops i915_fence_ops = {
 	.timeline_value_str =	i915_fence_timeline_value_str,
 };
 
-static struct fence *i915_fence_create(struct intel_engine_cs *ring,
-				       struct intel_context *ctx)
+static struct i915_fence *__i915_fence_create(struct intel_engine_cs *ring,
+					      struct intel_context *ctx,
+					      u32 seqno)
 {
 	struct i915_fence *fence;
-	int ret;
 
 	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
 	if (!fence)
 		return NULL;
 
+	fence->ring = ring;
+	fence->ctx = ctx;
+	fence->seqno = ring->outstanding_lazy_seqno;
+	fence_init(&fence->base, &i915_fence_ops, &fence_lock, ctx->user_handle,
+		   fence->seqno);
+
+	return fence;
+}
+
+static struct fence *i915_fence_create(struct intel_engine_cs *ring,
+				       struct intel_context *ctx)
+{
+	struct i915_fence *fence;
+	int ret;
+
 	ret = ring->add_request(ring);
 	if (ret) {
 		DRM_ERROR("add_request failed\n");
-		fence_free((struct fence *)fence);
 		return NULL;
 	}
 
-	fence->ring = ring;
-	fence->ctx = ctx;
-	fence->seqno = ring->outstanding_lazy_seqno;
-	fence_init(&fence->base, &i915_fence_ops, &fence_lock, ctx->user_handle,
-		   fence->seqno);
+	fence = __i915_fence_create(ring, ctx, ring->outstanding_lazy_seqno);
 
 	return &fence->base;
 }
 
+int i915_sync_fence_create(struct intel_engine_cs *ring,
+			   struct intel_context *ctx,
+			   u32 seqno)
+{
+	struct i915_fence *fence;
+	struct sync_fence *sfence;
+	char name[64];
+	int fd = get_unused_fd_flags(O_CLOEXEC);
+
+	fence = __i915_fence_create(ring, ctx, seqno);
+	if (!fence) {
+		fd = -ENOMEM;
+		goto err;
+	}
+
+	snprintf(name, sizeof(name), "0x%08x:0x%08x",
+		 ctx->user_handle, seqno);
+	sfence = sync_fence_create_dma(name, &fence->base);
+	if (!sfence) {
+		fence_free((struct fence *)fence);
+		fd = -ENOMEM;
+		goto err;
+	}
+
+	sync_fence_install(sfence, fd);
+
+err:
+	return fd;
+}
+
 /**
  * i915_sync_create_fence_ioctl - fence creation function
  * @dev: drm device
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 65bd271..edadab2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -585,6 +585,8 @@ struct drm_i915_gem_relocation_entry {
 #define I915_GEM_DOMAIN_VERTEX		0x00000020
 /** GTT domain - aperture and scanout */
 #define I915_GEM_DOMAIN_GTT		0x00000040
+/** Sync object - special for inline fences */
+#define I915_GEM_DOMAIN_SYNC_OBJ	0x00000080
 /** @} */
 
 struct drm_i915_gem_exec_object {
@@ -661,10 +663,11 @@ struct drm_i915_gem_exec_object2 {
 	 */
 	__u64 offset;
 
-#define EXEC_OBJECT_NEEDS_FENCE (1<<0)
-#define EXEC_OBJECT_NEEDS_GTT	(1<<1)
-#define EXEC_OBJECT_WRITE	(1<<2)
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1)
+#define EXEC_OBJECT_NEEDS_FENCE (1<<0) /* requires fence regsiter */
+#define EXEC_OBJECT_NEEDS_GTT	(1<<1) /* needs global GTT mapping */
+#define EXEC_OBJECT_WRITE	(1<<2) /* object will be written */
+#define EXEC_OBJECT_SYNC_OBJ	(1<<3) /* emit a sync obj instead */
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SYNC_OBJ<<1)
 	__u64 flags;
 
 	__u64 rsvd1;
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] drm/i915: allow sync points within batches
  2014-09-02 21:32 ` [PATCH 2/2] drm/i915: allow sync points within batches Jesse Barnes
@ 2014-09-03  7:01   ` Chris Wilson
  2014-09-03 15:41     ` Jesse Barnes
  0 siblings, 1 reply; 13+ messages in thread
From: Chris Wilson @ 2014-09-03  7:01 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: intel-gfx

On Tue, Sep 02, 2014 at 02:32:41PM -0700, Jesse Barnes wrote:
> Use a new reloc type to allow userspace to insert sync points within
> batches before they're submitted.  The corresponding fence fds are
> returned in the offset field of the returned reloc tree, and can be
> operated on with the sync fence APIs.
> 
> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
> ---
>  drivers/gpu/drm/i915/i915_drv.h            |   4 +
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_sync.c           |  58 ++++++++++---
>  include/uapi/drm/i915_drm.h                |  11 ++-
>  4 files changed, 167 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 6eb119e..410eedf 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv);
>  void i915_sync_fini(struct drm_i915_private *dev_priv);
>  int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
>  				 struct drm_file *file);
> +int i915_sync_fence_create(struct intel_engine_cs *ring,
> +			   struct intel_context *ctx,
> +			   u32 seqno);
> +
>  
>  #define PIN_MAPPABLE 0x1
>  #define PIN_NONBLOCK 0x2
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 60998fc..32ec599 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -32,6 +32,7 @@
>  #include "i915_trace.h"
>  #include "intel_drv.h"
>  #include <linux/dma_remapping.h>
> +#include "../../../staging/android/sync.h"
>  
>  #define  __EXEC_OBJECT_HAS_PIN (1<<31)
>  #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> @@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
>  		!obj->map_and_fenceable ||
>  		obj->cache_level != I915_CACHE_NONE);
>  }
> +static int
> +emit_sync_obj_cpu(struct drm_i915_gem_object *obj,
> +		  struct drm_i915_gem_relocation_entry *reloc)
> +{
> +	uint32_t page_offset = offset_in_page(reloc->offset);
> +	char *vaddr;
> +	int ret;
> +
> +	ret = i915_gem_object_set_to_cpu_domain(obj, true);
> +	if (ret)
> +		return ret;
> +
> +	vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> +				reloc->offset >> PAGE_SHIFT));
> +	*(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX;
> +	*(uint32_t *)(vaddr + page_offset + 4) =
> +		I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
> +	*(uint32_t *)(vaddr + page_offset + 8) =
> +		obj->ring->outstanding_lazy_seqno;
> +	*(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT;
> +
> +	kunmap_atomic(vaddr);
> +
> +	return 0;
> +}
> +
> +static int
> +emit_sync_obj_gtt(struct drm_i915_gem_object *obj,
> +		  struct drm_i915_gem_relocation_entry *reloc)
> +{
> +	struct drm_device *dev = obj->base.dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	uint32_t __iomem *reloc_entry;
> +	void __iomem *reloc_page;
> +	int ret;
> +
> +	ret = i915_gem_object_set_to_gtt_domain(obj, true);
> +	if (ret)
> +		return ret;
> +
> +	ret = i915_gem_object_put_fence(obj);
> +	if (ret)
> +		return ret;
> +
> +	/* Map the page containing the relocation we're going to perform.  */
> +	reloc->offset += i915_gem_obj_ggtt_offset(obj);
> +	reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable,
> +			reloc->offset & PAGE_MASK);
> +
> +	reloc_entry = (uint32_t __iomem *)
> +		(reloc_page + offset_in_page(reloc->offset));
> +	iowrite32(MI_STORE_DWORD_INDEX, reloc_entry);
> +	iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT,
> +		  reloc_entry);
> +	iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry);
> +	iowrite32(MI_USER_INTERRUPT, reloc_entry);
> +
> +	io_mapping_unmap_atomic(reloc_page);

These commands are illegal/invalid inside the object, only valid inside
the ring.

> +	return 0;
> +}
>  
>  static int
>  relocate_entry_cpu(struct drm_i915_gem_object *obj,
> @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
>  static int
>  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
>  				   struct eb_vmas *eb,
> -				   struct drm_i915_gem_relocation_entry *reloc)
> +				   struct drm_i915_gem_relocation_entry *reloc,
> +				   struct intel_context *ctx)

Hmm. That's a nuisance. But no, you only use it to automatically create
a fence not to patch the batch, so you can just use an object-flag.

This fits neatly into requests.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] drm/i915: Android sync points for i915 v2
  2014-09-02 21:32 ` [PATCH 1/2] drm/i915: Android sync points for i915 v2 Jesse Barnes
@ 2014-09-03  7:09   ` Chris Wilson
  2014-09-03 16:41     ` Ville Syrjälä
  0 siblings, 1 reply; 13+ messages in thread
From: Chris Wilson @ 2014-09-03  7:09 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: intel-gfx

On Tue, Sep 02, 2014 at 02:32:40PM -0700, Jesse Barnes wrote:
> +static int i915_fence_check(wait_queue_t *wait, unsigned mode, int flags,
> +			    void *key)
> +{
> +	struct i915_fence *intel_fence = wait->private;
> +	struct intel_engine_cs *ring = intel_fence->ring;
> +
> +	if (!i915_seqno_passed(ring->get_seqno(ring, false),
> +			       intel_fence->seqno))
> +		return 0;
> +
> +	fence_signal_locked(&intel_fence->base);
> +
> +	__remove_wait_queue(&ring->irq_queue, wait);
> +	fence_put(&intel_fence->base);
> +	ring->irq_put(ring);
> +
> +	return 0;
> +}
> +
> +static bool i915_fence_enable_signaling(struct fence *fence)
> +{
> +	struct i915_fence *intel_fence = to_intel_fence(fence);
> +	struct intel_engine_cs *ring = intel_fence->ring;
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	wait_queue_t *wait = &intel_fence->wait;
> +
> +	/* queue fence wait queue on irq queue and get fence */
> +	if (i915_seqno_passed(ring->get_seqno(ring, false),
> +			      intel_fence->seqno) ||
> +	    i915_terminally_wedged(&dev_priv->gpu_error))
> +		return false;
> +
> +	if (!ring->irq_get(ring))
> +		return false;
> +
> +	wait->flags = 0;
> +	wait->private = intel_fence;
> +	wait->func = i915_fence_check;
> +
> +	__add_wait_queue(&ring->irq_queue, wait);
> +	fence_get(fence);
> +
> +	return true;
> +}

This looks like it implements poll(). 

You should recheck i915_request_complete() after setting up the irq
waiter. Or does struct fence_ops handle that?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] drm/i915: allow sync points within batches
  2014-09-03  7:01   ` Chris Wilson
@ 2014-09-03 15:41     ` Jesse Barnes
  2014-09-03 16:08       ` Chris Wilson
  0 siblings, 1 reply; 13+ messages in thread
From: Jesse Barnes @ 2014-09-03 15:41 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, 3 Sep 2014 08:01:55 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> On Tue, Sep 02, 2014 at 02:32:41PM -0700, Jesse Barnes wrote:
> > Use a new reloc type to allow userspace to insert sync points within
> > batches before they're submitted.  The corresponding fence fds are
> > returned in the offset field of the returned reloc tree, and can be
> > operated on with the sync fence APIs.
> > 
> > Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h            |   4 +
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++-----
> >  drivers/gpu/drm/i915/i915_sync.c           |  58 ++++++++++---
> >  include/uapi/drm/i915_drm.h                |  11 ++-
> >  4 files changed, 167 insertions(+), 31 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 6eb119e..410eedf 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv);
> >  void i915_sync_fini(struct drm_i915_private *dev_priv);
> >  int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
> >  				 struct drm_file *file);
> > +int i915_sync_fence_create(struct intel_engine_cs *ring,
> > +			   struct intel_context *ctx,
> > +			   u32 seqno);
> > +
> >  
> >  #define PIN_MAPPABLE 0x1
> >  #define PIN_NONBLOCK 0x2
> > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > index 60998fc..32ec599 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > @@ -32,6 +32,7 @@
> >  #include "i915_trace.h"
> >  #include "intel_drv.h"
> >  #include <linux/dma_remapping.h>
> > +#include "../../../staging/android/sync.h"
> >  
> >  #define  __EXEC_OBJECT_HAS_PIN (1<<31)
> >  #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> > @@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
> >  		!obj->map_and_fenceable ||
> >  		obj->cache_level != I915_CACHE_NONE);
> >  }
> > +static int
> > +emit_sync_obj_cpu(struct drm_i915_gem_object *obj,
> > +		  struct drm_i915_gem_relocation_entry *reloc)
> > +{
> > +	uint32_t page_offset = offset_in_page(reloc->offset);
> > +	char *vaddr;
> > +	int ret;
> > +
> > +	ret = i915_gem_object_set_to_cpu_domain(obj, true);
> > +	if (ret)
> > +		return ret;
> > +
> > +	vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> > +				reloc->offset >> PAGE_SHIFT));
> > +	*(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX;
> > +	*(uint32_t *)(vaddr + page_offset + 4) =
> > +		I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
> > +	*(uint32_t *)(vaddr + page_offset + 8) =
> > +		obj->ring->outstanding_lazy_seqno;
> > +	*(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT;
> > +
> > +	kunmap_atomic(vaddr);
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +emit_sync_obj_gtt(struct drm_i915_gem_object *obj,
> > +		  struct drm_i915_gem_relocation_entry *reloc)
> > +{
> > +	struct drm_device *dev = obj->base.dev;
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +	uint32_t __iomem *reloc_entry;
> > +	void __iomem *reloc_page;
> > +	int ret;
> > +
> > +	ret = i915_gem_object_set_to_gtt_domain(obj, true);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = i915_gem_object_put_fence(obj);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* Map the page containing the relocation we're going to perform.  */
> > +	reloc->offset += i915_gem_obj_ggtt_offset(obj);
> > +	reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable,
> > +			reloc->offset & PAGE_MASK);
> > +
> > +	reloc_entry = (uint32_t __iomem *)
> > +		(reloc_page + offset_in_page(reloc->offset));
> > +	iowrite32(MI_STORE_DWORD_INDEX, reloc_entry);
> > +	iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT,
> > +		  reloc_entry);
> > +	iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry);
> > +	iowrite32(MI_USER_INTERRUPT, reloc_entry);
> > +
> > +	io_mapping_unmap_atomic(reloc_page);
> 
> These commands are illegal/invalid inside the object, only valid inside
> the ring.

Hm, we ought to be able to write to no privileged space with
STORE_DWORD, but that does mean moving to context specific pages in
process space, or at least adding them to our existing scheme.

I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from
a non-privileged batch that nixes one of the other neat features we
could have (fine grained intra-batch userspace synchronization).

> > +	return 0;
> > +}
> >  
> >  static int
> >  relocate_entry_cpu(struct drm_i915_gem_object *obj,
> > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
> >  static int
> >  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> >  				   struct eb_vmas *eb,
> > -				   struct drm_i915_gem_relocation_entry *reloc)
> > +				   struct drm_i915_gem_relocation_entry *reloc,
> > +				   struct intel_context *ctx)
> 
> Hmm. That's a nuisance. But no, you only use it to automatically create
> a fence not to patch the batch, so you can just use an object-flag.
> 
> This fits neatly into requests.

Most definitely.  What do you think of the potential upside in the DDX
for this, assuming we get dword writes from batches working?

-- 
Jesse Barnes, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] drm/i915: allow sync points within batches
  2014-09-03 15:41     ` Jesse Barnes
@ 2014-09-03 16:08       ` Chris Wilson
  2014-09-03 19:01         ` Jesse Barnes
  0 siblings, 1 reply; 13+ messages in thread
From: Chris Wilson @ 2014-09-03 16:08 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: intel-gfx

On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote:
> On Wed, 3 Sep 2014 08:01:55 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> 
> > These commands are illegal/invalid inside the object, only valid inside
> > the ring.
> 
> Hm, we ought to be able to write to no privileged space with
> STORE_DWORD, but that does mean moving to context specific pages in
> process space, or at least adding them to our existing scheme.

The per-process context page also doesn't exist generically. I certainly
hope that userspace can't overwrite the hws! Imagine if we were using
that for interrupt status reads, or seqno tracking...
 
> I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from
> a non-privileged batch that nixes one of the other neat features we
> could have (fine grained intra-batch userspace synchronization).

I don't understand how writing the operation into the batch is
beneficial vs writing into the ring, unless you instended to use
something more fine grained than the batch seqno. You want to get
interrupts from inside batches? Rather than continue the existing scheme
of splitting up batches between fences?

I definitely think we should think twice before allowing userspace to
arbitrarily generate interrupts.

> > > +	return 0;
> > > +}
> > >  
> > >  static int
> > >  relocate_entry_cpu(struct drm_i915_gem_object *obj,
> > > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
> > >  static int
> > >  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> > >  				   struct eb_vmas *eb,
> > > -				   struct drm_i915_gem_relocation_entry *reloc)
> > > +				   struct drm_i915_gem_relocation_entry *reloc,
> > > +				   struct intel_context *ctx)
> > 
> > Hmm. That's a nuisance. But no, you only use it to automatically create
> > a fence not to patch the batch, so you can just use an object-flag.
> > 
> > This fits neatly into requests.
> 
> Most definitely.  What do you think of the potential upside in the DDX
> for this, assuming we get dword writes from batches working?

Negative. You now have relocation overhead, you still need to split
batches to keep the gpu busy and do ring switches, and context switching
between clients, so I don't feel a need for fences from inside a batch.

Getting seqno and a hws in the client would be nice, but if it continues
to require kernel polling, no thanks, I'll just still to approximately
tracking the active state of surfaces with the heavier accurate queries
sparingly.

About the only thing I could see as being useful is that it would allow
you to reuse a batch buffer multiple times, rather than overallocate a
whole page and keep a pool of such pages.

I am missing something?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] drm/i915: Android sync points for i915 v2
  2014-09-03  7:09   ` Chris Wilson
@ 2014-09-03 16:41     ` Ville Syrjälä
  2014-09-03 17:05       ` Chris Wilson
  0 siblings, 1 reply; 13+ messages in thread
From: Ville Syrjälä @ 2014-09-03 16:41 UTC (permalink / raw)
  To: Chris Wilson, Jesse Barnes, intel-gfx

On Wed, Sep 03, 2014 at 08:09:01AM +0100, Chris Wilson wrote:
> On Tue, Sep 02, 2014 at 02:32:40PM -0700, Jesse Barnes wrote:
> > +static int i915_fence_check(wait_queue_t *wait, unsigned mode, int flags,
> > +			    void *key)
> > +{
> > +	struct i915_fence *intel_fence = wait->private;
> > +	struct intel_engine_cs *ring = intel_fence->ring;
> > +
> > +	if (!i915_seqno_passed(ring->get_seqno(ring, false),
> > +			       intel_fence->seqno))
> > +		return 0;
> > +
> > +	fence_signal_locked(&intel_fence->base);
> > +
> > +	__remove_wait_queue(&ring->irq_queue, wait);
> > +	fence_put(&intel_fence->base);
> > +	ring->irq_put(ring);
> > +
> > +	return 0;
> > +}
> > +
> > +static bool i915_fence_enable_signaling(struct fence *fence)
> > +{
> > +	struct i915_fence *intel_fence = to_intel_fence(fence);
> > +	struct intel_engine_cs *ring = intel_fence->ring;
> > +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> > +	wait_queue_t *wait = &intel_fence->wait;
> > +
> > +	/* queue fence wait queue on irq queue and get fence */
> > +	if (i915_seqno_passed(ring->get_seqno(ring, false),
> > +			      intel_fence->seqno) ||
> > +	    i915_terminally_wedged(&dev_priv->gpu_error))
> > +		return false;
> > +
> > +	if (!ring->irq_get(ring))
> > +		return false;
> > +
> > +	wait->flags = 0;
> > +	wait->private = intel_fence;
> > +	wait->func = i915_fence_check;
> > +
> > +	__add_wait_queue(&ring->irq_queue, wait);
> > +	fence_get(fence);
> > +
> > +	return true;
> > +}
> 
> This looks like it implements poll(). 
> 
> You should recheck i915_request_complete() after setting up the irq
> waiter. Or does struct fence_ops handle that?

Also looks quite a bit like my ring notify doohicky from:
http://lists.freedesktop.org/archives/intel-gfx/2014-June/047623.html

Except I kept the list in the driver so you would need to do only one
get_seqno() per irq. Also if the list would be sorted (which it wasn't
in my patch) it would prevent signalling the fences out of order. But
maybe that's not really a problem for anyone.

Hmm, so if the out of order thing isn't a problem maybe use the wait
queue still but replace the wake_up() with __wake_up() so that the seqno
can be passed in as the key. That's assuming people care about
optimizing the seqno reads.

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] drm/i915: Android sync points for i915 v2
  2014-09-03 16:41     ` Ville Syrjälä
@ 2014-09-03 17:05       ` Chris Wilson
  2014-09-03 17:12         ` Chris Wilson
  0 siblings, 1 reply; 13+ messages in thread
From: Chris Wilson @ 2014-09-03 17:05 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: intel-gfx

On Wed, Sep 03, 2014 at 07:41:34PM +0300, Ville Syrjälä wrote:
> Also looks quite a bit like my ring notify doohicky from:
> http://lists.freedesktop.org/archives/intel-gfx/2014-June/047623.html

Indeed, fences look similar :)
 
> Except I kept the list in the driver so you would need to do only one
> get_seqno() per irq. Also if the list would be sorted (which it wasn't
> in my patch) it would prevent signalling the fences out of order. But
> maybe that's not really a problem for anyone.
> 
> Hmm, so if the out of order thing isn't a problem maybe use the wait
> queue still but replace the wake_up() with __wake_up() so that the seqno
> can be passed in as the key. That's assuming people care about
> optimizing the seqno reads.

No, we don't care about plain seqno reads - they're a cached read.

But having a kthread dedicated to internal fence signalling makes sense
if we have more than just mmioflips waiting. At the moment, I am quite
happy with using one of the system kthread pool for just mmioflips.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] drm/i915: Android sync points for i915 v2
  2014-09-03 17:05       ` Chris Wilson
@ 2014-09-03 17:12         ` Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2014-09-03 17:12 UTC (permalink / raw)
  To: Ville Syrjälä, Jesse Barnes, intel-gfx

On Wed, Sep 03, 2014 at 06:05:42PM +0100, Chris Wilson wrote:
> On Wed, Sep 03, 2014 at 07:41:34PM +0300, Ville Syrjälä wrote:
> > Also looks quite a bit like my ring notify doohicky from:
> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047623.html
> 
> Indeed, fences look similar :)
>  
> > Except I kept the list in the driver so you would need to do only one
> > get_seqno() per irq. Also if the list would be sorted (which it wasn't
> > in my patch) it would prevent signalling the fences out of order. But
> > maybe that's not really a problem for anyone.
> > 
> > Hmm, so if the out of order thing isn't a problem maybe use the wait
> > queue still but replace the wake_up() with __wake_up() so that the seqno
> > can be passed in as the key. That's assuming people care about
> > optimizing the seqno reads.
> 
> No, we don't care about plain seqno reads - they're a cached read.

I guess I should clarify - after a single irq coherency barrier, they're
cheap.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] drm/i915: allow sync points within batches
  2014-09-03 16:08       ` Chris Wilson
@ 2014-09-03 19:01         ` Jesse Barnes
  2014-09-03 19:41           ` Daniel Vetter
  0 siblings, 1 reply; 13+ messages in thread
From: Jesse Barnes @ 2014-09-03 19:01 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, 3 Sep 2014 17:08:53 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote:
> > On Wed, 3 Sep 2014 08:01:55 +0100
> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > 
> > > These commands are illegal/invalid inside the object, only valid inside
> > > the ring.
> > 
> > Hm, we ought to be able to write to no privileged space with
> > STORE_DWORD, but that does mean moving to context specific pages in
> > process space, or at least adding them to our existing scheme.
> 
> The per-process context page also doesn't exist generically. I certainly
> hope that userspace can't overwrite the hws! Imagine if we were using
> that for interrupt status reads, or seqno tracking...

Yeah I'm thinking of an additional hws that's per-context and userspace
mappable.  It could come in handy for userspace only sync stuff.

>  
> > I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from
> > a non-privileged batch that nixes one of the other neat features we
> > could have (fine grained intra-batch userspace synchronization).
> 
> I don't understand how writing the operation into the batch is
> beneficial vs writing into the ring, unless you instended to use
> something more fine grained than the batch seqno. You want to get
> interrupts from inside batches? Rather than continue the existing scheme
> of splitting up batches between fences?

Yeah, the whole idea here was to avoid flushing batches in order to
emit fences, both to avoid overhead and give userspace more rope.

> 
> I definitely think we should think twice before allowing userspace to
> arbitrarily generate interrupts.
> 
> > > > +	return 0;
> > > > +}
> > > >  
> > > >  static int
> > > >  relocate_entry_cpu(struct drm_i915_gem_object *obj,
> > > > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
> > > >  static int
> > > >  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> > > >  				   struct eb_vmas *eb,
> > > > -				   struct drm_i915_gem_relocation_entry *reloc)
> > > > +				   struct drm_i915_gem_relocation_entry *reloc,
> > > > +				   struct intel_context *ctx)
> > > 
> > > Hmm. That's a nuisance. But no, you only use it to automatically create
> > > a fence not to patch the batch, so you can just use an object-flag.
> > > 
> > > This fits neatly into requests.
> > 
> > Most definitely.  What do you think of the potential upside in the DDX
> > for this, assuming we get dword writes from batches working?
> 
> Negative. You now have relocation overhead, you still need to split
> batches to keep the gpu busy and do ring switches, and context switching
> between clients, so I don't feel a need for fences from inside a batch.
> 
> Getting seqno and a hws in the client would be nice, but if it continues
> to require kernel polling, no thanks, I'll just still to approximately
> tracking the active state of surfaces with the heavier accurate queries
> sparingly.
> 
> About the only thing I could see as being useful is that it would allow
> you to reuse a batch buffer multiple times, rather than overallocate a
> whole page and keep a pool of such pages.
> 
> I am missing something?

No I think that's about right.  The need for reloc processing is a
definite downside to this approach, but that could be solved with a new
interface, or by just allowing userspace to map/manage a hws.  The
downside there is that the resulting fences wouldn't be shareable.  But
requiring a flush for that is probably fine.

-- 
Jesse Barnes, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] drm/i915: allow sync points within batches
  2014-09-03 19:01         ` Jesse Barnes
@ 2014-09-03 19:41           ` Daniel Vetter
  2014-09-03 19:48             ` Jesse Barnes
  0 siblings, 1 reply; 13+ messages in thread
From: Daniel Vetter @ 2014-09-03 19:41 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: intel-gfx

On Wed, Sep 3, 2014 at 9:01 PM, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> On Wed, 3 Sep 2014 17:08:53 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
>> On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote:
>> > On Wed, 3 Sep 2014 08:01:55 +0100
>> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
>> >
>> > > These commands are illegal/invalid inside the object, only valid inside
>> > > the ring.
>> >
>> > Hm, we ought to be able to write to no privileged space with
>> > STORE_DWORD, but that does mean moving to context specific pages in
>> > process space, or at least adding them to our existing scheme.
>>
>> The per-process context page also doesn't exist generically. I certainly
>> hope that userspace can't overwrite the hws! Imagine if we were using
>> that for interrupt status reads, or seqno tracking...
>
> Yeah I'm thinking of an additional hws that's per-context and userspace
> mappable.  It could come in handy for userspace only sync stuff.

Userspace can already do seqno writes with MI_FLUSH_DW or PIPE_CONTROL
- lots of igt tests actually do that for correctness checks. So the
only thing really is interrupts, and I think for that we really want
the full request tracking machinery in the kernel (otherwise I fear
we'll have even more fun with lost/spurious interrupts since the hw
guys just seem to not be able to get that right). Which means a full
batch split.

I have no idea how that's supposed to work when userspace does direct
hardware submission. But that's kinda a good reason not to do that
anyway, and at least for now it looks like direct hw submission is for
opencl2 only with interop with other devices (where sync matters) not
a use-case. For interop with other processes the gpu can always do a
seqno write to some shared page. And busy-looping, but apparently
that's what people want for low-latency. Or at least what designers
seem to think people want ...
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] drm/i915: allow sync points within batches
  2014-09-03 19:41           ` Daniel Vetter
@ 2014-09-03 19:48             ` Jesse Barnes
  0 siblings, 0 replies; 13+ messages in thread
From: Jesse Barnes @ 2014-09-03 19:48 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Wed, 3 Sep 2014 21:41:02 +0200
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Wed, Sep 3, 2014 at 9:01 PM, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> > On Wed, 3 Sep 2014 17:08:53 +0100
> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >> On Wed, Sep 03, 2014 at 08:41:06AM -0700, Jesse Barnes wrote:
> >> > On Wed, 3 Sep 2014 08:01:55 +0100
> >> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >> >
> >> > > These commands are illegal/invalid inside the object, only valid inside
> >> > > the ring.
> >> >
> >> > Hm, we ought to be able to write to no privileged space with
> >> > STORE_DWORD, but that does mean moving to context specific pages in
> >> > process space, or at least adding them to our existing scheme.
> >>
> >> The per-process context page also doesn't exist generically. I certainly
> >> hope that userspace can't overwrite the hws! Imagine if we were using
> >> that for interrupt status reads, or seqno tracking...
> >
> > Yeah I'm thinking of an additional hws that's per-context and userspace
> > mappable.  It could come in handy for userspace only sync stuff.
> 
> Userspace can already do seqno writes with MI_FLUSH_DW or PIPE_CONTROL
> - lots of igt tests actually do that for correctness checks. So the
> only thing really is interrupts, and I think for that we really want
> the full request tracking machinery in the kernel (otherwise I fear
> we'll have even more fun with lost/spurious interrupts since the hw
> guys just seem to not be able to get that right). Which means a full
> batch split.
> 
> I have no idea how that's supposed to work when userspace does direct
> hardware submission. But that's kinda a good reason not to do that
> anyway, and at least for now it looks like direct hw submission is for
> opencl2 only with interop with other devices (where sync matters) not
> a use-case. For interop with other processes the gpu can always do a
> seqno write to some shared page. And busy-looping, but apparently
> that's what people want for low-latency. Or at least what designers
> seem to think people want ...

Yeah I haven't thought how direct submission will work in terms of
IPC.  It may just have to be done in userland with a custom cooperative
mechanism...

-- 
Jesse Barnes, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2014-09-03 19:48 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-09-02 21:32 Updated fence patches Jesse Barnes
2014-09-02 21:32 ` [PATCH 1/2] drm/i915: Android sync points for i915 v2 Jesse Barnes
2014-09-03  7:09   ` Chris Wilson
2014-09-03 16:41     ` Ville Syrjälä
2014-09-03 17:05       ` Chris Wilson
2014-09-03 17:12         ` Chris Wilson
2014-09-02 21:32 ` [PATCH 2/2] drm/i915: allow sync points within batches Jesse Barnes
2014-09-03  7:01   ` Chris Wilson
2014-09-03 15:41     ` Jesse Barnes
2014-09-03 16:08       ` Chris Wilson
2014-09-03 19:01         ` Jesse Barnes
2014-09-03 19:41           ` Daniel Vetter
2014-09-03 19:48             ` Jesse Barnes

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.