All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/radeon: Add support for userspace fence waits
@ 2012-01-31 16:59 Simon Farnsworth
  2012-01-31 17:56 ` Michel Dänzer
  2012-01-31 18:55 ` David Airlie
  0 siblings, 2 replies; 15+ messages in thread
From: Simon Farnsworth @ 2012-01-31 16:59 UTC (permalink / raw)
  To: dri-devel

Userspace currently busywaits for fences to complete; on my workload, this
busywait consumes 10% of the available CPU time.

Provide an ioctl so that userspace can wait for an EOP interrupt that
corresponds to a previous EVENT_WRITE_EOP.

Signed-off-by: Simon Farnsworth <simon.farnsworth@onelan.co.uk>
---
I've been working on top of Jerome's tiling patches, so this doesn't apply
directly on top of current upstream kernels. I can easily rebase to another
version upon request - just point me to a git tree.

My goal is to remove the sched_yield in Mesa's r600_fence_finish given up to
date enough kernel; I hope, though, that the interface is clean enough for
other users to extend it in the future (e.g. using compute rings).

 drivers/gpu/drm/radeon/radeon.h       |    3 +
 drivers/gpu/drm/radeon/radeon_drv.c   |    3 +-
 drivers/gpu/drm/radeon/radeon_fence.c |    2 +
 drivers/gpu/drm/radeon/radeon_gem.c   |   70 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/radeon/radeon_kms.c   |    1 +
 include/drm/radeon_drm.h              |   30 ++++++++++++++
 6 files changed, 108 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 2859406..00c187b 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -217,6 +217,7 @@ struct radeon_fence_driver {
 	unsigned long			last_jiffies;
 	unsigned long			last_timeout;
 	wait_queue_head_t		queue;
+	wait_queue_head_t		userspace_queue;
 	struct list_head		created;
 	struct list_head		emitted;
 	struct list_head		signaled;
@@ -1348,6 +1349,8 @@ int radeon_gem_set_tiling_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *filp);
 int radeon_gem_get_tiling_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *filp);
+int radeon_gem_wait_user_fence_ioctl(struct drm_device *dev, void *data,
+				     struct drm_file *filp);
 
 /* VRAM scratch page for HDP bug, default vram page */
 struct r600_vram_scratch {
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index 4ae2e1d..9f82fa9 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -56,9 +56,10 @@
  *   2.12.0 - RADEON_CS_KEEP_TILING_FLAGS
  *   2.13.0 - virtual memory support
  *   2.14.0 - add evergreen tiling informations
+ *   2.15.0 - gem_wait_user_fence ioctl
  */
 #define KMS_DRIVER_MAJOR	2
-#define KMS_DRIVER_MINOR	14
+#define KMS_DRIVER_MINOR	15
 #define KMS_DRIVER_PATCHLEVEL	0
 int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags);
 int radeon_driver_unload_kms(struct drm_device *dev);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 64ea3dd..d86bc28 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -356,6 +356,7 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
 	if (wake) {
 		wake_up_all(&rdev->fence_drv[ring].queue);
 	}
+	wake_up_interruptible_all(&rdev->fence_drv[ring].userspace_queue);
 }
 
 int radeon_fence_count_emitted(struct radeon_device *rdev, int ring)
@@ -421,6 +422,7 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
 	INIT_LIST_HEAD(&rdev->fence_drv[ring].emitted);
 	INIT_LIST_HEAD(&rdev->fence_drv[ring].signaled);
 	init_waitqueue_head(&rdev->fence_drv[ring].queue);
+	init_waitqueue_head(&rdev->fence_drv[ring].userspace_queue);
 	rdev->fence_drv[ring].initialized = false;
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index 7337850..602274f 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -531,3 +531,73 @@ int radeon_mode_dumb_destroy(struct drm_file *file_priv,
 {
 	return drm_gem_handle_delete(file_priv, handle);
 }
+
+int radeon_gem_wait_user_fence_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *filp)
+{
+	struct drm_radeon_gem_wait_user_fence *args = data;
+	struct radeon_device *rdev = dev->dev_private;
+	struct drm_gem_object *gobj;
+	struct radeon_bo *robj;
+	void *buffer_data;
+	uint32_t *fence_data;
+	int r = 0;
+	long timeout;
+	int ring = RADEON_RING_TYPE_GFX_INDEX;
+
+	/* If you're implementing this for other rings, you'll want to share
+	   code with radeon_cs_get_ring in radeon_cs.c */
+	if (args->ring != RADEON_CS_RING_GFX) {
+		return -EINVAL;
+	}
+
+	gobj = drm_gem_object_lookup(dev, filp, args->handle);
+	if (gobj == NULL) {
+		return -ENOENT;
+	}
+	robj = gem_to_radeon_bo(gobj);
+
+	if (gobj->size < args->offset) {
+		r = -EINVAL;
+		goto unreference;
+	}
+
+	r = radeon_bo_reserve(robj, true);
+	if (r) {
+		goto unreference;
+	}
+
+	r = radeon_bo_pin(robj, RADEON_GEM_DOMAIN_GTT, NULL);
+	if (r) {
+		goto unreserve;
+	}
+
+	r = radeon_bo_kmap(robj, &buffer_data);
+	if (r) {
+		goto unpin;
+	}
+
+	radeon_irq_kms_sw_irq_get(rdev, ring);
+
+	fence_data = (uint32_t*)buffer_data;
+	timeout = wait_event_interruptible_timeout(rdev->fence_drv[ring].userspace_queue,
+						   fence_data[args->offset >> 2] != args->value,
+						   usecs_to_jiffies(args->timeout_usec));
+
+	radeon_irq_kms_sw_irq_put(rdev, ring);
+
+	if (timeout == 0)
+		r = -ETIMEDOUT;
+	else if (timeout < 0)
+		r = timeout;
+
+	radeon_bo_kunmap(robj);
+unpin:
+	radeon_bo_unpin(robj);
+unreserve:
+	radeon_bo_unreserve(robj);
+unreference:
+	drm_gem_object_unreference_unlocked(gobj);
+
+	return r;
+}
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
index d335288..0e552cc 100644
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@@ -496,5 +496,6 @@ struct drm_ioctl_desc radeon_ioctls_kms[] = {
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_GET_TILING, radeon_gem_get_tiling_ioctl, DRM_AUTH|DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(RADEON_GEM_WAIT_USER_FENCE, radeon_gem_wait_user_fence_ioctl, DRM_AUTH|DRM_UNLOCKED),
 };
 int radeon_max_kms_ioctl = DRM_ARRAY_SIZE(radeon_ioctls_kms);
diff --git a/include/drm/radeon_drm.h b/include/drm/radeon_drm.h
index dd2e9cf..3d4ae93 100644
--- a/include/drm/radeon_drm.h
+++ b/include/drm/radeon_drm.h
@@ -510,6 +510,7 @@ typedef struct {
 #define DRM_RADEON_GEM_GET_TILING	0x29
 #define DRM_RADEON_GEM_BUSY		0x2a
 #define DRM_RADEON_GEM_VA		0x2b
+#define DRM_RADEON_GEM_WAIT_USER_FENCE  0x2c
 
 #define DRM_IOCTL_RADEON_CP_INIT    DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CP_INIT, drm_radeon_init_t)
 #define DRM_IOCTL_RADEON_CP_START   DRM_IO(  DRM_COMMAND_BASE + DRM_RADEON_CP_START)
@@ -552,6 +553,7 @@ typedef struct {
 #define DRM_IOCTL_RADEON_GEM_GET_TILING	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_GET_TILING, struct drm_radeon_gem_get_tiling)
 #define DRM_IOCTL_RADEON_GEM_BUSY	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy)
 #define DRM_IOCTL_RADEON_GEM_VA		DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va)
+#define DRM_IOCTL_RADEON_GEM_WAIT_USER_FENCE   DRM_IOW(DRM_COMMAND_BASE + DRM_RADEON_GEM_WAIT_USER_FENCE, struct drm_radeon_gem_wait_user_fence)
 
 typedef struct drm_radeon_init {
 	enum {
@@ -967,4 +969,32 @@ struct drm_radeon_info {
 	uint64_t		value;
 };
 
+/**
+ * struct drm_radeon_gem_wait_user_fence - DRM_RADEON_GEM_WAIT_USER_FENCE ioctl param
+ *
+ * @handle: Handle for the object that the GPU is expected to write
+ * @ring: The ring on which the fence packet was issued
+ * @offset: Offset (in bytes) within that object where the GPU is expected
+ *          to write. Must be DWORD-aligned
+ * @value: The value expected if the GPU has not yet written to this location
+ * @timeout_usec: The maximum time to wait for the GPU, in microseconds
+ *
+ * The DRM_RADEON_GEM_WAIT_USER_FENCE ioctl is meant to allow userspace to
+ * avoid busy-waiting for a EVENT_WRITE_EOP packet to complete (e.g. for
+ * fence sync objects in OpenGL). It expects the EVENT_WRITE_EOP packet to
+ * have requested an interrupt on completion.
+ *
+ * The ioctl will return immediately if the value supplied is not the value
+ * found in the buffer at offset bytes in; otherwise, it will sleep for up
+ * to timeout_usec, waking up when an EVENT_WRITE_EOP packet causes an
+ * interrupt and the value in the buffer might have changed.
+ */
+struct drm_radeon_gem_wait_user_fence {
+	uint32_t                handle;
+	uint32_t                ring;
+	uint64_t                offset;
+	uint32_t                value;
+	uint64_t                timeout_usec;
+};
+
 #endif
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 15+ messages in thread
* r600g: Trying to remove the busywait for fence completion, but hitting inexplicable behaviour
@ 2012-01-31 13:16 Simon Farnsworth
  2012-01-31 13:18 ` [PATCH] drm/radeon: Add support for userspace fence waits Simon Farnsworth
  0 siblings, 1 reply; 15+ messages in thread
From: Simon Farnsworth @ 2012-01-31 13:16 UTC (permalink / raw)
  To: dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 3738 bytes --]

Hello,

When profiling my workload on an AMD E-350 (PALM GPU) to see why it still
wasn't performing well with Jerome's WIP macrotiling patches, I noticed that
r600_fence_finish was taking 10% of my CPU time. I determined experimentally
that changing from sched_yield() to os_time_sleep(10) fixed this and
resolved my last performance issue on AMD Fusion as compared to Intel Atom,
but felt that this was hacky.

I've therefore tried to use INT_SEL of 0b10 in the EVENT_WRITE_EOP in Mesa,
combined with a new ioctl to wait for a changed value, but it's not working
the way I would expect. I'll be sending patches as replies to this message,
so that you can see exactly what I've done, but in brief, I have an ioctl
that uses wait_event to wait for a chosen offset in a BO to change
value. I've added a suitable waitqueue, and made radeon_fence_process call
wake_up_all.

I'm seeing behaviour from this that I can't explain; as you'll see in the
patches, I've moved some IRQ prints from DRM_DEBUG to printk(KERN_INFO), and
I'm seeing that I don't get the EOP interrupt in a timely fashion - either
because memory is not as coherent between the GPU and CPU as I would like
(so I'm reading stale data when I call wait_event), or because the interrupt
is genuinely delayed.

From dmesg (with commentary):

X11 and my GL compositor start.

[   84.423567] IH: CP EOP
[   84.423600] Woke kernel fences
[   84.423606] Waking up all waiters

This looks like an EOP for a kernel-side fence.

[   84.651320] wait_user_fence offset 4 value 0 timeout -1
[   84.651332] Current data value 0

This is my compositor coming in via my new ioctl, to wait for EOP.

I get bored of waiting for the ioctl to complete, and send the compositor
SIGSTOP then SIGCONT.

[  149.970629] wait_user_fence offset 4 value 0 timeout -1
[  149.970635] Finished data value 1

My new ioctl completes, as the data has changed value. I was expecting an
EOP interrupt before this, which hasn't arrived - why?

[  150.224675] wait_user_fence offset 8 value 0 timeout -1
[  150.224692] Current data value 0

The compositor comes in again, waiting on a different fence.

[  150.250166] IH: CP EOP
[  150.250194] Woke kernel fences
[  150.250198] Waking up all waiters

This looks like an EOP for a kernel-side fence.

[  150.250212] IH: CP EOP
[  150.250216] Waking up all waiters

This looks like an EOP for the fence that completed at time 149.970 - why's
it been delayed?

[  150.250219] IH: CP EOP
[  150.250221] Waking up all waiters

And another EOP that I can't tie up to command buffers.

[  150.250327] IH: CP EOP
[  150.250335] Woke kernel fences
[  150.250337] Waking up all waiters

Kernel fence.

[  150.250559] IH: CP EOP
[  150.250567] Woke kernel fences
[  150.250570] Waking up all waiters

Another kernel fence.

[  150.250581] IH: CP EOP
[  150.250583] Waking up all waiters
[  150.250595] wait_user_fence offset 8 value 0 timeout -1
[  150.250604] IH: CP EOP
[  150.250608] Waking up all waiters
[  150.250615] Finished data value 1

Two user fence EOPs in a row, one of which woke up my process.

[  150.251462] IH: CP EOP
[  150.251477] Woke kernel fences
[  150.251480] Waking up all waiters

Kernel fence.

[  150.256806] wait_user_fence offset 0 value 0 timeout -1
[  150.256828] Current data value 0

Stalled again, waiting for EOP interrupt. Could be because the GPU and CPU
have different views of memory at this point.

There will be two patches in reply to this mail - one is the Mesa patch, one
the kernel patch; I would greatly appreciate help getting this going.
--
Simon Farnsworth
Software Engineer
ONELAN Limited
http://www.onelan.com/

[-- Attachment #1.2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2012-02-01 18:21 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-01-31 16:59 [PATCH] drm/radeon: Add support for userspace fence waits Simon Farnsworth
2012-01-31 17:56 ` Michel Dänzer
2012-01-31 18:56   ` Jerome Glisse
2012-01-31 21:08     ` Marek Olšák
2012-02-01  8:39       ` Michel Dänzer
2012-02-01 13:58         ` Marek Olšák
2012-01-31 18:55 ` David Airlie
2012-01-31 19:07   ` Jerome Glisse
2012-01-31 19:13     ` Alex Deucher
2012-01-31 19:36       ` Jerome Glisse
2012-01-31 23:07         ` Christian König
2012-02-01 11:31     ` Simon Farnsworth
2012-02-01 18:21       ` Christian König
  -- strict thread matches above, loose matches on Subject: below --
2012-01-31 13:16 r600g: Trying to remove the busywait for fence completion, but hitting inexplicable behaviour Simon Farnsworth
2012-01-31 13:18 ` [PATCH] drm/radeon: Add support for userspace fence waits Simon Farnsworth
2012-01-31 14:49   ` Simon Farnsworth

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.