All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] drm/radeon: add a way to get and set initial buffer domains
@ 2014-02-26 18:25 Marek Olšák
  2014-02-26 18:25 ` [PATCH 2/6] drm/radeon: track memory statistics about VRAM and GTT usage and buffer moves Marek Olšák
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Marek Olšák @ 2014-02-26 18:25 UTC (permalink / raw)
  To: dri-devel

From: Marek Olšák <marek.olsak@amd.com>

When passing buffers between processes, the receiving process needs to know
the original buffer domain, so that it doesn't accidentally move the buffer.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 drivers/gpu/drm/radeon/radeon.h        |  3 +++
 drivers/gpu/drm/radeon/radeon_drv.c    |  3 ++-
 drivers/gpu/drm/radeon/radeon_gem.c    | 36 ++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/radeon/radeon_kms.c    |  1 +
 drivers/gpu/drm/radeon/radeon_object.c |  3 +++
 include/uapi/drm/radeon_drm.h          | 12 ++++++++++++
 6 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index a415f8e..3f10782 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -454,6 +454,7 @@ struct radeon_bo {
 	/* Protected by gem.mutex */
 	struct list_head		list;
 	/* Protected by tbo.reserved */
+	u32				initial_domain;
 	u32				placements[3];
 	struct ttm_placement		placement;
 	struct ttm_buffer_object	tbo;
@@ -2114,6 +2115,8 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data,
 			      struct drm_file *filp);
 int radeon_gem_va_ioctl(struct drm_device *dev, void *data,
 			  struct drm_file *filp);
+int radeon_gem_op_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *filp);
 int radeon_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
 int radeon_gem_set_tiling_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *filp);
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index 84a1bbb7..4392b7c 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -79,9 +79,10 @@
  *   2.35.0 - Add CIK macrotile mode array query
  *   2.36.0 - Fix CIK DCE tiling setup
  *   2.37.0 - allow GS ring setup on r6xx/r7xx
+ *   2.38.0 - RADEON_GEM_OP (GET_INITIAL_DOMAIN, SET_INITIAL_DOMAIN)
  */
 #define KMS_DRIVER_MAJOR	2
-#define KMS_DRIVER_MINOR	37
+#define KMS_DRIVER_MINOR	38
 #define KMS_DRIVER_PATCHLEVEL	0
 int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags);
 int radeon_driver_unload_kms(struct drm_device *dev);
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index b96c819..9863ca7 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -533,6 +533,42 @@ out:
 	return r;
 }
 
+int radeon_gem_op_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *filp)
+{
+	struct drm_radeon_gem_op *args = data;
+	struct drm_gem_object *gobj;
+	struct radeon_bo *robj;
+	int r;
+
+	gobj = drm_gem_object_lookup(dev, filp, args->handle);
+	if (gobj == NULL) {
+		return -ENOENT;
+	}
+	robj = gem_to_radeon_bo(gobj);
+	r = radeon_bo_reserve(robj, false);
+	if (unlikely(r))
+		goto out;
+
+	switch (args->op) {
+	case RADEON_GEM_OP_GET_INITIAL_DOMAIN:
+		args->value = robj->initial_domain;
+		break;
+	case RADEON_GEM_OP_SET_INITIAL_DOMAIN:
+		robj->initial_domain = args->value & (RADEON_GEM_DOMAIN_VRAM |
+						      RADEON_GEM_DOMAIN_GTT |
+						      RADEON_GEM_DOMAIN_CPU);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+	radeon_bo_unreserve(robj);
+out:
+	drm_gem_object_unreference_unlocked(gobj);
+	return r;
+}
+
 int radeon_mode_dumb_create(struct drm_file *file_priv,
 			    struct drm_device *dev,
 			    struct drm_mode_create_dumb *args)
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
index baff98b..0b631eb 100644
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@@ -814,5 +814,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = {
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_GET_TILING, radeon_gem_get_tiling_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
 };
 int radeon_max_kms_ioctl = DRM_ARRAY_SIZE(radeon_ioctls_kms);
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index 08595cf..dd12bb4 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -145,6 +145,9 @@ int radeon_bo_create(struct radeon_device *rdev,
 	bo->surface_reg = -1;
 	INIT_LIST_HEAD(&bo->list);
 	INIT_LIST_HEAD(&bo->va);
+	bo->initial_domain = domain & (RADEON_GEM_DOMAIN_VRAM |
+	                               RADEON_GEM_DOMAIN_GTT |
+	                               RADEON_GEM_DOMAIN_CPU);
 	radeon_ttm_placement_from_domain(bo, domain);
 	/* Kernel allocation are uninterruptible */
 	down_read(&rdev->pm.mclk_lock);
diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h
index 1cf18b4..cb5c93a 100644
--- a/include/uapi/drm/radeon_drm.h
+++ b/include/uapi/drm/radeon_drm.h
@@ -510,6 +510,7 @@ typedef struct {
 #define DRM_RADEON_GEM_GET_TILING	0x29
 #define DRM_RADEON_GEM_BUSY		0x2a
 #define DRM_RADEON_GEM_VA		0x2b
+#define DRM_RADEON_GEM_OP		0x2c
 
 #define DRM_IOCTL_RADEON_CP_INIT    DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CP_INIT, drm_radeon_init_t)
 #define DRM_IOCTL_RADEON_CP_START   DRM_IO(  DRM_COMMAND_BASE + DRM_RADEON_CP_START)
@@ -552,6 +553,7 @@ typedef struct {
 #define DRM_IOCTL_RADEON_GEM_GET_TILING	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_GET_TILING, struct drm_radeon_gem_get_tiling)
 #define DRM_IOCTL_RADEON_GEM_BUSY	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy)
 #define DRM_IOCTL_RADEON_GEM_VA		DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va)
+#define DRM_IOCTL_RADEON_GEM_OP		DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op)
 
 typedef struct drm_radeon_init {
 	enum {
@@ -884,6 +886,16 @@ struct drm_radeon_gem_pwrite {
 	uint64_t data_ptr;
 };
 
+/* Sets or returns a value associated with a buffer. */
+struct drm_radeon_gem_op {
+	uint32_t	handle; /* buffer */
+	uint32_t	op;     /* RADEON_GEM_OP_* */
+	uint64_t	value;  /* input or return value */
+};
+
+#define RADEON_GEM_OP_GET_INITIAL_DOMAIN	0
+#define RADEON_GEM_OP_SET_INITIAL_DOMAIN	1
+
 #define RADEON_VA_MAP			1
 #define RADEON_VA_UNMAP			2
 
-- 
1.8.3.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 12+ messages in thread
* [PATCH 0/6] Radeon memory management improvements v3
@ 2014-03-01 23:56 Marek Olšák
  2014-03-01 23:56 ` [PATCH 4/6] drm/radeon: add buffers to the LRU list from smallest to largest Marek Olšák
  0 siblings, 1 reply; 12+ messages in thread
From: Marek Olšák @ 2014-03-01 23:56 UTC (permalink / raw)
  To: dri-devel

While updating "[PATCH 5/6] drm/radeon: validate relocations in the order determined by userspace" based on feedback, which should have been a harmless change, I discovered that the performance dropped. The problem was that list_add/move from one list to another reversed the list of relocations and the only thing which restored performance was to sort the relocation list with a stable sort. (preserve ordering of buffers whose priorities are equal). The only explanation I've come up with is that the buffers which appear sooner in the list are likely to be used more often by an IB than buffers which appear later in the list. For example, a texture used in all draw commands will most probably be somewhere at the beginning. However, a texture which appears at the end is probably only used in a
  few last draw commands.

Also, the stats are changed to atomic64_t as discusssed before.

Marek

^ permalink raw reply	[flat|nested] 12+ messages in thread
* [PATCH 0/6] Radeon memory management improvements
@ 2014-02-24 15:20 Marek Olšák
  2014-02-24 15:20 ` [PATCH 4/6] drm/radeon: add buffers to the LRU list from smallest to largest Marek Olšák
  0 siblings, 1 reply; 12+ messages in thread
From: Marek Olšák @ 2014-02-24 15:20 UTC (permalink / raw)
  To: dri-devel

This series improves performance for the cases when there is not enough VRAM for all buffers.

First of all, I'd like to mention that if you set both VRAM and GTT domains for a buffer, you pretty much say you don't care where the buffer ends up. It usually makes the performance even worse.

This work was largely benchmark-driven and I tried a lot of ideas before I found out which ones work. The patches describe what they do and they're quite simple, so I'll just share the results here.


Card: Evergreen Redwood (HD 5670), 512 MB of VRAM
Test: Unigine Heaven 4.0, High settings

1) 1280x720, 4x MSAA, need 525 MB of VRAM

Without patches: 16.6 FPS
With patches: 16.6 FPS
Improvement: 0 %

2) 1600x900, 4x MSAA, need 642 MB of VRAM

Without patches: 7.1 FPS
With patches: 9.7 FPS
Improvement: 36 %

3) 1920x1080, 4x MSAA, need 743 MB of VRAM

Without patches: 3.7 FPS
With patches: 5.6 FPS
Improvement: 51 %

4) 1600x900, 8x MSAA, need 838 MB of VRAM
Without patches: 2.9 FPS
With patches: 4.6 FPS
Improvement: 58 %

These results don't change if you run the benchmark several times, which proves the improvement is stable.


To conclude this, here are ideas for future work:

1) Add virtual memory support for VRAM. Our GPUs support virtual memory, which not only solves fragmentation issues, but it also allows each buffer to be partially in VRAM and partially in GTT, which becomes more important with large buffers like 100 MB. Moving whole buffers back and forth between VRAM and GTT is inefficient if you can do it at page granularity. Also, due to fragmentation, we can never really use all of VRAM, but only about 90-95%.

2) Add support for uncached GTT. I think it should improve performance for dGPUs under memory pressure, but some testing needs to be done to confirm that. Uncached GTT doesn't seem to work for me on Evergreen, but it's said to be working on some later chips.


The patches for Mesa will follow later today. Please review.

Marek

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-03-04 13:27 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-02-26 18:25 [PATCH 1/6] drm/radeon: add a way to get and set initial buffer domains Marek Olšák
2014-02-26 18:25 ` [PATCH 2/6] drm/radeon: track memory statistics about VRAM and GTT usage and buffer moves Marek Olšák
2014-02-26 18:25 ` [PATCH 3/6] drm/radeon: deduplicate code in radeon_gem_busy_ioctl Marek Olšák
2014-02-26 18:25 ` [PATCH 4/6] drm/radeon: add buffers to the LRU list from smallest to largest Marek Olšák
2014-02-27  1:22   ` Michel Dänzer
2014-03-01 21:57     ` Marek Olšák
2014-03-04 13:27       ` Thomas Hellstrom
2014-02-26 18:25 ` [PATCH 5/6] drm/radeon: validate relocations in the order determined by userspace Marek Olšák
2014-02-27  9:29   ` Christian König
2014-02-26 18:25 ` [PATCH 6/6] drm/radeon: limit how much memory TTM can move per IB according to VRAM usage Marek Olšák
  -- strict thread matches above, loose matches on Subject: below --
2014-03-01 23:56 [PATCH 0/6] Radeon memory management improvements v3 Marek Olšák
2014-03-01 23:56 ` [PATCH 4/6] drm/radeon: add buffers to the LRU list from smallest to largest Marek Olšák
2014-02-24 15:20 [PATCH 0/6] Radeon memory management improvements Marek Olšák
2014-02-24 15:20 ` [PATCH 4/6] drm/radeon: add buffers to the LRU list from smallest to largest Marek Olšák

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.