* [PATCH 0/7] A few drm_syncobj optimisations
@ 2025-03-18 15:54 Tvrtko Ursulin
2025-03-18 15:54 ` [PATCH 1/7] drm/syncobj: Remove unhelpful helper Tvrtko Ursulin
` (7 more replies)
0 siblings, 8 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-18 15:54 UTC (permalink / raw)
To: dri-devel; +Cc: kernel-dev, Tvrtko Ursulin
A small set of drm_syncobj optimisations which should make things a tiny bit
more efficient on the CPU side of things.
Improvement seems to be around 1.5%* more FPS if observed with "vkgears
-present-mailbox" on a Steam Deck Plasma desktop, but I am reluctant to make a
definitive claim on the numbers since there is some run to run variance. But, as
suggested by Michel Dänzer, I did do a five ~100 second runs on the each kernel
to be able to show the ministat analysis.
x before
+ after
+------------------------------------------------------------+
| x + |
| x x + |
| x xx ++++ |
| x x xx x ++++ |
| x xx x xx x+ ++++ |
| xxxxx xxxxxx+ ++++ + + |
| xxxxxxx xxxxxx+x ++++ +++ |
| x xxxxxxxxxxx*xx+* x++++++++ ++ |
| x x xxxxxxxxxxxx**x*+*+*++++++++ ++++ + |
| xx x xxxxxxxxxx*x****+***+**+++++ ++++++ |
|x xxx x xxxxx*x****x***********+*++**+++++++ + + +|
| |_______A______| |
| |______A_______| |
+------------------------------------------------------------+
N Min Max Median Avg Stddev
x 135 21697.58 22809.467 22321.396 22307.707 198.75011
+ 118 22200.746 23277.09 22661.4 22671.442 192.10609
Difference at 95.0% confidence
363.735 +/- 48.3345
1.63054% +/- 0.216672%
(Student's t, pooled s = 195.681)
Tvrtko Ursulin (7):
drm/syncobj: Remove unhelpful helper
drm/syncobj: Do not allocate an array to store zeros when waiting
drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find
drm/syncobj: Use put_user in drm_syncobj_query_ioctl
drm/syncobj: Avoid temporary allocation in
drm_syncobj_timeline_signal_ioctl
drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout
drm/syncobj: Add a fast path to drm_syncobj_array_find
drivers/gpu/drm/drm_syncobj.c | 281 ++++++++++++++++++----------------
1 file changed, 147 insertions(+), 134 deletions(-)
--
2.48.0
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH 1/7] drm/syncobj: Remove unhelpful helper
2025-03-18 15:54 [PATCH 0/7] A few drm_syncobj optimisations Tvrtko Ursulin
@ 2025-03-18 15:54 ` Tvrtko Ursulin
2025-03-24 21:38 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 2/7] drm/syncobj: Do not allocate an array to store zeros when waiting Tvrtko Ursulin
` (6 subsequent siblings)
7 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-18 15:54 UTC (permalink / raw)
To: dri-devel; +Cc: kernel-dev, Tvrtko Ursulin
Helper which fails to consolidate the code and instead just forks into two
copies of the code based on a boolean parameter is not very helpful or
readable. Lets just remove it and proof in the pudding is the net smaller
code.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
drivers/gpu/drm/drm_syncobj.c | 98 ++++++++++++++++-------------------
1 file changed, 44 insertions(+), 54 deletions(-)
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 4f2ab8a7b50f..d0d60c331df8 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1221,42 +1221,6 @@ signed long drm_timeout_abs_to_jiffies(int64_t timeout_nsec)
}
EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
-static int drm_syncobj_array_wait(struct drm_device *dev,
- struct drm_file *file_private,
- struct drm_syncobj_wait *wait,
- struct drm_syncobj_timeline_wait *timeline_wait,
- struct drm_syncobj **syncobjs, bool timeline,
- ktime_t *deadline)
-{
- signed long timeout = 0;
- uint32_t first = ~0;
-
- if (!timeline) {
- timeout = drm_timeout_abs_to_jiffies(wait->timeout_nsec);
- timeout = drm_syncobj_array_wait_timeout(syncobjs,
- NULL,
- wait->count_handles,
- wait->flags,
- timeout, &first,
- deadline);
- if (timeout < 0)
- return timeout;
- wait->first_signaled = first;
- } else {
- timeout = drm_timeout_abs_to_jiffies(timeline_wait->timeout_nsec);
- timeout = drm_syncobj_array_wait_timeout(syncobjs,
- u64_to_user_ptr(timeline_wait->points),
- timeline_wait->count_handles,
- timeline_wait->flags,
- timeout, &first,
- deadline);
- if (timeout < 0)
- return timeout;
- timeline_wait->first_signaled = first;
- }
- return 0;
-}
-
static int drm_syncobj_array_find(struct drm_file *file_private,
void __user *user_handles,
uint32_t count_handles,
@@ -1319,9 +1283,12 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_private)
{
struct drm_syncobj_wait *args = data;
+ ktime_t deadline, *pdeadline = NULL;
+ u32 count = args->count_handles;
struct drm_syncobj **syncobjs;
unsigned int possible_flags;
- ktime_t t, *tp = NULL;
+ u32 first = ~0;
+ long timeout;
int ret = 0;
if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ))
@@ -1334,27 +1301,37 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
if (args->flags & ~possible_flags)
return -EINVAL;
- if (args->count_handles == 0)
+ if (count == 0)
return 0;
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
- args->count_handles,
+ count,
&syncobjs);
if (ret < 0)
return ret;
if (args->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE) {
- t = ns_to_ktime(args->deadline_nsec);
- tp = &t;
+ deadline = ns_to_ktime(args->deadline_nsec);
+ pdeadline = &deadline;
}
- ret = drm_syncobj_array_wait(dev, file_private,
- args, NULL, syncobjs, false, tp);
+ timeout = drm_syncobj_array_wait_timeout(syncobjs,
+ NULL,
+ count,
+ args->flags,
+ drm_timeout_abs_to_jiffies(args->timeout_nsec),
+ &first,
+ pdeadline);
- drm_syncobj_array_free(syncobjs, args->count_handles);
+ drm_syncobj_array_free(syncobjs, count);
- return ret;
+ if (timeout < 0)
+ return timeout;
+
+ args->first_signaled = first;
+
+ return 0;
}
int
@@ -1362,9 +1339,12 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_private)
{
struct drm_syncobj_timeline_wait *args = data;
+ ktime_t deadline, *pdeadline = NULL;
+ u32 count = args->count_handles;
struct drm_syncobj **syncobjs;
unsigned int possible_flags;
- ktime_t t, *tp = NULL;
+ u32 first = ~0;
+ long timeout;
int ret = 0;
if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE))
@@ -1378,27 +1358,37 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
if (args->flags & ~possible_flags)
return -EINVAL;
- if (args->count_handles == 0)
+ if (count == 0)
return 0;
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
- args->count_handles,
+ count,
&syncobjs);
if (ret < 0)
return ret;
if (args->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE) {
- t = ns_to_ktime(args->deadline_nsec);
- tp = &t;
+ deadline = ns_to_ktime(args->deadline_nsec);
+ pdeadline = &deadline;
}
- ret = drm_syncobj_array_wait(dev, file_private,
- NULL, args, syncobjs, true, tp);
+ timeout = drm_syncobj_array_wait_timeout(syncobjs,
+ u64_to_user_ptr(args->points),
+ count,
+ args->flags,
+ drm_timeout_abs_to_jiffies(args->timeout_nsec),
+ &first,
+ pdeadline);
- drm_syncobj_array_free(syncobjs, args->count_handles);
+ drm_syncobj_array_free(syncobjs, count);
- return ret;
+ if (timeout < 0)
+ return timeout;
+
+ args->first_signaled = first;
+
+ return 0;
}
static void syncobj_eventfd_entry_fence_func(struct dma_fence *fence,
--
2.48.0
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 2/7] drm/syncobj: Do not allocate an array to store zeros when waiting
2025-03-18 15:54 [PATCH 0/7] A few drm_syncobj optimisations Tvrtko Ursulin
2025-03-18 15:54 ` [PATCH 1/7] drm/syncobj: Remove unhelpful helper Tvrtko Ursulin
@ 2025-03-18 15:54 ` Tvrtko Ursulin
2025-03-24 21:55 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 3/7] drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find Tvrtko Ursulin
` (5 subsequent siblings)
7 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-18 15:54 UTC (permalink / raw)
To: dri-devel; +Cc: kernel-dev, Tvrtko Ursulin
When waiting on syncobjs the current code allocates a temporary array only
to fill it up with all zeros.
We can avoid that by relying on the allocated entry array already being
zero allocated.
For the timeline mode we can fetch the timeline point values as we
populate the entries array so also do not need this additional temporary
allocation.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
drivers/gpu/drm/drm_syncobj.c | 39 ++++++++++++++---------------------
1 file changed, 15 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index d0d60c331df8..fd5ba6c89666 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1028,7 +1028,7 @@ static void syncobj_wait_syncobj_func(struct drm_syncobj *syncobj,
}
static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
- void __user *user_points,
+ u64 __user *user_points,
uint32_t count,
uint32_t flags,
signed long timeout,
@@ -1036,9 +1036,8 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
ktime_t *deadline)
{
struct syncobj_wait_entry *entries;
- struct dma_fence *fence;
- uint64_t *points;
uint32_t signaled_count, i;
+ struct dma_fence *fence;
if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT |
DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) {
@@ -1046,24 +1045,14 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
lockdep_assert_none_held_once();
}
- points = kmalloc_array(count, sizeof(*points), GFP_KERNEL);
- if (points == NULL)
- return -ENOMEM;
-
- if (!user_points) {
- memset(points, 0, count * sizeof(uint64_t));
-
- } else if (copy_from_user(points, user_points,
- sizeof(uint64_t) * count)) {
- timeout = -EFAULT;
- goto err_free_points;
- }
+ if (user_points &&
+ !access_ok(user_points, count * sizeof(*user_points)))
+ return -EFAULT;
entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
- if (!entries) {
- timeout = -ENOMEM;
- goto err_free_points;
- }
+ if (!entries)
+ return -ENOMEM;
+
/* Walk the list of sync objects and initialize entries. We do
* this up-front so that we can properly return -EINVAL if there is
* a syncobj with a missing fence and then never have the chance of
@@ -1074,9 +1063,14 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
struct dma_fence *fence;
entries[i].task = current;
- entries[i].point = points[i];
+ if (user_points &&
+ __get_user(entries[i].point, user_points++)) {
+ timeout = -EFAULT;
+ goto cleanup_entries;
+ }
fence = drm_syncobj_fence_get(syncobjs[i]);
- if (!fence || dma_fence_chain_find_seqno(&fence, points[i])) {
+ if (!fence ||
+ dma_fence_chain_find_seqno(&fence, entries[i].point)) {
dma_fence_put(fence);
if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT |
DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) {
@@ -1182,9 +1176,6 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
}
kfree(entries);
-err_free_points:
- kfree(points);
-
return timeout;
}
--
2.48.0
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 3/7] drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find
2025-03-18 15:54 [PATCH 0/7] A few drm_syncobj optimisations Tvrtko Ursulin
2025-03-18 15:54 ` [PATCH 1/7] drm/syncobj: Remove unhelpful helper Tvrtko Ursulin
2025-03-18 15:54 ` [PATCH 2/7] drm/syncobj: Do not allocate an array to store zeros when waiting Tvrtko Ursulin
@ 2025-03-18 15:54 ` Tvrtko Ursulin
2025-03-24 22:29 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 4/7] drm/syncobj: Use put_user in drm_syncobj_query_ioctl Tvrtko Ursulin
` (4 subsequent siblings)
7 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-18 15:54 UTC (permalink / raw)
To: dri-devel; +Cc: kernel-dev, Tvrtko Ursulin
Drm_syncobj_array_find() helper is used from many userspace ioctl entry
points with the task of looking up userspace handles to internal objects.
We can easily avoid one temporary allocation by making it read the handles
as it is looking them up.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
drivers/gpu/drm/drm_syncobj.c | 44 +++++++++++++++++------------------
1 file changed, 21 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index fd5ba6c89666..cdda2df06bec 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1213,48 +1213,46 @@ signed long drm_timeout_abs_to_jiffies(int64_t timeout_nsec)
EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
static int drm_syncobj_array_find(struct drm_file *file_private,
- void __user *user_handles,
- uint32_t count_handles,
+ u32 __user *handles,
+ uint32_t count,
struct drm_syncobj ***syncobjs_out)
{
- uint32_t i, *handles;
struct drm_syncobj **syncobjs;
+ uint32_t i;
int ret;
- handles = kmalloc_array(count_handles, sizeof(*handles), GFP_KERNEL);
- if (handles == NULL)
+ if (!access_ok(handles, count * sizeof(*handles)))
+ return -EFAULT;
+
+ syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
+ if (!syncobjs)
return -ENOMEM;
- if (copy_from_user(handles, user_handles,
- sizeof(uint32_t) * count_handles)) {
- ret = -EFAULT;
- goto err_free_handles;
- }
+ for (i = 0; i < count; i++) {
+ u64 handle;
- syncobjs = kmalloc_array(count_handles, sizeof(*syncobjs), GFP_KERNEL);
- if (syncobjs == NULL) {
- ret = -ENOMEM;
- goto err_free_handles;
- }
-
- for (i = 0; i < count_handles; i++) {
- syncobjs[i] = drm_syncobj_find(file_private, handles[i]);
+ if (__get_user(handle, handles++)) {
+ ret = -EFAULT;
+ syncobjs[i] = NULL;
+ goto err_put_syncobjs;
+ }
+ syncobjs[i] = drm_syncobj_find(file_private, handle);
if (!syncobjs[i]) {
ret = -ENOENT;
goto err_put_syncobjs;
}
}
- kfree(handles);
*syncobjs_out = syncobjs;
return 0;
err_put_syncobjs:
- while (i-- > 0)
- drm_syncobj_put(syncobjs[i]);
+ while (i > 0) {
+ if (syncobjs[i])
+ drm_syncobj_put(syncobjs[i]);
+ i--;
+ }
kfree(syncobjs);
-err_free_handles:
- kfree(handles);
return ret;
}
--
2.48.0
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 4/7] drm/syncobj: Use put_user in drm_syncobj_query_ioctl
2025-03-18 15:54 [PATCH 0/7] A few drm_syncobj optimisations Tvrtko Ursulin
` (2 preceding siblings ...)
2025-03-18 15:54 ` [PATCH 3/7] drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find Tvrtko Ursulin
@ 2025-03-18 15:54 ` Tvrtko Ursulin
2025-03-24 22:32 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 5/7] drm/syncobj: Avoid temporary allocation in drm_syncobj_timeline_signal_ioctl Tvrtko Ursulin
` (3 subsequent siblings)
7 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-18 15:54 UTC (permalink / raw)
To: dri-devel; +Cc: kernel-dev, Tvrtko Ursulin
Since the query loop is using copy_to_user() to write out a single u64 at
a time it feels more natural (and is a tiny bit more compact) to replace
it with put_user().
Access_ok() check is added to the input checking for an early bailout in
case of a bad buffer passed in.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
drivers/gpu/drm/drm_syncobj.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index cdda2df06bec..74d1dc0d1f8b 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1643,6 +1643,9 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
if (args->count_handles == 0)
return -EINVAL;
+ if (!access_ok(points, args->count_handles * sizeof(*points)))
+ return -EFAULT;
+
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
args->count_handles,
@@ -1684,10 +1687,10 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
point = 0;
}
dma_fence_put(fence);
- ret = copy_to_user(&points[i], &point, sizeof(uint64_t));
- ret = ret ? -EFAULT : 0;
- if (ret)
+ if (__put_user(point, points++)) {
+ ret = -EFAULT;
break;
+ }
}
drm_syncobj_array_free(syncobjs, args->count_handles);
--
2.48.0
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 5/7] drm/syncobj: Avoid temporary allocation in drm_syncobj_timeline_signal_ioctl
2025-03-18 15:54 [PATCH 0/7] A few drm_syncobj optimisations Tvrtko Ursulin
` (3 preceding siblings ...)
2025-03-18 15:54 ` [PATCH 4/7] drm/syncobj: Use put_user in drm_syncobj_query_ioctl Tvrtko Ursulin
@ 2025-03-18 15:54 ` Tvrtko Ursulin
2025-03-24 22:50 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 6/7] drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout Tvrtko Ursulin
` (2 subsequent siblings)
7 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-18 15:54 UTC (permalink / raw)
To: dri-devel; +Cc: kernel-dev, Tvrtko Ursulin
We can avoid one of the two temporary allocations if we read the userspace
supplied timeline points as we go along.
The only new complication is to unwind unused fence chains on the error
path, but even that code was already present in the function.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
drivers/gpu/drm/drm_syncobj.c | 46 +++++++++++++++--------------------
1 file changed, 20 insertions(+), 26 deletions(-)
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 74d1dc0d1f8b..b4563c696056 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1557,10 +1557,10 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_private)
{
struct drm_syncobj_timeline_array *args = data;
+ uint64_t __user *points = u64_to_user_ptr(args->points);
+ uint32_t i, j, count = args->count_handles;
struct drm_syncobj **syncobjs;
struct dma_fence_chain **chains;
- uint64_t *points;
- uint32_t i, j;
int ret;
if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE))
@@ -1572,33 +1572,22 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
if (args->count_handles == 0)
return -EINVAL;
+ if (!access_ok(points, count * sizeof(*points)))
+ return -EFAULT;
+
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
- args->count_handles,
+ count,
&syncobjs);
if (ret < 0)
return ret;
- points = kmalloc_array(args->count_handles, sizeof(*points),
- GFP_KERNEL);
- if (!points) {
- ret = -ENOMEM;
- goto out;
- }
- if (!u64_to_user_ptr(args->points)) {
- memset(points, 0, args->count_handles * sizeof(uint64_t));
- } else if (copy_from_user(points, u64_to_user_ptr(args->points),
- sizeof(uint64_t) * args->count_handles)) {
- ret = -EFAULT;
- goto err_points;
- }
-
- chains = kmalloc_array(args->count_handles, sizeof(void *), GFP_KERNEL);
+ chains = kmalloc_array(count, sizeof(void *), GFP_KERNEL);
if (!chains) {
ret = -ENOMEM;
- goto err_points;
+ goto out;
}
- for (i = 0; i < args->count_handles; i++) {
+ for (i = 0; i < count; i++) {
chains[i] = dma_fence_chain_alloc();
if (!chains[i]) {
for (j = 0; j < i; j++)
@@ -1608,19 +1597,24 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
}
}
- for (i = 0; i < args->count_handles; i++) {
+ for (i = 0; i < count; i++) {
struct dma_fence *fence = dma_fence_get_stub();
+ u64 point = 0;
- drm_syncobj_add_point(syncobjs[i], chains[i],
- fence, points[i]);
+ if (points && __get_user(point, points++)) {
+ ret = -EFAULT;
+ for (j = i; j < count; j++)
+ dma_fence_chain_free(chains[j]);
+ goto err_chains;
+ }
+
+ drm_syncobj_add_point(syncobjs[i], chains[i], fence, point);
dma_fence_put(fence);
}
err_chains:
kfree(chains);
-err_points:
- kfree(points);
out:
- drm_syncobj_array_free(syncobjs, args->count_handles);
+ drm_syncobj_array_free(syncobjs, count);
return ret;
}
--
2.48.0
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 6/7] drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout
2025-03-18 15:54 [PATCH 0/7] A few drm_syncobj optimisations Tvrtko Ursulin
` (4 preceding siblings ...)
2025-03-18 15:54 ` [PATCH 5/7] drm/syncobj: Avoid temporary allocation in drm_syncobj_timeline_signal_ioctl Tvrtko Ursulin
@ 2025-03-18 15:54 ` Tvrtko Ursulin
2025-03-24 23:00 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find Tvrtko Ursulin
2025-03-24 23:17 ` [PATCH 0/7] A few drm_syncobj optimisations Maíra Canal
7 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-18 15:54 UTC (permalink / raw)
To: dri-devel; +Cc: kernel-dev, Tvrtko Ursulin
Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
sycobjs is relatively hot, but the 96% of the calls are for a single
object. (~4% for two points, and never more than three points. While
a more trivial workload like vkmark under Plasma is even more skewed
to single point waits.)
Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre-
allocated stack array for those cases.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index b4563c696056..94932b89298f 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1035,6 +1035,7 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
uint32_t *idx,
ktime_t *deadline)
{
+ struct syncobj_wait_entry stack_entries[4];
struct syncobj_wait_entry *entries;
uint32_t signaled_count, i;
struct dma_fence *fence;
@@ -1049,9 +1050,14 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
!access_ok(user_points, count * sizeof(*user_points)))
return -EFAULT;
- entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
- if (!entries)
- return -ENOMEM;
+ if (count > ARRAY_SIZE(stack_entries)) {
+ entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+ } else {
+ memset(stack_entries, 0, sizeof(stack_entries));
+ entries = stack_entries;
+ }
/* Walk the list of sync objects and initialize entries. We do
* this up-front so that we can properly return -EINVAL if there is
@@ -1174,7 +1180,9 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
&entries[i].fence_cb);
dma_fence_put(entries[i].fence);
}
- kfree(entries);
+
+ if (entries != stack_entries)
+ kfree(entries);
return timeout;
}
--
2.48.0
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find
2025-03-18 15:54 [PATCH 0/7] A few drm_syncobj optimisations Tvrtko Ursulin
` (5 preceding siblings ...)
2025-03-18 15:54 ` [PATCH 6/7] drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout Tvrtko Ursulin
@ 2025-03-18 15:54 ` Tvrtko Ursulin
2025-03-24 23:06 ` Maíra Canal
2025-03-24 23:17 ` [PATCH 0/7] A few drm_syncobj optimisations Maíra Canal
7 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-18 15:54 UTC (permalink / raw)
To: dri-devel; +Cc: kernel-dev, Tvrtko Ursulin
Running the Cyberpunk 2077 benchmark we can observe that the lookup helper
is relatively hot, but the 97% of the calls are for a single object. (~3%
for two points, and never more than three points. While a more trivial
workload like vkmark under Plasma is even more skewed to single point
lookups.)
Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a
pre-allocated stack array for those cases.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
---
drivers/gpu/drm/drm_syncobj.c | 53 +++++++++++++++++++++++++++--------
1 file changed, 41 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 94932b89298f..233bdef53c87 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1223,6 +1223,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
static int drm_syncobj_array_find(struct drm_file *file_private,
u32 __user *handles,
uint32_t count,
+ struct drm_syncobj **stack_syncobjs,
+ u32 stack_count,
struct drm_syncobj ***syncobjs_out)
{
struct drm_syncobj **syncobjs;
@@ -1232,9 +1234,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
if (!access_ok(handles, count * sizeof(*handles)))
return -EFAULT;
- syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
- if (!syncobjs)
- return -ENOMEM;
+ if (count > stack_count) {
+ syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
+ if (!syncobjs)
+ return -ENOMEM;
+ } else {
+ syncobjs = stack_syncobjs;
+ }
for (i = 0; i < count; i++) {
u64 handle;
@@ -1260,25 +1266,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
drm_syncobj_put(syncobjs[i]);
i--;
}
- kfree(syncobjs);
+
+ if (syncobjs != stack_syncobjs)
+ kfree(syncobjs);
return ret;
}
static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
- uint32_t count)
+ uint32_t count,
+ struct drm_syncobj **stack_syncobjs)
{
uint32_t i;
for (i = 0; i < count; i++)
drm_syncobj_put(syncobjs[i]);
- kfree(syncobjs);
+
+ if (syncobjs != stack_syncobjs)
+ kfree(syncobjs);
}
int
drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_private)
{
+ struct drm_syncobj *stack_syncobjs[4];
struct drm_syncobj_wait *args = data;
ktime_t deadline, *pdeadline = NULL;
u32 count = args->count_handles;
@@ -1304,6 +1316,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
count,
+ stack_syncobjs,
+ ARRAY_SIZE(stack_syncobjs),
&syncobjs);
if (ret < 0)
return ret;
@@ -1321,7 +1335,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
&first,
pdeadline);
- drm_syncobj_array_free(syncobjs, count);
+ drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
if (timeout < 0)
return timeout;
@@ -1336,6 +1350,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_private)
{
struct drm_syncobj_timeline_wait *args = data;
+ struct drm_syncobj *stack_syncobjs[4];
ktime_t deadline, *pdeadline = NULL;
u32 count = args->count_handles;
struct drm_syncobj **syncobjs;
@@ -1361,6 +1376,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
count,
+ stack_syncobjs,
+ ARRAY_SIZE(stack_syncobjs),
&syncobjs);
if (ret < 0)
return ret;
@@ -1378,7 +1395,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
&first,
pdeadline);
- drm_syncobj_array_free(syncobjs, count);
+ drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
if (timeout < 0)
return timeout;
@@ -1496,6 +1513,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_private)
{
struct drm_syncobj_array *args = data;
+ struct drm_syncobj *stack_syncobjs[4];
struct drm_syncobj **syncobjs;
uint32_t i;
int ret;
@@ -1512,6 +1530,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
args->count_handles,
+ stack_syncobjs,
+ ARRAY_SIZE(stack_syncobjs),
&syncobjs);
if (ret < 0)
return ret;
@@ -1519,7 +1539,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
for (i = 0; i < args->count_handles; i++)
drm_syncobj_replace_fence(syncobjs[i], NULL);
- drm_syncobj_array_free(syncobjs, args->count_handles);
+ drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs);
return 0;
}
@@ -1529,6 +1549,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_private)
{
struct drm_syncobj_array *args = data;
+ struct drm_syncobj *stack_syncobjs[4];
struct drm_syncobj **syncobjs;
uint32_t i;
int ret;
@@ -1545,6 +1566,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
args->count_handles,
+ stack_syncobjs,
+ ARRAY_SIZE(stack_syncobjs),
&syncobjs);
if (ret < 0)
return ret;
@@ -1555,7 +1578,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
break;
}
- drm_syncobj_array_free(syncobjs, args->count_handles);
+ drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs);
return ret;
}
@@ -1567,6 +1590,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
struct drm_syncobj_timeline_array *args = data;
uint64_t __user *points = u64_to_user_ptr(args->points);
uint32_t i, j, count = args->count_handles;
+ struct drm_syncobj *stack_syncobjs[4];
struct drm_syncobj **syncobjs;
struct dma_fence_chain **chains;
int ret;
@@ -1586,6 +1610,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
count,
+ stack_syncobjs,
+ ARRAY_SIZE(stack_syncobjs),
&syncobjs);
if (ret < 0)
return ret;
@@ -1622,7 +1648,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
err_chains:
kfree(chains);
out:
- drm_syncobj_array_free(syncobjs, count);
+ drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
return ret;
}
@@ -1631,6 +1657,7 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_private)
{
struct drm_syncobj_timeline_array *args = data;
+ struct drm_syncobj *stack_syncobjs[4];
struct drm_syncobj **syncobjs;
uint64_t __user *points = u64_to_user_ptr(args->points);
uint32_t i;
@@ -1651,6 +1678,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
ret = drm_syncobj_array_find(file_private,
u64_to_user_ptr(args->handles),
args->count_handles,
+ stack_syncobjs,
+ ARRAY_SIZE(stack_syncobjs),
&syncobjs);
if (ret < 0)
return ret;
@@ -1694,7 +1723,7 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
break;
}
}
- drm_syncobj_array_free(syncobjs, args->count_handles);
+ drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs);
return ret;
}
--
2.48.0
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 1/7] drm/syncobj: Remove unhelpful helper
2025-03-18 15:54 ` [PATCH 1/7] drm/syncobj: Remove unhelpful helper Tvrtko Ursulin
@ 2025-03-24 21:38 ` Maíra Canal
2025-03-25 9:12 ` Tvrtko Ursulin
0 siblings, 1 reply; 24+ messages in thread
From: Maíra Canal @ 2025-03-24 21:38 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
Some nits inline, but feel free to ignore them if you don't think they
are reasonable. Apart from that,
Reviewed-by: Maíra Canal <mcanal@igalia.com>
On 18/03/25 12:54, Tvrtko Ursulin wrote:
> Helper which fails to consolidate the code and instead just forks into two
> copies of the code based on a boolean parameter is not very helpful or
> readable. Lets just remove it and proof in the pudding is the net smaller
> code.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> ---
> drivers/gpu/drm/drm_syncobj.c | 98 ++++++++++++++++-------------------
> 1 file changed, 44 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index 4f2ab8a7b50f..d0d60c331df8 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -1221,42 +1221,6 @@ signed long drm_timeout_abs_to_jiffies(int64_t timeout_nsec)
> }
> EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
>
> -static int drm_syncobj_array_wait(struct drm_device *dev,
> - struct drm_file *file_private,
> - struct drm_syncobj_wait *wait,
> - struct drm_syncobj_timeline_wait *timeline_wait,
> - struct drm_syncobj **syncobjs, bool timeline,
> - ktime_t *deadline)
> -{
> - signed long timeout = 0;
> - uint32_t first = ~0;
> -
> - if (!timeline) {
> - timeout = drm_timeout_abs_to_jiffies(wait->timeout_nsec);
> - timeout = drm_syncobj_array_wait_timeout(syncobjs,
> - NULL,
> - wait->count_handles,
> - wait->flags,
> - timeout, &first,
> - deadline);
> - if (timeout < 0)
> - return timeout;
> - wait->first_signaled = first;
> - } else {
> - timeout = drm_timeout_abs_to_jiffies(timeline_wait->timeout_nsec);
> - timeout = drm_syncobj_array_wait_timeout(syncobjs,
> - u64_to_user_ptr(timeline_wait->points),
> - timeline_wait->count_handles,
> - timeline_wait->flags,
> - timeout, &first,
> - deadline);
> - if (timeout < 0)
> - return timeout;
> - timeline_wait->first_signaled = first;
> - }
> - return 0;
> -}
> -
> static int drm_syncobj_array_find(struct drm_file *file_private,
> void __user *user_handles,
> uint32_t count_handles,
> @@ -1319,9 +1283,12 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> struct drm_syncobj_wait *args = data;
> + ktime_t deadline, *pdeadline = NULL;
> + u32 count = args->count_handles;
From my point of view, this variable didn't make the code more readable.
I'd prefer to keep using `args->count_handles` in the code.
> struct drm_syncobj **syncobjs;
> unsigned int possible_flags;
> - ktime_t t, *tp = NULL;
> + u32 first = ~0;
> + long timeout;
> int ret = 0;
>
> if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ))
> @@ -1334,27 +1301,37 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
> if (args->flags & ~possible_flags)
> return -EINVAL;
>
> - if (args->count_handles == 0)
> + if (count == 0)
> return 0;
>
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> - args->count_handles,
> + count,
> &syncobjs);
> if (ret < 0)
> return ret;
>
> if (args->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE) {
> - t = ns_to_ktime(args->deadline_nsec);
> - tp = &t;
> + deadline = ns_to_ktime(args->deadline_nsec);
> + pdeadline = &deadline;
> }
>
> - ret = drm_syncobj_array_wait(dev, file_private,
> - args, NULL, syncobjs, false, tp);
> + timeout = drm_syncobj_array_wait_timeout(syncobjs,
> + NULL,
> + count,
> + args->flags,
> + drm_timeout_abs_to_jiffies(args->timeout_nsec),
Could you create a variable for the timeout instead of adding the
function as a parameter?
Same comments for `drm_syncobj_timeline_wait_ioctl()`.
Best Regards,
- Maíra
> + &first,
> + pdeadline);
>
> - drm_syncobj_array_free(syncobjs, args->count_handles);
> + drm_syncobj_array_free(syncobjs, count);
>
> - return ret;
> + if (timeout < 0)
> + return timeout;
> +
> + args->first_signaled = first;
> +
> + return 0;
> }
>
> int
> @@ -1362,9 +1339,12 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> struct drm_syncobj_timeline_wait *args = data;
> + ktime_t deadline, *pdeadline = NULL;
> + u32 count = args->count_handles;
> struct drm_syncobj **syncobjs;
> unsigned int possible_flags;
> - ktime_t t, *tp = NULL;
> + u32 first = ~0;
> + long timeout;
> int ret = 0;
>
> if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE))
> @@ -1378,27 +1358,37 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
> if (args->flags & ~possible_flags)
> return -EINVAL;
>
> - if (args->count_handles == 0)
> + if (count == 0)
> return 0;
>
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> - args->count_handles,
> + count,
> &syncobjs);
> if (ret < 0)
> return ret;
>
> if (args->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE) {
> - t = ns_to_ktime(args->deadline_nsec);
> - tp = &t;
> + deadline = ns_to_ktime(args->deadline_nsec);
> + pdeadline = &deadline;
> }
>
> - ret = drm_syncobj_array_wait(dev, file_private,
> - NULL, args, syncobjs, true, tp);
> + timeout = drm_syncobj_array_wait_timeout(syncobjs,
> + u64_to_user_ptr(args->points),
> + count,
> + args->flags,
> + drm_timeout_abs_to_jiffies(args->timeout_nsec),
> + &first,
> + pdeadline);
>
> - drm_syncobj_array_free(syncobjs, args->count_handles);
> + drm_syncobj_array_free(syncobjs, count);
>
> - return ret;
> + if (timeout < 0)
> + return timeout;
> +
> + args->first_signaled = first;
> +
> + return 0;
> }
>
> static void syncobj_eventfd_entry_fence_func(struct dma_fence *fence,
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/7] drm/syncobj: Do not allocate an array to store zeros when waiting
2025-03-18 15:54 ` [PATCH 2/7] drm/syncobj: Do not allocate an array to store zeros when waiting Tvrtko Ursulin
@ 2025-03-24 21:55 ` Maíra Canal
0 siblings, 0 replies; 24+ messages in thread
From: Maíra Canal @ 2025-03-24 21:55 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
On 18/03/25 12:54, Tvrtko Ursulin wrote:
> When waiting on syncobjs the current code allocates a temporary array only
> to fill it up with all zeros.
>
> We can avoid that by relying on the allocated entry array already being
> zero allocated.
>
> For the timeline mode we can fetch the timeline point values as we
> populate the entries array so also do not need this additional temporary
> allocation.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Reviewed-by: Maíra Canal <mcanal@igalia.com>
Best Regards,
- Maíra
> ---
> drivers/gpu/drm/drm_syncobj.c | 39 ++++++++++++++---------------------
> 1 file changed, 15 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index d0d60c331df8..fd5ba6c89666 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -1028,7 +1028,7 @@ static void syncobj_wait_syncobj_func(struct drm_syncobj *syncobj,
> }
>
> static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> - void __user *user_points,
> + u64 __user *user_points,
> uint32_t count,
> uint32_t flags,
> signed long timeout,
> @@ -1036,9 +1036,8 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> ktime_t *deadline)
> {
> struct syncobj_wait_entry *entries;
> - struct dma_fence *fence;
> - uint64_t *points;
> uint32_t signaled_count, i;
> + struct dma_fence *fence;
>
> if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT |
> DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) {
> @@ -1046,24 +1045,14 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> lockdep_assert_none_held_once();
> }
>
> - points = kmalloc_array(count, sizeof(*points), GFP_KERNEL);
> - if (points == NULL)
> - return -ENOMEM;
> -
> - if (!user_points) {
> - memset(points, 0, count * sizeof(uint64_t));
> -
> - } else if (copy_from_user(points, user_points,
> - sizeof(uint64_t) * count)) {
> - timeout = -EFAULT;
> - goto err_free_points;
> - }
> + if (user_points &&
> + !access_ok(user_points, count * sizeof(*user_points)))
> + return -EFAULT;
>
> entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
> - if (!entries) {
> - timeout = -ENOMEM;
> - goto err_free_points;
> - }
> + if (!entries)
> + return -ENOMEM;
> +
> /* Walk the list of sync objects and initialize entries. We do
> * this up-front so that we can properly return -EINVAL if there is
> * a syncobj with a missing fence and then never have the chance of
> @@ -1074,9 +1063,14 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> struct dma_fence *fence;
>
> entries[i].task = current;
> - entries[i].point = points[i];
> + if (user_points &&
> + __get_user(entries[i].point, user_points++)) {
> + timeout = -EFAULT;
> + goto cleanup_entries;
> + }
> fence = drm_syncobj_fence_get(syncobjs[i]);
> - if (!fence || dma_fence_chain_find_seqno(&fence, points[i])) {
> + if (!fence ||
> + dma_fence_chain_find_seqno(&fence, entries[i].point)) {
> dma_fence_put(fence);
> if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT |
> DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) {
> @@ -1182,9 +1176,6 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> }
> kfree(entries);
>
> -err_free_points:
> - kfree(points);
> -
> return timeout;
> }
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 3/7] drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find
2025-03-18 15:54 ` [PATCH 3/7] drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find Tvrtko Ursulin
@ 2025-03-24 22:29 ` Maíra Canal
2025-03-25 9:26 ` Tvrtko Ursulin
0 siblings, 1 reply; 24+ messages in thread
From: Maíra Canal @ 2025-03-24 22:29 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
On 18/03/25 12:54, Tvrtko Ursulin wrote:
> Drm_syncobj_array_find() helper is used from many userspace ioctl entry
> points with the task of looking up userspace handles to internal objects.
>
> We can easily avoid one temporary allocation by making it read the handles
> as it is looking them up.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> ---
> drivers/gpu/drm/drm_syncobj.c | 44 +++++++++++++++++------------------
> 1 file changed, 21 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index fd5ba6c89666..cdda2df06bec 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -1213,48 +1213,46 @@ signed long drm_timeout_abs_to_jiffies(int64_t timeout_nsec)
> EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
>
> static int drm_syncobj_array_find(struct drm_file *file_private,
> - void __user *user_handles,
> - uint32_t count_handles,
> + u32 __user *handles,
> + uint32_t count,
> struct drm_syncobj ***syncobjs_out)
> {
> - uint32_t i, *handles;
> struct drm_syncobj **syncobjs;
> + uint32_t i;
> int ret;
>
> - handles = kmalloc_array(count_handles, sizeof(*handles), GFP_KERNEL);
> - if (handles == NULL)
> + if (!access_ok(handles, count * sizeof(*handles)))
> + return -EFAULT;
> +
> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
> + if (!syncobjs)
> return -ENOMEM;
>
> - if (copy_from_user(handles, user_handles,
> - sizeof(uint32_t) * count_handles)) {
> - ret = -EFAULT;
> - goto err_free_handles;
> - }
> + for (i = 0; i < count; i++) {
> + u64 handle;
I believe this is a u32.
>
> - syncobjs = kmalloc_array(count_handles, sizeof(*syncobjs), GFP_KERNEL);
> - if (syncobjs == NULL) {
> - ret = -ENOMEM;
> - goto err_free_handles;
> - }
> -
> - for (i = 0; i < count_handles; i++) {
> - syncobjs[i] = drm_syncobj_find(file_private, handles[i]);
> + if (__get_user(handle, handles++)) {
> + ret = -EFAULT;
> + syncobjs[i] = NULL;
Do we need to assign syncobjs[i] to NULL? Can we just go to
err_put_syncobjs?
> + goto err_put_syncobjs;
> + }
> + syncobjs[i] = drm_syncobj_find(file_private, handle);
> if (!syncobjs[i]) {
> ret = -ENOENT;
> goto err_put_syncobjs;
> }
> }
>
> - kfree(handles);
> *syncobjs_out = syncobjs;
> return 0;
>
> err_put_syncobjs:
> - while (i-- > 0)
> - drm_syncobj_put(syncobjs[i]);
> + while (i > 0) {
> + if (syncobjs[i])
I'm not sure if I understand which scenario this would be needed. For
the first syncobj that we don't find, we go to err_free_handles and
AFAIU all previous syncobjs exist, so there wouldn't be a need to check
if the syncobj != NULL.
Best Regards,
- Maíra
> + drm_syncobj_put(syncobjs[i]);
> + i--;
> + }
> kfree(syncobjs);
> -err_free_handles:
> - kfree(handles);
>
> return ret;
> }
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 4/7] drm/syncobj: Use put_user in drm_syncobj_query_ioctl
2025-03-18 15:54 ` [PATCH 4/7] drm/syncobj: Use put_user in drm_syncobj_query_ioctl Tvrtko Ursulin
@ 2025-03-24 22:32 ` Maíra Canal
0 siblings, 0 replies; 24+ messages in thread
From: Maíra Canal @ 2025-03-24 22:32 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
On 18/03/25 12:54, Tvrtko Ursulin wrote:
> Since the query loop is using copy_to_user() to write out a single u64 at
> a time it feels more natural (and is a tiny bit more compact) to replace
> it with put_user().
>
> Access_ok() check is added to the input checking for an early bailout in
> case of a bad buffer passed in.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Reviewed-by: Maíra Canal <mcanal@igalia.com>
Best Regards,
- Maíra
> ---
> drivers/gpu/drm/drm_syncobj.c | 9 ++++++---
> 1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index cdda2df06bec..74d1dc0d1f8b 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -1643,6 +1643,9 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
> if (args->count_handles == 0)
> return -EINVAL;
>
> + if (!access_ok(points, args->count_handles * sizeof(*points)))
> + return -EFAULT;
> +
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> args->count_handles,
> @@ -1684,10 +1687,10 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
> point = 0;
> }
> dma_fence_put(fence);
> - ret = copy_to_user(&points[i], &point, sizeof(uint64_t));
> - ret = ret ? -EFAULT : 0;
> - if (ret)
> + if (__put_user(point, points++)) {
> + ret = -EFAULT;
> break;
> + }
> }
> drm_syncobj_array_free(syncobjs, args->count_handles);
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 5/7] drm/syncobj: Avoid temporary allocation in drm_syncobj_timeline_signal_ioctl
2025-03-18 15:54 ` [PATCH 5/7] drm/syncobj: Avoid temporary allocation in drm_syncobj_timeline_signal_ioctl Tvrtko Ursulin
@ 2025-03-24 22:50 ` Maíra Canal
0 siblings, 0 replies; 24+ messages in thread
From: Maíra Canal @ 2025-03-24 22:50 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
On 18/03/25 12:54, Tvrtko Ursulin wrote:
> We can avoid one of the two temporary allocations if we read the userspace
> supplied timeline points as we go along.
>
> The only new complication is to unwind unused fence chains on the error
> path, but even that code was already present in the function.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Reviewed-by: Maíra Canal <mcanal@igalia.com>
Best Regards,
- Maíra
> ---
> drivers/gpu/drm/drm_syncobj.c | 46 +++++++++++++++--------------------
> 1 file changed, 20 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index 74d1dc0d1f8b..b4563c696056 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -1557,10 +1557,10 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> struct drm_syncobj_timeline_array *args = data;
> + uint64_t __user *points = u64_to_user_ptr(args->points);
> + uint32_t i, j, count = args->count_handles;
> struct drm_syncobj **syncobjs;
> struct dma_fence_chain **chains;
> - uint64_t *points;
> - uint32_t i, j;
> int ret;
>
> if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE))
> @@ -1572,33 +1572,22 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> if (args->count_handles == 0)
> return -EINVAL;
>
> + if (!access_ok(points, count * sizeof(*points)))
> + return -EFAULT;
> +
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> - args->count_handles,
> + count,
> &syncobjs);
> if (ret < 0)
> return ret;
>
> - points = kmalloc_array(args->count_handles, sizeof(*points),
> - GFP_KERNEL);
> - if (!points) {
> - ret = -ENOMEM;
> - goto out;
> - }
> - if (!u64_to_user_ptr(args->points)) {
> - memset(points, 0, args->count_handles * sizeof(uint64_t));
> - } else if (copy_from_user(points, u64_to_user_ptr(args->points),
> - sizeof(uint64_t) * args->count_handles)) {
> - ret = -EFAULT;
> - goto err_points;
> - }
> -
> - chains = kmalloc_array(args->count_handles, sizeof(void *), GFP_KERNEL);
> + chains = kmalloc_array(count, sizeof(void *), GFP_KERNEL);
> if (!chains) {
> ret = -ENOMEM;
> - goto err_points;
> + goto out;
> }
> - for (i = 0; i < args->count_handles; i++) {
> + for (i = 0; i < count; i++) {
> chains[i] = dma_fence_chain_alloc();
> if (!chains[i]) {
> for (j = 0; j < i; j++)
> @@ -1608,19 +1597,24 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> }
> }
>
> - for (i = 0; i < args->count_handles; i++) {
> + for (i = 0; i < count; i++) {
> struct dma_fence *fence = dma_fence_get_stub();
> + u64 point = 0;
>
> - drm_syncobj_add_point(syncobjs[i], chains[i],
> - fence, points[i]);
> + if (points && __get_user(point, points++)) {
> + ret = -EFAULT;
> + for (j = i; j < count; j++)
> + dma_fence_chain_free(chains[j]);
> + goto err_chains;
> + }
> +
> + drm_syncobj_add_point(syncobjs[i], chains[i], fence, point);
> dma_fence_put(fence);> }
> err_chains:
> kfree(chains);
> -err_points:
> - kfree(points);
> out:
> - drm_syncobj_array_free(syncobjs, args->count_handles);
> + drm_syncobj_array_free(syncobjs, count);
>
> return ret;
> }
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 6/7] drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout
2025-03-18 15:54 ` [PATCH 6/7] drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout Tvrtko Ursulin
@ 2025-03-24 23:00 ` Maíra Canal
2025-03-25 9:33 ` Tvrtko Ursulin
0 siblings, 1 reply; 24+ messages in thread
From: Maíra Canal @ 2025-03-24 23:00 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
On 18/03/25 12:54, Tvrtko Ursulin wrote:
> Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
> sycobjs is relatively hot, but the 96% of the calls are for a single
> object. (~4% for two points, and never more than three points. While
> a more trivial workload like vkmark under Plasma is even more skewed
> to single point waits.)
>
> Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre-
> allocated stack array for those cases.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> ---
> drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++----
> 1 file changed, 12 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index b4563c696056..94932b89298f 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -1035,6 +1035,7 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> uint32_t *idx,
> ktime_t *deadline)
> {
> + struct syncobj_wait_entry stack_entries[4];
Can't you initialize as
struct syncobj_wait_entry stack_entries[4] = {0};
?
> struct syncobj_wait_entry *entries;
> uint32_t signaled_count, i;
> struct dma_fence *fence;
> @@ -1049,9 +1050,14 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> !access_ok(user_points, count * sizeof(*user_points)))
> return -EFAULT;
>
> - entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
> - if (!entries)
> - return -ENOMEM;
> + if (count > ARRAY_SIZE(stack_entries)) {
> + entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
> + if (!entries)
> + return -ENOMEM;
> + } else {
> + memset(stack_entries, 0, sizeof(stack_entries));
Then, you can avoid this step.
> + entries = stack_entries;
> + }
>
> /* Walk the list of sync objects and initialize entries. We do
> * this up-front so that we can properly return -EINVAL if there is
> @@ -1174,7 +1180,9 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> &entries[i].fence_cb);
> dma_fence_put(entries[i].fence);
> }
> - kfree(entries);
> +
> + if (entries != stack_entries)
You can initialize `entries = NULL` and avoid this step.
Anyway,
Reviewed-by: Maíra Canal <mcanal@igalia.com>
Best Regards,
- Maíra
> + kfree(entries);
>
> return timeout;
> }
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find
2025-03-18 15:54 ` [PATCH 7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find Tvrtko Ursulin
@ 2025-03-24 23:06 ` Maíra Canal
2025-03-25 9:54 ` Tvrtko Ursulin
0 siblings, 1 reply; 24+ messages in thread
From: Maíra Canal @ 2025-03-24 23:06 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
Some nits inline, mostly personal comments. In any case,
Reviewed-by: Maíra Canal <mcanal@igalia.com>
On 18/03/25 12:54, Tvrtko Ursulin wrote:
> Running the Cyberpunk 2077 benchmark we can observe that the lookup helper
> is relatively hot, but the 97% of the calls are for a single object. (~3%
> for two points, and never more than three points. While a more trivial
> workload like vkmark under Plasma is even more skewed to single point
> lookups.)
>
> Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a
> pre-allocated stack array for those cases.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> ---
> drivers/gpu/drm/drm_syncobj.c | 53 +++++++++++++++++++++++++++--------
> 1 file changed, 41 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index 94932b89298f..233bdef53c87 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -1223,6 +1223,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
> static int drm_syncobj_array_find(struct drm_file *file_private,
> u32 __user *handles,
> uint32_t count,
> + struct drm_syncobj **stack_syncobjs,
> + u32 stack_count,
> struct drm_syncobj ***syncobjs_out)
> {
> struct drm_syncobj **syncobjs;
> @@ -1232,9 +1234,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
> if (!access_ok(handles, count * sizeof(*handles)))
> return -EFAULT;
>
> - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
> - if (!syncobjs)
> - return -ENOMEM;
> + if (count > stack_count) {
I believe it's worth adding a comment mentioning that using the stack
syncobj is a fast-path that covers most cases.
> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
> + if (!syncobjs)
> + return -ENOMEM;
> + } else {
> + syncobjs = stack_syncobjs;
> + }
>
> for (i = 0; i < count; i++) {
> u64 handle;
> @@ -1260,25 +1266,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
> drm_syncobj_put(syncobjs[i]);
> i--;
> }
> - kfree(syncobjs);
> +
> + if (syncobjs != stack_syncobjs)
Again, I have a slight preference to make `syncobjs = NULL` and avoid
this if condition. But it's just a personal preference.
> + kfree(syncobjs);
>
> return ret;
> }
>
> static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
> - uint32_t count)
> + uint32_t count,
> + struct drm_syncobj **stack_syncobjs)
IMO, I think the order `syncobjs, stack_syncobjs, count` is a bit more
intuitive.
> {
> uint32_t i;
>
> for (i = 0; i < count; i++)
> drm_syncobj_put(syncobjs[i]);
> - kfree(syncobjs);
> +
> + if (syncobjs != stack_syncobjs)
> + kfree(syncobjs);
> }
>
> int
> drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> + struct drm_syncobj *stack_syncobjs[4];
> struct drm_syncobj_wait *args = data;
> ktime_t deadline, *pdeadline = NULL;
> u32 count = args->count_handles;
> @@ -1304,6 +1316,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> count,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1321,7 +1335,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
> &first,
> pdeadline);
>
> - drm_syncobj_array_free(syncobjs, count);
> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
>
> if (timeout < 0)
> return timeout;
> @@ -1336,6 +1350,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> struct drm_syncobj_timeline_wait *args = data;
> + struct drm_syncobj *stack_syncobjs[4];
Zero initialize it?
Best Regards,
- Maíra
> ktime_t deadline, *pdeadline = NULL;
> u32 count = args->count_handles;
> struct drm_syncobj **syncobjs;
> @@ -1361,6 +1376,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> count,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1378,7 +1395,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
> &first,
> pdeadline);
>
> - drm_syncobj_array_free(syncobjs, count);
> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
>
> if (timeout < 0)
> return timeout;
> @@ -1496,6 +1513,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> struct drm_syncobj_array *args = data;
> + struct drm_syncobj *stack_syncobjs[4];
> struct drm_syncobj **syncobjs;
> uint32_t i;
> int ret;
> @@ -1512,6 +1530,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> args->count_handles,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1519,7 +1539,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
> for (i = 0; i < args->count_handles; i++)
> drm_syncobj_replace_fence(syncobjs[i], NULL);
>
> - drm_syncobj_array_free(syncobjs, args->count_handles);
> + drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs);
>
> return 0;
> }
> @@ -1529,6 +1549,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> struct drm_syncobj_array *args = data;
> + struct drm_syncobj *stack_syncobjs[4];
> struct drm_syncobj **syncobjs;
> uint32_t i;
> int ret;
> @@ -1545,6 +1566,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> args->count_handles,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1555,7 +1578,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
> break;
> }
>
> - drm_syncobj_array_free(syncobjs, args->count_handles);
> + drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs);
>
> return ret;
> }
> @@ -1567,6 +1590,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> struct drm_syncobj_timeline_array *args = data;
> uint64_t __user *points = u64_to_user_ptr(args->points);
> uint32_t i, j, count = args->count_handles;
> + struct drm_syncobj *stack_syncobjs[4];
> struct drm_syncobj **syncobjs;
> struct dma_fence_chain **chains;
> int ret;
> @@ -1586,6 +1610,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> count,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1622,7 +1648,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> err_chains:
> kfree(chains);
> out:
> - drm_syncobj_array_free(syncobjs, count);
> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
>
> return ret;
> }
> @@ -1631,6 +1657,7 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> struct drm_syncobj_timeline_array *args = data;
> + struct drm_syncobj *stack_syncobjs[4];
> struct drm_syncobj **syncobjs;
> uint64_t __user *points = u64_to_user_ptr(args->points);
> uint32_t i;
> @@ -1651,6 +1678,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> args->count_handles,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1694,7 +1723,7 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
> break;
> }
> }
> - drm_syncobj_array_free(syncobjs, args->count_handles);
> + drm_syncobj_array_free(syncobjs, args->count_handles, stack_syncobjs);
>
> return ret;
> }
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 0/7] A few drm_syncobj optimisations
2025-03-18 15:54 [PATCH 0/7] A few drm_syncobj optimisations Tvrtko Ursulin
` (6 preceding siblings ...)
2025-03-18 15:54 ` [PATCH 7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find Tvrtko Ursulin
@ 2025-03-24 23:17 ` Maíra Canal
2025-03-25 9:57 ` Tvrtko Ursulin
7 siblings, 1 reply; 24+ messages in thread
From: Maíra Canal @ 2025-03-24 23:17 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
Thanks for this patchset! I applied this patchset to the RPi downstream
kernel 6.13.7 [1] and saw an FPS improvement of approximately 5.85%
with "vkgears -present-mailbox" on the RPi 5.
I did five 100 seconds runs on each kernel and here are my results:
### 6.13.7
| Run | Min FPS | Max FPS | Avg FPS |
|----------|-------------|-------------|-------------|
| Run #1 | 6646.52 | 6874.77 | 6739.313 |
| Run #2 | 5387.04 | 6723.274 | 6046.773 |
| Run #3 | 6230.49 | 6823.47 | 6423.923 |
| Run #4 | 5269.678 | 5870.59 | 5501.858 |
| Run #5 | 5504.54 | 6285.91 | 5859.724 |
* Overall Avg FPS: 6114.318 FPS
### 6.13.7 + DRM Syncobj optimisations
| Run | Min FPS | Max FPS | Avg FPS |
|----------|-------------|-------------|-------------|
| Run #1 | 6089.05 | 7296.27 | 6859.724 |
| Run #2 | 6022.48 | 7264 | 6818.518 |
| Run #3 | 5987.68 | 6188.77 | 6041.365 |
| Run #4 | 5699.27 | 6448.99 | 6190.374 |
| Run #5 | 6199.27 | 6791.15 | 6450.900 |
* Overall Avg FPS: 6472.176 FPS
[1] https://github.com/raspberrypi/linux/tree/rpi-6.13.y
Best Regards,
- Maíra
On 18/03/25 12:54, Tvrtko Ursulin wrote:
> A small set of drm_syncobj optimisations which should make things a tiny bit
> more efficient on the CPU side of things.
>
> Improvement seems to be around 1.5%* more FPS if observed with "vkgears
> -present-mailbox" on a Steam Deck Plasma desktop, but I am reluctant to make a
> definitive claim on the numbers since there is some run to run variance. But, as
> suggested by Michel Dänzer, I did do a five ~100 second runs on the each kernel
> to be able to show the ministat analysis.
>
> x before
> + after
> +------------------------------------------------------------+
> | x + |
> | x x + |
> | x xx ++++ |
> | x x xx x ++++ |
> | x xx x xx x+ ++++ |
> | xxxxx xxxxxx+ ++++ + + |
> | xxxxxxx xxxxxx+x ++++ +++ |
> | x xxxxxxxxxxx*xx+* x++++++++ ++ |
> | x x xxxxxxxxxxxx**x*+*+*++++++++ ++++ + |
> | xx x xxxxxxxxxx*x****+***+**+++++ ++++++ |
> |x xxx x xxxxx*x****x***********+*++**+++++++ + + +|
> | |_______A______| |
> | |______A_______| |
> +------------------------------------------------------------+
> N Min Max Median Avg Stddev
> x 135 21697.58 22809.467 22321.396 22307.707 198.75011
> + 118 22200.746 23277.09 22661.4 22671.442 192.10609
> Difference at 95.0% confidence
> 363.735 +/- 48.3345
> 1.63054% +/- 0.216672%
> (Student's t, pooled s = 195.681)
>
> Tvrtko Ursulin (7):
> drm/syncobj: Remove unhelpful helper
> drm/syncobj: Do not allocate an array to store zeros when waiting
> drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find
> drm/syncobj: Use put_user in drm_syncobj_query_ioctl
> drm/syncobj: Avoid temporary allocation in
> drm_syncobj_timeline_signal_ioctl
> drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout
> drm/syncobj: Add a fast path to drm_syncobj_array_find
>
> drivers/gpu/drm/drm_syncobj.c | 281 ++++++++++++++++++----------------
> 1 file changed, 147 insertions(+), 134 deletions(-)
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 1/7] drm/syncobj: Remove unhelpful helper
2025-03-24 21:38 ` Maíra Canal
@ 2025-03-25 9:12 ` Tvrtko Ursulin
0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-25 9:12 UTC (permalink / raw)
To: Maíra Canal, dri-devel; +Cc: kernel-dev
On 24/03/2025 21:38, Maíra Canal wrote:
> Hi Tvrtko,
>
> Some nits inline, but feel free to ignore them if you don't think they
> are reasonable. Apart from that,
>
> Reviewed-by: Maíra Canal <mcanal@igalia.com>
>
> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>> Helper which fails to consolidate the code and instead just forks into
>> two
>> copies of the code based on a boolean parameter is not very helpful or
>> readable. Lets just remove it and proof in the pudding is the net smaller
>> code.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>> ---
>> drivers/gpu/drm/drm_syncobj.c | 98 ++++++++++++++++-------------------
>> 1 file changed, 44 insertions(+), 54 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/
>> drm_syncobj.c
>> index 4f2ab8a7b50f..d0d60c331df8 100644
>> --- a/drivers/gpu/drm/drm_syncobj.c
>> +++ b/drivers/gpu/drm/drm_syncobj.c
>> @@ -1221,42 +1221,6 @@ signed long drm_timeout_abs_to_jiffies(int64_t
>> timeout_nsec)
>> }
>> EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
>> -static int drm_syncobj_array_wait(struct drm_device *dev,
>> - struct drm_file *file_private,
>> - struct drm_syncobj_wait *wait,
>> - struct drm_syncobj_timeline_wait *timeline_wait,
>> - struct drm_syncobj **syncobjs, bool timeline,
>> - ktime_t *deadline)
>> -{
>> - signed long timeout = 0;
>> - uint32_t first = ~0;
>> -
>> - if (!timeline) {
>> - timeout = drm_timeout_abs_to_jiffies(wait->timeout_nsec);
>> - timeout = drm_syncobj_array_wait_timeout(syncobjs,
>> - NULL,
>> - wait->count_handles,
>> - wait->flags,
>> - timeout, &first,
>> - deadline);
>> - if (timeout < 0)
>> - return timeout;
>> - wait->first_signaled = first;
>> - } else {
>> - timeout = drm_timeout_abs_to_jiffies(timeline_wait-
>> >timeout_nsec);
>> - timeout = drm_syncobj_array_wait_timeout(syncobjs,
>> - u64_to_user_ptr(timeline_wait->points),
>> - timeline_wait->count_handles,
>> - timeline_wait->flags,
>> - timeout, &first,
>> - deadline);
>> - if (timeout < 0)
>> - return timeout;
>> - timeline_wait->first_signaled = first;
>> - }
>> - return 0;
>> -}
>> -
>> static int drm_syncobj_array_find(struct drm_file *file_private,
>> void __user *user_handles,
>> uint32_t count_handles,
>> @@ -1319,9 +1283,12 @@ drm_syncobj_wait_ioctl(struct drm_device *dev,
>> void *data,
>> struct drm_file *file_private)
>> {
>> struct drm_syncobj_wait *args = data;
>> + ktime_t deadline, *pdeadline = NULL;
>> + u32 count = args->count_handles;
>
> From my point of view, this variable didn't make the code more readable.
> I'd prefer to keep using `args->count_handles` in the code.
I think this one is 50-50. I made it a local since it used four times in
both functions. Since you are not strongly against it I opted to keep it
as is.
>> struct drm_syncobj **syncobjs;
>> unsigned int possible_flags;
>> - ktime_t t, *tp = NULL;
>> + u32 first = ~0;
>> + long timeout;
>> int ret = 0;
>> if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ))
>> @@ -1334,27 +1301,37 @@ drm_syncobj_wait_ioctl(struct drm_device *dev,
>> void *data,
>> if (args->flags & ~possible_flags)
>> return -EINVAL;
>> - if (args->count_handles == 0)
>> + if (count == 0)
>> return 0;
>> ret = drm_syncobj_array_find(file_private,
>> u64_to_user_ptr(args->handles),
>> - args->count_handles,
>> + count,
>> &syncobjs);
>> if (ret < 0)
>> return ret;
>> if (args->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE) {
>> - t = ns_to_ktime(args->deadline_nsec);
>> - tp = &t;
>> + deadline = ns_to_ktime(args->deadline_nsec);
>> + pdeadline = &deadline;
>> }
>> - ret = drm_syncobj_array_wait(dev, file_private,
>> - args, NULL, syncobjs, false, tp);
>> + timeout = drm_syncobj_array_wait_timeout(syncobjs,
>> + NULL,
>> + count,
>> + args->flags,
>> + drm_timeout_abs_to_jiffies(args->timeout_nsec),
>
> Could you create a variable for the timeout instead of adding the
> function as a parameter?
>
> Same comments for `drm_syncobj_timeline_wait_ioctl()`.
Good suggestion, done.
Regards,
Tvrtko
>> + &first,
>> + pdeadline);
>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>> + drm_syncobj_array_free(syncobjs, count);
>> - return ret;
>> + if (timeout < 0)
>> + return timeout;
>> +
>> + args->first_signaled = first;
>> +
>> + return 0;
>> }
>> int
>> @@ -1362,9 +1339,12 @@ drm_syncobj_timeline_wait_ioctl(struct
>> drm_device *dev, void *data,
>> struct drm_file *file_private)
>> {
>> struct drm_syncobj_timeline_wait *args = data;
>> + ktime_t deadline, *pdeadline = NULL;
>> + u32 count = args->count_handles;
>> struct drm_syncobj **syncobjs;
>> unsigned int possible_flags;
>> - ktime_t t, *tp = NULL;
>> + u32 first = ~0;
>> + long timeout;
>> int ret = 0;
>> if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE))
>> @@ -1378,27 +1358,37 @@ drm_syncobj_timeline_wait_ioctl(struct
>> drm_device *dev, void *data,
>> if (args->flags & ~possible_flags)
>> return -EINVAL;
>> - if (args->count_handles == 0)
>> + if (count == 0)
>> return 0;
>> ret = drm_syncobj_array_find(file_private,
>> u64_to_user_ptr(args->handles),
>> - args->count_handles,
>> + count,
>> &syncobjs);
>> if (ret < 0)
>> return ret;
>> if (args->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE) {
>> - t = ns_to_ktime(args->deadline_nsec);
>> - tp = &t;
>> + deadline = ns_to_ktime(args->deadline_nsec);
>> + pdeadline = &deadline;
>> }
>> - ret = drm_syncobj_array_wait(dev, file_private,
>> - NULL, args, syncobjs, true, tp);
>> + timeout = drm_syncobj_array_wait_timeout(syncobjs,
>> + u64_to_user_ptr(args->points),
>> + count,
>> + args->flags,
>> + drm_timeout_abs_to_jiffies(args->timeout_nsec),
>> + &first,
>> + pdeadline);
>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>> + drm_syncobj_array_free(syncobjs, count);
>> - return ret;
>> + if (timeout < 0)
>> + return timeout;
>> +
>> + args->first_signaled = first;
>> +
>> + return 0;
>> }
>> static void syncobj_eventfd_entry_fence_func(struct dma_fence *fence,
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 3/7] drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find
2025-03-24 22:29 ` Maíra Canal
@ 2025-03-25 9:26 ` Tvrtko Ursulin
0 siblings, 0 replies; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-25 9:26 UTC (permalink / raw)
To: Maíra Canal, dri-devel; +Cc: kernel-dev
On 24/03/2025 22:29, Maíra Canal wrote:
> Hi Tvrtko,
>
> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>> Drm_syncobj_array_find() helper is used from many userspace ioctl entry
>> points with the task of looking up userspace handles to internal objects.
>>
>> We can easily avoid one temporary allocation by making it read the
>> handles
>> as it is looking them up.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>> ---
>> drivers/gpu/drm/drm_syncobj.c | 44 +++++++++++++++++------------------
>> 1 file changed, 21 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/
>> drm_syncobj.c
>> index fd5ba6c89666..cdda2df06bec 100644
>> --- a/drivers/gpu/drm/drm_syncobj.c
>> +++ b/drivers/gpu/drm/drm_syncobj.c
>> @@ -1213,48 +1213,46 @@ signed long drm_timeout_abs_to_jiffies(int64_t
>> timeout_nsec)
>> EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
>> static int drm_syncobj_array_find(struct drm_file *file_private,
>> - void __user *user_handles,
>> - uint32_t count_handles,
>> + u32 __user *handles,
>> + uint32_t count,
>> struct drm_syncobj ***syncobjs_out)
>> {
>> - uint32_t i, *handles;
>> struct drm_syncobj **syncobjs;
>> + uint32_t i;
>> int ret;
>> - handles = kmalloc_array(count_handles, sizeof(*handles),
>> GFP_KERNEL);
>> - if (handles == NULL)
>> + if (!access_ok(handles, count * sizeof(*handles)))
>> + return -EFAULT;
>> +
>> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
>> + if (!syncobjs)
>> return -ENOMEM;
>> - if (copy_from_user(handles, user_handles,
>> - sizeof(uint32_t) * count_handles)) {
>> - ret = -EFAULT;
>> - goto err_free_handles;
>> - }
>> + for (i = 0; i < count; i++) {
>> + u64 handle;
>
> I believe this is a u32.
Well spotted.
>> - syncobjs = kmalloc_array(count_handles, sizeof(*syncobjs),
>> GFP_KERNEL);
>> - if (syncobjs == NULL) {
>> - ret = -ENOMEM;
>> - goto err_free_handles;
>> - }
>> -
>> - for (i = 0; i < count_handles; i++) {
>> - syncobjs[i] = drm_syncobj_find(file_private, handles[i]);
>> + if (__get_user(handle, handles++)) {
>> + ret = -EFAULT;
>> + syncobjs[i] = NULL;
>
> Do we need to assign syncobjs[i] to NULL? Can we just go to
> err_put_syncobjs?
>
>> + goto err_put_syncobjs;
>> + }
>> + syncobjs[i] = drm_syncobj_find(file_private, handle);
>> if (!syncobjs[i]) {
>> ret = -ENOENT;
>> goto err_put_syncobjs;
>> }
>> }
>> - kfree(handles);
>> *syncobjs_out = syncobjs;
>> return 0;
>> err_put_syncobjs:
>> - while (i-- > 0)
>> - drm_syncobj_put(syncobjs[i]);
>> + while (i > 0) {
>> + if (syncobjs[i])
>
> I'm not sure if I understand which scenario this would be needed. For
> the first syncobj that we don't find, we go to err_free_handles and
> AFAIU all previous syncobjs exist, so there wouldn't be a need to check
> if the syncobj != NULL.
Well spotted again. I don't remember why I thought I needed to change
this since it looks the old unwind works fine with added __get_user
failure path.
Regards,
Tvrtko
>> + drm_syncobj_put(syncobjs[i]);
>> + i--;
>> + }
>> kfree(syncobjs);
>> -err_free_handles:
>> - kfree(handles);
>> return ret;
>> }
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 6/7] drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout
2025-03-24 23:00 ` Maíra Canal
@ 2025-03-25 9:33 ` Tvrtko Ursulin
2025-03-25 19:56 ` Maíra Canal
0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-25 9:33 UTC (permalink / raw)
To: Maíra Canal, dri-devel; +Cc: kernel-dev
On 24/03/2025 23:00, Maíra Canal wrote:
> Hi Tvrtko,
>
> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>> Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
>> sycobjs is relatively hot, but the 96% of the calls are for a single
>> object. (~4% for two points, and never more than three points. While
>> a more trivial workload like vkmark under Plasma is even more skewed
>> to single point waits.)
>>
>> Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre-
>> allocated stack array for those cases.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>> ---
>> drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++----
>> 1 file changed, 12 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/
>> drm_syncobj.c
>> index b4563c696056..94932b89298f 100644
>> --- a/drivers/gpu/drm/drm_syncobj.c
>> +++ b/drivers/gpu/drm/drm_syncobj.c
>> @@ -1035,6 +1035,7 @@ static signed long
>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>> uint32_t *idx,
>> ktime_t *deadline)
>> {
>> + struct syncobj_wait_entry stack_entries[4];
>
> Can't you initialize as
>
> struct syncobj_wait_entry stack_entries[4] = {0};
>
> ?
Could do but I preferred to only do it when it is used.
>
>> struct syncobj_wait_entry *entries;
>> uint32_t signaled_count, i;
>> struct dma_fence *fence;
>> @@ -1049,9 +1050,14 @@ static signed long
>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>> !access_ok(user_points, count * sizeof(*user_points)))
>> return -EFAULT;
>> - entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
>> - if (!entries)
>> - return -ENOMEM;
>> + if (count > ARRAY_SIZE(stack_entries)) {
>> + entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
>> + if (!entries)
>> + return -ENOMEM;
>> + } else {
>> + memset(stack_entries, 0, sizeof(stack_entries));
>
> Then, you can avoid this step.
>
>> + entries = stack_entries;
>> + }
>> /* Walk the list of sync objects and initialize entries. We do
>> * this up-front so that we can properly return -EINVAL if there is
>> @@ -1174,7 +1180,9 @@ static signed long
>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>> &entries[i].fence_cb);
>> dma_fence_put(entries[i].fence);
>> }
>> - kfree(entries);
>> +
>> + if (entries != stack_entries)
>
> You can initialize `entries = NULL` and avoid this step.
Hmm where? You mean like:
if (entries == stack_entries)
entries = NULL;
kfree(entries);
Or something different?
Regards,
Tvrtko
>
> Anyway,
>
> Reviewed-by: Maíra Canal <mcanal@igalia.com>
>
> Best Regards,
> - Maíra
>
>> + kfree(entries);
>> return timeout;
>> }
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find
2025-03-24 23:06 ` Maíra Canal
@ 2025-03-25 9:54 ` Tvrtko Ursulin
2025-03-25 20:12 ` Maíra Canal
0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-25 9:54 UTC (permalink / raw)
To: Maíra Canal, dri-devel; +Cc: kernel-dev
On 24/03/2025 23:06, Maíra Canal wrote:
> Hi Tvrtko,
>
> Some nits inline, mostly personal comments. In any case,
>
> Reviewed-by: Maíra Canal <mcanal@igalia.com>
>
>
> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>> Running the Cyberpunk 2077 benchmark we can observe that the lookup
>> helper
>> is relatively hot, but the 97% of the calls are for a single object. (~3%
>> for two points, and never more than three points. While a more trivial
>> workload like vkmark under Plasma is even more skewed to single point
>> lookups.)
>>
>> Therefore lets add a fast path to bypass the kmalloc_array/kfree and
>> use a
>> pre-allocated stack array for those cases.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>> ---
>> drivers/gpu/drm/drm_syncobj.c | 53 +++++++++++++++++++++++++++--------
>> 1 file changed, 41 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/
>> drm_syncobj.c
>> index 94932b89298f..233bdef53c87 100644
>> --- a/drivers/gpu/drm/drm_syncobj.c
>> +++ b/drivers/gpu/drm/drm_syncobj.c
>> @@ -1223,6 +1223,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
>> static int drm_syncobj_array_find(struct drm_file *file_private,
>> u32 __user *handles,
>> uint32_t count,
>> + struct drm_syncobj **stack_syncobjs,
>> + u32 stack_count,
>> struct drm_syncobj ***syncobjs_out)
>> {
>> struct drm_syncobj **syncobjs;
>> @@ -1232,9 +1234,13 @@ static int drm_syncobj_array_find(struct
>> drm_file *file_private,
>> if (!access_ok(handles, count * sizeof(*handles)))
>> return -EFAULT;
>> - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
>> - if (!syncobjs)
>> - return -ENOMEM;
>> + if (count > stack_count) {
>
> I believe it's worth adding a comment mentioning that using the stack
> syncobj is a fast-path that covers most cases.
Yep. But it didn't feel like here is the place so I added comments to
where callers size the arrays. That however means there are two
duplicated comments. Okay with you?
>> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
>> + if (!syncobjs)
>> + return -ENOMEM;
>> + } else {
>> + syncobjs = stack_syncobjs;
>> + }
>> for (i = 0; i < count; i++) {
>> u64 handle;
>> @@ -1260,25 +1266,31 @@ static int drm_syncobj_array_find(struct
>> drm_file *file_private,
>> drm_syncobj_put(syncobjs[i]);
>> i--;
>> }
>> - kfree(syncobjs);
>> +
>> + if (syncobjs != stack_syncobjs)
>
> Again, I have a slight preference to make `syncobjs = NULL` and avoid
> this if condition. But it's just a personal preference.
Pending clarifications from the other patch.
>
>> + kfree(syncobjs);
>> return ret;
>> }
>> static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
>> - uint32_t count)
>> + uint32_t count,
>> + struct drm_syncobj **stack_syncobjs)
>
> IMO, I think the order `syncobjs, stack_syncobjs, count` is a bit more
> intuitive.
But count is not directly related to the size of the stack array in this
function. I could make it a boolean perhaps like this:
static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
uint32_t count,
bool free_array)
And then in the callers:
drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
Would that be clearer?
>
>> {
>> uint32_t i;
>> for (i = 0; i < count; i++)
>> drm_syncobj_put(syncobjs[i]);
>> - kfree(syncobjs);
>> +
>> + if (syncobjs != stack_syncobjs)
>> + kfree(syncobjs);
>> }
>> int
>> drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>> struct drm_file *file_private)
>> {
>> + struct drm_syncobj *stack_syncobjs[4];
>> struct drm_syncobj_wait *args = data;
>> ktime_t deadline, *pdeadline = NULL;
>> u32 count = args->count_handles;
>> @@ -1304,6 +1316,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev,
>> void *data,
>> ret = drm_syncobj_array_find(file_private,
>> u64_to_user_ptr(args->handles),
>> count,
>> + stack_syncobjs,
>> + ARRAY_SIZE(stack_syncobjs),
>> &syncobjs);
>> if (ret < 0)
>> return ret;
>> @@ -1321,7 +1335,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev,
>> void *data,
>> &first,
>> pdeadline);
>> - drm_syncobj_array_free(syncobjs, count);
>> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
>> if (timeout < 0)
>> return timeout;
>> @@ -1336,6 +1350,7 @@ drm_syncobj_timeline_wait_ioctl(struct
>> drm_device *dev, void *data,
>> struct drm_file *file_private)
>> {
>> struct drm_syncobj_timeline_wait *args = data;
>> + struct drm_syncobj *stack_syncobjs[4];
>
> Zero initialize it?
Do you see it is required?
Regards,
Tvrtko
>> ktime_t deadline, *pdeadline = NULL;
>> u32 count = args->count_handles;
>> struct drm_syncobj **syncobjs;
>> @@ -1361,6 +1376,8 @@ drm_syncobj_timeline_wait_ioctl(struct
>> drm_device *dev, void *data,
>> ret = drm_syncobj_array_find(file_private,
>> u64_to_user_ptr(args->handles),
>> count,
>> + stack_syncobjs,
>> + ARRAY_SIZE(stack_syncobjs),
>> &syncobjs);
>> if (ret < 0)
>> return ret;
>> @@ -1378,7 +1395,7 @@ drm_syncobj_timeline_wait_ioctl(struct
>> drm_device *dev, void *data,
>> &first,
>> pdeadline);
>> - drm_syncobj_array_free(syncobjs, count);
>> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
>> if (timeout < 0)
>> return timeout;
>> @@ -1496,6 +1513,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev,
>> void *data,
>> struct drm_file *file_private)
>> {
>> struct drm_syncobj_array *args = data;
>> + struct drm_syncobj *stack_syncobjs[4];
>> struct drm_syncobj **syncobjs;
>> uint32_t i;
>> int ret;
>> @@ -1512,6 +1530,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev,
>> void *data,
>> ret = drm_syncobj_array_find(file_private,
>> u64_to_user_ptr(args->handles),
>> args->count_handles,
>> + stack_syncobjs,
>> + ARRAY_SIZE(stack_syncobjs),
>> &syncobjs);
>> if (ret < 0)
>> return ret;
>> @@ -1519,7 +1539,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev,
>> void *data,
>> for (i = 0; i < args->count_handles; i++)
>> drm_syncobj_replace_fence(syncobjs[i], NULL);
>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>> + drm_syncobj_array_free(syncobjs, args->count_handles,
>> stack_syncobjs);
>> return 0;
>> }
>> @@ -1529,6 +1549,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev,
>> void *data,
>> struct drm_file *file_private)
>> {
>> struct drm_syncobj_array *args = data;
>> + struct drm_syncobj *stack_syncobjs[4];
>> struct drm_syncobj **syncobjs;
>> uint32_t i;
>> int ret;
>> @@ -1545,6 +1566,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev,
>> void *data,
>> ret = drm_syncobj_array_find(file_private,
>> u64_to_user_ptr(args->handles),
>> args->count_handles,
>> + stack_syncobjs,
>> + ARRAY_SIZE(stack_syncobjs),
>> &syncobjs);
>> if (ret < 0)
>> return ret;
>> @@ -1555,7 +1578,7 @@ drm_syncobj_signal_ioctl(struct drm_device *dev,
>> void *data,
>> break;
>> }
>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>> + drm_syncobj_array_free(syncobjs, args->count_handles,
>> stack_syncobjs);
>> return ret;
>> }
>> @@ -1567,6 +1590,7 @@ drm_syncobj_timeline_signal_ioctl(struct
>> drm_device *dev, void *data,
>> struct drm_syncobj_timeline_array *args = data;
>> uint64_t __user *points = u64_to_user_ptr(args->points);
>> uint32_t i, j, count = args->count_handles;
>> + struct drm_syncobj *stack_syncobjs[4];
>> struct drm_syncobj **syncobjs;
>> struct dma_fence_chain **chains;
>> int ret;
>> @@ -1586,6 +1610,8 @@ drm_syncobj_timeline_signal_ioctl(struct
>> drm_device *dev, void *data,
>> ret = drm_syncobj_array_find(file_private,
>> u64_to_user_ptr(args->handles),
>> count,
>> + stack_syncobjs,
>> + ARRAY_SIZE(stack_syncobjs),
>> &syncobjs);
>> if (ret < 0)
>> return ret;
>> @@ -1622,7 +1648,7 @@ drm_syncobj_timeline_signal_ioctl(struct
>> drm_device *dev, void *data,
>> err_chains:
>> kfree(chains);
>> out:
>> - drm_syncobj_array_free(syncobjs, count);
>> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
>> return ret;
>> }
>> @@ -1631,6 +1657,7 @@ int drm_syncobj_query_ioctl(struct drm_device
>> *dev, void *data,
>> struct drm_file *file_private)
>> {
>> struct drm_syncobj_timeline_array *args = data;
>> + struct drm_syncobj *stack_syncobjs[4];
>> struct drm_syncobj **syncobjs;
>> uint64_t __user *points = u64_to_user_ptr(args->points);
>> uint32_t i;
>> @@ -1651,6 +1678,8 @@ int drm_syncobj_query_ioctl(struct drm_device
>> *dev, void *data,
>> ret = drm_syncobj_array_find(file_private,
>> u64_to_user_ptr(args->handles),
>> args->count_handles,
>> + stack_syncobjs,
>> + ARRAY_SIZE(stack_syncobjs),
>> &syncobjs);
>> if (ret < 0)
>> return ret;
>> @@ -1694,7 +1723,7 @@ int drm_syncobj_query_ioctl(struct drm_device
>> *dev, void *data,
>> break;
>> }
>> }
>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>> + drm_syncobj_array_free(syncobjs, args->count_handles,
>> stack_syncobjs);
>> return ret;
>> }
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 0/7] A few drm_syncobj optimisations
2025-03-24 23:17 ` [PATCH 0/7] A few drm_syncobj optimisations Maíra Canal
@ 2025-03-25 9:57 ` Tvrtko Ursulin
2025-03-25 21:10 ` Maíra Canal
0 siblings, 1 reply; 24+ messages in thread
From: Tvrtko Ursulin @ 2025-03-25 9:57 UTC (permalink / raw)
To: Maíra Canal, dri-devel; +Cc: kernel-dev
On 24/03/2025 23:17, Maíra Canal wrote:
> Hi Tvrtko,
>
> Thanks for this patchset! I applied this patchset to the RPi downstream
> kernel 6.13.7 [1] and saw an FPS improvement of approximately 5.85%
> with "vkgears -present-mailbox" on the RPi 5.
>
> I did five 100 seconds runs on each kernel and here are my results:
>
> ### 6.13.7
>
> | Run | Min FPS | Max FPS | Avg FPS |
> |----------|-------------|-------------|-------------|
> | Run #1 | 6646.52 | 6874.77 | 6739.313 |
> | Run #2 | 5387.04 | 6723.274 | 6046.773 |
> | Run #3 | 6230.49 | 6823.47 | 6423.923 |
> | Run #4 | 5269.678 | 5870.59 | 5501.858 |
> | Run #5 | 5504.54 | 6285.91 | 5859.724 |
>
> * Overall Avg FPS: 6114.318 FPS
>
>
> ### 6.13.7 + DRM Syncobj optimisations
>
> | Run | Min FPS | Max FPS | Avg FPS |
> |----------|-------------|-------------|-------------|
> | Run #1 | 6089.05 | 7296.27 | 6859.724 |
> | Run #2 | 6022.48 | 7264 | 6818.518 |
> | Run #3 | 5987.68 | 6188.77 | 6041.365 |
> | Run #4 | 5699.27 | 6448.99 | 6190.374 |
> | Run #5 | 6199.27 | 6791.15 | 6450.900 |
>
> * Overall Avg FPS: 6472.176 FPS
Neat, thanks for testing! I am not surprised a slower CPU benefits more.
Btw if you have the raw data it would be nice to feed it to ministat too.
Regards,
Tvrtko
> [1] https://github.com/raspberrypi/linux/tree/rpi-6.13.y
>
> Best Regards,
> - Maíra
>
> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>> A small set of drm_syncobj optimisations which should make things a
>> tiny bit
>> more efficient on the CPU side of things.
>>
>> Improvement seems to be around 1.5%* more FPS if observed with "vkgears
>> -present-mailbox" on a Steam Deck Plasma desktop, but I am reluctant
>> to make a
>> definitive claim on the numbers since there is some run to run
>> variance. But, as
>> suggested by Michel Dänzer, I did do a five ~100 second runs on the
>> each kernel
>> to be able to show the ministat analysis.
>>
>> x before
>> + after
>> +------------------------------------------------------------+
>> | x + |
>> | x x + |
>> | x xx ++++ |
>> | x x xx x ++++ |
>> | x xx x xx x+ ++++ |
>> | xxxxx xxxxxx+ ++++ + + |
>> | xxxxxxx xxxxxx+x ++++ +++ |
>> | x xxxxxxxxxxx*xx+* x++++++++ ++ |
>> | x x xxxxxxxxxxxx**x*+*+*++++++++ ++++ + |
>> | xx x xxxxxxxxxx*x****+***+**+++++ ++++++ |
>> |x xxx x xxxxx*x****x***********+*++**+++++++ + + +|
>> | |_______A______| |
>> | |______A_______| |
>> +------------------------------------------------------------+
>> N Min Max Median Avg
>> Stddev
>> x 135 21697.58 22809.467 22321.396 22307.707
>> 198.75011
>> + 118 22200.746 23277.09 22661.4 22671.442
>> 192.10609
>> Difference at 95.0% confidence
>> 363.735 +/- 48.3345
>> 1.63054% +/- 0.216672%
>> (Student's t, pooled s = 195.681)
>>
>> Tvrtko Ursulin (7):
>> drm/syncobj: Remove unhelpful helper
>> drm/syncobj: Do not allocate an array to store zeros when waiting
>> drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find
>> drm/syncobj: Use put_user in drm_syncobj_query_ioctl
>> drm/syncobj: Avoid temporary allocation in
>> drm_syncobj_timeline_signal_ioctl
>> drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout
>> drm/syncobj: Add a fast path to drm_syncobj_array_find
>>
>> drivers/gpu/drm/drm_syncobj.c | 281 ++++++++++++++++++----------------
>> 1 file changed, 147 insertions(+), 134 deletions(-)
>>
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 6/7] drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout
2025-03-25 9:33 ` Tvrtko Ursulin
@ 2025-03-25 19:56 ` Maíra Canal
0 siblings, 0 replies; 24+ messages in thread
From: Maíra Canal @ 2025-03-25 19:56 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
On 25/03/25 06:33, Tvrtko Ursulin wrote:
>
> On 24/03/2025 23:00, Maíra Canal wrote:
>> Hi Tvrtko,
>>
>> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>>> Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
>>> sycobjs is relatively hot, but the 96% of the calls are for a single
>>> object. (~4% for two points, and never more than three points. While
>>> a more trivial workload like vkmark under Plasma is even more skewed
>>> to single point waits.)
>>>
>>> Therefore lets add a fast path to bypass the kcalloc/kfree and use a
>>> pre-
>>> allocated stack array for those cases.
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>>> ---
>>> drivers/gpu/drm/drm_syncobj.c | 16 ++++++++++++----
>>> 1 file changed, 12 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/
>>> drm_syncobj.c
>>> index b4563c696056..94932b89298f 100644
>>> --- a/drivers/gpu/drm/drm_syncobj.c
>>> +++ b/drivers/gpu/drm/drm_syncobj.c
>>> @@ -1035,6 +1035,7 @@ static signed long
>>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>>> uint32_t *idx,
>>> ktime_t *deadline)
>>> {
>>> + struct syncobj_wait_entry stack_entries[4];
>>
>> Can't you initialize as
>>
>> struct syncobj_wait_entry stack_entries[4] = {0};
>>
>> ?
>
> Could do but I preferred to only do it when it is used.
Np.
>
>>
>>> struct syncobj_wait_entry *entries;
>>> uint32_t signaled_count, i;>>> struct dma_fence *fence;
>>> @@ -1049,9 +1050,14 @@ static signed long
>>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>>> !access_ok(user_points, count * sizeof(*user_points)))
>>> return -EFAULT;
>>> - entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
>>> - if (!entries)
>>> - return -ENOMEM;
>>> + if (count > ARRAY_SIZE(stack_entries)) {
>>> + entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
>>> + if (!entries)
>>> + return -ENOMEM;
>>> + } else {
>>> + memset(stack_entries, 0, sizeof(stack_entries));
>>
>> Then, you can avoid this step.
>>
>>> + entries = stack_entries;
>>> + }
>>> /* Walk the list of sync objects and initialize entries. We do
>>> * this up-front so that we can properly return -EINVAL if
>>> there is
>>> @@ -1174,7 +1180,9 @@ static signed long
>>> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>>> &entries[i].fence_cb);
>>> dma_fence_put(entries[i].fence);
>>> }
>>> - kfree(entries);
>>> +
>>> + if (entries != stack_entries)
>>
>> You can initialize `entries = NULL` and avoid this step.
>
> Hmm where? You mean like:
>
> if (entries == stack_entries)
> entries = NULL;
> kfree(entries);
>
> Or something different?
Nvm, what I had thought initially was wrong. Sorry for the noise! Again,
feel free to you add my R-b.
Best Regards,
- Maíra
>
> Regards,
>
> Tvrtko
>
>>
>> Anyway,
>>
>> Reviewed-by: Maíra Canal <mcanal@igalia.com>
>>
>> Best Regards,
>> - Maíra
>>
>>> + kfree(entries);
>>> return timeout;
>>> }
>>
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find
2025-03-25 9:54 ` Tvrtko Ursulin
@ 2025-03-25 20:12 ` Maíra Canal
0 siblings, 0 replies; 24+ messages in thread
From: Maíra Canal @ 2025-03-25 20:12 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
On 25/03/25 06:54, Tvrtko Ursulin wrote:
>
> On 24/03/2025 23:06, Maíra Canal wrote:
>> Hi Tvrtko,
>>
>> Some nits inline, mostly personal comments. In any case,
>>
>> Reviewed-by: Maíra Canal <mcanal@igalia.com>
>>
>>
>> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>>> Running the Cyberpunk 2077 benchmark we can observe that the lookup
>>> helper
>>> is relatively hot, but the 97% of the calls are for a single object.
>>> (~3%
>>> for two points, and never more than three points. While a more trivial
>>> workload like vkmark under Plasma is even more skewed to single point
>>> lookups.)
>>>
>>> Therefore lets add a fast path to bypass the kmalloc_array/kfree and
>>> use a
>>> pre-allocated stack array for those cases.
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
>>> ---
>>> drivers/gpu/drm/drm_syncobj.c | 53 +++++++++++++++++++++++++++--------
>>> 1 file changed, 41 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/
>>> drm_syncobj.c
>>> index 94932b89298f..233bdef53c87 100644
>>> --- a/drivers/gpu/drm/drm_syncobj.c
>>> +++ b/drivers/gpu/drm/drm_syncobj.c
>>> @@ -1223,6 +1223,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
>>> static int drm_syncobj_array_find(struct drm_file *file_private,
>>> u32 __user *handles,
>>> uint32_t count,
>>> + struct drm_syncobj **stack_syncobjs,
>>> + u32 stack_count,
>>> struct drm_syncobj ***syncobjs_out)
>>> {
>>> struct drm_syncobj **syncobjs;
>>> @@ -1232,9 +1234,13 @@ static int drm_syncobj_array_find(struct
>>> drm_file *file_private,
>>> if (!access_ok(handles, count * sizeof(*handles)))
>>> return -EFAULT;
>>> - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
>>> - if (!syncobjs)
>>> - return -ENOMEM;
>>> + if (count > stack_count) {
>>
>> I believe it's worth adding a comment mentioning that using the stack
>> syncobj is a fast-path that covers most cases.
>
> Yep. But it didn't feel like here is the place so I added comments to
> where callers size the arrays. That however means there are two
> duplicated comments. Okay with you?
Sure.
>
>>> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
>>> + if (!syncobjs)
>>> + return -ENOMEM;
>>> + } else {
>>> + syncobjs = stack_syncobjs;
>>> + }
>>> for (i = 0; i < count; i++) {
>>> u64 handle;
>>> @@ -1260,25 +1266,31 @@ static int drm_syncobj_array_find(struct
>>> drm_file *file_private,
>>> drm_syncobj_put(syncobjs[i]);
>>> i--;
>>> }
>>> - kfree(syncobjs);
>>> +
>>> + if (syncobjs != stack_syncobjs)
>>
>> Again, I have a slight preference to make `syncobjs = NULL` and avoid
>> this if condition. But it's just a personal preference.
>
> Pending clarifications from the other patch.
Nvm, it wasn't a good idea.
>
>>
>>> + kfree(syncobjs);
>>> return ret;
>>> }
>>> static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
>>> - uint32_t count)
>>> + uint32_t count,
>>> + struct drm_syncobj **stack_syncobjs)
>>
>> IMO, I think the order `syncobjs, stack_syncobjs, count` is a bit more
>> intuitive.
>
> But count is not directly related to the size of the stack array in this
> function. I could make it a boolean perhaps like this:>
> static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
> uint32_t count,
> bool free_array)
>
> And then in the callers:
>
> drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>
> Would that be clearer?
Yeah, it does.
>
>>
>>> {
>>> uint32_t i;
>>> for (i = 0; i < count; i++)
>>> drm_syncobj_put(syncobjs[i]);
>>> - kfree(syncobjs);
>>> +
>>> + if (syncobjs != stack_syncobjs)
>>> + kfree(syncobjs);
>>> }
>>> int
>>> drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>>> struct drm_file *file_private)
>>> {
>>> + struct drm_syncobj *stack_syncobjs[4];
>>> struct drm_syncobj_wait *args = data;
>>> ktime_t deadline, *pdeadline = NULL;
>>> u32 count = args->count_handles;
>>> @@ -1304,6 +1316,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev,
>>> void *data,
>>> ret = drm_syncobj_array_find(file_private,
>>> u64_to_user_ptr(args->handles),
>>> count,
>>> + stack_syncobjs,
>>> + ARRAY_SIZE(stack_syncobjs),
>>> &syncobjs);
>>> if (ret < 0)
>>> return ret;
>>> @@ -1321,7 +1335,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev,
>>> void *data,
>>> &first,
>>> pdeadline);
>>> - drm_syncobj_array_free(syncobjs, count);
>>> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
>>> if (timeout < 0)
>>> return timeout;
>>> @@ -1336,6 +1350,7 @@ drm_syncobj_timeline_wait_ioctl(struct
>>> drm_device *dev, void *data,
>>> struct drm_file *file_private)
>>> {
>>> struct drm_syncobj_timeline_wait *args = data;
>>> + struct drm_syncobj *stack_syncobjs[4];
>>
>> Zero initialize it?
>
> Do you see it is required?
Not required, I was just suggesting to double-check if it wasn't needed
indeed.
Best Regards,
- Maíra
>
> Regards,
>
> Tvrtko
>
>>> ktime_t deadline, *pdeadline = NULL;
>>> u32 count = args->count_handles;
>>> struct drm_syncobj **syncobjs;
>>> @@ -1361,6 +1376,8 @@ drm_syncobj_timeline_wait_ioctl(struct
>>> drm_device *dev, void *data,
>>> ret = drm_syncobj_array_find(file_private,
>>> u64_to_user_ptr(args->handles),
>>> count,
>>> + stack_syncobjs,
>>> + ARRAY_SIZE(stack_syncobjs),
>>> &syncobjs);
>>> if (ret < 0)
>>> return ret;
>>> @@ -1378,7 +1395,7 @@ drm_syncobj_timeline_wait_ioctl(struct
>>> drm_device *dev, void *data,
>>> &first,
>>> pdeadline);
>>> - drm_syncobj_array_free(syncobjs, count);
>>> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
>>> if (timeout < 0)
>>> return timeout;
>>> @@ -1496,6 +1513,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev,
>>> void *data,
>>> struct drm_file *file_private)
>>> {
>>> struct drm_syncobj_array *args = data;
>>> + struct drm_syncobj *stack_syncobjs[4];
>>> struct drm_syncobj **syncobjs;
>>> uint32_t i;
>>> int ret;
>>> @@ -1512,6 +1530,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev,
>>> void *data,
>>> ret = drm_syncobj_array_find(file_private,
>>> u64_to_user_ptr(args->handles),
>>> args->count_handles,
>>> + stack_syncobjs,
>>> + ARRAY_SIZE(stack_syncobjs),
>>> &syncobjs);
>>> if (ret < 0)
>>> return ret;
>>> @@ -1519,7 +1539,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev,
>>> void *data,
>>> for (i = 0; i < args->count_handles; i++)
>>> drm_syncobj_replace_fence(syncobjs[i], NULL);
>>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>>> + drm_syncobj_array_free(syncobjs, args->count_handles,
>>> stack_syncobjs);
>>> return 0;
>>> }
>>> @@ -1529,6 +1549,7 @@ drm_syncobj_signal_ioctl(struct drm_device
>>> *dev, void *data,
>>> struct drm_file *file_private)
>>> {
>>> struct drm_syncobj_array *args = data;
>>> + struct drm_syncobj *stack_syncobjs[4];
>>> struct drm_syncobj **syncobjs;
>>> uint32_t i;
>>> int ret;
>>> @@ -1545,6 +1566,8 @@ drm_syncobj_signal_ioctl(struct drm_device
>>> *dev, void *data,
>>> ret = drm_syncobj_array_find(file_private,
>>> u64_to_user_ptr(args->handles),
>>> args->count_handles,
>>> + stack_syncobjs,
>>> + ARRAY_SIZE(stack_syncobjs),
>>> &syncobjs);
>>> if (ret < 0)
>>> return ret;
>>> @@ -1555,7 +1578,7 @@ drm_syncobj_signal_ioctl(struct drm_device
>>> *dev, void *data,
>>> break;
>>> }
>>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>>> + drm_syncobj_array_free(syncobjs, args->count_handles,
>>> stack_syncobjs);
>>> return ret;
>>> }
>>> @@ -1567,6 +1590,7 @@ drm_syncobj_timeline_signal_ioctl(struct
>>> drm_device *dev, void *data,
>>> struct drm_syncobj_timeline_array *args = data;
>>> uint64_t __user *points = u64_to_user_ptr(args->points);
>>> uint32_t i, j, count = args->count_handles;
>>> + struct drm_syncobj *stack_syncobjs[4];
>>> struct drm_syncobj **syncobjs;
>>> struct dma_fence_chain **chains;
>>> int ret;
>>> @@ -1586,6 +1610,8 @@ drm_syncobj_timeline_signal_ioctl(struct
>>> drm_device *dev, void *data,
>>> ret = drm_syncobj_array_find(file_private,
>>> u64_to_user_ptr(args->handles),
>>> count,
>>> + stack_syncobjs,
>>> + ARRAY_SIZE(stack_syncobjs),
>>> &syncobjs);
>>> if (ret < 0)
>>> return ret;
>>> @@ -1622,7 +1648,7 @@ drm_syncobj_timeline_signal_ioctl(struct
>>> drm_device *dev, void *data,
>>> err_chains:
>>> kfree(chains);
>>> out:
>>> - drm_syncobj_array_free(syncobjs, count);
>>> + drm_syncobj_array_free(syncobjs, count, stack_syncobjs);
>>> return ret;
>>> }
>>> @@ -1631,6 +1657,7 @@ int drm_syncobj_query_ioctl(struct drm_device
>>> *dev, void *data,
>>> struct drm_file *file_private)
>>> {
>>> struct drm_syncobj_timeline_array *args = data;
>>> + struct drm_syncobj *stack_syncobjs[4];
>>> struct drm_syncobj **syncobjs;
>>> uint64_t __user *points = u64_to_user_ptr(args->points);
>>> uint32_t i;
>>> @@ -1651,6 +1678,8 @@ int drm_syncobj_query_ioctl(struct drm_device
>>> *dev, void *data,
>>> ret = drm_syncobj_array_find(file_private,
>>> u64_to_user_ptr(args->handles),
>>> args->count_handles,
>>> + stack_syncobjs,
>>> + ARRAY_SIZE(stack_syncobjs),
>>> &syncobjs);
>>> if (ret < 0)
>>> return ret;
>>> @@ -1694,7 +1723,7 @@ int drm_syncobj_query_ioctl(struct drm_device
>>> *dev, void *data,
>>> break;
>>> }
>>> }
>>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>>> + drm_syncobj_array_free(syncobjs, args->count_handles,
>>> stack_syncobjs);
>>> return ret;
>>> }
>>
>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 0/7] A few drm_syncobj optimisations
2025-03-25 9:57 ` Tvrtko Ursulin
@ 2025-03-25 21:10 ` Maíra Canal
0 siblings, 0 replies; 24+ messages in thread
From: Maíra Canal @ 2025-03-25 21:10 UTC (permalink / raw)
To: Tvrtko Ursulin, dri-devel; +Cc: kernel-dev
Hi Tvrtko,
On 25/03/25 06:57, Tvrtko Ursulin wrote:
>
> On 24/03/2025 23:17, Maíra Canal wrote:
>> Hi Tvrtko,
>>
>> Thanks for this patchset! I applied this patchset to the RPi downstream
>> kernel 6.13.7 [1] and saw an FPS improvement of approximately 5.85%
>> with "vkgears -present-mailbox" on the RPi 5.
>>
>> I did five 100 seconds runs on each kernel and here are my results:
>>
>> ### 6.13.7
>>
>> | Run | Min FPS | Max FPS | Avg FPS |
>> |----------|-------------|-------------|-------------|
>> | Run #1 | 6646.52 | 6874.77 | 6739.313 |
>> | Run #2 | 5387.04 | 6723.274 | 6046.773 |
>> | Run #3 | 6230.49 | 6823.47 | 6423.923 |
>> | Run #4 | 5269.678 | 5870.59 | 5501.858 |
>> | Run #5 | 5504.54 | 6285.91 | 5859.724 |
>>
>> * Overall Avg FPS: 6114.318 FPS
>>
>>
>> ### 6.13.7 + DRM Syncobj optimisations
>>
>> | Run | Min FPS | Max FPS | Avg FPS |
>> |----------|-------------|-------------|-------------|
>> | Run #1 | 6089.05 | 7296.27 | 6859.724 |
>> | Run #2 | 6022.48 | 7264 | 6818.518 |
>> | Run #3 | 5987.68 | 6188.77 | 6041.365 |
>> | Run #4 | 5699.27 | 6448.99 | 6190.374 |
>> | Run #5 | 6199.27 | 6791.15 | 6450.900 |
>>
>> * Overall Avg FPS: 6472.176 FPS
>
> Neat, thanks for testing! I am not surprised a slower CPU benefits more.
>
> Btw if you have the raw data it would be nice to feed it to ministat too.
I ran again and collected the raw data. Here is the ministat:
x no-optimizations.txt
+ syncobjs-optimizations.txt
+---------------------------------------------------------------------------+
| + +
|
| + + ++
|
| x + + ++
|
| xx * ++x ++
|
| * xx +*+ +*x++ ++
|
| x ++x *+xxx +*+ x+*x+*x x ++ x
|
|x xxx ++xxxx *+xxx +*+ x***+** x + ++ ** + + x++
|
|xxxxx x +***x*x*+**x xxxx* xx+** *******x* x + +++x**x+*+ + **++x
xxx x|
| |__________|______A_M____MA__________|___|
|
+---------------------------------------------------------------------------+
N Min Max Median Avg Stddev
x 95 5660.033 7371.548 6413.172 6383.4326 431.10036
+ 95 5914.994 7209.361 6538.192 6568.3293 345.7754
Difference at 95.0% confidence
184.897 +/- 111.131
2.89651% +/- 1.74093%
(Student's t, pooled s = 390.774)
Best Regards,
- Maíra
>
> Regards,
>
> Tvrtko
>
>> [1] https://github.com/raspberrypi/linux/tree/rpi-6.13.y
>>
>> Best Regards,
>> - Maíra
>>
>> On 18/03/25 12:54, Tvrtko Ursulin wrote:
>>> A small set of drm_syncobj optimisations which should make things a
>>> tiny bit
>>> more efficient on the CPU side of things.
>>>
>>> Improvement seems to be around 1.5%* more FPS if observed with "vkgears
>>> -present-mailbox" on a Steam Deck Plasma desktop, but I am reluctant
>>> to make a
>>> definitive claim on the numbers since there is some run to run
>>> variance. But, as
>>> suggested by Michel Dänzer, I did do a five ~100 second runs on the
>>> each kernel
>>> to be able to show the ministat analysis.
>>>
>>> x before
>>> + after
>>> +------------------------------------------------------------+
>>> | x + |
>>> | x x + |
>>> | x xx ++++ |
>>> | x x xx x ++++ |
>>> | x xx x xx x+ ++++ |
>>> | xxxxx xxxxxx+ ++++ + + |
>>> | xxxxxxx xxxxxx+x ++++ +++ |
>>> | x xxxxxxxxxxx*xx+* x++++++++ ++ |
>>> | x x xxxxxxxxxxxx**x*+*+*++++++++ ++++ + |
>>> | xx x xxxxxxxxxx*x****+***+**+++++ ++++++ |
>>> |x xxx x xxxxx*x****x***********+*++**+++++++ + + +|
>>> | |_______A______| |
>>> | |______A_______| |
>>> +------------------------------------------------------------+
>>> N Min Max Median Avg Stddev
>>> x 135 21697.58 22809.467 22321.396 22307.707 198.75011
>>> + 118 22200.746 23277.09 22661.4 22671.442 192.10609
>>> Difference at 95.0% confidence
>>> 363.735 +/- 48.3345
>>> 1.63054% +/- 0.216672%
>>> (Student's t, pooled s = 195.681)
>>>
>>> Tvrtko Ursulin (7):
>>> drm/syncobj: Remove unhelpful helper
>>> drm/syncobj: Do not allocate an array to store zeros when waiting
>>> drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find
>>> drm/syncobj: Use put_user in drm_syncobj_query_ioctl
>>> drm/syncobj: Avoid temporary allocation in
>>> drm_syncobj_timeline_signal_ioctl
>>> drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout
>>> drm/syncobj: Add a fast path to drm_syncobj_array_find
>>>
>>> drivers/gpu/drm/drm_syncobj.c | 281 ++++++++++++++++++----------------
>>> 1 file changed, 147 insertions(+), 134 deletions(-)
>>>
>>
>
^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2025-03-25 21:10 UTC | newest]
Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-18 15:54 [PATCH 0/7] A few drm_syncobj optimisations Tvrtko Ursulin
2025-03-18 15:54 ` [PATCH 1/7] drm/syncobj: Remove unhelpful helper Tvrtko Ursulin
2025-03-24 21:38 ` Maíra Canal
2025-03-25 9:12 ` Tvrtko Ursulin
2025-03-18 15:54 ` [PATCH 2/7] drm/syncobj: Do not allocate an array to store zeros when waiting Tvrtko Ursulin
2025-03-24 21:55 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 3/7] drm/syncobj: Avoid one temporary allocation in drm_syncobj_array_find Tvrtko Ursulin
2025-03-24 22:29 ` Maíra Canal
2025-03-25 9:26 ` Tvrtko Ursulin
2025-03-18 15:54 ` [PATCH 4/7] drm/syncobj: Use put_user in drm_syncobj_query_ioctl Tvrtko Ursulin
2025-03-24 22:32 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 5/7] drm/syncobj: Avoid temporary allocation in drm_syncobj_timeline_signal_ioctl Tvrtko Ursulin
2025-03-24 22:50 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 6/7] drm/syncobj: Add a fast path to drm_syncobj_array_wait_timeout Tvrtko Ursulin
2025-03-24 23:00 ` Maíra Canal
2025-03-25 9:33 ` Tvrtko Ursulin
2025-03-25 19:56 ` Maíra Canal
2025-03-18 15:54 ` [PATCH 7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find Tvrtko Ursulin
2025-03-24 23:06 ` Maíra Canal
2025-03-25 9:54 ` Tvrtko Ursulin
2025-03-25 20:12 ` Maíra Canal
2025-03-24 23:17 ` [PATCH 0/7] A few drm_syncobj optimisations Maíra Canal
2025-03-25 9:57 ` Tvrtko Ursulin
2025-03-25 21:10 ` Maíra Canal
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).