* [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3
@ 2026-04-22 10:30 Christian König
2026-04-22 10:30 ` [PATCH 2/2] dma-buf/dma_fence_array: optimize handling Christian König
2026-04-22 10:49 ` [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Tvrtko Ursulin
0 siblings, 2 replies; 7+ messages in thread
From: Christian König @ 2026-04-22 10:30 UTC (permalink / raw)
To: tursulin, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig
Amdgpu was the only user of the signal on any feature and we dropped
that use case recently, so we can remove that functionality.
v2: update num_pending only after the fence is signaled
v3: separate out simplifying dma_fence_array implementation
Signed-off-by: Christian König <christian.koenig@amd.com>
---
drivers/dma-buf/dma-fence-array.c | 13 ++++---------
drivers/dma-buf/dma-fence-unwrap.c | 3 +--
drivers/dma-buf/dma-resv.c | 3 +--
drivers/dma-buf/st-dma-fence-unwrap.c | 2 +-
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 3 +--
drivers/gpu/drm/xe/xe_sync.c | 2 +-
drivers/gpu/drm/xe/xe_vm.c | 4 ++--
include/linux/dma-fence-array.h | 6 ++----
8 files changed, 13 insertions(+), 23 deletions(-)
diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
index 089f69469524..5e10e8df372f 100644
--- a/drivers/dma-buf/dma-fence-array.c
+++ b/drivers/dma-buf/dma-fence-array.c
@@ -190,15 +190,13 @@ EXPORT_SYMBOL(dma_fence_array_alloc);
* @fences: [in] array containing the fences
* @context: [in] fence context to use
* @seqno: [in] sequence number to use
- * @signal_on_any: [in] signal on any fence in the array
*
* Implementation of @dma_fence_array_create without allocation. Useful to init
* a preallocated dma fence array in the path of reclaim or dma fence signaling.
*/
void dma_fence_array_init(struct dma_fence_array *array,
int num_fences, struct dma_fence **fences,
- u64 context, unsigned seqno,
- bool signal_on_any)
+ u64 context, unsigned seqno)
{
static struct lock_class_key dma_fence_array_lock_key;
@@ -222,7 +220,7 @@ void dma_fence_array_init(struct dma_fence_array *array,
*/
lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
- atomic_set(&array->num_pending, signal_on_any ? 1 : num_fences);
+ atomic_set(&array->num_pending, num_fences);
array->fences = fences;
array->base.error = PENDING_ERROR;
@@ -249,7 +247,6 @@ EXPORT_SYMBOL(dma_fence_array_init);
* @fences: [in] array containing the fences
* @context: [in] fence context to use
* @seqno: [in] sequence number to use
- * @signal_on_any: [in] signal on any fence in the array
*
* Allocate a dma_fence_array object and initialize the base fence with
* dma_fence_init().
@@ -264,8 +261,7 @@ EXPORT_SYMBOL(dma_fence_array_init);
*/
struct dma_fence_array *dma_fence_array_create(int num_fences,
struct dma_fence **fences,
- u64 context, unsigned seqno,
- bool signal_on_any)
+ u64 context, unsigned seqno)
{
struct dma_fence_array *array;
@@ -273,8 +269,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
if (!array)
return NULL;
- dma_fence_array_init(array, num_fences, fences,
- context, seqno, signal_on_any);
+ dma_fence_array_init(array, num_fences, fences, context, seqno);
return array;
}
diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c
index 07fe9bf45aea..53bb40e70b27 100644
--- a/drivers/dma-buf/dma-fence-unwrap.c
+++ b/drivers/dma-buf/dma-fence-unwrap.c
@@ -180,8 +180,7 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
if (count > 1) {
result = dma_fence_array_create(count, array,
- dma_fence_context_alloc(1),
- 1, false);
+ dma_fence_context_alloc(1), 1);
if (!result) {
for (i = 0; i < count; i++)
dma_fence_put(array[i]);
diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index ce9e6c04897f..39a92d9f2413 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -648,8 +648,7 @@ int dma_resv_get_singleton(struct dma_resv *obj, enum dma_resv_usage usage,
}
array = dma_fence_array_create(count, fences,
- dma_fence_context_alloc(1),
- 1, false);
+ dma_fence_context_alloc(1), 1);
if (!array) {
while (count--)
dma_fence_put(fences[count]);
diff --git a/drivers/dma-buf/st-dma-fence-unwrap.c b/drivers/dma-buf/st-dma-fence-unwrap.c
index 51c87869b7b8..4e7ee25372ba 100644
--- a/drivers/dma-buf/st-dma-fence-unwrap.c
+++ b/drivers/dma-buf/st-dma-fence-unwrap.c
@@ -64,7 +64,7 @@ static struct dma_fence *mock_array(unsigned int num_fences, ...)
array = dma_fence_array_create(num_fences, fences,
dma_fence_context_alloc(1),
- 1, false);
+ 1);
if (!array)
goto error_free;
return &array->base;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 942f4eed817f..4a1a9031f9db 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -3205,8 +3205,7 @@ eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
fence_array = dma_fence_array_create(eb->num_batches,
fences,
eb->context->parallel.fence_context,
- eb->context->parallel.seqno++,
- false);
+ eb->context->parallel.seqno++);
if (!fence_array) {
kfree(fences);
return ERR_PTR(-ENOMEM);
diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index 24d6d9af20d6..37866768d64c 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -376,7 +376,7 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
xe_assert(vm->xe, current_fence == num_fence);
cf = dma_fence_array_create(num_fence, fences,
dma_fence_context_alloc(1),
- 1, false);
+ 1);
if (!cf)
goto err_out;
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 56e2db50bb36..8f472911469d 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3370,7 +3370,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
goto err_trace;
}
- cf = dma_fence_array_alloc(n_fence);
+ cf = dma_fence_array_alloc();
if (!cf) {
fence = ERR_PTR(-ENOMEM);
goto err_out;
@@ -3414,7 +3414,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
xe_assert(vm->xe, current_fence == n_fence);
dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
- 1, false);
+ 1);
fence = &cf->base;
for_each_tile(tile, vm->xe, id) {
diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
index 370b3d2bba37..1b1d87579c38 100644
--- a/include/linux/dma-fence-array.h
+++ b/include/linux/dma-fence-array.h
@@ -81,13 +81,11 @@ to_dma_fence_array(struct dma_fence *fence)
struct dma_fence_array *dma_fence_array_alloc(int num_fences);
void dma_fence_array_init(struct dma_fence_array *array,
int num_fences, struct dma_fence **fences,
- u64 context, unsigned seqno,
- bool signal_on_any);
+ u64 context, unsigned seqno);
struct dma_fence_array *dma_fence_array_create(int num_fences,
struct dma_fence **fences,
- u64 context, unsigned seqno,
- bool signal_on_any);
+ u64 context, unsigned seqno);
bool dma_fence_match_context(struct dma_fence *fence, u64 context);
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/2] dma-buf/dma_fence_array: optimize handling
2026-04-22 10:30 [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Christian König
@ 2026-04-22 10:30 ` Christian König
2026-04-22 11:37 ` Tvrtko Ursulin
2026-04-22 10:49 ` [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Tvrtko Ursulin
1 sibling, 1 reply; 7+ messages in thread
From: Christian König @ 2026-04-22 10:30 UTC (permalink / raw)
To: tursulin, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig
Removing the signal on any feature allows to simplfy the dma_fence_array
code a lot and saves us from the need to install a callback on all fences
at the same time.
This results in less memory and CPU overhead.
Signed-off-by: Christian König <christian.koenig@amd.com>
---
drivers/dma-buf/dma-fence-array.c | 130 +++++++++++++-----------------
include/linux/dma-fence-array.h | 22 ++---
2 files changed, 59 insertions(+), 93 deletions(-)
diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
index 5e10e8df372f..f1b4b3296c87 100644
--- a/drivers/dma-buf/dma-fence-array.c
+++ b/drivers/dma-buf/dma-fence-array.c
@@ -42,97 +42,80 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
cmpxchg(&array->base.error, PENDING_ERROR, 0);
}
-static void irq_dma_fence_array_work(struct irq_work *wrk)
+static void dma_fence_array_cb_func(struct dma_fence *f,
+ struct dma_fence_cb *cb)
{
- struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
+ struct dma_fence_array *array =
+ container_of(cb, struct dma_fence_array, callback);
- dma_fence_array_clear_pending_error(array);
+ irq_work_queue(&array->work);
+}
+
+static void dma_fence_array_arm_cb(struct dma_fence_array *array)
+{
+ while (array->num_pending) {
+ struct dma_fence *f = array->fences[array->num_pending - 1];
+
+ if (!dma_fence_add_callback(f, &array->callback,
+ dma_fence_array_cb_func))
+ return;
+
+ dma_fence_array_set_pending_error(array, f->error);
+ WRITE_ONCE(array->num_pending, array->num_pending - 1);
+ }
dma_fence_signal(&array->base);
dma_fence_put(&array->base);
}
-static void dma_fence_array_cb_func(struct dma_fence *f,
- struct dma_fence_cb *cb)
+static void dma_fence_array_irq_work(struct irq_work *wrk)
{
- struct dma_fence_array_cb *array_cb =
- container_of(cb, struct dma_fence_array_cb, cb);
- struct dma_fence_array *array = array_cb->array;
-
- dma_fence_array_set_pending_error(array, f->error);
+ struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
- if (atomic_dec_and_test(&array->num_pending))
- irq_work_queue(&array->work);
- else
- dma_fence_put(&array->base);
+ WRITE_ONCE(array->num_pending, array->num_pending - 1);
+ dma_fence_array_arm_cb(array);
}
static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
{
struct dma_fence_array *array = to_dma_fence_array(fence);
- struct dma_fence_array_cb *cb = array->callbacks;
- unsigned i;
-
- for (i = 0; i < array->num_fences; ++i) {
- cb[i].array = array;
- /*
- * As we may report that the fence is signaled before all
- * callbacks are complete, we need to take an additional
- * reference count on the array so that we do not free it too
- * early. The core fence handling will only hold the reference
- * until we signal the array as complete (but that is now
- * insufficient).
- */
- dma_fence_get(&array->base);
- if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
- dma_fence_array_cb_func)) {
- int error = array->fences[i]->error;
-
- dma_fence_array_set_pending_error(array, error);
- dma_fence_put(&array->base);
- if (atomic_dec_and_test(&array->num_pending)) {
- dma_fence_array_clear_pending_error(array);
- return false;
- }
- }
- }
+ /*
+ * As we may report that the fence is signaled before all
+ * callbacks are complete, we need to take an additional
+ * reference count on the array so that we do not free it too
+ * early. The core fence handling will only hold the reference
+ * until we signal the array as complete (but that is now
+ * insufficient).
+ */
+ dma_fence_get(&array->base);
+ dma_fence_array_arm_cb(array);
return true;
}
static bool dma_fence_array_signaled(struct dma_fence *fence)
{
struct dma_fence_array *array = to_dma_fence_array(fence);
- int num_pending;
+ int num_pending, error = 0;
unsigned int i;
/*
- * We need to read num_pending before checking the enable_signal bit
- * to avoid racing with the enable_signaling() implementation, which
- * might decrement the counter, and cause a partial check.
- * atomic_read_acquire() pairs with atomic_dec_and_test() in
- * dma_fence_array_enable_signaling()
- *
- * The !--num_pending check is here to account for the any_signaled case
- * if we race with enable_signaling(), that means the !num_pending check
- * in the is_signalling_enabled branch might be outdated (num_pending
- * might have been decremented), but that's fine. The user will get the
- * right value when testing again later.
+ * Reading num_pending without a memory barrier here is correct since
+ * that is only for optimization, it is perfectly acceptable to have a
+ * stale value for it. In all other cases num_pending is accessed by a
+ * single call chain.
*/
- num_pending = atomic_read_acquire(&array->num_pending);
- if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
- if (num_pending <= 0)
- goto signal;
- return false;
- }
+ num_pending = READ_ONCE(array->num_pending);
+ for (i = 0; i < num_pending; ++i) {
+ struct dma_fence *f = array->fences[i];
- for (i = 0; i < array->num_fences; ++i) {
- if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
- goto signal;
- }
- return false;
+ if (!dma_fence_is_signaled(f))
+ return false;
-signal:
+ if (!error)
+ error = f->error;
+ }
+ dma_fence_array_set_pending_error(array, error);
dma_fence_array_clear_pending_error(array);
return true;
}
@@ -171,15 +154,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
/**
* dma_fence_array_alloc - Allocate a custom fence array
- * @num_fences: [in] number of fences to add in the array
*
* Return dma fence array on success, NULL on failure
*/
-struct dma_fence_array *dma_fence_array_alloc(int num_fences)
+struct dma_fence_array *dma_fence_array_alloc(void)
{
- struct dma_fence_array *array;
-
- return kzalloc_flex(*array, callbacks, num_fences);
+ return kzalloc_obj(struct dma_fence_array);
}
EXPORT_SYMBOL(dma_fence_array_alloc);
@@ -203,10 +183,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
WARN_ON(!num_fences || !fences);
array->num_fences = num_fences;
+ array->num_pending = num_fences;
+ array->fences = fences;
+ array->base.error = PENDING_ERROR;
dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
seqno);
- init_irq_work(&array->work, irq_dma_fence_array_work);
+ init_irq_work(&array->work, dma_fence_array_irq_work);
/*
* dma_fence_array_enable_signaling() is invoked while holding
@@ -220,11 +203,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
*/
lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
- atomic_set(&array->num_pending, num_fences);
- array->fences = fences;
-
- array->base.error = PENDING_ERROR;
-
/*
* dma_fence_array objects should never contain any other fence
* containers or otherwise we run into recursion and potential kernel
@@ -265,7 +243,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
{
struct dma_fence_array *array;
- array = dma_fence_array_alloc(num_fences);
+ array = dma_fence_array_alloc();
if (!array)
return NULL;
diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
index 1b1d87579c38..3ee55c0e2fa4 100644
--- a/include/linux/dma-fence-array.h
+++ b/include/linux/dma-fence-array.h
@@ -15,16 +15,6 @@
#include <linux/dma-fence.h>
#include <linux/irq_work.h>
-/**
- * struct dma_fence_array_cb - callback helper for fence array
- * @cb: fence callback structure for signaling
- * @array: reference to the parent fence array object
- */
-struct dma_fence_array_cb {
- struct dma_fence_cb cb;
- struct dma_fence_array *array;
-};
-
/**
* struct dma_fence_array - fence to represent an array of fences
* @base: fence base class
@@ -33,18 +23,17 @@ struct dma_fence_array_cb {
* @num_pending: fences in the array still pending
* @fences: array of the fences
* @work: internal irq_work function
- * @callbacks: array of callback helpers
+ * @callback: callback structure for signaling
*/
struct dma_fence_array {
struct dma_fence base;
- unsigned num_fences;
- atomic_t num_pending;
+ unsigned int num_fences;
+ unsigned int num_pending;
struct dma_fence **fences;
struct irq_work work;
-
- struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
+ struct dma_fence_cb callback;
};
/**
@@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
for (index = 0, fence = dma_fence_array_first(head); fence; \
++(index), fence = dma_fence_array_next(head, index))
-struct dma_fence_array *dma_fence_array_alloc(int num_fences);
+struct dma_fence_array *dma_fence_array_alloc(void);
void dma_fence_array_init(struct dma_fence_array *array,
int num_fences, struct dma_fence **fences,
u64 context, unsigned seqno);
-
struct dma_fence_array *dma_fence_array_create(int num_fences,
struct dma_fence **fences,
u64 context, unsigned seqno);
--
2.43.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3
2026-04-22 10:30 [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Christian König
2026-04-22 10:30 ` [PATCH 2/2] dma-buf/dma_fence_array: optimize handling Christian König
@ 2026-04-22 10:49 ` Tvrtko Ursulin
2026-05-04 14:46 ` Christian König
1 sibling, 1 reply; 7+ messages in thread
From: Tvrtko Ursulin @ 2026-04-22 10:49 UTC (permalink / raw)
To: Christian König, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig
On 22/04/2026 11:30, Christian König wrote:
> Amdgpu was the only user of the signal on any feature and we dropped
> that use case recently, so we can remove that functionality.
>
> v2: update num_pending only after the fence is signaled
> v3: separate out simplifying dma_fence_array implementation
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
> drivers/dma-buf/dma-fence-array.c | 13 ++++---------
> drivers/dma-buf/dma-fence-unwrap.c | 3 +--
> drivers/dma-buf/dma-resv.c | 3 +--
> drivers/dma-buf/st-dma-fence-unwrap.c | 2 +-
> drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 3 +--
> drivers/gpu/drm/xe/xe_sync.c | 2 +-
> drivers/gpu/drm/xe/xe_vm.c | 4 ++--
> include/linux/dma-fence-array.h | 6 ++----
> 8 files changed, 13 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
> index 089f69469524..5e10e8df372f 100644
> --- a/drivers/dma-buf/dma-fence-array.c
> +++ b/drivers/dma-buf/dma-fence-array.c
> @@ -190,15 +190,13 @@ EXPORT_SYMBOL(dma_fence_array_alloc);
> * @fences: [in] array containing the fences
> * @context: [in] fence context to use
> * @seqno: [in] sequence number to use
> - * @signal_on_any: [in] signal on any fence in the array
> *
> * Implementation of @dma_fence_array_create without allocation. Useful to init
> * a preallocated dma fence array in the path of reclaim or dma fence signaling.
> */
> void dma_fence_array_init(struct dma_fence_array *array,
> int num_fences, struct dma_fence **fences,
> - u64 context, unsigned seqno,
> - bool signal_on_any)
> + u64 context, unsigned seqno)
> {
> static struct lock_class_key dma_fence_array_lock_key;
>
> @@ -222,7 +220,7 @@ void dma_fence_array_init(struct dma_fence_array *array,
> */
> lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>
> - atomic_set(&array->num_pending, signal_on_any ? 1 : num_fences);
> + atomic_set(&array->num_pending, num_fences);
> array->fences = fences;
>
> array->base.error = PENDING_ERROR;
> @@ -249,7 +247,6 @@ EXPORT_SYMBOL(dma_fence_array_init);
> * @fences: [in] array containing the fences
> * @context: [in] fence context to use
> * @seqno: [in] sequence number to use
> - * @signal_on_any: [in] signal on any fence in the array
> *
> * Allocate a dma_fence_array object and initialize the base fence with
> * dma_fence_init().
> @@ -264,8 +261,7 @@ EXPORT_SYMBOL(dma_fence_array_init);
> */
> struct dma_fence_array *dma_fence_array_create(int num_fences,
> struct dma_fence **fences,
> - u64 context, unsigned seqno,
> - bool signal_on_any)
> + u64 context, unsigned seqno)
> {
> struct dma_fence_array *array;
>
> @@ -273,8 +269,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
> if (!array)
> return NULL;
>
> - dma_fence_array_init(array, num_fences, fences,
> - context, seqno, signal_on_any);
> + dma_fence_array_init(array, num_fences, fences, context, seqno);
>
> return array;
> }
> diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c
> index 07fe9bf45aea..53bb40e70b27 100644
> --- a/drivers/dma-buf/dma-fence-unwrap.c
> +++ b/drivers/dma-buf/dma-fence-unwrap.c
> @@ -180,8 +180,7 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
>
> if (count > 1) {
> result = dma_fence_array_create(count, array,
> - dma_fence_context_alloc(1),
> - 1, false);
> + dma_fence_context_alloc(1), 1);
> if (!result) {
> for (i = 0; i < count; i++)
> dma_fence_put(array[i]);
> diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
> index ce9e6c04897f..39a92d9f2413 100644
> --- a/drivers/dma-buf/dma-resv.c
> +++ b/drivers/dma-buf/dma-resv.c
> @@ -648,8 +648,7 @@ int dma_resv_get_singleton(struct dma_resv *obj, enum dma_resv_usage usage,
> }
>
> array = dma_fence_array_create(count, fences,
> - dma_fence_context_alloc(1),
> - 1, false);
> + dma_fence_context_alloc(1), 1);
> if (!array) {
> while (count--)
> dma_fence_put(fences[count]);
> diff --git a/drivers/dma-buf/st-dma-fence-unwrap.c b/drivers/dma-buf/st-dma-fence-unwrap.c
> index 51c87869b7b8..4e7ee25372ba 100644
> --- a/drivers/dma-buf/st-dma-fence-unwrap.c
> +++ b/drivers/dma-buf/st-dma-fence-unwrap.c
> @@ -64,7 +64,7 @@ static struct dma_fence *mock_array(unsigned int num_fences, ...)
>
> array = dma_fence_array_create(num_fences, fences,
> dma_fence_context_alloc(1),
> - 1, false);
> + 1);
> if (!array)
> goto error_free;
> return &array->base;
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 942f4eed817f..4a1a9031f9db 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -3205,8 +3205,7 @@ eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
> fence_array = dma_fence_array_create(eb->num_batches,
> fences,
> eb->context->parallel.fence_context,
> - eb->context->parallel.seqno++,
> - false);
> + eb->context->parallel.seqno++);
> if (!fence_array) {
> kfree(fences);
> return ERR_PTR(-ENOMEM);
> diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
> index 24d6d9af20d6..37866768d64c 100644
> --- a/drivers/gpu/drm/xe/xe_sync.c
> +++ b/drivers/gpu/drm/xe/xe_sync.c
> @@ -376,7 +376,7 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
> xe_assert(vm->xe, current_fence == num_fence);
> cf = dma_fence_array_create(num_fence, fences,
> dma_fence_context_alloc(1),
> - 1, false);
> + 1);
> if (!cf)
> goto err_out;
>
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 56e2db50bb36..8f472911469d 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3370,7 +3370,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
> goto err_trace;
> }
>
> - cf = dma_fence_array_alloc(n_fence);
> + cf = dma_fence_array_alloc();
Patch splitting mistake here.
The rest LGTM. So with this hunk dropped:
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Regards,
Tvrtko
> if (!cf) {
> fence = ERR_PTR(-ENOMEM);
> goto err_out;
> @@ -3414,7 +3414,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
>
> xe_assert(vm->xe, current_fence == n_fence);
> dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
> - 1, false);
> + 1);
> fence = &cf->base;
>
> for_each_tile(tile, vm->xe, id) {
> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
> index 370b3d2bba37..1b1d87579c38 100644
> --- a/include/linux/dma-fence-array.h
> +++ b/include/linux/dma-fence-array.h
> @@ -81,13 +81,11 @@ to_dma_fence_array(struct dma_fence *fence)
> struct dma_fence_array *dma_fence_array_alloc(int num_fences);
> void dma_fence_array_init(struct dma_fence_array *array,
> int num_fences, struct dma_fence **fences,
> - u64 context, unsigned seqno,
> - bool signal_on_any);
> + u64 context, unsigned seqno);
>
> struct dma_fence_array *dma_fence_array_create(int num_fences,
> struct dma_fence **fences,
> - u64 context, unsigned seqno,
> - bool signal_on_any);
> + u64 context, unsigned seqno);
>
> bool dma_fence_match_context(struct dma_fence *fence, u64 context);
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 2/2] dma-buf/dma_fence_array: optimize handling
2026-04-22 10:30 ` [PATCH 2/2] dma-buf/dma_fence_array: optimize handling Christian König
@ 2026-04-22 11:37 ` Tvrtko Ursulin
2026-05-04 14:55 ` Christian König
0 siblings, 1 reply; 7+ messages in thread
From: Tvrtko Ursulin @ 2026-04-22 11:37 UTC (permalink / raw)
To: Christian König, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig
On 22/04/2026 11:30, Christian König wrote:
> Removing the signal on any feature allows to simplfy the dma_fence_array
> code a lot and saves us from the need to install a callback on all fences
> at the same time.
>
> This results in less memory and CPU overhead.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
> drivers/dma-buf/dma-fence-array.c | 130 +++++++++++++-----------------
> include/linux/dma-fence-array.h | 22 ++---
> 2 files changed, 59 insertions(+), 93 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
> index 5e10e8df372f..f1b4b3296c87 100644
> --- a/drivers/dma-buf/dma-fence-array.c
> +++ b/drivers/dma-buf/dma-fence-array.c
> @@ -42,97 +42,80 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
> cmpxchg(&array->base.error, PENDING_ERROR, 0);
> }
>
> -static void irq_dma_fence_array_work(struct irq_work *wrk)
> +static void dma_fence_array_cb_func(struct dma_fence *f,
> + struct dma_fence_cb *cb)
> {
> - struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
> + struct dma_fence_array *array =
> + container_of(cb, struct dma_fence_array, callback);
>
> - dma_fence_array_clear_pending_error(array);
> + irq_work_queue(&array->work);
> +}
> +
> +static void dma_fence_array_arm_cb(struct dma_fence_array *array)
> +{
> + while (array->num_pending) {
> + struct dma_fence *f = array->fences[array->num_pending - 1];
> +
> + if (!dma_fence_add_callback(f, &array->callback,
> + dma_fence_array_cb_func))
> + return;
> +
> + dma_fence_array_set_pending_error(array, f->error);
> + WRITE_ONCE(array->num_pending, array->num_pending - 1);
Do you think the WRITE_ONCEs are needed? As the loop will restart with
un-annotated read anyway, but not just that, I don't think it can be
compiled away in the kernel with this usage pattern. Maybe I am mistaken.
> + }
>
> dma_fence_signal(&array->base);
> dma_fence_put(&array->base);
> }
>
> -static void dma_fence_array_cb_func(struct dma_fence *f,
> - struct dma_fence_cb *cb)
> +static void dma_fence_array_irq_work(struct irq_work *wrk)
> {
> - struct dma_fence_array_cb *array_cb =
> - container_of(cb, struct dma_fence_array_cb, cb);
> - struct dma_fence_array *array = array_cb->array;
> -
> - dma_fence_array_set_pending_error(array, f->error);
> + struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>
> - if (atomic_dec_and_test(&array->num_pending))
> - irq_work_queue(&array->work);
> - else
> - dma_fence_put(&array->base);
> + WRITE_ONCE(array->num_pending, array->num_pending - 1);
> + dma_fence_array_arm_cb(array);
So for x86 going from one irqwork latency to num_fences latencies is
probably passable but I am not sure how other architectures fare.
> }
>
> static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
> {
> struct dma_fence_array *array = to_dma_fence_array(fence);
> - struct dma_fence_array_cb *cb = array->callbacks;
> - unsigned i;
> -
> - for (i = 0; i < array->num_fences; ++i) {
> - cb[i].array = array;
> - /*
> - * As we may report that the fence is signaled before all
> - * callbacks are complete, we need to take an additional
> - * reference count on the array so that we do not free it too
> - * early. The core fence handling will only hold the reference
> - * until we signal the array as complete (but that is now
> - * insufficient).
> - */
> - dma_fence_get(&array->base);
> - if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
> - dma_fence_array_cb_func)) {
> - int error = array->fences[i]->error;
> -
> - dma_fence_array_set_pending_error(array, error);
> - dma_fence_put(&array->base);
> - if (atomic_dec_and_test(&array->num_pending)) {
> - dma_fence_array_clear_pending_error(array);
> - return false;
> - }
> - }
> - }
>
> + /*
> + * As we may report that the fence is signaled before all
> + * callbacks are complete, we need to take an additional
> + * reference count on the array so that we do not free it too
> + * early. The core fence handling will only hold the reference
> + * until we signal the array as complete (but that is now
> + * insufficient).
> + */
> + dma_fence_get(&array->base);
> + dma_fence_array_arm_cb(array);
> return true;
Are you sure it is safe to always return true?
Regards,
Tvrtko
> }
>
> static bool dma_fence_array_signaled(struct dma_fence *fence)
> {
> struct dma_fence_array *array = to_dma_fence_array(fence);
> - int num_pending;
> + int num_pending, error = 0;
> unsigned int i;
>
> /*
> - * We need to read num_pending before checking the enable_signal bit
> - * to avoid racing with the enable_signaling() implementation, which
> - * might decrement the counter, and cause a partial check.
> - * atomic_read_acquire() pairs with atomic_dec_and_test() in
> - * dma_fence_array_enable_signaling()
> - *
> - * The !--num_pending check is here to account for the any_signaled case
> - * if we race with enable_signaling(), that means the !num_pending check
> - * in the is_signalling_enabled branch might be outdated (num_pending
> - * might have been decremented), but that's fine. The user will get the
> - * right value when testing again later.
> + * Reading num_pending without a memory barrier here is correct since
> + * that is only for optimization, it is perfectly acceptable to have a
> + * stale value for it. In all other cases num_pending is accessed by a
> + * single call chain.
> */
> - num_pending = atomic_read_acquire(&array->num_pending);
> - if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
> - if (num_pending <= 0)
> - goto signal;
> - return false;
> - }
> + num_pending = READ_ONCE(array->num_pending);
> + for (i = 0; i < num_pending; ++i) {
> + struct dma_fence *f = array->fences[i];
>
> - for (i = 0; i < array->num_fences; ++i) {
> - if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
> - goto signal;
> - }
> - return false;
> + if (!dma_fence_is_signaled(f))
> + return false;
>
> -signal:
> + if (!error)
> + error = f->error;
> + }
> + dma_fence_array_set_pending_error(array, error);
> dma_fence_array_clear_pending_error(array);
> return true;
> }
> @@ -171,15 +154,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
>
> /**
> * dma_fence_array_alloc - Allocate a custom fence array
> - * @num_fences: [in] number of fences to add in the array
> *
> * Return dma fence array on success, NULL on failure
> */
> -struct dma_fence_array *dma_fence_array_alloc(int num_fences)
> +struct dma_fence_array *dma_fence_array_alloc(void)
> {
> - struct dma_fence_array *array;
> -
> - return kzalloc_flex(*array, callbacks, num_fences);
> + return kzalloc_obj(struct dma_fence_array);
> }
> EXPORT_SYMBOL(dma_fence_array_alloc);
>
> @@ -203,10 +183,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
> WARN_ON(!num_fences || !fences);
>
> array->num_fences = num_fences;
> + array->num_pending = num_fences;
> + array->fences = fences;
> + array->base.error = PENDING_ERROR;
>
> dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
> seqno);
> - init_irq_work(&array->work, irq_dma_fence_array_work);
> + init_irq_work(&array->work, dma_fence_array_irq_work);
>
> /*
> * dma_fence_array_enable_signaling() is invoked while holding
> @@ -220,11 +203,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
> */
> lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>
> - atomic_set(&array->num_pending, num_fences);
> - array->fences = fences;
> -
> - array->base.error = PENDING_ERROR;
> -
> /*
> * dma_fence_array objects should never contain any other fence
> * containers or otherwise we run into recursion and potential kernel
> @@ -265,7 +243,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
> {
> struct dma_fence_array *array;
>
> - array = dma_fence_array_alloc(num_fences);
> + array = dma_fence_array_alloc();
> if (!array)
> return NULL;
>
> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
> index 1b1d87579c38..3ee55c0e2fa4 100644
> --- a/include/linux/dma-fence-array.h
> +++ b/include/linux/dma-fence-array.h
> @@ -15,16 +15,6 @@
> #include <linux/dma-fence.h>
> #include <linux/irq_work.h>
>
> -/**
> - * struct dma_fence_array_cb - callback helper for fence array
> - * @cb: fence callback structure for signaling
> - * @array: reference to the parent fence array object
> - */
> -struct dma_fence_array_cb {
> - struct dma_fence_cb cb;
> - struct dma_fence_array *array;
> -};
> -
> /**
> * struct dma_fence_array - fence to represent an array of fences
> * @base: fence base class
> @@ -33,18 +23,17 @@ struct dma_fence_array_cb {
> * @num_pending: fences in the array still pending
> * @fences: array of the fences
> * @work: internal irq_work function
> - * @callbacks: array of callback helpers
> + * @callback: callback structure for signaling
> */
> struct dma_fence_array {
> struct dma_fence base;
>
> - unsigned num_fences;
> - atomic_t num_pending;
> + unsigned int num_fences;
> + unsigned int num_pending;
> struct dma_fence **fences;
>
> struct irq_work work;
> -
> - struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
> + struct dma_fence_cb callback;
> };
>
> /**
> @@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
> for (index = 0, fence = dma_fence_array_first(head); fence; \
> ++(index), fence = dma_fence_array_next(head, index))
>
> -struct dma_fence_array *dma_fence_array_alloc(int num_fences);
> +struct dma_fence_array *dma_fence_array_alloc(void);
> void dma_fence_array_init(struct dma_fence_array *array,
> int num_fences, struct dma_fence **fences,
> u64 context, unsigned seqno);
> -
> struct dma_fence_array *dma_fence_array_create(int num_fences,
> struct dma_fence **fences,
> u64 context, unsigned seqno);
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3
2026-04-22 10:49 ` [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Tvrtko Ursulin
@ 2026-05-04 14:46 ` Christian König
0 siblings, 0 replies; 7+ messages in thread
From: Christian König @ 2026-05-04 14:46 UTC (permalink / raw)
To: Tvrtko Ursulin, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig
On 4/22/26 12:49, Tvrtko Ursulin wrote:
>
> On 22/04/2026 11:30, Christian König wrote:
...
>> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
>> index 56e2db50bb36..8f472911469d 100644
>> --- a/drivers/gpu/drm/xe/xe_vm.c
>> +++ b/drivers/gpu/drm/xe/xe_vm.c
>> @@ -3370,7 +3370,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
>> goto err_trace;
>> }
>> - cf = dma_fence_array_alloc(n_fence);
>> + cf = dma_fence_array_alloc();
>
> Patch splitting mistake here.
Ah, thanks for pointing this out.
> The rest LGTM. So with this hunk dropped:
>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
I double checked the patch once more, compile tested it and then pushed the result.
Thanks,
Christian.
>
> Regards,
>
> Tvrtko
>
>> if (!cf) {
>> fence = ERR_PTR(-ENOMEM);
>> goto err_out;
>> @@ -3414,7 +3414,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
>> xe_assert(vm->xe, current_fence == n_fence);
>> dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
>> - 1, false);
>> + 1);
>> fence = &cf->base;
>> for_each_tile(tile, vm->xe, id) {
>> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
>> index 370b3d2bba37..1b1d87579c38 100644
>> --- a/include/linux/dma-fence-array.h
>> +++ b/include/linux/dma-fence-array.h
>> @@ -81,13 +81,11 @@ to_dma_fence_array(struct dma_fence *fence)
>> struct dma_fence_array *dma_fence_array_alloc(int num_fences);
>> void dma_fence_array_init(struct dma_fence_array *array,
>> int num_fences, struct dma_fence **fences,
>> - u64 context, unsigned seqno,
>> - bool signal_on_any);
>> + u64 context, unsigned seqno);
>> struct dma_fence_array *dma_fence_array_create(int num_fences,
>> struct dma_fence **fences,
>> - u64 context, unsigned seqno,
>> - bool signal_on_any);
>> + u64 context, unsigned seqno);
>> bool dma_fence_match_context(struct dma_fence *fence, u64 context);
>>
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 2/2] dma-buf/dma_fence_array: optimize handling
2026-04-22 11:37 ` Tvrtko Ursulin
@ 2026-05-04 14:55 ` Christian König
2026-05-04 15:55 ` Tvrtko Ursulin
0 siblings, 1 reply; 7+ messages in thread
From: Christian König @ 2026-05-04 14:55 UTC (permalink / raw)
To: Tvrtko Ursulin, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig
On 4/22/26 13:37, Tvrtko Ursulin wrote:
>
> On 22/04/2026 11:30, Christian König wrote:
>> Removing the signal on any feature allows to simplfy the dma_fence_array
>> code a lot and saves us from the need to install a callback on all fences
>> at the same time.
>>
>> This results in less memory and CPU overhead.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>> drivers/dma-buf/dma-fence-array.c | 130 +++++++++++++-----------------
>> include/linux/dma-fence-array.h | 22 ++---
>> 2 files changed, 59 insertions(+), 93 deletions(-)
>>
>> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
>> index 5e10e8df372f..f1b4b3296c87 100644
>> --- a/drivers/dma-buf/dma-fence-array.c
>> +++ b/drivers/dma-buf/dma-fence-array.c
>> @@ -42,97 +42,80 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
>> cmpxchg(&array->base.error, PENDING_ERROR, 0);
>> }
>> -static void irq_dma_fence_array_work(struct irq_work *wrk)
>> +static void dma_fence_array_cb_func(struct dma_fence *f,
>> + struct dma_fence_cb *cb)
>> {
>> - struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>> + struct dma_fence_array *array =
>> + container_of(cb, struct dma_fence_array, callback);
>> - dma_fence_array_clear_pending_error(array);
>> + irq_work_queue(&array->work);
>> +}
>> +
>> +static void dma_fence_array_arm_cb(struct dma_fence_array *array)
>> +{
>> + while (array->num_pending) {
>> + struct dma_fence *f = array->fences[array->num_pending - 1];
>> +
>> + if (!dma_fence_add_callback(f, &array->callback,
>> + dma_fence_array_cb_func))
>> + return;
>> +
>> + dma_fence_array_set_pending_error(array, f->error);
>> + WRITE_ONCE(array->num_pending, array->num_pending - 1);
>
> Do you think the WRITE_ONCEs are needed? As the loop will restart with un-annotated read anyway, but not just that, I don't think it can be compiled away in the kernel with this usage pattern. Maybe I am mistaken.
I also think the WRITE_ONCEs are superfluous. But I wanted to be on the save side, not that the compiler re-orders the write before the dma_fence_add_callback() and it shouldn't matter for performance.
>
>> + }
>> dma_fence_signal(&array->base);
>> dma_fence_put(&array->base);
>> }
>> -static void dma_fence_array_cb_func(struct dma_fence *f,
>> - struct dma_fence_cb *cb)
>> +static void dma_fence_array_irq_work(struct irq_work *wrk)
>> {
>> - struct dma_fence_array_cb *array_cb =
>> - container_of(cb, struct dma_fence_array_cb, cb);
>> - struct dma_fence_array *array = array_cb->array;
>> -
>> - dma_fence_array_set_pending_error(array, f->error);
>> + struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>> - if (atomic_dec_and_test(&array->num_pending))
>> - irq_work_queue(&array->work);
>> - else
>> - dma_fence_put(&array->base);
>> + WRITE_ONCE(array->num_pending, array->num_pending - 1);
>> + dma_fence_array_arm_cb(array);
>
> So for x86 going from one irqwork latency to num_fences latencies is probably passable but I am not sure how other architectures fare.
Mhm, what do you mean?
Previously we started one irqwork handler for each not signaled fence when enable_signaling was called, but now we reduce that to only starting one for each not signaled fence when the previous fence has finished.
As far as I can see that is always better or at least the same overhead.
>
>> }
>> static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
>> {
>> struct dma_fence_array *array = to_dma_fence_array(fence);
>> - struct dma_fence_array_cb *cb = array->callbacks;
>> - unsigned i;
>> -
>> - for (i = 0; i < array->num_fences; ++i) {
>> - cb[i].array = array;
>> - /*
>> - * As we may report that the fence is signaled before all
>> - * callbacks are complete, we need to take an additional
>> - * reference count on the array so that we do not free it too
>> - * early. The core fence handling will only hold the reference
>> - * until we signal the array as complete (but that is now
>> - * insufficient).
>> - */
>> - dma_fence_get(&array->base);
>> - if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
>> - dma_fence_array_cb_func)) {
>> - int error = array->fences[i]->error;
>> -
>> - dma_fence_array_set_pending_error(array, error);
>> - dma_fence_put(&array->base);
>> - if (atomic_dec_and_test(&array->num_pending)) {
>> - dma_fence_array_clear_pending_error(array);
>> - return false;
>> - }
>> - }
>> - }
>> + /*
>> + * As we may report that the fence is signaled before all
>> + * callbacks are complete, we need to take an additional
>> + * reference count on the array so that we do not free it too
>> + * early. The core fence handling will only hold the reference
>> + * until we signal the array as complete (but that is now
>> + * insufficient).
>> + */
>> + dma_fence_get(&array->base);
>> + dma_fence_array_arm_cb(array);
>> return true;
>
> Are you sure it is safe to always return true?
Oh, good point!
It is safe to return true here, but it is not save to call dma_fence_array_arm_cb() because that could signal the fence and result in double locking!
Going to fix that, thanks.
Regards,
Christian.
>
> Regards,
>
> Tvrtko
>
>> }
>> static bool dma_fence_array_signaled(struct dma_fence *fence)
>> {
>> struct dma_fence_array *array = to_dma_fence_array(fence);
>> - int num_pending;
>> + int num_pending, error = 0;
>> unsigned int i;
>> /*
>> - * We need to read num_pending before checking the enable_signal bit
>> - * to avoid racing with the enable_signaling() implementation, which
>> - * might decrement the counter, and cause a partial check.
>> - * atomic_read_acquire() pairs with atomic_dec_and_test() in
>> - * dma_fence_array_enable_signaling()
>> - *
>> - * The !--num_pending check is here to account for the any_signaled case
>> - * if we race with enable_signaling(), that means the !num_pending check
>> - * in the is_signalling_enabled branch might be outdated (num_pending
>> - * might have been decremented), but that's fine. The user will get the
>> - * right value when testing again later.
>> + * Reading num_pending without a memory barrier here is correct since
>> + * that is only for optimization, it is perfectly acceptable to have a
>> + * stale value for it. In all other cases num_pending is accessed by a
>> + * single call chain.
>> */
>> - num_pending = atomic_read_acquire(&array->num_pending);
>> - if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
>> - if (num_pending <= 0)
>> - goto signal;
>> - return false;
>> - }
>> + num_pending = READ_ONCE(array->num_pending);
>> + for (i = 0; i < num_pending; ++i) {
>> + struct dma_fence *f = array->fences[i];
>> - for (i = 0; i < array->num_fences; ++i) {
>> - if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
>> - goto signal;
>> - }
>> - return false;
>> + if (!dma_fence_is_signaled(f))
>> + return false;
>> -signal:
>> + if (!error)
>> + error = f->error;
>> + }
>> + dma_fence_array_set_pending_error(array, error);
>> dma_fence_array_clear_pending_error(array);
>> return true;
>> }
>> @@ -171,15 +154,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
>> /**
>> * dma_fence_array_alloc - Allocate a custom fence array
>> - * @num_fences: [in] number of fences to add in the array
>> *
>> * Return dma fence array on success, NULL on failure
>> */
>> -struct dma_fence_array *dma_fence_array_alloc(int num_fences)
>> +struct dma_fence_array *dma_fence_array_alloc(void)
>> {
>> - struct dma_fence_array *array;
>> -
>> - return kzalloc_flex(*array, callbacks, num_fences);
>> + return kzalloc_obj(struct dma_fence_array);
>> }
>> EXPORT_SYMBOL(dma_fence_array_alloc);
>> @@ -203,10 +183,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
>> WARN_ON(!num_fences || !fences);
>> array->num_fences = num_fences;
>> + array->num_pending = num_fences;
>> + array->fences = fences;
>> + array->base.error = PENDING_ERROR;
>> dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
>> seqno);
>> - init_irq_work(&array->work, irq_dma_fence_array_work);
>> + init_irq_work(&array->work, dma_fence_array_irq_work);
>> /*
>> * dma_fence_array_enable_signaling() is invoked while holding
>> @@ -220,11 +203,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
>> */
>> lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>> - atomic_set(&array->num_pending, num_fences);
>> - array->fences = fences;
>> -
>> - array->base.error = PENDING_ERROR;
>> -
>> /*
>> * dma_fence_array objects should never contain any other fence
>> * containers or otherwise we run into recursion and potential kernel
>> @@ -265,7 +243,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
>> {
>> struct dma_fence_array *array;
>> - array = dma_fence_array_alloc(num_fences);
>> + array = dma_fence_array_alloc();
>> if (!array)
>> return NULL;
>> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
>> index 1b1d87579c38..3ee55c0e2fa4 100644
>> --- a/include/linux/dma-fence-array.h
>> +++ b/include/linux/dma-fence-array.h
>> @@ -15,16 +15,6 @@
>> #include <linux/dma-fence.h>
>> #include <linux/irq_work.h>
>> -/**
>> - * struct dma_fence_array_cb - callback helper for fence array
>> - * @cb: fence callback structure for signaling
>> - * @array: reference to the parent fence array object
>> - */
>> -struct dma_fence_array_cb {
>> - struct dma_fence_cb cb;
>> - struct dma_fence_array *array;
>> -};
>> -
>> /**
>> * struct dma_fence_array - fence to represent an array of fences
>> * @base: fence base class
>> @@ -33,18 +23,17 @@ struct dma_fence_array_cb {
>> * @num_pending: fences in the array still pending
>> * @fences: array of the fences
>> * @work: internal irq_work function
>> - * @callbacks: array of callback helpers
>> + * @callback: callback structure for signaling
>> */
>> struct dma_fence_array {
>> struct dma_fence base;
>> - unsigned num_fences;
>> - atomic_t num_pending;
>> + unsigned int num_fences;
>> + unsigned int num_pending;
>> struct dma_fence **fences;
>> struct irq_work work;
>> -
>> - struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
>> + struct dma_fence_cb callback;
>> };
>> /**
>> @@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
>> for (index = 0, fence = dma_fence_array_first(head); fence; \
>> ++(index), fence = dma_fence_array_next(head, index))
>> -struct dma_fence_array *dma_fence_array_alloc(int num_fences);
>> +struct dma_fence_array *dma_fence_array_alloc(void);
>> void dma_fence_array_init(struct dma_fence_array *array,
>> int num_fences, struct dma_fence **fences,
>> u64 context, unsigned seqno);
>> -
>> struct dma_fence_array *dma_fence_array_create(int num_fences,
>> struct dma_fence **fences,
>> u64 context, unsigned seqno);
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 2/2] dma-buf/dma_fence_array: optimize handling
2026-05-04 14:55 ` Christian König
@ 2026-05-04 15:55 ` Tvrtko Ursulin
0 siblings, 0 replies; 7+ messages in thread
From: Tvrtko Ursulin @ 2026-05-04 15:55 UTC (permalink / raw)
To: Christian König, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig
On 04/05/2026 15:55, Christian König wrote:
> On 4/22/26 13:37, Tvrtko Ursulin wrote:
>>
>> On 22/04/2026 11:30, Christian König wrote:
>>> Removing the signal on any feature allows to simplfy the dma_fence_array
>>> code a lot and saves us from the need to install a callback on all fences
>>> at the same time.
>>>
>>> This results in less memory and CPU overhead.
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> ---
>>> drivers/dma-buf/dma-fence-array.c | 130 +++++++++++++-----------------
>>> include/linux/dma-fence-array.h | 22 ++---
>>> 2 files changed, 59 insertions(+), 93 deletions(-)
>>>
>>> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
>>> index 5e10e8df372f..f1b4b3296c87 100644
>>> --- a/drivers/dma-buf/dma-fence-array.c
>>> +++ b/drivers/dma-buf/dma-fence-array.c
>>> @@ -42,97 +42,80 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
>>> cmpxchg(&array->base.error, PENDING_ERROR, 0);
>>> }
>>> -static void irq_dma_fence_array_work(struct irq_work *wrk)
>>> +static void dma_fence_array_cb_func(struct dma_fence *f,
>>> + struct dma_fence_cb *cb)
>>> {
>>> - struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>>> + struct dma_fence_array *array =
>>> + container_of(cb, struct dma_fence_array, callback);
>>> - dma_fence_array_clear_pending_error(array);
>>> + irq_work_queue(&array->work);
>>> +}
>>> +
>>> +static void dma_fence_array_arm_cb(struct dma_fence_array *array)
>>> +{
>>> + while (array->num_pending) {
>>> + struct dma_fence *f = array->fences[array->num_pending - 1];
>>> +
>>> + if (!dma_fence_add_callback(f, &array->callback,
>>> + dma_fence_array_cb_func))
>>> + return;
>>> +
>>> + dma_fence_array_set_pending_error(array, f->error);
>>> + WRITE_ONCE(array->num_pending, array->num_pending - 1);
>>
>> Do you think the WRITE_ONCEs are needed? As the loop will restart with un-annotated read anyway, but not just that, I don't think it can be compiled away in the kernel with this usage pattern. Maybe I am mistaken.
>
> I also think the WRITE_ONCEs are superfluous. But I wanted to be on the save side, not that the compiler re-orders the write before the dma_fence_add_callback() and it shouldn't matter for performance.
Hm okay, but I am on the fence here, if they stay they may be misleading
the reader that they are required.
>
>>
>>> + }
>>> dma_fence_signal(&array->base);
>>> dma_fence_put(&array->base);
>>> }
>>> -static void dma_fence_array_cb_func(struct dma_fence *f,
>>> - struct dma_fence_cb *cb)
>>> +static void dma_fence_array_irq_work(struct irq_work *wrk)
>>> {
>>> - struct dma_fence_array_cb *array_cb =
>>> - container_of(cb, struct dma_fence_array_cb, cb);
>>> - struct dma_fence_array *array = array_cb->array;
>>> -
>>> - dma_fence_array_set_pending_error(array, f->error);
>>> + struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>>> - if (atomic_dec_and_test(&array->num_pending))
>>> - irq_work_queue(&array->work);
>>> - else
>>> - dma_fence_put(&array->base);
>>> + WRITE_ONCE(array->num_pending, array->num_pending - 1);
>>> + dma_fence_array_arm_cb(array);
>>
>> So for x86 going from one irqwork latency to num_fences latencies is probably passable but I am not sure how other architectures fare.
>
> Mhm, what do you mean?
>
> Previously we started one irqwork handler for each not signaled fence when enable_signaling was called, but now we reduce that to only starting one for each not signaled fence when the previous fence has finished.
>
> As far as I can see that is always better or at least the same overhead.
Currently irq work latency is once per array:
static void dma_fence_array_cb_func(struct dma_fence *f,
struct dma_fence_cb *cb)
{
...
if (atomic_dec_and_test(&array->num_pending))
irq_work_queue(&array->work);
So only when last one signals.
With this patch, AFAIR from when I reviewed it because the diff is a bit
difficult to read so I applied it, it is a "chain" of irq work and
callbacks. One by one as you say - irq work installs the callback on the
next fence, which when signalled queues irq work.
Regards,
Tvrtko
>>
>>> }
>>> static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
>>> {
>>> struct dma_fence_array *array = to_dma_fence_array(fence);
>>> - struct dma_fence_array_cb *cb = array->callbacks;
>>> - unsigned i;
>>> -
>>> - for (i = 0; i < array->num_fences; ++i) {
>>> - cb[i].array = array;
>>> - /*
>>> - * As we may report that the fence is signaled before all
>>> - * callbacks are complete, we need to take an additional
>>> - * reference count on the array so that we do not free it too
>>> - * early. The core fence handling will only hold the reference
>>> - * until we signal the array as complete (but that is now
>>> - * insufficient).
>>> - */
>>> - dma_fence_get(&array->base);
>>> - if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
>>> - dma_fence_array_cb_func)) {
>>> - int error = array->fences[i]->error;
>>> -
>>> - dma_fence_array_set_pending_error(array, error);
>>> - dma_fence_put(&array->base);
>>> - if (atomic_dec_and_test(&array->num_pending)) {
>>> - dma_fence_array_clear_pending_error(array);
>>> - return false;
>>> - }
>>> - }
>>> - }
>>> + /*
>>> + * As we may report that the fence is signaled before all
>>> + * callbacks are complete, we need to take an additional
>>> + * reference count on the array so that we do not free it too
>>> + * early. The core fence handling will only hold the reference
>>> + * until we signal the array as complete (but that is now
>>> + * insufficient).
>>> + */
>>> + dma_fence_get(&array->base);
>>> + dma_fence_array_arm_cb(array);
>>> return true;
>>
>> Are you sure it is safe to always return true?
>
> Oh, good point!
>
> It is safe to return true here, but it is not save to call dma_fence_array_arm_cb() because that could signal the fence and result in double locking!
>
> Going to fix that, thanks.
>
> Regards,
> Christian.
>
>>
>> Regards,
>>
>> Tvrtko
>>
>>> }
>>> static bool dma_fence_array_signaled(struct dma_fence *fence)
>>> {
>>> struct dma_fence_array *array = to_dma_fence_array(fence);
>>> - int num_pending;
>>> + int num_pending, error = 0;
>>> unsigned int i;
>>> /*
>>> - * We need to read num_pending before checking the enable_signal bit
>>> - * to avoid racing with the enable_signaling() implementation, which
>>> - * might decrement the counter, and cause a partial check.
>>> - * atomic_read_acquire() pairs with atomic_dec_and_test() in
>>> - * dma_fence_array_enable_signaling()
>>> - *
>>> - * The !--num_pending check is here to account for the any_signaled case
>>> - * if we race with enable_signaling(), that means the !num_pending check
>>> - * in the is_signalling_enabled branch might be outdated (num_pending
>>> - * might have been decremented), but that's fine. The user will get the
>>> - * right value when testing again later.
>>> + * Reading num_pending without a memory barrier here is correct since
>>> + * that is only for optimization, it is perfectly acceptable to have a
>>> + * stale value for it. In all other cases num_pending is accessed by a
>>> + * single call chain.
>>> */
>>> - num_pending = atomic_read_acquire(&array->num_pending);
>>> - if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
>>> - if (num_pending <= 0)
>>> - goto signal;
>>> - return false;
>>> - }
>>> + num_pending = READ_ONCE(array->num_pending);
>>> + for (i = 0; i < num_pending; ++i) {
>>> + struct dma_fence *f = array->fences[i];
>>> - for (i = 0; i < array->num_fences; ++i) {
>>> - if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
>>> - goto signal;
>>> - }
>>> - return false;
>>> + if (!dma_fence_is_signaled(f))
>>> + return false;
>>> -signal:
>>> + if (!error)
>>> + error = f->error;
>>> + }
>>> + dma_fence_array_set_pending_error(array, error);
>>> dma_fence_array_clear_pending_error(array);
>>> return true;
>>> }
>>> @@ -171,15 +154,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
>>> /**
>>> * dma_fence_array_alloc - Allocate a custom fence array
>>> - * @num_fences: [in] number of fences to add in the array
>>> *
>>> * Return dma fence array on success, NULL on failure
>>> */
>>> -struct dma_fence_array *dma_fence_array_alloc(int num_fences)
>>> +struct dma_fence_array *dma_fence_array_alloc(void)
>>> {
>>> - struct dma_fence_array *array;
>>> -
>>> - return kzalloc_flex(*array, callbacks, num_fences);
>>> + return kzalloc_obj(struct dma_fence_array);
>>> }
>>> EXPORT_SYMBOL(dma_fence_array_alloc);
>>> @@ -203,10 +183,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
>>> WARN_ON(!num_fences || !fences);
>>> array->num_fences = num_fences;
>>> + array->num_pending = num_fences;
>>> + array->fences = fences;
>>> + array->base.error = PENDING_ERROR;
>>> dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
>>> seqno);
>>> - init_irq_work(&array->work, irq_dma_fence_array_work);
>>> + init_irq_work(&array->work, dma_fence_array_irq_work);
>>> /*
>>> * dma_fence_array_enable_signaling() is invoked while holding
>>> @@ -220,11 +203,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
>>> */
>>> lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>>> - atomic_set(&array->num_pending, num_fences);
>>> - array->fences = fences;
>>> -
>>> - array->base.error = PENDING_ERROR;
>>> -
>>> /*
>>> * dma_fence_array objects should never contain any other fence
>>> * containers or otherwise we run into recursion and potential kernel
>>> @@ -265,7 +243,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
>>> {
>>> struct dma_fence_array *array;
>>> - array = dma_fence_array_alloc(num_fences);
>>> + array = dma_fence_array_alloc();
>>> if (!array)
>>> return NULL;
>>> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
>>> index 1b1d87579c38..3ee55c0e2fa4 100644
>>> --- a/include/linux/dma-fence-array.h
>>> +++ b/include/linux/dma-fence-array.h
>>> @@ -15,16 +15,6 @@
>>> #include <linux/dma-fence.h>
>>> #include <linux/irq_work.h>
>>> -/**
>>> - * struct dma_fence_array_cb - callback helper for fence array
>>> - * @cb: fence callback structure for signaling
>>> - * @array: reference to the parent fence array object
>>> - */
>>> -struct dma_fence_array_cb {
>>> - struct dma_fence_cb cb;
>>> - struct dma_fence_array *array;
>>> -};
>>> -
>>> /**
>>> * struct dma_fence_array - fence to represent an array of fences
>>> * @base: fence base class
>>> @@ -33,18 +23,17 @@ struct dma_fence_array_cb {
>>> * @num_pending: fences in the array still pending
>>> * @fences: array of the fences
>>> * @work: internal irq_work function
>>> - * @callbacks: array of callback helpers
>>> + * @callback: callback structure for signaling
>>> */
>>> struct dma_fence_array {
>>> struct dma_fence base;
>>> - unsigned num_fences;
>>> - atomic_t num_pending;
>>> + unsigned int num_fences;
>>> + unsigned int num_pending;
>>> struct dma_fence **fences;
>>> struct irq_work work;
>>> -
>>> - struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
>>> + struct dma_fence_cb callback;
>>> };
>>> /**
>>> @@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
>>> for (index = 0, fence = dma_fence_array_first(head); fence; \
>>> ++(index), fence = dma_fence_array_next(head, index))
>>> -struct dma_fence_array *dma_fence_array_alloc(int num_fences);
>>> +struct dma_fence_array *dma_fence_array_alloc(void);
>>> void dma_fence_array_init(struct dma_fence_array *array,
>>> int num_fences, struct dma_fence **fences,
>>> u64 context, unsigned seqno);
>>> -
>>> struct dma_fence_array *dma_fence_array_create(int num_fences,
>>> struct dma_fence **fences,
>>> u64 context, unsigned seqno);
>>
>
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2026-05-04 15:55 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-22 10:30 [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Christian König
2026-04-22 10:30 ` [PATCH 2/2] dma-buf/dma_fence_array: optimize handling Christian König
2026-04-22 11:37 ` Tvrtko Ursulin
2026-05-04 14:55 ` Christian König
2026-05-04 15:55 ` Tvrtko Ursulin
2026-04-22 10:49 ` [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Tvrtko Ursulin
2026-05-04 14:46 ` Christian König
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox