public inbox for linux-media@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3
@ 2026-04-22 10:30 Christian König
  2026-04-22 10:30 ` [PATCH 2/2] dma-buf/dma_fence_array: optimize handling Christian König
  2026-04-22 10:49 ` [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Tvrtko Ursulin
  0 siblings, 2 replies; 7+ messages in thread
From: Christian König @ 2026-04-22 10:30 UTC (permalink / raw)
  To: tursulin, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig

Amdgpu was the only user of the signal on any feature and we dropped
that use case recently, so we can remove that functionality.

v2: update num_pending only after the fence is signaled
v3: separate out simplifying dma_fence_array implementation

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-fence-array.c              | 13 ++++---------
 drivers/dma-buf/dma-fence-unwrap.c             |  3 +--
 drivers/dma-buf/dma-resv.c                     |  3 +--
 drivers/dma-buf/st-dma-fence-unwrap.c          |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |  3 +--
 drivers/gpu/drm/xe/xe_sync.c                   |  2 +-
 drivers/gpu/drm/xe/xe_vm.c                     |  4 ++--
 include/linux/dma-fence-array.h                |  6 ++----
 8 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
index 089f69469524..5e10e8df372f 100644
--- a/drivers/dma-buf/dma-fence-array.c
+++ b/drivers/dma-buf/dma-fence-array.c
@@ -190,15 +190,13 @@ EXPORT_SYMBOL(dma_fence_array_alloc);
  * @fences:		[in]	array containing the fences
  * @context:		[in]	fence context to use
  * @seqno:		[in]	sequence number to use
- * @signal_on_any:	[in]	signal on any fence in the array
  *
  * Implementation of @dma_fence_array_create without allocation. Useful to init
  * a preallocated dma fence array in the path of reclaim or dma fence signaling.
  */
 void dma_fence_array_init(struct dma_fence_array *array,
 			  int num_fences, struct dma_fence **fences,
-			  u64 context, unsigned seqno,
-			  bool signal_on_any)
+			  u64 context, unsigned seqno)
 {
 	static struct lock_class_key dma_fence_array_lock_key;
 
@@ -222,7 +220,7 @@ void dma_fence_array_init(struct dma_fence_array *array,
 	 */
 	lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
 
-	atomic_set(&array->num_pending, signal_on_any ? 1 : num_fences);
+	atomic_set(&array->num_pending, num_fences);
 	array->fences = fences;
 
 	array->base.error = PENDING_ERROR;
@@ -249,7 +247,6 @@ EXPORT_SYMBOL(dma_fence_array_init);
  * @fences:		[in]	array containing the fences
  * @context:		[in]	fence context to use
  * @seqno:		[in]	sequence number to use
- * @signal_on_any:	[in]	signal on any fence in the array
  *
  * Allocate a dma_fence_array object and initialize the base fence with
  * dma_fence_init().
@@ -264,8 +261,7 @@ EXPORT_SYMBOL(dma_fence_array_init);
  */
 struct dma_fence_array *dma_fence_array_create(int num_fences,
 					       struct dma_fence **fences,
-					       u64 context, unsigned seqno,
-					       bool signal_on_any)
+					       u64 context, unsigned seqno)
 {
 	struct dma_fence_array *array;
 
@@ -273,8 +269,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
 	if (!array)
 		return NULL;
 
-	dma_fence_array_init(array, num_fences, fences,
-			     context, seqno, signal_on_any);
+	dma_fence_array_init(array, num_fences, fences, context, seqno);
 
 	return array;
 }
diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c
index 07fe9bf45aea..53bb40e70b27 100644
--- a/drivers/dma-buf/dma-fence-unwrap.c
+++ b/drivers/dma-buf/dma-fence-unwrap.c
@@ -180,8 +180,7 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
 
 	if (count > 1) {
 		result = dma_fence_array_create(count, array,
-						dma_fence_context_alloc(1),
-						1, false);
+						dma_fence_context_alloc(1), 1);
 		if (!result) {
 			for (i = 0; i < count; i++)
 				dma_fence_put(array[i]);
diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index ce9e6c04897f..39a92d9f2413 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -648,8 +648,7 @@ int dma_resv_get_singleton(struct dma_resv *obj, enum dma_resv_usage usage,
 	}
 
 	array = dma_fence_array_create(count, fences,
-				       dma_fence_context_alloc(1),
-				       1, false);
+				       dma_fence_context_alloc(1), 1);
 	if (!array) {
 		while (count--)
 			dma_fence_put(fences[count]);
diff --git a/drivers/dma-buf/st-dma-fence-unwrap.c b/drivers/dma-buf/st-dma-fence-unwrap.c
index 51c87869b7b8..4e7ee25372ba 100644
--- a/drivers/dma-buf/st-dma-fence-unwrap.c
+++ b/drivers/dma-buf/st-dma-fence-unwrap.c
@@ -64,7 +64,7 @@ static struct dma_fence *mock_array(unsigned int num_fences, ...)
 
 	array = dma_fence_array_create(num_fences, fences,
 				       dma_fence_context_alloc(1),
-				       1, false);
+				       1);
 	if (!array)
 		goto error_free;
 	return &array->base;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 942f4eed817f..4a1a9031f9db 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -3205,8 +3205,7 @@ eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
 	fence_array = dma_fence_array_create(eb->num_batches,
 					     fences,
 					     eb->context->parallel.fence_context,
-					     eb->context->parallel.seqno++,
-					     false);
+					     eb->context->parallel.seqno++);
 	if (!fence_array) {
 		kfree(fences);
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index 24d6d9af20d6..37866768d64c 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -376,7 +376,7 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
 		xe_assert(vm->xe, current_fence == num_fence);
 		cf = dma_fence_array_create(num_fence, fences,
 					    dma_fence_context_alloc(1),
-					    1, false);
+					    1);
 		if (!cf)
 			goto err_out;
 
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 56e2db50bb36..8f472911469d 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3370,7 +3370,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
 		goto err_trace;
 	}
 
-	cf = dma_fence_array_alloc(n_fence);
+	cf = dma_fence_array_alloc();
 	if (!cf) {
 		fence = ERR_PTR(-ENOMEM);
 		goto err_out;
@@ -3414,7 +3414,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
 
 	xe_assert(vm->xe, current_fence == n_fence);
 	dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
-			     1, false);
+			     1);
 	fence = &cf->base;
 
 	for_each_tile(tile, vm->xe, id) {
diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
index 370b3d2bba37..1b1d87579c38 100644
--- a/include/linux/dma-fence-array.h
+++ b/include/linux/dma-fence-array.h
@@ -81,13 +81,11 @@ to_dma_fence_array(struct dma_fence *fence)
 struct dma_fence_array *dma_fence_array_alloc(int num_fences);
 void dma_fence_array_init(struct dma_fence_array *array,
 			  int num_fences, struct dma_fence **fences,
-			  u64 context, unsigned seqno,
-			  bool signal_on_any);
+			  u64 context, unsigned seqno);
 
 struct dma_fence_array *dma_fence_array_create(int num_fences,
 					       struct dma_fence **fences,
-					       u64 context, unsigned seqno,
-					       bool signal_on_any);
+					       u64 context, unsigned seqno);
 
 bool dma_fence_match_context(struct dma_fence *fence, u64 context);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/2] dma-buf/dma_fence_array: optimize handling
  2026-04-22 10:30 [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Christian König
@ 2026-04-22 10:30 ` Christian König
  2026-04-22 11:37   ` Tvrtko Ursulin
  2026-04-22 10:49 ` [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Tvrtko Ursulin
  1 sibling, 1 reply; 7+ messages in thread
From: Christian König @ 2026-04-22 10:30 UTC (permalink / raw)
  To: tursulin, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig

Removing the signal on any feature allows to simplfy the dma_fence_array
code a lot and saves us from the need to install a callback on all fences
at the same time.

This results in less memory and CPU overhead.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-fence-array.c | 130 +++++++++++++-----------------
 include/linux/dma-fence-array.h   |  22 ++---
 2 files changed, 59 insertions(+), 93 deletions(-)

diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
index 5e10e8df372f..f1b4b3296c87 100644
--- a/drivers/dma-buf/dma-fence-array.c
+++ b/drivers/dma-buf/dma-fence-array.c
@@ -42,97 +42,80 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
 	cmpxchg(&array->base.error, PENDING_ERROR, 0);
 }
 
-static void irq_dma_fence_array_work(struct irq_work *wrk)
+static void dma_fence_array_cb_func(struct dma_fence *f,
+				    struct dma_fence_cb *cb)
 {
-	struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
+	struct dma_fence_array *array =
+		container_of(cb, struct dma_fence_array, callback);
 
-	dma_fence_array_clear_pending_error(array);
+	irq_work_queue(&array->work);
+}
+
+static void dma_fence_array_arm_cb(struct dma_fence_array *array)
+{
+	while (array->num_pending) {
+		struct dma_fence *f = array->fences[array->num_pending - 1];
+
+		if (!dma_fence_add_callback(f, &array->callback,
+					    dma_fence_array_cb_func))
+			return;
+
+		dma_fence_array_set_pending_error(array, f->error);
+		WRITE_ONCE(array->num_pending, array->num_pending - 1);
+	}
 
 	dma_fence_signal(&array->base);
 	dma_fence_put(&array->base);
 }
 
-static void dma_fence_array_cb_func(struct dma_fence *f,
-				    struct dma_fence_cb *cb)
+static void dma_fence_array_irq_work(struct irq_work *wrk)
 {
-	struct dma_fence_array_cb *array_cb =
-		container_of(cb, struct dma_fence_array_cb, cb);
-	struct dma_fence_array *array = array_cb->array;
-
-	dma_fence_array_set_pending_error(array, f->error);
+	struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
 
-	if (atomic_dec_and_test(&array->num_pending))
-		irq_work_queue(&array->work);
-	else
-		dma_fence_put(&array->base);
+	WRITE_ONCE(array->num_pending, array->num_pending - 1);
+	dma_fence_array_arm_cb(array);
 }
 
 static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
 {
 	struct dma_fence_array *array = to_dma_fence_array(fence);
-	struct dma_fence_array_cb *cb = array->callbacks;
-	unsigned i;
-
-	for (i = 0; i < array->num_fences; ++i) {
-		cb[i].array = array;
-		/*
-		 * As we may report that the fence is signaled before all
-		 * callbacks are complete, we need to take an additional
-		 * reference count on the array so that we do not free it too
-		 * early. The core fence handling will only hold the reference
-		 * until we signal the array as complete (but that is now
-		 * insufficient).
-		 */
-		dma_fence_get(&array->base);
-		if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
-					   dma_fence_array_cb_func)) {
-			int error = array->fences[i]->error;
-
-			dma_fence_array_set_pending_error(array, error);
-			dma_fence_put(&array->base);
-			if (atomic_dec_and_test(&array->num_pending)) {
-				dma_fence_array_clear_pending_error(array);
-				return false;
-			}
-		}
-	}
 
+	/*
+	 * As we may report that the fence is signaled before all
+	 * callbacks are complete, we need to take an additional
+	 * reference count on the array so that we do not free it too
+	 * early. The core fence handling will only hold the reference
+	 * until we signal the array as complete (but that is now
+	 * insufficient).
+	 */
+	dma_fence_get(&array->base);
+	dma_fence_array_arm_cb(array);
 	return true;
 }
 
 static bool dma_fence_array_signaled(struct dma_fence *fence)
 {
 	struct dma_fence_array *array = to_dma_fence_array(fence);
-	int num_pending;
+	int num_pending, error = 0;
 	unsigned int i;
 
 	/*
-	 * We need to read num_pending before checking the enable_signal bit
-	 * to avoid racing with the enable_signaling() implementation, which
-	 * might decrement the counter, and cause a partial check.
-	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
-	 * dma_fence_array_enable_signaling()
-	 *
-	 * The !--num_pending check is here to account for the any_signaled case
-	 * if we race with enable_signaling(), that means the !num_pending check
-	 * in the is_signalling_enabled branch might be outdated (num_pending
-	 * might have been decremented), but that's fine. The user will get the
-	 * right value when testing again later.
+	 * Reading num_pending without a memory barrier here is correct since
+	 * that is only for optimization, it is perfectly acceptable to have a
+	 * stale value for it. In all other cases num_pending is accessed by a
+	 * single call chain.
 	 */
-	num_pending = atomic_read_acquire(&array->num_pending);
-	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
-		if (num_pending <= 0)
-			goto signal;
-		return false;
-	}
+	num_pending = READ_ONCE(array->num_pending);
+	for (i = 0; i < num_pending; ++i) {
+		struct dma_fence *f = array->fences[i];
 
-	for (i = 0; i < array->num_fences; ++i) {
-		if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
-			goto signal;
-	}
-	return false;
+		if (!dma_fence_is_signaled(f))
+			return false;
 
-signal:
+		if (!error)
+			error = f->error;
+	}
+	dma_fence_array_set_pending_error(array, error);
 	dma_fence_array_clear_pending_error(array);
 	return true;
 }
@@ -171,15 +154,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
 
 /**
  * dma_fence_array_alloc - Allocate a custom fence array
- * @num_fences:		[in]	number of fences to add in the array
  *
  * Return dma fence array on success, NULL on failure
  */
-struct dma_fence_array *dma_fence_array_alloc(int num_fences)
+struct dma_fence_array *dma_fence_array_alloc(void)
 {
-	struct dma_fence_array *array;
-
-	return kzalloc_flex(*array, callbacks, num_fences);
+	return kzalloc_obj(struct dma_fence_array);
 }
 EXPORT_SYMBOL(dma_fence_array_alloc);
 
@@ -203,10 +183,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
 	WARN_ON(!num_fences || !fences);
 
 	array->num_fences = num_fences;
+	array->num_pending = num_fences;
+	array->fences = fences;
+	array->base.error = PENDING_ERROR;
 
 	dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
 		       seqno);
-	init_irq_work(&array->work, irq_dma_fence_array_work);
+	init_irq_work(&array->work, dma_fence_array_irq_work);
 
 	/*
 	 * dma_fence_array_enable_signaling() is invoked while holding
@@ -220,11 +203,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
 	 */
 	lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
 
-	atomic_set(&array->num_pending, num_fences);
-	array->fences = fences;
-
-	array->base.error = PENDING_ERROR;
-
 	/*
 	 * dma_fence_array objects should never contain any other fence
 	 * containers or otherwise we run into recursion and potential kernel
@@ -265,7 +243,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
 {
 	struct dma_fence_array *array;
 
-	array = dma_fence_array_alloc(num_fences);
+	array = dma_fence_array_alloc();
 	if (!array)
 		return NULL;
 
diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
index 1b1d87579c38..3ee55c0e2fa4 100644
--- a/include/linux/dma-fence-array.h
+++ b/include/linux/dma-fence-array.h
@@ -15,16 +15,6 @@
 #include <linux/dma-fence.h>
 #include <linux/irq_work.h>
 
-/**
- * struct dma_fence_array_cb - callback helper for fence array
- * @cb: fence callback structure for signaling
- * @array: reference to the parent fence array object
- */
-struct dma_fence_array_cb {
-	struct dma_fence_cb cb;
-	struct dma_fence_array *array;
-};
-
 /**
  * struct dma_fence_array - fence to represent an array of fences
  * @base: fence base class
@@ -33,18 +23,17 @@ struct dma_fence_array_cb {
  * @num_pending: fences in the array still pending
  * @fences: array of the fences
  * @work: internal irq_work function
- * @callbacks: array of callback helpers
+ * @callback: callback structure for signaling
  */
 struct dma_fence_array {
 	struct dma_fence base;
 
-	unsigned num_fences;
-	atomic_t num_pending;
+	unsigned int num_fences;
+	unsigned int num_pending;
 	struct dma_fence **fences;
 
 	struct irq_work work;
-
-	struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
+	struct dma_fence_cb callback;
 };
 
 /**
@@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
 	for (index = 0, fence = dma_fence_array_first(head); fence;	\
 	     ++(index), fence = dma_fence_array_next(head, index))
 
-struct dma_fence_array *dma_fence_array_alloc(int num_fences);
+struct dma_fence_array *dma_fence_array_alloc(void);
 void dma_fence_array_init(struct dma_fence_array *array,
 			  int num_fences, struct dma_fence **fences,
 			  u64 context, unsigned seqno);
-
 struct dma_fence_array *dma_fence_array_create(int num_fences,
 					       struct dma_fence **fences,
 					       u64 context, unsigned seqno);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3
  2026-04-22 10:30 [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Christian König
  2026-04-22 10:30 ` [PATCH 2/2] dma-buf/dma_fence_array: optimize handling Christian König
@ 2026-04-22 10:49 ` Tvrtko Ursulin
  2026-05-04 14:46   ` Christian König
  1 sibling, 1 reply; 7+ messages in thread
From: Tvrtko Ursulin @ 2026-04-22 10:49 UTC (permalink / raw)
  To: Christian König, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig


On 22/04/2026 11:30, Christian König wrote:
> Amdgpu was the only user of the signal on any feature and we dropped
> that use case recently, so we can remove that functionality.
> 
> v2: update num_pending only after the fence is signaled
> v3: separate out simplifying dma_fence_array implementation
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/dma-buf/dma-fence-array.c              | 13 ++++---------
>   drivers/dma-buf/dma-fence-unwrap.c             |  3 +--
>   drivers/dma-buf/dma-resv.c                     |  3 +--
>   drivers/dma-buf/st-dma-fence-unwrap.c          |  2 +-
>   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |  3 +--
>   drivers/gpu/drm/xe/xe_sync.c                   |  2 +-
>   drivers/gpu/drm/xe/xe_vm.c                     |  4 ++--
>   include/linux/dma-fence-array.h                |  6 ++----
>   8 files changed, 13 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
> index 089f69469524..5e10e8df372f 100644
> --- a/drivers/dma-buf/dma-fence-array.c
> +++ b/drivers/dma-buf/dma-fence-array.c
> @@ -190,15 +190,13 @@ EXPORT_SYMBOL(dma_fence_array_alloc);
>    * @fences:		[in]	array containing the fences
>    * @context:		[in]	fence context to use
>    * @seqno:		[in]	sequence number to use
> - * @signal_on_any:	[in]	signal on any fence in the array
>    *
>    * Implementation of @dma_fence_array_create without allocation. Useful to init
>    * a preallocated dma fence array in the path of reclaim or dma fence signaling.
>    */
>   void dma_fence_array_init(struct dma_fence_array *array,
>   			  int num_fences, struct dma_fence **fences,
> -			  u64 context, unsigned seqno,
> -			  bool signal_on_any)
> +			  u64 context, unsigned seqno)
>   {
>   	static struct lock_class_key dma_fence_array_lock_key;
>   
> @@ -222,7 +220,7 @@ void dma_fence_array_init(struct dma_fence_array *array,
>   	 */
>   	lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>   
> -	atomic_set(&array->num_pending, signal_on_any ? 1 : num_fences);
> +	atomic_set(&array->num_pending, num_fences);
>   	array->fences = fences;
>   
>   	array->base.error = PENDING_ERROR;
> @@ -249,7 +247,6 @@ EXPORT_SYMBOL(dma_fence_array_init);
>    * @fences:		[in]	array containing the fences
>    * @context:		[in]	fence context to use
>    * @seqno:		[in]	sequence number to use
> - * @signal_on_any:	[in]	signal on any fence in the array
>    *
>    * Allocate a dma_fence_array object and initialize the base fence with
>    * dma_fence_init().
> @@ -264,8 +261,7 @@ EXPORT_SYMBOL(dma_fence_array_init);
>    */
>   struct dma_fence_array *dma_fence_array_create(int num_fences,
>   					       struct dma_fence **fences,
> -					       u64 context, unsigned seqno,
> -					       bool signal_on_any)
> +					       u64 context, unsigned seqno)
>   {
>   	struct dma_fence_array *array;
>   
> @@ -273,8 +269,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
>   	if (!array)
>   		return NULL;
>   
> -	dma_fence_array_init(array, num_fences, fences,
> -			     context, seqno, signal_on_any);
> +	dma_fence_array_init(array, num_fences, fences, context, seqno);
>   
>   	return array;
>   }
> diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c
> index 07fe9bf45aea..53bb40e70b27 100644
> --- a/drivers/dma-buf/dma-fence-unwrap.c
> +++ b/drivers/dma-buf/dma-fence-unwrap.c
> @@ -180,8 +180,7 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
>   
>   	if (count > 1) {
>   		result = dma_fence_array_create(count, array,
> -						dma_fence_context_alloc(1),
> -						1, false);
> +						dma_fence_context_alloc(1), 1);
>   		if (!result) {
>   			for (i = 0; i < count; i++)
>   				dma_fence_put(array[i]);
> diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
> index ce9e6c04897f..39a92d9f2413 100644
> --- a/drivers/dma-buf/dma-resv.c
> +++ b/drivers/dma-buf/dma-resv.c
> @@ -648,8 +648,7 @@ int dma_resv_get_singleton(struct dma_resv *obj, enum dma_resv_usage usage,
>   	}
>   
>   	array = dma_fence_array_create(count, fences,
> -				       dma_fence_context_alloc(1),
> -				       1, false);
> +				       dma_fence_context_alloc(1), 1);
>   	if (!array) {
>   		while (count--)
>   			dma_fence_put(fences[count]);
> diff --git a/drivers/dma-buf/st-dma-fence-unwrap.c b/drivers/dma-buf/st-dma-fence-unwrap.c
> index 51c87869b7b8..4e7ee25372ba 100644
> --- a/drivers/dma-buf/st-dma-fence-unwrap.c
> +++ b/drivers/dma-buf/st-dma-fence-unwrap.c
> @@ -64,7 +64,7 @@ static struct dma_fence *mock_array(unsigned int num_fences, ...)
>   
>   	array = dma_fence_array_create(num_fences, fences,
>   				       dma_fence_context_alloc(1),
> -				       1, false);
> +				       1);
>   	if (!array)
>   		goto error_free;
>   	return &array->base;
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 942f4eed817f..4a1a9031f9db 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -3205,8 +3205,7 @@ eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd)
>   	fence_array = dma_fence_array_create(eb->num_batches,
>   					     fences,
>   					     eb->context->parallel.fence_context,
> -					     eb->context->parallel.seqno++,
> -					     false);
> +					     eb->context->parallel.seqno++);
>   	if (!fence_array) {
>   		kfree(fences);
>   		return ERR_PTR(-ENOMEM);
> diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
> index 24d6d9af20d6..37866768d64c 100644
> --- a/drivers/gpu/drm/xe/xe_sync.c
> +++ b/drivers/gpu/drm/xe/xe_sync.c
> @@ -376,7 +376,7 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
>   		xe_assert(vm->xe, current_fence == num_fence);
>   		cf = dma_fence_array_create(num_fence, fences,
>   					    dma_fence_context_alloc(1),
> -					    1, false);
> +					    1);
>   		if (!cf)
>   			goto err_out;
>   
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 56e2db50bb36..8f472911469d 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3370,7 +3370,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
>   		goto err_trace;
>   	}
>   
> -	cf = dma_fence_array_alloc(n_fence);
> +	cf = dma_fence_array_alloc();

Patch splitting mistake here.

The rest LGTM. So with this hunk dropped:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>

Regards,

Tvrtko

>   	if (!cf) {
>   		fence = ERR_PTR(-ENOMEM);
>   		goto err_out;
> @@ -3414,7 +3414,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
>   
>   	xe_assert(vm->xe, current_fence == n_fence);
>   	dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
> -			     1, false);
> +			     1);
>   	fence = &cf->base;
>   
>   	for_each_tile(tile, vm->xe, id) {
> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
> index 370b3d2bba37..1b1d87579c38 100644
> --- a/include/linux/dma-fence-array.h
> +++ b/include/linux/dma-fence-array.h
> @@ -81,13 +81,11 @@ to_dma_fence_array(struct dma_fence *fence)
>   struct dma_fence_array *dma_fence_array_alloc(int num_fences);
>   void dma_fence_array_init(struct dma_fence_array *array,
>   			  int num_fences, struct dma_fence **fences,
> -			  u64 context, unsigned seqno,
> -			  bool signal_on_any);
> +			  u64 context, unsigned seqno);
>   
>   struct dma_fence_array *dma_fence_array_create(int num_fences,
>   					       struct dma_fence **fences,
> -					       u64 context, unsigned seqno,
> -					       bool signal_on_any);
> +					       u64 context, unsigned seqno);
>   
>   bool dma_fence_match_context(struct dma_fence *fence, u64 context);
>   


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] dma-buf/dma_fence_array: optimize handling
  2026-04-22 10:30 ` [PATCH 2/2] dma-buf/dma_fence_array: optimize handling Christian König
@ 2026-04-22 11:37   ` Tvrtko Ursulin
  2026-05-04 14:55     ` Christian König
  0 siblings, 1 reply; 7+ messages in thread
From: Tvrtko Ursulin @ 2026-04-22 11:37 UTC (permalink / raw)
  To: Christian König, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig


On 22/04/2026 11:30, Christian König wrote:
> Removing the signal on any feature allows to simplfy the dma_fence_array
> code a lot and saves us from the need to install a callback on all fences
> at the same time.
> 
> This results in less memory and CPU overhead.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/dma-buf/dma-fence-array.c | 130 +++++++++++++-----------------
>   include/linux/dma-fence-array.h   |  22 ++---
>   2 files changed, 59 insertions(+), 93 deletions(-)
> 
> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
> index 5e10e8df372f..f1b4b3296c87 100644
> --- a/drivers/dma-buf/dma-fence-array.c
> +++ b/drivers/dma-buf/dma-fence-array.c
> @@ -42,97 +42,80 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
>   	cmpxchg(&array->base.error, PENDING_ERROR, 0);
>   }
>   
> -static void irq_dma_fence_array_work(struct irq_work *wrk)
> +static void dma_fence_array_cb_func(struct dma_fence *f,
> +				    struct dma_fence_cb *cb)
>   {
> -	struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
> +	struct dma_fence_array *array =
> +		container_of(cb, struct dma_fence_array, callback);
>   
> -	dma_fence_array_clear_pending_error(array);
> +	irq_work_queue(&array->work);
> +}
> +
> +static void dma_fence_array_arm_cb(struct dma_fence_array *array)
> +{
> +	while (array->num_pending) {
> +		struct dma_fence *f = array->fences[array->num_pending - 1];
> +
> +		if (!dma_fence_add_callback(f, &array->callback,
> +					    dma_fence_array_cb_func))
> +			return;
> +
> +		dma_fence_array_set_pending_error(array, f->error);
> +		WRITE_ONCE(array->num_pending, array->num_pending - 1);

Do you think the WRITE_ONCEs are needed? As the loop will restart with 
un-annotated read anyway, but not just that, I don't think it can be 
compiled away in the kernel with this usage pattern. Maybe I am mistaken.

> +	}
>   
>   	dma_fence_signal(&array->base);
>   	dma_fence_put(&array->base);
>   }
>   
> -static void dma_fence_array_cb_func(struct dma_fence *f,
> -				    struct dma_fence_cb *cb)
> +static void dma_fence_array_irq_work(struct irq_work *wrk)
>   {
> -	struct dma_fence_array_cb *array_cb =
> -		container_of(cb, struct dma_fence_array_cb, cb);
> -	struct dma_fence_array *array = array_cb->array;
> -
> -	dma_fence_array_set_pending_error(array, f->error);
> +	struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>   
> -	if (atomic_dec_and_test(&array->num_pending))
> -		irq_work_queue(&array->work);
> -	else
> -		dma_fence_put(&array->base);
> +	WRITE_ONCE(array->num_pending, array->num_pending - 1);
> +	dma_fence_array_arm_cb(array);

So for x86 going from one irqwork latency to num_fences latencies is 
probably passable but I am not sure how other architectures fare.

>   }
>   
>   static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
>   {
>   	struct dma_fence_array *array = to_dma_fence_array(fence);
> -	struct dma_fence_array_cb *cb = array->callbacks;
> -	unsigned i;
> -
> -	for (i = 0; i < array->num_fences; ++i) {
> -		cb[i].array = array;
> -		/*
> -		 * As we may report that the fence is signaled before all
> -		 * callbacks are complete, we need to take an additional
> -		 * reference count on the array so that we do not free it too
> -		 * early. The core fence handling will only hold the reference
> -		 * until we signal the array as complete (but that is now
> -		 * insufficient).
> -		 */
> -		dma_fence_get(&array->base);
> -		if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
> -					   dma_fence_array_cb_func)) {
> -			int error = array->fences[i]->error;
> -
> -			dma_fence_array_set_pending_error(array, error);
> -			dma_fence_put(&array->base);
> -			if (atomic_dec_and_test(&array->num_pending)) {
> -				dma_fence_array_clear_pending_error(array);
> -				return false;
> -			}
> -		}
> -	}
>   
> +	/*
> +	 * As we may report that the fence is signaled before all
> +	 * callbacks are complete, we need to take an additional
> +	 * reference count on the array so that we do not free it too
> +	 * early. The core fence handling will only hold the reference
> +	 * until we signal the array as complete (but that is now
> +	 * insufficient).
> +	 */
> +	dma_fence_get(&array->base);
> +	dma_fence_array_arm_cb(array);
>   	return true;

Are you sure it is safe to always return true?

Regards,

Tvrtko

>   }
>   
>   static bool dma_fence_array_signaled(struct dma_fence *fence)
>   {
>   	struct dma_fence_array *array = to_dma_fence_array(fence);
> -	int num_pending;
> +	int num_pending, error = 0;
>   	unsigned int i;
>   
>   	/*
> -	 * We need to read num_pending before checking the enable_signal bit
> -	 * to avoid racing with the enable_signaling() implementation, which
> -	 * might decrement the counter, and cause a partial check.
> -	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
> -	 * dma_fence_array_enable_signaling()
> -	 *
> -	 * The !--num_pending check is here to account for the any_signaled case
> -	 * if we race with enable_signaling(), that means the !num_pending check
> -	 * in the is_signalling_enabled branch might be outdated (num_pending
> -	 * might have been decremented), but that's fine. The user will get the
> -	 * right value when testing again later.
> +	 * Reading num_pending without a memory barrier here is correct since
> +	 * that is only for optimization, it is perfectly acceptable to have a
> +	 * stale value for it. In all other cases num_pending is accessed by a
> +	 * single call chain.
>   	 */
> -	num_pending = atomic_read_acquire(&array->num_pending);
> -	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
> -		if (num_pending <= 0)
> -			goto signal;
> -		return false;
> -	}
> +	num_pending = READ_ONCE(array->num_pending);
> +	for (i = 0; i < num_pending; ++i) {
> +		struct dma_fence *f = array->fences[i];
>   
> -	for (i = 0; i < array->num_fences; ++i) {
> -		if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
> -			goto signal;
> -	}
> -	return false;
> +		if (!dma_fence_is_signaled(f))
> +			return false;
>   
> -signal:
> +		if (!error)
> +			error = f->error;
> +	}
> +	dma_fence_array_set_pending_error(array, error);
>   	dma_fence_array_clear_pending_error(array);
>   	return true;
>   }
> @@ -171,15 +154,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
>   
>   /**
>    * dma_fence_array_alloc - Allocate a custom fence array
> - * @num_fences:		[in]	number of fences to add in the array
>    *
>    * Return dma fence array on success, NULL on failure
>    */
> -struct dma_fence_array *dma_fence_array_alloc(int num_fences)
> +struct dma_fence_array *dma_fence_array_alloc(void)
>   {
> -	struct dma_fence_array *array;
> -
> -	return kzalloc_flex(*array, callbacks, num_fences);
> +	return kzalloc_obj(struct dma_fence_array);
>   }
>   EXPORT_SYMBOL(dma_fence_array_alloc);
>   
> @@ -203,10 +183,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
>   	WARN_ON(!num_fences || !fences);
>   
>   	array->num_fences = num_fences;
> +	array->num_pending = num_fences;
> +	array->fences = fences;
> +	array->base.error = PENDING_ERROR;
>   
>   	dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
>   		       seqno);
> -	init_irq_work(&array->work, irq_dma_fence_array_work);
> +	init_irq_work(&array->work, dma_fence_array_irq_work);
>   
>   	/*
>   	 * dma_fence_array_enable_signaling() is invoked while holding
> @@ -220,11 +203,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
>   	 */
>   	lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>   
> -	atomic_set(&array->num_pending, num_fences);
> -	array->fences = fences;
> -
> -	array->base.error = PENDING_ERROR;
> -
>   	/*
>   	 * dma_fence_array objects should never contain any other fence
>   	 * containers or otherwise we run into recursion and potential kernel
> @@ -265,7 +243,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
>   {
>   	struct dma_fence_array *array;
>   
> -	array = dma_fence_array_alloc(num_fences);
> +	array = dma_fence_array_alloc();
>   	if (!array)
>   		return NULL;
>   
> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
> index 1b1d87579c38..3ee55c0e2fa4 100644
> --- a/include/linux/dma-fence-array.h
> +++ b/include/linux/dma-fence-array.h
> @@ -15,16 +15,6 @@
>   #include <linux/dma-fence.h>
>   #include <linux/irq_work.h>
>   
> -/**
> - * struct dma_fence_array_cb - callback helper for fence array
> - * @cb: fence callback structure for signaling
> - * @array: reference to the parent fence array object
> - */
> -struct dma_fence_array_cb {
> -	struct dma_fence_cb cb;
> -	struct dma_fence_array *array;
> -};
> -
>   /**
>    * struct dma_fence_array - fence to represent an array of fences
>    * @base: fence base class
> @@ -33,18 +23,17 @@ struct dma_fence_array_cb {
>    * @num_pending: fences in the array still pending
>    * @fences: array of the fences
>    * @work: internal irq_work function
> - * @callbacks: array of callback helpers
> + * @callback: callback structure for signaling
>    */
>   struct dma_fence_array {
>   	struct dma_fence base;
>   
> -	unsigned num_fences;
> -	atomic_t num_pending;
> +	unsigned int num_fences;
> +	unsigned int num_pending;
>   	struct dma_fence **fences;
>   
>   	struct irq_work work;
> -
> -	struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
> +	struct dma_fence_cb callback;
>   };
>   
>   /**
> @@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
>   	for (index = 0, fence = dma_fence_array_first(head); fence;	\
>   	     ++(index), fence = dma_fence_array_next(head, index))
>   
> -struct dma_fence_array *dma_fence_array_alloc(int num_fences);
> +struct dma_fence_array *dma_fence_array_alloc(void);
>   void dma_fence_array_init(struct dma_fence_array *array,
>   			  int num_fences, struct dma_fence **fences,
>   			  u64 context, unsigned seqno);
> -
>   struct dma_fence_array *dma_fence_array_create(int num_fences,
>   					       struct dma_fence **fences,
>   					       u64 context, unsigned seqno);


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3
  2026-04-22 10:49 ` [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Tvrtko Ursulin
@ 2026-05-04 14:46   ` Christian König
  0 siblings, 0 replies; 7+ messages in thread
From: Christian König @ 2026-05-04 14:46 UTC (permalink / raw)
  To: Tvrtko Ursulin, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig

On 4/22/26 12:49, Tvrtko Ursulin wrote:
> 
> On 22/04/2026 11:30, Christian König wrote:
...
>>   diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
>> index 56e2db50bb36..8f472911469d 100644
>> --- a/drivers/gpu/drm/xe/xe_vm.c
>> +++ b/drivers/gpu/drm/xe/xe_vm.c
>> @@ -3370,7 +3370,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
>>           goto err_trace;
>>       }
>>   -    cf = dma_fence_array_alloc(n_fence);
>> +    cf = dma_fence_array_alloc();
> 
> Patch splitting mistake here.

Ah, thanks for pointing this out.

> The rest LGTM. So with this hunk dropped:
> 
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>

I double checked the patch once more, compile tested it and then pushed the result.

Thanks,
Christian.

> 
> Regards,
> 
> Tvrtko
> 
>>       if (!cf) {
>>           fence = ERR_PTR(-ENOMEM);
>>           goto err_out;
>> @@ -3414,7 +3414,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
>>         xe_assert(vm->xe, current_fence == n_fence);
>>       dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
>> -                 1, false);
>> +                 1);
>>       fence = &cf->base;
>>         for_each_tile(tile, vm->xe, id) {
>> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
>> index 370b3d2bba37..1b1d87579c38 100644
>> --- a/include/linux/dma-fence-array.h
>> +++ b/include/linux/dma-fence-array.h
>> @@ -81,13 +81,11 @@ to_dma_fence_array(struct dma_fence *fence)
>>   struct dma_fence_array *dma_fence_array_alloc(int num_fences);
>>   void dma_fence_array_init(struct dma_fence_array *array,
>>                 int num_fences, struct dma_fence **fences,
>> -              u64 context, unsigned seqno,
>> -              bool signal_on_any);
>> +              u64 context, unsigned seqno);
>>     struct dma_fence_array *dma_fence_array_create(int num_fences,
>>                              struct dma_fence **fences,
>> -                           u64 context, unsigned seqno,
>> -                           bool signal_on_any);
>> +                           u64 context, unsigned seqno);
>>     bool dma_fence_match_context(struct dma_fence *fence, u64 context);
>>   
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] dma-buf/dma_fence_array: optimize handling
  2026-04-22 11:37   ` Tvrtko Ursulin
@ 2026-05-04 14:55     ` Christian König
  2026-05-04 15:55       ` Tvrtko Ursulin
  0 siblings, 1 reply; 7+ messages in thread
From: Christian König @ 2026-05-04 14:55 UTC (permalink / raw)
  To: Tvrtko Ursulin, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig

On 4/22/26 13:37, Tvrtko Ursulin wrote:
> 
> On 22/04/2026 11:30, Christian König wrote:
>> Removing the signal on any feature allows to simplfy the dma_fence_array
>> code a lot and saves us from the need to install a callback on all fences
>> at the same time.
>>
>> This results in less memory and CPU overhead.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/dma-buf/dma-fence-array.c | 130 +++++++++++++-----------------
>>   include/linux/dma-fence-array.h   |  22 ++---
>>   2 files changed, 59 insertions(+), 93 deletions(-)
>>
>> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
>> index 5e10e8df372f..f1b4b3296c87 100644
>> --- a/drivers/dma-buf/dma-fence-array.c
>> +++ b/drivers/dma-buf/dma-fence-array.c
>> @@ -42,97 +42,80 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
>>       cmpxchg(&array->base.error, PENDING_ERROR, 0);
>>   }
>>   -static void irq_dma_fence_array_work(struct irq_work *wrk)
>> +static void dma_fence_array_cb_func(struct dma_fence *f,
>> +                    struct dma_fence_cb *cb)
>>   {
>> -    struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>> +    struct dma_fence_array *array =
>> +        container_of(cb, struct dma_fence_array, callback);
>>   -    dma_fence_array_clear_pending_error(array);
>> +    irq_work_queue(&array->work);
>> +}
>> +
>> +static void dma_fence_array_arm_cb(struct dma_fence_array *array)
>> +{
>> +    while (array->num_pending) {
>> +        struct dma_fence *f = array->fences[array->num_pending - 1];
>> +
>> +        if (!dma_fence_add_callback(f, &array->callback,
>> +                        dma_fence_array_cb_func))
>> +            return;
>> +
>> +        dma_fence_array_set_pending_error(array, f->error);
>> +        WRITE_ONCE(array->num_pending, array->num_pending - 1);
> 
> Do you think the WRITE_ONCEs are needed? As the loop will restart with un-annotated read anyway, but not just that, I don't think it can be compiled away in the kernel with this usage pattern. Maybe I am mistaken.

I also think the WRITE_ONCEs are superfluous. But I wanted to be on the save side, not that the compiler re-orders the write before the dma_fence_add_callback() and it shouldn't matter for performance.

> 
>> +    }
>>         dma_fence_signal(&array->base);
>>       dma_fence_put(&array->base);
>>   }
>>   -static void dma_fence_array_cb_func(struct dma_fence *f,
>> -                    struct dma_fence_cb *cb)
>> +static void dma_fence_array_irq_work(struct irq_work *wrk)
>>   {
>> -    struct dma_fence_array_cb *array_cb =
>> -        container_of(cb, struct dma_fence_array_cb, cb);
>> -    struct dma_fence_array *array = array_cb->array;
>> -
>> -    dma_fence_array_set_pending_error(array, f->error);
>> +    struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>>   -    if (atomic_dec_and_test(&array->num_pending))
>> -        irq_work_queue(&array->work);
>> -    else
>> -        dma_fence_put(&array->base);
>> +    WRITE_ONCE(array->num_pending, array->num_pending - 1);
>> +    dma_fence_array_arm_cb(array);
> 
> So for x86 going from one irqwork latency to num_fences latencies is probably passable but I am not sure how other architectures fare.

Mhm, what do you mean?

Previously we started one irqwork handler for each not signaled fence when enable_signaling was called, but now we reduce that to only starting one for each not signaled fence when the previous fence has finished.

As far as I can see that is always better or at least the same overhead.

> 
>>   }
>>     static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
>>   {
>>       struct dma_fence_array *array = to_dma_fence_array(fence);
>> -    struct dma_fence_array_cb *cb = array->callbacks;
>> -    unsigned i;
>> -
>> -    for (i = 0; i < array->num_fences; ++i) {
>> -        cb[i].array = array;
>> -        /*
>> -         * As we may report that the fence is signaled before all
>> -         * callbacks are complete, we need to take an additional
>> -         * reference count on the array so that we do not free it too
>> -         * early. The core fence handling will only hold the reference
>> -         * until we signal the array as complete (but that is now
>> -         * insufficient).
>> -         */
>> -        dma_fence_get(&array->base);
>> -        if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
>> -                       dma_fence_array_cb_func)) {
>> -            int error = array->fences[i]->error;
>> -
>> -            dma_fence_array_set_pending_error(array, error);
>> -            dma_fence_put(&array->base);
>> -            if (atomic_dec_and_test(&array->num_pending)) {
>> -                dma_fence_array_clear_pending_error(array);
>> -                return false;
>> -            }
>> -        }
>> -    }
>>   +    /*
>> +     * As we may report that the fence is signaled before all
>> +     * callbacks are complete, we need to take an additional
>> +     * reference count on the array so that we do not free it too
>> +     * early. The core fence handling will only hold the reference
>> +     * until we signal the array as complete (but that is now
>> +     * insufficient).
>> +     */
>> +    dma_fence_get(&array->base);
>> +    dma_fence_array_arm_cb(array);
>>       return true;
> 
> Are you sure it is safe to always return true?

Oh, good point!

It is safe to return true here, but it is not save to call dma_fence_array_arm_cb() because that could signal the fence and result in double locking!

Going to fix that, thanks.

Regards,
Christian.

> 
> Regards,
> 
> Tvrtko
> 
>>   }
>>     static bool dma_fence_array_signaled(struct dma_fence *fence)
>>   {
>>       struct dma_fence_array *array = to_dma_fence_array(fence);
>> -    int num_pending;
>> +    int num_pending, error = 0;
>>       unsigned int i;
>>         /*
>> -     * We need to read num_pending before checking the enable_signal bit
>> -     * to avoid racing with the enable_signaling() implementation, which
>> -     * might decrement the counter, and cause a partial check.
>> -     * atomic_read_acquire() pairs with atomic_dec_and_test() in
>> -     * dma_fence_array_enable_signaling()
>> -     *
>> -     * The !--num_pending check is here to account for the any_signaled case
>> -     * if we race with enable_signaling(), that means the !num_pending check
>> -     * in the is_signalling_enabled branch might be outdated (num_pending
>> -     * might have been decremented), but that's fine. The user will get the
>> -     * right value when testing again later.
>> +     * Reading num_pending without a memory barrier here is correct since
>> +     * that is only for optimization, it is perfectly acceptable to have a
>> +     * stale value for it. In all other cases num_pending is accessed by a
>> +     * single call chain.
>>        */
>> -    num_pending = atomic_read_acquire(&array->num_pending);
>> -    if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
>> -        if (num_pending <= 0)
>> -            goto signal;
>> -        return false;
>> -    }
>> +    num_pending = READ_ONCE(array->num_pending);
>> +    for (i = 0; i < num_pending; ++i) {
>> +        struct dma_fence *f = array->fences[i];
>>   -    for (i = 0; i < array->num_fences; ++i) {
>> -        if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
>> -            goto signal;
>> -    }
>> -    return false;
>> +        if (!dma_fence_is_signaled(f))
>> +            return false;
>>   -signal:
>> +        if (!error)
>> +            error = f->error;
>> +    }
>> +    dma_fence_array_set_pending_error(array, error);
>>       dma_fence_array_clear_pending_error(array);
>>       return true;
>>   }
>> @@ -171,15 +154,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
>>     /**
>>    * dma_fence_array_alloc - Allocate a custom fence array
>> - * @num_fences:        [in]    number of fences to add in the array
>>    *
>>    * Return dma fence array on success, NULL on failure
>>    */
>> -struct dma_fence_array *dma_fence_array_alloc(int num_fences)
>> +struct dma_fence_array *dma_fence_array_alloc(void)
>>   {
>> -    struct dma_fence_array *array;
>> -
>> -    return kzalloc_flex(*array, callbacks, num_fences);
>> +    return kzalloc_obj(struct dma_fence_array);
>>   }
>>   EXPORT_SYMBOL(dma_fence_array_alloc);
>>   @@ -203,10 +183,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
>>       WARN_ON(!num_fences || !fences);
>>         array->num_fences = num_fences;
>> +    array->num_pending = num_fences;
>> +    array->fences = fences;
>> +    array->base.error = PENDING_ERROR;
>>         dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
>>                  seqno);
>> -    init_irq_work(&array->work, irq_dma_fence_array_work);
>> +    init_irq_work(&array->work, dma_fence_array_irq_work);
>>         /*
>>        * dma_fence_array_enable_signaling() is invoked while holding
>> @@ -220,11 +203,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
>>        */
>>       lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>>   -    atomic_set(&array->num_pending, num_fences);
>> -    array->fences = fences;
>> -
>> -    array->base.error = PENDING_ERROR;
>> -
>>       /*
>>        * dma_fence_array objects should never contain any other fence
>>        * containers or otherwise we run into recursion and potential kernel
>> @@ -265,7 +243,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
>>   {
>>       struct dma_fence_array *array;
>>   -    array = dma_fence_array_alloc(num_fences);
>> +    array = dma_fence_array_alloc();
>>       if (!array)
>>           return NULL;
>>   diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
>> index 1b1d87579c38..3ee55c0e2fa4 100644
>> --- a/include/linux/dma-fence-array.h
>> +++ b/include/linux/dma-fence-array.h
>> @@ -15,16 +15,6 @@
>>   #include <linux/dma-fence.h>
>>   #include <linux/irq_work.h>
>>   -/**
>> - * struct dma_fence_array_cb - callback helper for fence array
>> - * @cb: fence callback structure for signaling
>> - * @array: reference to the parent fence array object
>> - */
>> -struct dma_fence_array_cb {
>> -    struct dma_fence_cb cb;
>> -    struct dma_fence_array *array;
>> -};
>> -
>>   /**
>>    * struct dma_fence_array - fence to represent an array of fences
>>    * @base: fence base class
>> @@ -33,18 +23,17 @@ struct dma_fence_array_cb {
>>    * @num_pending: fences in the array still pending
>>    * @fences: array of the fences
>>    * @work: internal irq_work function
>> - * @callbacks: array of callback helpers
>> + * @callback: callback structure for signaling
>>    */
>>   struct dma_fence_array {
>>       struct dma_fence base;
>>   -    unsigned num_fences;
>> -    atomic_t num_pending;
>> +    unsigned int num_fences;
>> +    unsigned int num_pending;
>>       struct dma_fence **fences;
>>         struct irq_work work;
>> -
>> -    struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
>> +    struct dma_fence_cb callback;
>>   };
>>     /**
>> @@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
>>       for (index = 0, fence = dma_fence_array_first(head); fence;    \
>>            ++(index), fence = dma_fence_array_next(head, index))
>>   -struct dma_fence_array *dma_fence_array_alloc(int num_fences);
>> +struct dma_fence_array *dma_fence_array_alloc(void);
>>   void dma_fence_array_init(struct dma_fence_array *array,
>>                 int num_fences, struct dma_fence **fences,
>>                 u64 context, unsigned seqno);
>> -
>>   struct dma_fence_array *dma_fence_array_create(int num_fences,
>>                              struct dma_fence **fences,
>>                              u64 context, unsigned seqno);
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] dma-buf/dma_fence_array: optimize handling
  2026-05-04 14:55     ` Christian König
@ 2026-05-04 15:55       ` Tvrtko Ursulin
  0 siblings, 0 replies; 7+ messages in thread
From: Tvrtko Ursulin @ 2026-05-04 15:55 UTC (permalink / raw)
  To: Christian König, sumit.semwal; +Cc: dri-devel, linux-media, linaro-mm-sig


On 04/05/2026 15:55, Christian König wrote:
> On 4/22/26 13:37, Tvrtko Ursulin wrote:
>>
>> On 22/04/2026 11:30, Christian König wrote:
>>> Removing the signal on any feature allows to simplfy the dma_fence_array
>>> code a lot and saves us from the need to install a callback on all fences
>>> at the same time.
>>>
>>> This results in less memory and CPU overhead.
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> ---
>>>    drivers/dma-buf/dma-fence-array.c | 130 +++++++++++++-----------------
>>>    include/linux/dma-fence-array.h   |  22 ++---
>>>    2 files changed, 59 insertions(+), 93 deletions(-)
>>>
>>> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
>>> index 5e10e8df372f..f1b4b3296c87 100644
>>> --- a/drivers/dma-buf/dma-fence-array.c
>>> +++ b/drivers/dma-buf/dma-fence-array.c
>>> @@ -42,97 +42,80 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
>>>        cmpxchg(&array->base.error, PENDING_ERROR, 0);
>>>    }
>>>    -static void irq_dma_fence_array_work(struct irq_work *wrk)
>>> +static void dma_fence_array_cb_func(struct dma_fence *f,
>>> +                    struct dma_fence_cb *cb)
>>>    {
>>> -    struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>>> +    struct dma_fence_array *array =
>>> +        container_of(cb, struct dma_fence_array, callback);
>>>    -    dma_fence_array_clear_pending_error(array);
>>> +    irq_work_queue(&array->work);
>>> +}
>>> +
>>> +static void dma_fence_array_arm_cb(struct dma_fence_array *array)
>>> +{
>>> +    while (array->num_pending) {
>>> +        struct dma_fence *f = array->fences[array->num_pending - 1];
>>> +
>>> +        if (!dma_fence_add_callback(f, &array->callback,
>>> +                        dma_fence_array_cb_func))
>>> +            return;
>>> +
>>> +        dma_fence_array_set_pending_error(array, f->error);
>>> +        WRITE_ONCE(array->num_pending, array->num_pending - 1);
>>
>> Do you think the WRITE_ONCEs are needed? As the loop will restart with un-annotated read anyway, but not just that, I don't think it can be compiled away in the kernel with this usage pattern. Maybe I am mistaken.
> 
> I also think the WRITE_ONCEs are superfluous. But I wanted to be on the save side, not that the compiler re-orders the write before the dma_fence_add_callback() and it shouldn't matter for performance.

Hm okay, but I am on the fence here, if they stay they may be misleading 
the reader that they are required.

> 
>>
>>> +    }
>>>          dma_fence_signal(&array->base);
>>>        dma_fence_put(&array->base);
>>>    }
>>>    -static void dma_fence_array_cb_func(struct dma_fence *f,
>>> -                    struct dma_fence_cb *cb)
>>> +static void dma_fence_array_irq_work(struct irq_work *wrk)
>>>    {
>>> -    struct dma_fence_array_cb *array_cb =
>>> -        container_of(cb, struct dma_fence_array_cb, cb);
>>> -    struct dma_fence_array *array = array_cb->array;
>>> -
>>> -    dma_fence_array_set_pending_error(array, f->error);
>>> +    struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>>>    -    if (atomic_dec_and_test(&array->num_pending))
>>> -        irq_work_queue(&array->work);
>>> -    else
>>> -        dma_fence_put(&array->base);
>>> +    WRITE_ONCE(array->num_pending, array->num_pending - 1);
>>> +    dma_fence_array_arm_cb(array);
>>
>> So for x86 going from one irqwork latency to num_fences latencies is probably passable but I am not sure how other architectures fare.
> 
> Mhm, what do you mean?
> 
> Previously we started one irqwork handler for each not signaled fence when enable_signaling was called, but now we reduce that to only starting one for each not signaled fence when the previous fence has finished.
> 
> As far as I can see that is always better or at least the same overhead.

Currently irq work latency is once per array:

static void dma_fence_array_cb_func(struct dma_fence *f,
				    struct dma_fence_cb *cb)
{
...

	if (atomic_dec_and_test(&array->num_pending))
		irq_work_queue(&array->work);

So only when last one signals.

With this patch, AFAIR from when I reviewed it because the diff is a bit 
difficult to read so I applied it, it is a "chain" of irq work and 
callbacks. One by one as you say - irq work installs the callback on the 
next fence, which when signalled queues irq work.

Regards,

Tvrtko

>>
>>>    }
>>>      static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
>>>    {
>>>        struct dma_fence_array *array = to_dma_fence_array(fence);
>>> -    struct dma_fence_array_cb *cb = array->callbacks;
>>> -    unsigned i;
>>> -
>>> -    for (i = 0; i < array->num_fences; ++i) {
>>> -        cb[i].array = array;
>>> -        /*
>>> -         * As we may report that the fence is signaled before all
>>> -         * callbacks are complete, we need to take an additional
>>> -         * reference count on the array so that we do not free it too
>>> -         * early. The core fence handling will only hold the reference
>>> -         * until we signal the array as complete (but that is now
>>> -         * insufficient).
>>> -         */
>>> -        dma_fence_get(&array->base);
>>> -        if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
>>> -                       dma_fence_array_cb_func)) {
>>> -            int error = array->fences[i]->error;
>>> -
>>> -            dma_fence_array_set_pending_error(array, error);
>>> -            dma_fence_put(&array->base);
>>> -            if (atomic_dec_and_test(&array->num_pending)) {
>>> -                dma_fence_array_clear_pending_error(array);
>>> -                return false;
>>> -            }
>>> -        }
>>> -    }
>>>    +    /*
>>> +     * As we may report that the fence is signaled before all
>>> +     * callbacks are complete, we need to take an additional
>>> +     * reference count on the array so that we do not free it too
>>> +     * early. The core fence handling will only hold the reference
>>> +     * until we signal the array as complete (but that is now
>>> +     * insufficient).
>>> +     */
>>> +    dma_fence_get(&array->base);
>>> +    dma_fence_array_arm_cb(array);
>>>        return true;
>>
>> Are you sure it is safe to always return true?
> 
> Oh, good point!
> 
> It is safe to return true here, but it is not save to call dma_fence_array_arm_cb() because that could signal the fence and result in double locking!
> 
> Going to fix that, thanks.
> 
> Regards,
> Christian.
> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>>    }
>>>      static bool dma_fence_array_signaled(struct dma_fence *fence)
>>>    {
>>>        struct dma_fence_array *array = to_dma_fence_array(fence);
>>> -    int num_pending;
>>> +    int num_pending, error = 0;
>>>        unsigned int i;
>>>          /*
>>> -     * We need to read num_pending before checking the enable_signal bit
>>> -     * to avoid racing with the enable_signaling() implementation, which
>>> -     * might decrement the counter, and cause a partial check.
>>> -     * atomic_read_acquire() pairs with atomic_dec_and_test() in
>>> -     * dma_fence_array_enable_signaling()
>>> -     *
>>> -     * The !--num_pending check is here to account for the any_signaled case
>>> -     * if we race with enable_signaling(), that means the !num_pending check
>>> -     * in the is_signalling_enabled branch might be outdated (num_pending
>>> -     * might have been decremented), but that's fine. The user will get the
>>> -     * right value when testing again later.
>>> +     * Reading num_pending without a memory barrier here is correct since
>>> +     * that is only for optimization, it is perfectly acceptable to have a
>>> +     * stale value for it. In all other cases num_pending is accessed by a
>>> +     * single call chain.
>>>         */
>>> -    num_pending = atomic_read_acquire(&array->num_pending);
>>> -    if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
>>> -        if (num_pending <= 0)
>>> -            goto signal;
>>> -        return false;
>>> -    }
>>> +    num_pending = READ_ONCE(array->num_pending);
>>> +    for (i = 0; i < num_pending; ++i) {
>>> +        struct dma_fence *f = array->fences[i];
>>>    -    for (i = 0; i < array->num_fences; ++i) {
>>> -        if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
>>> -            goto signal;
>>> -    }
>>> -    return false;
>>> +        if (!dma_fence_is_signaled(f))
>>> +            return false;
>>>    -signal:
>>> +        if (!error)
>>> +            error = f->error;
>>> +    }
>>> +    dma_fence_array_set_pending_error(array, error);
>>>        dma_fence_array_clear_pending_error(array);
>>>        return true;
>>>    }
>>> @@ -171,15 +154,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
>>>      /**
>>>     * dma_fence_array_alloc - Allocate a custom fence array
>>> - * @num_fences:        [in]    number of fences to add in the array
>>>     *
>>>     * Return dma fence array on success, NULL on failure
>>>     */
>>> -struct dma_fence_array *dma_fence_array_alloc(int num_fences)
>>> +struct dma_fence_array *dma_fence_array_alloc(void)
>>>    {
>>> -    struct dma_fence_array *array;
>>> -
>>> -    return kzalloc_flex(*array, callbacks, num_fences);
>>> +    return kzalloc_obj(struct dma_fence_array);
>>>    }
>>>    EXPORT_SYMBOL(dma_fence_array_alloc);
>>>    @@ -203,10 +183,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
>>>        WARN_ON(!num_fences || !fences);
>>>          array->num_fences = num_fences;
>>> +    array->num_pending = num_fences;
>>> +    array->fences = fences;
>>> +    array->base.error = PENDING_ERROR;
>>>          dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
>>>                   seqno);
>>> -    init_irq_work(&array->work, irq_dma_fence_array_work);
>>> +    init_irq_work(&array->work, dma_fence_array_irq_work);
>>>          /*
>>>         * dma_fence_array_enable_signaling() is invoked while holding
>>> @@ -220,11 +203,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
>>>         */
>>>        lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>>>    -    atomic_set(&array->num_pending, num_fences);
>>> -    array->fences = fences;
>>> -
>>> -    array->base.error = PENDING_ERROR;
>>> -
>>>        /*
>>>         * dma_fence_array objects should never contain any other fence
>>>         * containers or otherwise we run into recursion and potential kernel
>>> @@ -265,7 +243,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
>>>    {
>>>        struct dma_fence_array *array;
>>>    -    array = dma_fence_array_alloc(num_fences);
>>> +    array = dma_fence_array_alloc();
>>>        if (!array)
>>>            return NULL;
>>>    diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
>>> index 1b1d87579c38..3ee55c0e2fa4 100644
>>> --- a/include/linux/dma-fence-array.h
>>> +++ b/include/linux/dma-fence-array.h
>>> @@ -15,16 +15,6 @@
>>>    #include <linux/dma-fence.h>
>>>    #include <linux/irq_work.h>
>>>    -/**
>>> - * struct dma_fence_array_cb - callback helper for fence array
>>> - * @cb: fence callback structure for signaling
>>> - * @array: reference to the parent fence array object
>>> - */
>>> -struct dma_fence_array_cb {
>>> -    struct dma_fence_cb cb;
>>> -    struct dma_fence_array *array;
>>> -};
>>> -
>>>    /**
>>>     * struct dma_fence_array - fence to represent an array of fences
>>>     * @base: fence base class
>>> @@ -33,18 +23,17 @@ struct dma_fence_array_cb {
>>>     * @num_pending: fences in the array still pending
>>>     * @fences: array of the fences
>>>     * @work: internal irq_work function
>>> - * @callbacks: array of callback helpers
>>> + * @callback: callback structure for signaling
>>>     */
>>>    struct dma_fence_array {
>>>        struct dma_fence base;
>>>    -    unsigned num_fences;
>>> -    atomic_t num_pending;
>>> +    unsigned int num_fences;
>>> +    unsigned int num_pending;
>>>        struct dma_fence **fences;
>>>          struct irq_work work;
>>> -
>>> -    struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
>>> +    struct dma_fence_cb callback;
>>>    };
>>>      /**
>>> @@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
>>>        for (index = 0, fence = dma_fence_array_first(head); fence;    \
>>>             ++(index), fence = dma_fence_array_next(head, index))
>>>    -struct dma_fence_array *dma_fence_array_alloc(int num_fences);
>>> +struct dma_fence_array *dma_fence_array_alloc(void);
>>>    void dma_fence_array_init(struct dma_fence_array *array,
>>>                  int num_fences, struct dma_fence **fences,
>>>                  u64 context, unsigned seqno);
>>> -
>>>    struct dma_fence_array *dma_fence_array_create(int num_fences,
>>>                               struct dma_fence **fences,
>>>                               u64 context, unsigned seqno);
>>
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2026-05-04 15:55 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-22 10:30 [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Christian König
2026-04-22 10:30 ` [PATCH 2/2] dma-buf/dma_fence_array: optimize handling Christian König
2026-04-22 11:37   ` Tvrtko Ursulin
2026-05-04 14:55     ` Christian König
2026-05-04 15:55       ` Tvrtko Ursulin
2026-04-22 10:49 ` [PATCH 1/2] dma-buf/dma_fence_array: remove unused functionality v3 Tvrtko Ursulin
2026-05-04 14:46   ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox