[PATCH] dma-buf/dma_fence_array: optimize handling v2

dri-devel Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] dma-buf/dma_fence_array: optimize handling v2
@ 2026-05-05 11:08 Christian König
  2026-05-06  8:04 ` Tvrtko Ursulin
  0 siblings, 1 reply; 3+ messages in thread
From: Christian König @ 2026-05-05 11:08 UTC (permalink / raw)
  To: tursulin, sumit.semwal, dri-devel, linux-media, linaro-mm-sig

Removing the signal on any feature allows to simplfy the dma_fence_array
code a lot and saves us from the need to install a callback on all fences
at the same time.

This results is less memory and CPU overhead.

v2: fix potential double locking pointed out by Tvrtko

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-fence-array.c | 134 +++++++++++++-----------------
 drivers/gpu/drm/xe/xe_vm.c        |   2 +-
 include/linux/dma-fence-array.h   |  22 ++---
 3 files changed, 66 insertions(+), 92 deletions(-)

diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
index 5e10e8df372f..8b94c6287482 100644
--- a/drivers/dma-buf/dma-fence-array.c
+++ b/drivers/dma-buf/dma-fence-array.c
@@ -42,97 +42,88 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
 	cmpxchg(&array->base.error, PENDING_ERROR, 0);
 }
 
-static void irq_dma_fence_array_work(struct irq_work *wrk)
+static void dma_fence_array_cb_func(struct dma_fence *f,
+				    struct dma_fence_cb *cb)
 {
-	struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
-
-	dma_fence_array_clear_pending_error(array);
+	struct dma_fence_array *array =
+		container_of(cb, struct dma_fence_array, callback);
 
-	dma_fence_signal(&array->base);
-	dma_fence_put(&array->base);
+	irq_work_queue(&array->work);
 }
 
-static void dma_fence_array_cb_func(struct dma_fence *f,
-				    struct dma_fence_cb *cb)
+static bool dma_fence_array_try_add_cb(struct dma_fence_array *array)
 {
-	struct dma_fence_array_cb *array_cb =
-		container_of(cb, struct dma_fence_array_cb, cb);
-	struct dma_fence_array *array = array_cb->array;
+	while (array->num_pending) {
+		struct dma_fence *f = array->fences[array->num_pending - 1];
 
-	dma_fence_array_set_pending_error(array, f->error);
+		if (!dma_fence_add_callback(f, &array->callback,
+					    dma_fence_array_cb_func))
+			return true;
 
-	if (atomic_dec_and_test(&array->num_pending))
-		irq_work_queue(&array->work);
-	else
+		dma_fence_array_set_pending_error(array, f->error);
+		--array->num_pending;
+	}
+	return false;
+}
+
+static void dma_fence_array_irq_work(struct irq_work *wrk)
+{
+	struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
+
+	--array->num_pending;
+	if (!dma_fence_array_try_add_cb(array)) {
+		dma_fence_signal(&array->base);
 		dma_fence_put(&array->base);
+	}
 }
 
 static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
 {
 	struct dma_fence_array *array = to_dma_fence_array(fence);
-	struct dma_fence_array_cb *cb = array->callbacks;
-	unsigned i;
 
-	for (i = 0; i < array->num_fences; ++i) {
-		cb[i].array = array;
+	/*
+	 * As we may report that the fence is signaled before all
+	 * callbacks are complete, we need to take an additional
+	 * reference count on the array so that we do not free it too
+	 * early. The core fence handling will only hold the reference
+	 * until we signal the array as complete (but that is now
+	 * insufficient).
+	 */
+	dma_fence_get(&array->base);
+	if (!dma_fence_array_try_add_cb(array)) {
 		/*
-		 * As we may report that the fence is signaled before all
-		 * callbacks are complete, we need to take an additional
-		 * reference count on the array so that we do not free it too
-		 * early. The core fence handling will only hold the reference
-		 * until we signal the array as complete (but that is now
-		 * insufficient).
+		 * When all fences are already signaled we can drop the reference again
+		 * and report to the caller that the array can be signaled as well.
 		 */
-		dma_fence_get(&array->base);
-		if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
-					   dma_fence_array_cb_func)) {
-			int error = array->fences[i]->error;
-
-			dma_fence_array_set_pending_error(array, error);
-			dma_fence_put(&array->base);
-			if (atomic_dec_and_test(&array->num_pending)) {
-				dma_fence_array_clear_pending_error(array);
-				return false;
-			}
-		}
+		dma_fence_put(&array->base);
+		return false;
 	}
-
 	return true;
 }
 
 static bool dma_fence_array_signaled(struct dma_fence *fence)
 {
 	struct dma_fence_array *array = to_dma_fence_array(fence);
-	int num_pending;
+	int num_pending, error = 0;
 	unsigned int i;
 
 	/*
-	 * We need to read num_pending before checking the enable_signal bit
-	 * to avoid racing with the enable_signaling() implementation, which
-	 * might decrement the counter, and cause a partial check.
-	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
-	 * dma_fence_array_enable_signaling()
-	 *
-	 * The !--num_pending check is here to account for the any_signaled case
-	 * if we race with enable_signaling(), that means the !num_pending check
-	 * in the is_signalling_enabled branch might be outdated (num_pending
-	 * might have been decremented), but that's fine. The user will get the
-	 * right value when testing again later.
+	 * Reading num_pending without a memory barrier here is correct since
+	 * that is only for optimization, it is perfectly acceptable to have a
+	 * stale value for it. In all other cases num_pending is accessed by a
+	 * single call chain.
 	 */
-	num_pending = atomic_read_acquire(&array->num_pending);
-	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
-		if (num_pending <= 0)
-			goto signal;
-		return false;
-	}
+	num_pending = READ_ONCE(array->num_pending);
+	for (i = 0; i < num_pending; ++i) {
+		struct dma_fence *f = array->fences[i];
 
-	for (i = 0; i < array->num_fences; ++i) {
-		if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
-			goto signal;
-	}
-	return false;
+		if (!dma_fence_is_signaled(f))
+			return false;
 
-signal:
+		if (!error)
+			error = f->error;
+	}
+	dma_fence_array_set_pending_error(array, error);
 	dma_fence_array_clear_pending_error(array);
 	return true;
 }
@@ -171,15 +162,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
 
 /**
  * dma_fence_array_alloc - Allocate a custom fence array
- * @num_fences:		[in]	number of fences to add in the array
  *
  * Return dma fence array on success, NULL on failure
  */
-struct dma_fence_array *dma_fence_array_alloc(int num_fences)
+struct dma_fence_array *dma_fence_array_alloc(void)
 {
-	struct dma_fence_array *array;
-
-	return kzalloc_flex(*array, callbacks, num_fences);
+	return kzalloc_obj(struct dma_fence_array);
 }
 EXPORT_SYMBOL(dma_fence_array_alloc);
 
@@ -203,10 +191,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
 	WARN_ON(!num_fences || !fences);
 
 	array->num_fences = num_fences;
+	array->num_pending = num_fences;
+	array->fences = fences;
+	array->base.error = PENDING_ERROR;
 
 	dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
 		       seqno);
-	init_irq_work(&array->work, irq_dma_fence_array_work);
+	init_irq_work(&array->work, dma_fence_array_irq_work);
 
 	/*
 	 * dma_fence_array_enable_signaling() is invoked while holding
@@ -220,11 +211,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
 	 */
 	lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
 
-	atomic_set(&array->num_pending, num_fences);
-	array->fences = fences;
-
-	array->base.error = PENDING_ERROR;
-
 	/*
 	 * dma_fence_array objects should never contain any other fence
 	 * containers or otherwise we run into recursion and potential kernel
@@ -265,7 +251,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
 {
 	struct dma_fence_array *array;
 
-	array = dma_fence_array_alloc(num_fences);
+	array = dma_fence_array_alloc();
 	if (!array)
 		return NULL;
 
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 62a87a051be7..8f472911469d 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3370,7 +3370,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
 		goto err_trace;
 	}
 
-	cf = dma_fence_array_alloc(n_fence);
+	cf = dma_fence_array_alloc();
 	if (!cf) {
 		fence = ERR_PTR(-ENOMEM);
 		goto err_out;
diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
index 1b1d87579c38..3ee55c0e2fa4 100644
--- a/include/linux/dma-fence-array.h
+++ b/include/linux/dma-fence-array.h
@@ -15,16 +15,6 @@
 #include <linux/dma-fence.h>
 #include <linux/irq_work.h>
 
-/**
- * struct dma_fence_array_cb - callback helper for fence array
- * @cb: fence callback structure for signaling
- * @array: reference to the parent fence array object
- */
-struct dma_fence_array_cb {
-	struct dma_fence_cb cb;
-	struct dma_fence_array *array;
-};
-
 /**
  * struct dma_fence_array - fence to represent an array of fences
  * @base: fence base class
@@ -33,18 +23,17 @@ struct dma_fence_array_cb {
  * @num_pending: fences in the array still pending
  * @fences: array of the fences
  * @work: internal irq_work function
- * @callbacks: array of callback helpers
+ * @callback: callback structure for signaling
  */
 struct dma_fence_array {
 	struct dma_fence base;
 
-	unsigned num_fences;
-	atomic_t num_pending;
+	unsigned int num_fences;
+	unsigned int num_pending;
 	struct dma_fence **fences;
 
 	struct irq_work work;
-
-	struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
+	struct dma_fence_cb callback;
 };
 
 /**
@@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
 	for (index = 0, fence = dma_fence_array_first(head); fence;	\
 	     ++(index), fence = dma_fence_array_next(head, index))
 
-struct dma_fence_array *dma_fence_array_alloc(int num_fences);
+struct dma_fence_array *dma_fence_array_alloc(void);
 void dma_fence_array_init(struct dma_fence_array *array,
 			  int num_fences, struct dma_fence **fences,
 			  u64 context, unsigned seqno);
-
 struct dma_fence_array *dma_fence_array_create(int num_fences,
 					       struct dma_fence **fences,
 					       u64 context, unsigned seqno);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] dma-buf/dma_fence_array: optimize handling v2
  2026-05-05 11:08 [PATCH] dma-buf/dma_fence_array: optimize handling v2 Christian König
@ 2026-05-06  8:04 ` Tvrtko Ursulin
  2026-05-06  9:46   ` Christian König
  0 siblings, 1 reply; 3+ messages in thread
From: Tvrtko Ursulin @ 2026-05-06  8:04 UTC (permalink / raw)
  To: christian.koenig, sumit.semwal, dri-devel, linux-media,
	linaro-mm-sig


On 05/05/2026 12:08, Christian König wrote:
> Removing the signal on any feature allows to simplfy the dma_fence_array
> code a lot and saves us from the need to install a callback on all fences
> at the same time.
> 
> This results is less memory and CPU overhead.

Code looks good but I still worry about the new potential for num_fences 
irq work latencies whereas the existing implementation only has one.

Also, whether or not current or this implementation uses less or more 
CPU overhead depends on the signalling pattern (time distribution) of 
the fences in the array.

Apart from more latency it could even be more CPU usage in the 
pathological case.

It would be less if, when the last fence in the array signals, all 
others have already signaled. Although it would still need to go through 
all dma_fence_add_callback() calls so that part is the same as the 
current implementation. Only the CPU cycles from the signaling side 
would be saved.

But in the pathological case, where fences signal one by one from the 
first to last, and are spaced more in time than a single irq work 
latency, the new implementation needs more CPU time and more latency.

I do agree it would be nice to be able to drop the callbacks array but 
for the above reasons I am worried whether it is safe.

Some other minor comments below.

> v2: fix potential double locking pointed out by Tvrtko
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/dma-buf/dma-fence-array.c | 134 +++++++++++++-----------------
>   drivers/gpu/drm/xe/xe_vm.c        |   2 +-
>   include/linux/dma-fence-array.h   |  22 ++---
>   3 files changed, 66 insertions(+), 92 deletions(-)
> 
> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
> index 5e10e8df372f..8b94c6287482 100644
> --- a/drivers/dma-buf/dma-fence-array.c
> +++ b/drivers/dma-buf/dma-fence-array.c
> @@ -42,97 +42,88 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
>   	cmpxchg(&array->base.error, PENDING_ERROR, 0);
>   }
>   
> -static void irq_dma_fence_array_work(struct irq_work *wrk)
> +static void dma_fence_array_cb_func(struct dma_fence *f,
> +				    struct dma_fence_cb *cb)
>   {
> -	struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
> -
> -	dma_fence_array_clear_pending_error(array);
> +	struct dma_fence_array *array =
> +		container_of(cb, struct dma_fence_array, callback);
>   
> -	dma_fence_signal(&array->base);
> -	dma_fence_put(&array->base);
> +	irq_work_queue(&array->work);
>   }
>   
> -static void dma_fence_array_cb_func(struct dma_fence *f,
> -				    struct dma_fence_cb *cb)
> +static bool dma_fence_array_try_add_cb(struct dma_fence_array *array)
>   {
> -	struct dma_fence_array_cb *array_cb =
> -		container_of(cb, struct dma_fence_array_cb, cb);
> -	struct dma_fence_array *array = array_cb->array;
> +	while (array->num_pending) {
> +		struct dma_fence *f = array->fences[array->num_pending - 1];

Maybe add above this line something like:

/*
  * Install callbacks from the reverse so the check in
  * dma_fence_array_signaled() can be optimized.
  */

>   
> -	dma_fence_array_set_pending_error(array, f->error);
> +		if (!dma_fence_add_callback(f, &array->callback,
> +					    dma_fence_array_cb_func))
> +			return true;
>   
> -	if (atomic_dec_and_test(&array->num_pending))
> -		irq_work_queue(&array->work);
> -	else
> +		dma_fence_array_set_pending_error(array, f->error);
> +		--array->num_pending;
> +	}
> +	return false;
> +}
> +
> +static void dma_fence_array_irq_work(struct irq_work *wrk)
> +{
> +	struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
> +
> +	--array->num_pending;
> +	if (!dma_fence_array_try_add_cb(array)) {
> +		dma_fence_signal(&array->base);
>   		dma_fence_put(&array->base);
> +	}
>   }
>   
>   static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
>   {
>   	struct dma_fence_array *array = to_dma_fence_array(fence);
> -	struct dma_fence_array_cb *cb = array->callbacks;
> -	unsigned i;
>   
> -	for (i = 0; i < array->num_fences; ++i) {
> -		cb[i].array = array;
> +	/*
> +	 * As we may report that the fence is signaled before all
> +	 * callbacks are complete, we need to take an additional
> +	 * reference count on the array so that we do not free it too
> +	 * early. The core fence handling will only hold the reference
> +	 * until we signal the array as complete (but that is now
> +	 * insufficient).
> +	 */
> +	dma_fence_get(&array->base);
> +	if (!dma_fence_array_try_add_cb(array)) {
>   		/*
> -		 * As we may report that the fence is signaled before all
> -		 * callbacks are complete, we need to take an additional
> -		 * reference count on the array so that we do not free it too
> -		 * early. The core fence handling will only hold the reference
> -		 * until we signal the array as complete (but that is now
> -		 * insufficient).
> +		 * When all fences are already signaled we can drop the reference again
> +		 * and report to the caller that the array can be signaled as well.

Optional nit - above two lines end up only lines over 80 in the file.

Regards,

Tvrtko

>   		 */
> -		dma_fence_get(&array->base);
> -		if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
> -					   dma_fence_array_cb_func)) {
> -			int error = array->fences[i]->error;
> -
> -			dma_fence_array_set_pending_error(array, error);
> -			dma_fence_put(&array->base);
> -			if (atomic_dec_and_test(&array->num_pending)) {
> -				dma_fence_array_clear_pending_error(array);
> -				return false;
> -			}
> -		}
> +		dma_fence_put(&array->base);
> +		return false;
>   	}
> -
>   	return true;
>   }
>   
>   static bool dma_fence_array_signaled(struct dma_fence *fence)
>   {
>   	struct dma_fence_array *array = to_dma_fence_array(fence);
> -	int num_pending;
> +	int num_pending, error = 0;
>   	unsigned int i;
>   
>   	/*
> -	 * We need to read num_pending before checking the enable_signal bit
> -	 * to avoid racing with the enable_signaling() implementation, which
> -	 * might decrement the counter, and cause a partial check.
> -	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
> -	 * dma_fence_array_enable_signaling()
> -	 *
> -	 * The !--num_pending check is here to account for the any_signaled case
> -	 * if we race with enable_signaling(), that means the !num_pending check
> -	 * in the is_signalling_enabled branch might be outdated (num_pending
> -	 * might have been decremented), but that's fine. The user will get the
> -	 * right value when testing again later.
> +	 * Reading num_pending without a memory barrier here is correct since
> +	 * that is only for optimization, it is perfectly acceptable to have a
> +	 * stale value for it. In all other cases num_pending is accessed by a
> +	 * single call chain.
>   	 */
> -	num_pending = atomic_read_acquire(&array->num_pending);
> -	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
> -		if (num_pending <= 0)
> -			goto signal;
> -		return false;
> -	}
> +	num_pending = READ_ONCE(array->num_pending);
> +	for (i = 0; i < num_pending; ++i) {
> +		struct dma_fence *f = array->fences[i];
>   
> -	for (i = 0; i < array->num_fences; ++i) {
> -		if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
> -			goto signal;
> -	}
> -	return false;
> +		if (!dma_fence_is_signaled(f))
> +			return false;
>   
> -signal:
> +		if (!error)
> +			error = f->error;
> +	}
> +	dma_fence_array_set_pending_error(array, error);
>   	dma_fence_array_clear_pending_error(array);
>   	return true;
>   }
> @@ -171,15 +162,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
>   
>   /**
>    * dma_fence_array_alloc - Allocate a custom fence array
> - * @num_fences:		[in]	number of fences to add in the array
>    *
>    * Return dma fence array on success, NULL on failure
>    */
> -struct dma_fence_array *dma_fence_array_alloc(int num_fences)
> +struct dma_fence_array *dma_fence_array_alloc(void)
>   {
> -	struct dma_fence_array *array;
> -
> -	return kzalloc_flex(*array, callbacks, num_fences);
> +	return kzalloc_obj(struct dma_fence_array);
>   }
>   EXPORT_SYMBOL(dma_fence_array_alloc);
>   
> @@ -203,10 +191,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
>   	WARN_ON(!num_fences || !fences);
>   
>   	array->num_fences = num_fences;
> +	array->num_pending = num_fences;
> +	array->fences = fences;
> +	array->base.error = PENDING_ERROR;
>   
>   	dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
>   		       seqno);
> -	init_irq_work(&array->work, irq_dma_fence_array_work);
> +	init_irq_work(&array->work, dma_fence_array_irq_work);
>   
>   	/*
>   	 * dma_fence_array_enable_signaling() is invoked while holding
> @@ -220,11 +211,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
>   	 */
>   	lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>   
> -	atomic_set(&array->num_pending, num_fences);
> -	array->fences = fences;
> -
> -	array->base.error = PENDING_ERROR;
> -
>   	/*
>   	 * dma_fence_array objects should never contain any other fence
>   	 * containers or otherwise we run into recursion and potential kernel
> @@ -265,7 +251,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
>   {
>   	struct dma_fence_array *array;
>   
> -	array = dma_fence_array_alloc(num_fences);
> +	array = dma_fence_array_alloc();
>   	if (!array)
>   		return NULL;
>   
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 62a87a051be7..8f472911469d 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3370,7 +3370,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
>   		goto err_trace;
>   	}
>   
> -	cf = dma_fence_array_alloc(n_fence);
> +	cf = dma_fence_array_alloc();
>   	if (!cf) {
>   		fence = ERR_PTR(-ENOMEM);
>   		goto err_out;
> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
> index 1b1d87579c38..3ee55c0e2fa4 100644
> --- a/include/linux/dma-fence-array.h
> +++ b/include/linux/dma-fence-array.h
> @@ -15,16 +15,6 @@
>   #include <linux/dma-fence.h>
>   #include <linux/irq_work.h>
>   
> -/**
> - * struct dma_fence_array_cb - callback helper for fence array
> - * @cb: fence callback structure for signaling
> - * @array: reference to the parent fence array object
> - */
> -struct dma_fence_array_cb {
> -	struct dma_fence_cb cb;
> -	struct dma_fence_array *array;
> -};
> -
>   /**
>    * struct dma_fence_array - fence to represent an array of fences
>    * @base: fence base class
> @@ -33,18 +23,17 @@ struct dma_fence_array_cb {
>    * @num_pending: fences in the array still pending
>    * @fences: array of the fences
>    * @work: internal irq_work function
> - * @callbacks: array of callback helpers
> + * @callback: callback structure for signaling
>    */
>   struct dma_fence_array {
>   	struct dma_fence base;
>   
> -	unsigned num_fences;
> -	atomic_t num_pending;
> +	unsigned int num_fences;
> +	unsigned int num_pending;
>   	struct dma_fence **fences;
>   
>   	struct irq_work work;
> -
> -	struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
> +	struct dma_fence_cb callback;
>   };
>   
>   /**
> @@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
>   	for (index = 0, fence = dma_fence_array_first(head); fence;	\
>   	     ++(index), fence = dma_fence_array_next(head, index))
>   
> -struct dma_fence_array *dma_fence_array_alloc(int num_fences);
> +struct dma_fence_array *dma_fence_array_alloc(void);
>   void dma_fence_array_init(struct dma_fence_array *array,
>   			  int num_fences, struct dma_fence **fences,
>   			  u64 context, unsigned seqno);
> -
>   struct dma_fence_array *dma_fence_array_create(int num_fences,
>   					       struct dma_fence **fences,
>   					       u64 context, unsigned seqno);


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] dma-buf/dma_fence_array: optimize handling v2
  2026-05-06  8:04 ` Tvrtko Ursulin
@ 2026-05-06  9:46   ` Christian König
  0 siblings, 0 replies; 3+ messages in thread
From: Christian König @ 2026-05-06  9:46 UTC (permalink / raw)
  To: Tvrtko Ursulin, sumit.semwal, dri-devel, linux-media,
	linaro-mm-sig

On 5/6/26 10:04, Tvrtko Ursulin wrote:
> 
> On 05/05/2026 12:08, Christian König wrote:
>> Removing the signal on any feature allows to simplfy the dma_fence_array
>> code a lot and saves us from the need to install a callback on all fences
>> at the same time.
>>
>> This results is less memory and CPU overhead.
> 
> Code looks good but I still worry about the new potential for num_fences irq work latencies whereas the existing implementation only has one.
> 
> Also, whether or not current or this implementation uses less or more CPU overhead depends on the signalling pattern (time distribution) of the fences in the array.
> 
> Apart from more latency it could even be more CPU usage in the pathological case.
> 
> It would be less if, when the last fence in the array signals, all others have already signaled. Although it would still need to go through all dma_fence_add_callback() calls so that part is the same as the current implementation. Only the CPU cycles from the signaling side would be saved.
> 
> But in the pathological case, where fences signal one by one from the first to last, and are spaced more in time than a single irq work latency, the new implementation needs more CPU time and more latency.

Mhm, interesting point. I haven't considered that.

As far as I understood it the irq_work runs at the end of the interrupt handling. So when the dma_fence was signaled by an interrupt that is pretty much a no-op.

It only becomes a problem when the fence was signaled by polling, cause then the irq work only runs on the next timer tick if I'm not completely mistaken.

> I do agree it would be nice to be able to drop the callbacks array but for the above reasons I am worried whether it is safe.

The problem is not the memory footprint, but rather that the callbacks array adds work to the interrupt handler of each fence.

That's actually a huge bunch of extra overhead for each driver which could be completely avoid if we install only one callback at a time.

> Some other minor comments below.

Going to adress them.

Thanks,
Christian.

> 
>> v2: fix potential double locking pointed out by Tvrtko
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/dma-buf/dma-fence-array.c | 134 +++++++++++++-----------------
>>   drivers/gpu/drm/xe/xe_vm.c        |   2 +-
>>   include/linux/dma-fence-array.h   |  22 ++---
>>   3 files changed, 66 insertions(+), 92 deletions(-)
>>
>> diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
>> index 5e10e8df372f..8b94c6287482 100644
>> --- a/drivers/dma-buf/dma-fence-array.c
>> +++ b/drivers/dma-buf/dma-fence-array.c
>> @@ -42,97 +42,88 @@ static void dma_fence_array_clear_pending_error(struct dma_fence_array *array)
>>       cmpxchg(&array->base.error, PENDING_ERROR, 0);
>>   }
>>   -static void irq_dma_fence_array_work(struct irq_work *wrk)
>> +static void dma_fence_array_cb_func(struct dma_fence *f,
>> +                    struct dma_fence_cb *cb)
>>   {
>> -    struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>> -
>> -    dma_fence_array_clear_pending_error(array);
>> +    struct dma_fence_array *array =
>> +        container_of(cb, struct dma_fence_array, callback);
>>   -    dma_fence_signal(&array->base);
>> -    dma_fence_put(&array->base);
>> +    irq_work_queue(&array->work);
>>   }
>>   -static void dma_fence_array_cb_func(struct dma_fence *f,
>> -                    struct dma_fence_cb *cb)
>> +static bool dma_fence_array_try_add_cb(struct dma_fence_array *array)
>>   {
>> -    struct dma_fence_array_cb *array_cb =
>> -        container_of(cb, struct dma_fence_array_cb, cb);
>> -    struct dma_fence_array *array = array_cb->array;
>> +    while (array->num_pending) {
>> +        struct dma_fence *f = array->fences[array->num_pending - 1];
> 
> Maybe add above this line something like:
> 
> /*
>  * Install callbacks from the reverse so the check in
>  * dma_fence_array_signaled() can be optimized.
>  */
> 
>>   -    dma_fence_array_set_pending_error(array, f->error);
>> +        if (!dma_fence_add_callback(f, &array->callback,
>> +                        dma_fence_array_cb_func))
>> +            return true;
>>   -    if (atomic_dec_and_test(&array->num_pending))
>> -        irq_work_queue(&array->work);
>> -    else
>> +        dma_fence_array_set_pending_error(array, f->error);
>> +        --array->num_pending;
>> +    }
>> +    return false;
>> +}
>> +
>> +static void dma_fence_array_irq_work(struct irq_work *wrk)
>> +{
>> +    struct dma_fence_array *array = container_of(wrk, typeof(*array), work);
>> +
>> +    --array->num_pending;
>> +    if (!dma_fence_array_try_add_cb(array)) {
>> +        dma_fence_signal(&array->base);
>>           dma_fence_put(&array->base);
>> +    }
>>   }
>>     static bool dma_fence_array_enable_signaling(struct dma_fence *fence)
>>   {
>>       struct dma_fence_array *array = to_dma_fence_array(fence);
>> -    struct dma_fence_array_cb *cb = array->callbacks;
>> -    unsigned i;
>>   -    for (i = 0; i < array->num_fences; ++i) {
>> -        cb[i].array = array;
>> +    /*
>> +     * As we may report that the fence is signaled before all
>> +     * callbacks are complete, we need to take an additional
>> +     * reference count on the array so that we do not free it too
>> +     * early. The core fence handling will only hold the reference
>> +     * until we signal the array as complete (but that is now
>> +     * insufficient).
>> +     */
>> +    dma_fence_get(&array->base);
>> +    if (!dma_fence_array_try_add_cb(array)) {
>>           /*
>> -         * As we may report that the fence is signaled before all
>> -         * callbacks are complete, we need to take an additional
>> -         * reference count on the array so that we do not free it too
>> -         * early. The core fence handling will only hold the reference
>> -         * until we signal the array as complete (but that is now
>> -         * insufficient).
>> +         * When all fences are already signaled we can drop the reference again
>> +         * and report to the caller that the array can be signaled as well.
> 
> Optional nit - above two lines end up only lines over 80 in the file.
> 
> Regards,
> 
> Tvrtko
> 
>>            */
>> -        dma_fence_get(&array->base);
>> -        if (dma_fence_add_callback(array->fences[i], &cb[i].cb,
>> -                       dma_fence_array_cb_func)) {
>> -            int error = array->fences[i]->error;
>> -
>> -            dma_fence_array_set_pending_error(array, error);
>> -            dma_fence_put(&array->base);
>> -            if (atomic_dec_and_test(&array->num_pending)) {
>> -                dma_fence_array_clear_pending_error(array);
>> -                return false;
>> -            }
>> -        }
>> +        dma_fence_put(&array->base);
>> +        return false;
>>       }
>> -
>>       return true;
>>   }
>>     static bool dma_fence_array_signaled(struct dma_fence *fence)
>>   {
>>       struct dma_fence_array *array = to_dma_fence_array(fence);
>> -    int num_pending;
>> +    int num_pending, error = 0;
>>       unsigned int i;
>>         /*
>> -     * We need to read num_pending before checking the enable_signal bit
>> -     * to avoid racing with the enable_signaling() implementation, which
>> -     * might decrement the counter, and cause a partial check.
>> -     * atomic_read_acquire() pairs with atomic_dec_and_test() in
>> -     * dma_fence_array_enable_signaling()
>> -     *
>> -     * The !--num_pending check is here to account for the any_signaled case
>> -     * if we race with enable_signaling(), that means the !num_pending check
>> -     * in the is_signalling_enabled branch might be outdated (num_pending
>> -     * might have been decremented), but that's fine. The user will get the
>> -     * right value when testing again later.
>> +     * Reading num_pending without a memory barrier here is correct since
>> +     * that is only for optimization, it is perfectly acceptable to have a
>> +     * stale value for it. In all other cases num_pending is accessed by a
>> +     * single call chain.
>>        */
>> -    num_pending = atomic_read_acquire(&array->num_pending);
>> -    if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &array->base.flags)) {
>> -        if (num_pending <= 0)
>> -            goto signal;
>> -        return false;
>> -    }
>> +    num_pending = READ_ONCE(array->num_pending);
>> +    for (i = 0; i < num_pending; ++i) {
>> +        struct dma_fence *f = array->fences[i];
>>   -    for (i = 0; i < array->num_fences; ++i) {
>> -        if (dma_fence_is_signaled(array->fences[i]) && !--num_pending)
>> -            goto signal;
>> -    }
>> -    return false;
>> +        if (!dma_fence_is_signaled(f))
>> +            return false;
>>   -signal:
>> +        if (!error)
>> +            error = f->error;
>> +    }
>> +    dma_fence_array_set_pending_error(array, error);
>>       dma_fence_array_clear_pending_error(array);
>>       return true;
>>   }
>> @@ -171,15 +162,12 @@ EXPORT_SYMBOL(dma_fence_array_ops);
>>     /**
>>    * dma_fence_array_alloc - Allocate a custom fence array
>> - * @num_fences:        [in]    number of fences to add in the array
>>    *
>>    * Return dma fence array on success, NULL on failure
>>    */
>> -struct dma_fence_array *dma_fence_array_alloc(int num_fences)
>> +struct dma_fence_array *dma_fence_array_alloc(void)
>>   {
>> -    struct dma_fence_array *array;
>> -
>> -    return kzalloc_flex(*array, callbacks, num_fences);
>> +    return kzalloc_obj(struct dma_fence_array);
>>   }
>>   EXPORT_SYMBOL(dma_fence_array_alloc);
>>   @@ -203,10 +191,13 @@ void dma_fence_array_init(struct dma_fence_array *array,
>>       WARN_ON(!num_fences || !fences);
>>         array->num_fences = num_fences;
>> +    array->num_pending = num_fences;
>> +    array->fences = fences;
>> +    array->base.error = PENDING_ERROR;
>>         dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context,
>>                  seqno);
>> -    init_irq_work(&array->work, irq_dma_fence_array_work);
>> +    init_irq_work(&array->work, dma_fence_array_irq_work);
>>         /*
>>        * dma_fence_array_enable_signaling() is invoked while holding
>> @@ -220,11 +211,6 @@ void dma_fence_array_init(struct dma_fence_array *array,
>>        */
>>       lockdep_set_class(&array->base.inline_lock, &dma_fence_array_lock_key);
>>   -    atomic_set(&array->num_pending, num_fences);
>> -    array->fences = fences;
>> -
>> -    array->base.error = PENDING_ERROR;
>> -
>>       /*
>>        * dma_fence_array objects should never contain any other fence
>>        * containers or otherwise we run into recursion and potential kernel
>> @@ -265,7 +251,7 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
>>   {
>>       struct dma_fence_array *array;
>>   -    array = dma_fence_array_alloc(num_fences);
>> +    array = dma_fence_array_alloc();
>>       if (!array)
>>           return NULL;
>>   diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
>> index 62a87a051be7..8f472911469d 100644
>> --- a/drivers/gpu/drm/xe/xe_vm.c
>> +++ b/drivers/gpu/drm/xe/xe_vm.c
>> @@ -3370,7 +3370,7 @@ static struct dma_fence *ops_execute(struct xe_vm *vm,
>>           goto err_trace;
>>       }
>>   -    cf = dma_fence_array_alloc(n_fence);
>> +    cf = dma_fence_array_alloc();
>>       if (!cf) {
>>           fence = ERR_PTR(-ENOMEM);
>>           goto err_out;
>> diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
>> index 1b1d87579c38..3ee55c0e2fa4 100644
>> --- a/include/linux/dma-fence-array.h
>> +++ b/include/linux/dma-fence-array.h
>> @@ -15,16 +15,6 @@
>>   #include <linux/dma-fence.h>
>>   #include <linux/irq_work.h>
>>   -/**
>> - * struct dma_fence_array_cb - callback helper for fence array
>> - * @cb: fence callback structure for signaling
>> - * @array: reference to the parent fence array object
>> - */
>> -struct dma_fence_array_cb {
>> -    struct dma_fence_cb cb;
>> -    struct dma_fence_array *array;
>> -};
>> -
>>   /**
>>    * struct dma_fence_array - fence to represent an array of fences
>>    * @base: fence base class
>> @@ -33,18 +23,17 @@ struct dma_fence_array_cb {
>>    * @num_pending: fences in the array still pending
>>    * @fences: array of the fences
>>    * @work: internal irq_work function
>> - * @callbacks: array of callback helpers
>> + * @callback: callback structure for signaling
>>    */
>>   struct dma_fence_array {
>>       struct dma_fence base;
>>   -    unsigned num_fences;
>> -    atomic_t num_pending;
>> +    unsigned int num_fences;
>> +    unsigned int num_pending;
>>       struct dma_fence **fences;
>>         struct irq_work work;
>> -
>> -    struct dma_fence_array_cb callbacks[] __counted_by(num_fences);
>> +    struct dma_fence_cb callback;
>>   };
>>     /**
>> @@ -78,11 +67,10 @@ to_dma_fence_array(struct dma_fence *fence)
>>       for (index = 0, fence = dma_fence_array_first(head); fence;    \
>>            ++(index), fence = dma_fence_array_next(head, index))
>>   -struct dma_fence_array *dma_fence_array_alloc(int num_fences);
>> +struct dma_fence_array *dma_fence_array_alloc(void);
>>   void dma_fence_array_init(struct dma_fence_array *array,
>>                 int num_fences, struct dma_fence **fences,
>>                 u64 context, unsigned seqno);
>> -
>>   struct dma_fence_array *dma_fence_array_create(int num_fences,
>>                              struct dma_fence **fences,
>>                              u64 context, unsigned seqno);
> 


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-05-06  9:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-05 11:08 [PATCH] dma-buf/dma_fence_array: optimize handling v2 Christian König
2026-05-06  8:04 ` Tvrtko Ursulin
2026-05-06  9:46   ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox