On 2025-10-31 09:16, Christian König wrote: > This should allow amdkfd_fences to outlive the amdgpu module. > > Signed-off-by: Christian König > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 6 ++++ > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 36 +++++++------------ > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 7 ++-- > drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +-- > 4 files changed, 24 insertions(+), 29 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index 9e120c934cc1..35c59c784b7b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -196,6 +196,7 @@ int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data); > #endif > #if IS_ENABLED(CONFIG_HSA_AMD) > bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm); > +void amdkfd_fence_signal(struct dma_fence *f); > struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); > void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo); > int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, > @@ -210,6 +211,11 @@ bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) > return false; > } > > +static inline > +void amdkfd_fence_signal(struct dma_fence *f) > +{ > +} > + > static inline > struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f) > { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c > index 09c919f72b6c..69bca4536326 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c > @@ -127,29 +127,9 @@ static bool amdkfd_fence_enable_signaling(struct dma_fence *f) > if (!svm_range_schedule_evict_svm_bo(fence)) > return true; > } > - return false; > -} > - > -/** > - * amdkfd_fence_release - callback that fence can be freed > - * > - * @f: dma_fence > - * > - * This function is called when the reference count becomes zero. > - * Drops the mm_struct reference and RCU schedules freeing up the fence. > - */ > -static void amdkfd_fence_release(struct dma_fence *f) > -{ > - struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); > - > - /* Unconditionally signal the fence. The process is getting > - * terminated. > - */ > - if (WARN_ON(!fence)) > - return; /* Not an amdgpu_amdkfd_fence */ > - > mmdrop(fence->mm); > - kfree_rcu(f, rcu); > + fence->mm = NULL; > + return false; > } > > /** > @@ -174,9 +154,19 @@ bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) > return false; > } > > +void amdkfd_fence_signal(struct dma_fence *f) > +{ > + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); > + > + if (fence) { > + mmdrop(fence->mm); > + fence->mm = NULL; > + } > + dma_fence_signal(f); > +} > + I'm still concerned about possible race conditions between amdkfd_fence_signal and amdkfd_fence_enable_signaling. I think the latter is always called with the fence->lock held. So this could be fixed by taking the fence->lock in amdkfd_fence_signal: void amdkfd_fence_signal(struct dma_fence *f) { struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); unsigned long flags; spin_lock_irqsave(f->lock, &flags); if (fence && fence->mm) { mmdrop(fence->mm); fence->mm = NULL; } dma_fence_signal_locked(f); spin_unlock_irqrestore(f->lock, flags); } Also note that you need to NULL-check fence->mm (here and in enable_signaling) because mmdrop doesn't have a check. Regards,   Felix > static const struct dma_fence_ops amdkfd_fence_ops = { > .get_driver_name = amdkfd_fence_get_driver_name, > .get_timeline_name = amdkfd_fence_get_timeline_name, > .enable_signaling = amdkfd_fence_enable_signaling, > - .release = amdkfd_fence_release, > }; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index ddfe30c13e9d..779d7701bac9 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -1177,7 +1177,7 @@ static void kfd_process_wq_release(struct work_struct *work) > synchronize_rcu(); > ef = rcu_access_pointer(p->ef); > if (ef) > - dma_fence_signal(ef); > + amdkfd_fence_signal(ef); > > kfd_process_remove_sysfs(p); > kfd_debugfs_remove_process(p); > @@ -1986,7 +1986,6 @@ kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node, > static int signal_eviction_fence(struct kfd_process *p) > { > struct dma_fence *ef; > - int ret; > > rcu_read_lock(); > ef = dma_fence_get_rcu_safe(&p->ef); > @@ -1994,10 +1993,10 @@ static int signal_eviction_fence(struct kfd_process *p) > if (!ef) > return -EINVAL; > > - ret = dma_fence_signal(ef); > + amdkfd_fence_signal(ef); > dma_fence_put(ef); > > - return ret; > + return 0; > } > > static void evict_process_worker(struct work_struct *work) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > index 9d72411c3379..5d62d231a865 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > @@ -428,7 +428,7 @@ static void svm_range_bo_release(struct kref *kref) > > if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) > /* We're not in the eviction worker. Signal the fence. */ > - dma_fence_signal(&svm_bo->eviction_fence->base); > + amdkfd_fence_signal(&svm_bo->eviction_fence->base); > dma_fence_put(&svm_bo->eviction_fence->base); > amdgpu_bo_unref(&svm_bo->bo); > kfree(svm_bo); > @@ -3622,7 +3622,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) > mmap_read_unlock(mm); > mmput(mm); > > - dma_fence_signal(&svm_bo->eviction_fence->base); > + amdkfd_fence_signal(&svm_bo->eviction_fence->base); > > /* This is the last reference to svm_bo, after svm_range_vram_node_free > * has been called in svm_migrate_vram_to_ram