Re: [PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level

From: zhoucm1 <david1.zhou-5C7GfCeVMHo@public.gmane.org>
To: Felix Kuehling <felix.kuehling-5C7GfCeVMHo@public.gmane.org>,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: Re: [PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level
Date: Wed, 12 Jul 2017 13:52:01 +0800	[thread overview]
Message-ID: <5965B901.6000102@amd.com> (raw)
In-Reply-To: <4e7dee07-81d0-381d-579b-52e7e5969112-5C7GfCeVMHo@public.gmane.org>


On 2017年07月12日 12:56, Felix Kuehling wrote:
> Hi David,
>
> Responses inline ...
Yeah, I see your mean, you mainly want to save some cpu overhead. If 
others have no object, feel free add my RB then.

Regards,
David Zhou
>
> On 17-07-12 12:30 AM, zhoucm1 wrote:
>> nice improvement, just some nitpick inline, otherwise the patch is
>> Reviewed-by: Chunming Zhou <david1.zhou@amd.com>.
>>
>> On 2017年07月12日 11:56, Felix Kuehling wrote:
>>> When lots of virtual address spaces is used, there can be thousands
>>> of page table BOs. amdgpu_vm_update_level iterates over all of them
>>> recursively. In many cases only a few or none at all need to be
>>> updated. Minimize unnecessary code execution and memory usage in
>>> those cases.
>>>
>>> This speeds up memory mapping in a synthetic KFD memory mapping
>>> benchmark by roughly a factor two.
>>>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109
>>> +++++++++++++++++----------------
>>>    1 file changed, 55 insertions(+), 54 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index ff5de3a..23b899b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>    {
>>>        struct amdgpu_bo *shadow;
>>>        struct amdgpu_ring *ring = NULL;
>>> -    uint64_t pd_addr, shadow_addr = 0;
>>> +    uint64_t pd_addr = 0, shadow_addr = 0;
>>>        uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>>>        uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>>>        unsigned count = 0, pt_idx, ndw = 0;
>>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>          WARN_ON(vm->use_cpu_for_update && shadow);
>>>        if (vm->use_cpu_for_update && !shadow) {
>>> -        r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> -        if (r)
>>> -            return r;
>>> -        r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> -        if (unlikely(r)) {
>>> -            amdgpu_bo_kunmap(parent->bo);
>>> -            return r;
>>> -        }
>>> +        /* Defer kmapping until it's actually needed. Some
>>> +         * PDBs may need no update at all
>>> +         */
>>>            params.func = amdgpu_vm_cpu_set_ptes;
>>> +        params.ib = (void *)(long)-1;
>>>        } else {
>>> -        if (shadow) {
>>> -            r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
>>> -            if (r)
>>> -                return r;
>>> -        }
>>> -        ring = container_of(vm->entity.sched, struct amdgpu_ring,
>>> -                    sched);
>>> -
>>> -        /* padding, etc. */
>>> -        ndw = 64;
>>> -
>>> -        /* assume the worst case */
>>> -        ndw += parent->last_entry_used * 6;
>>> -
>>> -        pd_addr = amdgpu_bo_gpu_offset(parent->bo);
>>> -
>>> -        if (shadow) {
>>> -            shadow_addr = amdgpu_bo_gpu_offset(shadow);
>>> -            ndw *= 2;
>>> -        } else {
>>> -            shadow_addr = 0;
>>> -        }
>>> -
>>> -        r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> -        if (r)
>>> -            return r;
>>> -
>>> -        params.ib = &job->ibs[0];
>>> +        /* Defer IB allocation until it's actually
>>> +         * needed. Some PDBs may need no update at all
>>> +         */
>>> +        params.ib = NULL;
>>>            params.func = amdgpu_vm_do_set_ptes;
>>>        }
>>>    -
>>>        /* walk over the address space and update the directory */
>>>        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>>            struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
>>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>            if (bo == NULL)
>>>                continue;
>>>    -        if (bo->shadow) {
>>> -            struct amdgpu_bo *pt_shadow = bo->shadow;
>>> -
>>> -            r = amdgpu_ttm_bind(&pt_shadow->tbo,
>>> -                        &pt_shadow->tbo.mem);
>>> -            if (r)
>>> -                return r;
>>> -        }
>>> -
>>> -        pt = amdgpu_bo_gpu_offset(bo);
>>> -        pt = amdgpu_gart_get_vm_pde(adev, pt);
>>> +        pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
>> Getting offset from amdgpu_bo_gpu_offset(bo)  is better I think,
>> although we can assume pt bo is a known domain.
> amdgpu_bo_gpu_offset does a bunch of sanity checks (WARN_ON_ONCE). They
> are redundant here, because we can assume that the page table BOs have
> been properly created and validated by kernel code we can trust more
> than user mode. I saw amdgpu_bo_gpu_offset near the top with "perf top"
> when running my memory mapping benchmark, so I think it makes sense to
> eliminate the function call and redundant sanity checks in this code path.
>
>>>            if (parent->entries[pt_idx].addr == pt)
>>>                continue;
>>>              parent->entries[pt_idx].addr = pt;
>>>    +        if (!params.ib) {
>>> +            if (shadow) {
>>> +                r = amdgpu_ttm_bind(&shadow->tbo,
>>> +                            &shadow->tbo.mem);
>>> +                if (r)
>>> +                    return r;
>>> +            }
>>> +
>>> +            ring = container_of(vm->entity.sched,
>>> +                        struct amdgpu_ring, sched);
>>> +
>>> +            /* padding, etc. */
>>> +            ndw = 64;
>>> +
>>> +            /* assume the worst case */
>>> +            ndw += (parent->last_entry_used - pt_idx) * 6;
>>> +
>>> +            pd_addr = parent->bo->tbo.offset;
>>> +
>>> +            if (shadow) {
>>> +                shadow_addr = shadow->tbo.offset;
>>> +                ndw *= 2;
>>> +            } else {
>>> +                shadow_addr = 0;
>>> +            }
>>> +            r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> +            if (r)
>>> +                return r;
>>> +
>>> +            params.ib = &job->ibs[0];
>>> +        } else if (!pd_addr) {
>>> +            r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> +            if (r)
>>> +                return r;
>>> +            r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> +            if (unlikely(r)) {
>>> +                amdgpu_bo_kunmap(parent->bo);
>>> +                return r;
>>> +            }
>>> +        }
>>> +
>>>            pde = pd_addr + pt_idx * 8;
>>>            if (((last_pde + 8 * count) != pde) ||
>>>                ((last_pt + incr * count) != pt) ||
>>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>          if (params.func == amdgpu_vm_cpu_set_ptes)
>>>            amdgpu_bo_kunmap(parent->bo);
>>> -    else if (params.ib->length_dw == 0) {
>>> +    else if (params.ib && params.ib->length_dw == 0) {
>>>            amdgpu_job_free(job);
>>> -    } else {
>>> +    } else if (params.ib) {
>>>            amdgpu_ring_pad_ib(ring, params.ib);
>>>            amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>>>                     AMDGPU_FENCE_OWNER_VM);
>>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>              amdgpu_bo_fence(parent->bo, fence, true);
>>>            dma_fence_put(vm->last_dir_update);
>>> -        vm->last_dir_update = dma_fence_get(fence);
>>> -        dma_fence_put(fence);
>>> +        vm->last_dir_update = fence;
>> Here I prefer to keep previous code, the logic is more clear.:)
> Again, this is a code path that can be called very frequently. So I'd
> like to eliminate redundant atomic operations here.
>
>>>        }
>>>        /*
>>>         * Recurse into the subdirectories. This recursion is harmless
>>> because
>>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>>            struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>>>    -        if (!entry->bo)
>>> +        if (!entry->bo || !entry->entries)
>> In fact, !entry->entries would be checked at the begin of this
>> function in next loop, but either way is ok to me here.
> Doing the check here eliminates the lowest level of recursive function
> calls. That's where it counts the most. Each successive level of the
> recursion runs 512 times as often as the level above it. So completely
> eliminating the function call overhead of the lowest level is a good thing.
>
> Regards,
>    Felix
>
>> Regards,
>> David Zhou
>>>                continue;
>>>              r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx