All of lore.kernel.org
 help / color / mirror / Atom feed
From: zhoucm1 <david1.zhou-5C7GfCeVMHo@public.gmane.org>
To: Felix Kuehling <felix.kuehling-5C7GfCeVMHo@public.gmane.org>,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: Re: [PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level
Date: Wed, 12 Jul 2017 13:52:01 +0800	[thread overview]
Message-ID: <5965B901.6000102@amd.com> (raw)
In-Reply-To: <4e7dee07-81d0-381d-579b-52e7e5969112-5C7GfCeVMHo@public.gmane.org>



On 2017年07月12日 12:56, Felix Kuehling wrote:
> Hi David,
>
> Responses inline ...
Yeah, I see your mean, you mainly want to save some cpu overhead. If 
others have no object, feel free add my RB then.

Regards,
David Zhou
>
> On 17-07-12 12:30 AM, zhoucm1 wrote:
>> nice improvement, just some nitpick inline, otherwise the patch is
>> Reviewed-by: Chunming Zhou <david1.zhou@amd.com>.
>>
>> On 2017年07月12日 11:56, Felix Kuehling wrote:
>>> When lots of virtual address spaces is used, there can be thousands
>>> of page table BOs. amdgpu_vm_update_level iterates over all of them
>>> recursively. In many cases only a few or none at all need to be
>>> updated. Minimize unnecessary code execution and memory usage in
>>> those cases.
>>>
>>> This speeds up memory mapping in a synthetic KFD memory mapping
>>> benchmark by roughly a factor two.
>>>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109
>>> +++++++++++++++++----------------
>>>    1 file changed, 55 insertions(+), 54 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index ff5de3a..23b899b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>    {
>>>        struct amdgpu_bo *shadow;
>>>        struct amdgpu_ring *ring = NULL;
>>> -    uint64_t pd_addr, shadow_addr = 0;
>>> +    uint64_t pd_addr = 0, shadow_addr = 0;
>>>        uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>>>        uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>>>        unsigned count = 0, pt_idx, ndw = 0;
>>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>          WARN_ON(vm->use_cpu_for_update && shadow);
>>>        if (vm->use_cpu_for_update && !shadow) {
>>> -        r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> -        if (r)
>>> -            return r;
>>> -        r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> -        if (unlikely(r)) {
>>> -            amdgpu_bo_kunmap(parent->bo);
>>> -            return r;
>>> -        }
>>> +        /* Defer kmapping until it's actually needed. Some
>>> +         * PDBs may need no update at all
>>> +         */
>>>            params.func = amdgpu_vm_cpu_set_ptes;
>>> +        params.ib = (void *)(long)-1;
>>>        } else {
>>> -        if (shadow) {
>>> -            r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
>>> -            if (r)
>>> -                return r;
>>> -        }
>>> -        ring = container_of(vm->entity.sched, struct amdgpu_ring,
>>> -                    sched);
>>> -
>>> -        /* padding, etc. */
>>> -        ndw = 64;
>>> -
>>> -        /* assume the worst case */
>>> -        ndw += parent->last_entry_used * 6;
>>> -
>>> -        pd_addr = amdgpu_bo_gpu_offset(parent->bo);
>>> -
>>> -        if (shadow) {
>>> -            shadow_addr = amdgpu_bo_gpu_offset(shadow);
>>> -            ndw *= 2;
>>> -        } else {
>>> -            shadow_addr = 0;
>>> -        }
>>> -
>>> -        r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> -        if (r)
>>> -            return r;
>>> -
>>> -        params.ib = &job->ibs[0];
>>> +        /* Defer IB allocation until it's actually
>>> +         * needed. Some PDBs may need no update at all
>>> +         */
>>> +        params.ib = NULL;
>>>            params.func = amdgpu_vm_do_set_ptes;
>>>        }
>>>    -
>>>        /* walk over the address space and update the directory */
>>>        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>>            struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
>>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>            if (bo == NULL)
>>>                continue;
>>>    -        if (bo->shadow) {
>>> -            struct amdgpu_bo *pt_shadow = bo->shadow;
>>> -
>>> -            r = amdgpu_ttm_bind(&pt_shadow->tbo,
>>> -                        &pt_shadow->tbo.mem);
>>> -            if (r)
>>> -                return r;
>>> -        }
>>> -
>>> -        pt = amdgpu_bo_gpu_offset(bo);
>>> -        pt = amdgpu_gart_get_vm_pde(adev, pt);
>>> +        pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
>> Getting offset from amdgpu_bo_gpu_offset(bo)  is better I think,
>> although we can assume pt bo is a known domain.
> amdgpu_bo_gpu_offset does a bunch of sanity checks (WARN_ON_ONCE). They
> are redundant here, because we can assume that the page table BOs have
> been properly created and validated by kernel code we can trust more
> than user mode. I saw amdgpu_bo_gpu_offset near the top with "perf top"
> when running my memory mapping benchmark, so I think it makes sense to
> eliminate the function call and redundant sanity checks in this code path.
>
>>>            if (parent->entries[pt_idx].addr == pt)
>>>                continue;
>>>              parent->entries[pt_idx].addr = pt;
>>>    +        if (!params.ib) {
>>> +            if (shadow) {
>>> +                r = amdgpu_ttm_bind(&shadow->tbo,
>>> +                            &shadow->tbo.mem);
>>> +                if (r)
>>> +                    return r;
>>> +            }
>>> +
>>> +            ring = container_of(vm->entity.sched,
>>> +                        struct amdgpu_ring, sched);
>>> +
>>> +            /* padding, etc. */
>>> +            ndw = 64;
>>> +
>>> +            /* assume the worst case */
>>> +            ndw += (parent->last_entry_used - pt_idx) * 6;
>>> +
>>> +            pd_addr = parent->bo->tbo.offset;
>>> +
>>> +            if (shadow) {
>>> +                shadow_addr = shadow->tbo.offset;
>>> +                ndw *= 2;
>>> +            } else {
>>> +                shadow_addr = 0;
>>> +            }
>>> +            r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> +            if (r)
>>> +                return r;
>>> +
>>> +            params.ib = &job->ibs[0];
>>> +        } else if (!pd_addr) {
>>> +            r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> +            if (r)
>>> +                return r;
>>> +            r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> +            if (unlikely(r)) {
>>> +                amdgpu_bo_kunmap(parent->bo);
>>> +                return r;
>>> +            }
>>> +        }
>>> +
>>>            pde = pd_addr + pt_idx * 8;
>>>            if (((last_pde + 8 * count) != pde) ||
>>>                ((last_pt + incr * count) != pt) ||
>>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>          if (params.func == amdgpu_vm_cpu_set_ptes)
>>>            amdgpu_bo_kunmap(parent->bo);
>>> -    else if (params.ib->length_dw == 0) {
>>> +    else if (params.ib && params.ib->length_dw == 0) {
>>>            amdgpu_job_free(job);
>>> -    } else {
>>> +    } else if (params.ib) {
>>>            amdgpu_ring_pad_ib(ring, params.ib);
>>>            amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>>>                     AMDGPU_FENCE_OWNER_VM);
>>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>              amdgpu_bo_fence(parent->bo, fence, true);
>>>            dma_fence_put(vm->last_dir_update);
>>> -        vm->last_dir_update = dma_fence_get(fence);
>>> -        dma_fence_put(fence);
>>> +        vm->last_dir_update = fence;
>> Here I prefer to keep previous code, the logic is more clear.:)
> Again, this is a code path that can be called very frequently. So I'd
> like to eliminate redundant atomic operations here.
>
>>>        }
>>>        /*
>>>         * Recurse into the subdirectories. This recursion is harmless
>>> because
>>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>>            struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>>>    -        if (!entry->bo)
>>> +        if (!entry->bo || !entry->entries)
>> In fact, !entry->entries would be checked at the begin of this
>> function in next loop, but either way is ok to me here.
> Doing the check here eliminates the lowest level of recursive function
> calls. That's where it counts the most. Each successive level of the
> recursion runs 512 times as often as the level above it. So completely
> eliminating the function call overhead of the lowest level is a good thing.
>
> Regards,
>    Felix
>
>> Regards,
>> David Zhou
>>>                continue;
>>>              r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2017-07-12  5:52 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-07-12  3:56 [PATCH] drm/amdgpu: Optimize recursion in amdgpu_vm_update_level Felix Kuehling
     [not found] ` <1499831769-3764-1-git-send-email-Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
2017-07-12  4:30   ` zhoucm1
     [not found]     ` <5965A5F1.80902-5C7GfCeVMHo@public.gmane.org>
2017-07-12  4:56       ` Felix Kuehling
     [not found]         ` <4e7dee07-81d0-381d-579b-52e7e5969112-5C7GfCeVMHo@public.gmane.org>
2017-07-12  5:52           ` zhoucm1 [this message]
2017-07-12  7:50   ` Christian König
     [not found]     ` <c50326a0-6813-d484-b289-5681b306d723-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-07-12 19:38       ` Felix Kuehling
     [not found]         ` <083c3f4e-72b3-1ad1-6d11-a9eba48e3c9c-5C7GfCeVMHo@public.gmane.org>
2017-07-13 13:45           ` Christian König

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5965B901.6000102@amd.com \
    --to=david1.zhou-5c7gfcevmho@public.gmane.org \
    --cc=amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    --cc=felix.kuehling-5C7GfCeVMHo@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.