From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.7]) by gabe.freedesktop.org (Postfix) with ESMTPS id 2EB3210E306 for ; Thu, 14 Dec 2023 14:14:18 +0000 (UTC) Message-ID: <419ceb92-a73a-4d9b-b8c8-c6f4f7347d4b@linux.intel.com> Date: Thu, 14 Dec 2023 15:13:49 +0100 MIME-Version: 1.0 Subject: Re: [PATCH i-g-t] benchmarks/gem_wsim: Support gens without relocations Content-Language: en-US To: Tvrtko Ursulin , igt-dev@lists.freedesktop.org References: <20231212113449.26800-1-marcin.bernatowicz@linux.intel.com> <1de0993e-22dc-4952-b024-1b4022d0a8c5@linux.intel.com> From: "Bernatowicz, Marcin" In-Reply-To: <1de0993e-22dc-4952-b024-1b4022d0a8c5@linux.intel.com> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 8bit List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: igt-dev-bounces@lists.freedesktop.org Sender: "igt-dev" List-ID: Hi, On 12/13/2023 2:47 PM, Tvrtko Ursulin wrote: > > On 12/12/2023 11:34, Marcin Bernatowicz wrote: >> This commit adopts the approach present in lib/igt_dummyload >> to support generations that do not use relocations. >> The intel_allocator is leveraged to compute offsets on these generations. >> Apart of this change, the 'struct vm' is now shared by both i915 and >> Xe. It includes a 'vm_id' that the intel_allocator uses for address >> assignment. >> This modification enhances gem_wsim compatibility with newer platforms >> running i915. >> >> Signed-off-by: Marcin Bernatowicz >> --- >>   benchmarks/gem_wsim.c | 106 +++++++++++++++++++++++++++++++----------- >>   1 file changed, 80 insertions(+), 26 deletions(-) >> >> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c >> index 30673da8f..7455cd6a1 100644 >> --- a/benchmarks/gem_wsim.c >> +++ b/benchmarks/gem_wsim.c >> @@ -205,7 +205,7 @@ struct w_step { >>       uint32_t bb_handle; >>   }; >> -struct xe_vm { >> +struct vm { >>       uint32_t id; >>       bool compute_mode; >>       uint64_t ahnd; >> @@ -226,9 +226,9 @@ struct ctx { >>       struct bond *bonds; >>       bool load_balance; >>       uint64_t sseu; >> +    /* reference to vm */ >> +    struct vm *vm; >>       struct { >> -        /* reference to vm */ >> -        struct xe_vm *vm; >>           /* exec queues */ >>           unsigned int nr_queues; >>           struct xe_exec_queue *queue_list; >> @@ -256,10 +256,8 @@ struct workload { >>       unsigned int nr_ctxs; >>       struct ctx *ctx_list; >> -    struct { >> -        unsigned int nr_vms; >> -        struct xe_vm *vm_list; >> -    } xe; >> +    unsigned int nr_vms; >> +    struct vm *vm_list; >>       struct working_set **working_sets; /* array indexed by set id */ >>       int max_working_set_id; >> @@ -1504,7 +1502,16 @@ static unsigned int create_bb(struct w_step *w, >> int self) >>       gem_set_domain(fd, w->bb_handle, >>                  I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC); >> -    cs = ptr = gem_mmap__wc(fd, w->bb_handle, 0, w->bb_size, >> PROT_WRITE); >> +    if (__gem_set_caching(fd, w->bb_handle, >> +                  I915_CACHING_CACHED) == 0) { >> +        cs = ptr = gem_mmap__cpu(fd, w->bb_handle, >> +                       0, w->bb_size, >> +                       PROT_READ | PROT_WRITE); >> +    } else >> +        cs = ptr = gem_mmap__device_coherent(fd, >> +                               w->bb_handle, >> +                               0, w->bb_size, >> +                               PROT_READ | PROT_WRITE); > > How is this hink related I did not get it? Should it be a separate patch? It appears that 'gem_mmap__wc' is not working for atsm (it might be necessary to use I915_MMAP_OFFSET_FIXED for discrete?). Without extensive investigation, I've adopted the mapping approach from lib/igt_dummyload. This modification could potentially be applied in a separate patch. > >>       /* Store initial 64b timestamp: start */ >>       *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_CS_MMIO_DST; >> @@ -1539,9 +1546,11 @@ static unsigned int create_bb(struct w_step *w, >> int self) >>       /* Save delta for indirect read by COND_BBE */ >>       *cs++ = MI_STORE_REGISTER_MEM_CMD | (1 + use_64b) | MI_CS_MMIO_DST; >>       *cs++ = CS_GPR(NOW_TS); >> +    w->i915.reloc[r].presumed_offset = w->i915.obj[self].offset; >>       w->i915.reloc[r].target_handle = self; >>       w->i915.reloc[r].offset = offset_in_page(cs); >> -    *cs++ = w->i915.reloc[r].delta = 4000; >> +    w->i915.reloc[r].delta = 4000; >> +    *cs++ = w->i915.reloc[r].presumed_offset + w->i915.reloc[r].delta; >>       *cs++ = 0; >>       r++; >> @@ -1556,17 +1565,21 @@ static unsigned int create_bb(struct w_step >> *w, int self) >>       *cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b); >>       w->i915.bb_duration = cs; >>       *cs++ = 0; >> +    w->i915.reloc[r].presumed_offset = w->i915.obj[self].offset; >>       w->i915.reloc[r].target_handle = self; >>       w->i915.reloc[r].offset = offset_in_page(cs); >> -    *cs++ = w->i915.reloc[r].delta = 4000; >> +    w->i915.reloc[r].delta = 4000; >> +    *cs++ = w->i915.reloc[r].presumed_offset + w->i915.reloc[r].delta; >>       *cs++ = 0; >>       r++; >>       /* Otherwise back to recalculating delta */ >>       *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b; >> +    w->i915.reloc[r].presumed_offset = w->i915.obj[self].offset; >>       w->i915.reloc[r].target_handle = self; >>       w->i915.reloc[r].offset = offset_in_page(cs); >> -    *cs++ = w->i915.reloc[r].delta = offset_in_page(jmp); >> +    w->i915.reloc[r].delta = offset_in_page(jmp); >> +    *cs++ = w->i915.reloc[r].presumed_offset + w->i915.reloc[r].delta; >>       *cs++ = 0; >>       r++; >> @@ -1648,10 +1661,10 @@ xe_get_eq(struct workload *wrk, const struct >> w_step *w) >>       return eq; >>   } >> -static struct xe_vm * >> -xe_get_vm(struct workload *wrk, const struct w_step *w) >> +static struct vm * >> +get_vm(struct workload *wrk, const struct w_step *w) >>   { >> -    return wrk->xe.vm_list; >> +    return wrk->vm_list; > > Why does this have 'w' as an argument but doesn't use it? > At present, we are operating with a single vm. However, if additional vms are introduced, the 'w' parameter will be utilized to select the appropriate one (return get_ctx(wrk, w)->vm). >>   } >>   static uint32_t alloc_bo(int i915, unsigned long *size) >> @@ -1673,6 +1686,16 @@ alloc_step_batch(struct workload *wrk, struct >> w_step *w) >>       struct dep_entry *dep; >>       unsigned int j = 0; >>       unsigned int nr_obj = 2 + w->data_deps.nr; >> +    unsigned int objflags = 0; >> +    uint64_t addr; >> +    struct vm *vm = get_vm(wrk, w); >> + >> +    addr = gem_aperture_size(fd) / 2; >> +    if (addr >> 32) >> +        objflags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; >> + >> +    if (vm->ahnd) >> +        objflags |= EXEC_OBJECT_PINNED; >>       w->i915.obj = calloc(nr_obj, sizeof(*w->i915.obj)); >>       igt_assert(w->i915.obj); >> @@ -1680,11 +1703,18 @@ alloc_step_batch(struct workload *wrk, struct >> w_step *w) >>       w->bb_size = PAGE_SIZE; >>       w->i915.obj[j].handle = alloc_bo(fd, &w->bb_size); >>       w->i915.obj[j].flags = EXEC_OBJECT_WRITE; >> +    if (vm->ahnd) { >> +        addr = get_offset(vm->ahnd, w->i915.obj[j].handle, >> w->bb_size, 0); >> +        w->i915.obj[j].offset = CANONICAL(addr); >> +        w->i915.obj[j].flags |= objflags; >> +    } >> + >>       j++; >>       igt_assert(j < nr_obj); >>       for_each_dep(dep, w->data_deps) { >>           uint32_t dep_handle; >> +        uint64_t dep_size; > > w->bb_size is unsigned long, the two should probably match. I selected the dep_size type to match the uint64_t parameter type of the get_offset function. While the final result will remain unchanged, it might be more coherent to use unsigned long, aligning with the type of values dep_size is assigned, and pass that to the get_offset function. > >>           if (dep->working_set == -1) { >>               int dep_idx = w->idx + dep->target; >> @@ -1694,6 +1724,7 @@ alloc_step_batch(struct workload *wrk, struct >> w_step *w) >>               igt_assert(wrk->steps[dep_idx].type == BATCH); >>               dep_handle = wrk->steps[dep_idx].i915.obj[0].handle; >> +            dep_size = w->bb_size; >>           } else { >>               struct working_set *set; >> @@ -1707,18 +1738,33 @@ alloc_step_batch(struct workload *wrk, struct >> w_step *w) >>               igt_assert(set->sizes[dep->target].size); >>               dep_handle = set->handles[dep->target]; >> +            dep_size = set->sizes[dep->target].size; >>           } >>           w->i915.obj[j].flags = dep->write ? EXEC_OBJECT_WRITE : 0; >>           w->i915.obj[j].handle = dep_handle; >> +        if (vm->ahnd) { >> +            addr = get_offset(vm->ahnd, w->i915.obj[j].handle, >> dep_size, 0); >> +            w->i915.obj[j].offset = CANONICAL(addr); >> +            w->i915.obj[j].flags |= objflags; >> +        } >>           j++; >>           igt_assert(j < nr_obj); >>       } >>       w->bb_handle = w->i915.obj[j].handle = alloc_bo(fd, &w->bb_size); >> +    if (vm->ahnd) { >> +        addr = get_offset(vm->ahnd, w->i915.obj[j].handle, >> w->bb_size, 0); >> +        w->i915.obj[j].offset = CANONICAL(addr); >> +        w->i915.obj[j].flags |= objflags; >> +    } >>       w->i915.obj[j].relocation_count = create_bb(w, j); >> -    igt_assert(w->i915.obj[j].relocation_count <= >> ARRAY_SIZE(w->i915.reloc)); >> -    w->i915.obj[j].relocs_ptr = to_user_pointer(&w->i915.reloc); >> +    if (vm->ahnd) { >> +        w->i915.obj[j].relocation_count = 0; >> +    } else { >> +        igt_assert(w->i915.obj[j].relocation_count <= >> ARRAY_SIZE(w->i915.reloc)); >> +        w->i915.obj[j].relocs_ptr = to_user_pointer(&w->i915.reloc); >> +    } >>       w->i915.eb.buffers_ptr = to_user_pointer(w->i915.obj); >>       w->i915.eb.buffer_count = j + 1; >> @@ -1738,7 +1784,7 @@ alloc_step_batch(struct workload *wrk, struct >> w_step *w) >>   static void >>   xe_alloc_step_batch(struct workload *wrk, struct w_step *w) >>   { >> -    struct xe_vm *vm = xe_get_vm(wrk, w); >> +    struct vm *vm = get_vm(wrk, w); >>       struct xe_exec_queue *eq = xe_get_eq(wrk, w); >>       struct dep_entry *dep; >>       int i; >> @@ -2032,7 +2078,7 @@ static void measure_active_set(struct workload >> *wrk) >>   #define alloca0(sz) ({ size_t sz__ = (sz); memset(alloca(sz__), 0, >> sz__); }) >> -static void xe_vm_create_(struct xe_vm *vm) >> +static void xe_vm_create_(struct vm *vm) >>   { >>       uint32_t flags = 0; >> @@ -2046,7 +2092,7 @@ static void xe_vm_create_(struct xe_vm *vm) >>   static void xe_exec_queue_create_(struct ctx *ctx, struct >> xe_exec_queue *eq) >>   { >>       struct drm_xe_exec_queue_create create = { >> -        .vm_id = ctx->xe.vm->id, >> +        .vm_id = ctx->vm->id, >>           .width = 1, >>           .num_placements = eq->nr_hwes, >>           .instances = to_user_pointer(eq->hwe_list), >> @@ -2159,6 +2205,12 @@ static int prepare_contexts(unsigned int id, >> struct workload *wrk) >>                   gem_context_get_param(fd, ¶m); >>                   igt_assert(param.value); >>                   share_vm = param.value; >> +                wrk->nr_vms = 1; >> +                wrk->vm_list = calloc(wrk->nr_vms, sizeof(struct vm)); >> +                igt_assert(wrk->vm_list); >> +                wrk->vm_list->id = share_vm; >> +                wrk->vm_list->ahnd = get_reloc_ahnd(fd, share_vm); >> +                ctx2->vm = wrk->vm_list; >>                   break; >>               } >> @@ -2174,6 +2226,7 @@ static int prepare_contexts(unsigned int id, >> struct workload *wrk) >>           ctx_id = args.ctx_id; >>           ctx->id = ctx_id; >>           ctx->sseu = device_sseu.slice_mask; >> +        ctx->vm = wrk->vm_list; >>           __configure_context(ctx_id, wrk->prio); >> @@ -2267,16 +2320,17 @@ static int xe_prepare_contexts(unsigned int >> id, struct workload *wrk) >>       unsigned int i; >>       /* shortcut, create one vm */ >> -    wrk->xe.nr_vms = 1; >> -    wrk->xe.vm_list = calloc(wrk->xe.nr_vms, sizeof(struct xe_vm)); >> -    wrk->xe.vm_list->compute_mode = false; >> -    xe_vm_create_(wrk->xe.vm_list); >> -    wrk->xe.vm_list->ahnd = intel_allocator_open(fd, >> wrk->xe.vm_list->id, >> +    wrk->nr_vms = 1; >> +    wrk->vm_list = calloc(wrk->nr_vms, sizeof(struct vm)); >> +    igt_assert(wrk->vm_list); >> +    wrk->vm_list->compute_mode = false; >> +    xe_vm_create_(wrk->vm_list); >> +    wrk->vm_list->ahnd = intel_allocator_open(fd, wrk->vm_list->id, >>                                INTEL_ALLOCATOR_RELOC) > > Can some of this go to common, now that struct vm is common? Maybe not.. > remind me, plan is for xe backend to support more than one vm? That's > why get_vm has w_step as argument? Yes, that's correct. I should tackle it when adding support for more vms. For the time being, the primary requirement for this patch is a vm id to align with intel_allocator's needs. -- marcin > > Regards, > > Tvrtko > >>       __for_each_ctx(ctx, wrk, ctx_idx) { >>           /* link with vm */ >> -        ctx->xe.vm = wrk->xe.vm_list; >> +        ctx->vm = wrk->vm_list; >>           for_each_w_step(w, wrk) { >>               if (w->context != ctx_idx) >>                   continue; >> @@ -2846,7 +2900,7 @@ static void *run_workload(void *data) >>                   w_step_sync(w); >>                   syncobj_destroy(fd, w->xe.syncs[0].handle); >>                   free(w->xe.syncs); >> -                xe_vm_unbind_sync(fd, xe_get_vm(wrk, w)->id, 0, >> w->xe.exec.address, >> +                xe_vm_unbind_sync(fd, get_vm(wrk, w)->id, 0, >> w->xe.exec.address, >>                             w->bb_size); >>                   gem_munmap(w->xe.data, w->bb_size); >>                   gem_close(fd, w->bb_handle);