From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga06.intel.com (mga06b.intel.com [134.134.136.31]) by gabe.freedesktop.org (Postfix) with ESMTPS id 0320B10E9EF for ; Fri, 13 Jan 2023 11:38:58 +0000 (UTC) Message-ID: Date: Fri, 13 Jan 2023 11:38:53 +0000 MIME-Version: 1.0 Content-Language: en-GB To: Niranjana Vishwanathapura , igt-dev@lists.freedesktop.org References: <20221212231254.2303-1-niranjana.vishwanathapura@intel.com> <20221212231254.2303-18-niranjana.vishwanathapura@intel.com> From: Matthew Auld In-Reply-To: <20221212231254.2303-18-niranjana.vishwanathapura@intel.com> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [igt-dev] [PATCH i-g-t v9 17/19] lib/vm_bind: Add execbuf3 support to igt_spin_factory List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: tvrtko.ursulin@intel.com, thomas.hellstrom@intel.com, daniel.vetter@intel.com, petri.latvala@intel.com Errors-To: igt-dev-bounces@lists.freedesktop.org Sender: "igt-dev" List-ID: On 12/12/2022 23:12, Niranjana Vishwanathapura wrote: > In preparation of some VM_BIND+execbuf3 related tests, > Add execbuf3 support to igt_spin_factory. > > Only add the required support which can be expanded as needed > in the future. Would it make sense to explicitly reject/assert anything that is not currently supported? Is it just a case of rejecting some flags/modes, or is it more complicated? Just to make it more obvious to future users what is expected to work? Reviewed-by: Matthew Auld > > Signed-off-by: Niranjana Vishwanathapura > --- > lib/igt_dummyload.c | 245 ++++++++++++++++++++++++++++++++++++++++++-- > lib/igt_dummyload.h | 8 ++ > 2 files changed, 246 insertions(+), 7 deletions(-) > > diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c > index 17ae21f56..869002b8d 100644 > --- a/lib/igt_dummyload.c > +++ b/lib/igt_dummyload.c > @@ -35,11 +35,13 @@ > #include "i915/gem_engine_topology.h" > #include "i915/gem_mman.h" > #include "i915/gem_submission.h" > +#include "i915/i915_vm_bind.h" > #include "igt_aux.h" > #include "igt_core.h" > #include "igt_device.h" > #include "igt_dummyload.h" > #include "igt_gt.h" > +#include "igt_syncobj.h" > #include "igt_vgem.h" > #include "intel_allocator.h" > #include "intel_chipset.h" > @@ -424,6 +426,214 @@ emit_recursive_batch(igt_spin_t *spin, > return fence_fd; > } > > +static int > +emit_recursive_batch_execbuf3(igt_spin_t *spin, int fd, > + const struct igt_spin_factory *opts) > +{ > +#define SCRATCH 0 > +#define BATCH IGT_SPIN_BATCH > + const unsigned int devid = intel_get_drm_devid(fd); > + uint64_t addr, addr_scratch, ahnd = opts->ahnd; > + struct drm_i915_gem_timeline_fence *out_fence; > + const unsigned int gen = intel_gen(devid); > + struct drm_i915_gem_execbuffer3 *execbuf3; > + struct drm_i915_gem_exec_object2 *obj; > + unsigned int flags[GEM_MAX_ENGINES]; > + uint64_t presumed_offset, delta; > + unsigned int nengine; > + int fence_fd = -1; > + uint32_t *cs; > + int i; > + > + igt_assert(ahnd); > + igt_assert(i915_vm_bind_version(fd) == 1); > + igt_assert(!(opts->flags & (IGT_SPIN_FENCE_OUT | IGT_SPIN_FENCE_IN | > + IGT_SPIN_FENCE_SUBMIT))); > + igt_assert(!(opts->ctx && opts->ctx_id)); > + > + execbuf3 = memset(&spin->execbuf3, 0, sizeof(spin->execbuf3)); > + execbuf3->ctx_id = opts->ctx ? opts->ctx->id : opts->ctx_id; > + obj = memset(spin->obj, 0, sizeof(spin->obj)); > + > + nengine = 0; > + if (opts->engine == ALL_ENGINES) { > + struct intel_execution_engine2 *engine; > + > + igt_assert(opts->ctx); > + for_each_ctx_engine(fd, opts->ctx, engine) { > + if (opts->flags & IGT_SPIN_POLL_RUN && > + !gem_class_can_store_dword(fd, engine->class)) > + continue; > + > + flags[nengine++] = engine->flags; > + } > + } else { > + flags[nengine++] = opts->engine; > + } > + igt_require(nengine); > + spin->nengines = nengine; > + > + obj[BATCH].handle = > + handle_create(fd, BATCH_SIZE, opts->flags, &spin->batch); > + if (!spin->batch) { > + spin->batch = gem_mmap__device_coherent(fd, obj[BATCH].handle, > + 0, BATCH_SIZE, PROT_WRITE); > + gem_set_domain(fd, obj[BATCH].handle, > + I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); > + } > + cs = spin->batch; > + > + addr = intel_allocator_alloc_with_strategy(ahnd, obj[BATCH].handle, > + BATCH_SIZE, 0, > + ALLOC_STRATEGY_LOW_TO_HIGH); > + obj[BATCH].offset = CANONICAL(addr); > + i915_vm_bind(fd, opts->vm_id, obj[BATCH].offset, obj[BATCH].handle, 0, BATCH_SIZE, 0, 0, 0); > + > + execbuf3->batch_address = obj[BATCH].offset; > + addr += BATCH_SIZE; > + > + if (opts->dependency) { > + igt_assert(!(opts->flags & IGT_SPIN_POLL_RUN)); > + addr_scratch = intel_allocator_alloc_with_strategy(ahnd, opts->dependency, > + BATCH_SIZE, 0, > + ALLOC_STRATEGY_LOW_TO_HIGH); > + > + obj[SCRATCH].handle = opts->dependency; > + obj[SCRATCH].offset = CANONICAL(addr_scratch); > + i915_vm_bind(fd, opts->vm_id, obj[SCRATCH].offset, obj[SCRATCH].handle, 0, BATCH_SIZE, 0, 0, 0); > + } else if (opts->flags & IGT_SPIN_POLL_RUN) { > + igt_assert(!opts->dependency); > + > + spin->poll_handle = > + handle_create(fd, 4096, opts->flags, &spin->poll); > + obj[SCRATCH].handle = spin->poll_handle; > + > + if (!spin->poll) { > + if (__gem_set_caching(fd, spin->poll_handle, > + I915_CACHING_CACHED) == 0) > + spin->poll = gem_mmap__cpu(fd, spin->poll_handle, > + 0, 4096, > + PROT_READ | PROT_WRITE); > + else > + spin->poll = gem_mmap__device_coherent(fd, > + spin->poll_handle, > + 0, 4096, > + PROT_READ | PROT_WRITE); > + } > + > + addr = intel_allocator_alloc_with_strategy(ahnd, > + spin->poll_handle, > + BATCH_SIZE * 3, 0, > + ALLOC_STRATEGY_LOW_TO_HIGH); > + addr += 4096; /* guard page */ > + obj[SCRATCH].offset = CANONICAL(addr); > + i915_vm_bind(fd, opts->vm_id, obj[SCRATCH].offset, obj[SCRATCH].handle, 0, 4096, 0, 0, 0); > + addr += 4096; > + > + igt_assert_eq(spin->poll[SPIN_POLL_START_IDX], 0); > + > + presumed_offset = obj[SCRATCH].offset; > + delta = sizeof(uint32_t) * SPIN_POLL_START_IDX; > + > + *cs++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); > + > + if (gen >= 8) { > + *cs++ = presumed_offset + delta; > + *cs++ = (presumed_offset + delta) >> 32; > + } > + > + *cs++ = 1; > + } > + > + spin->handle = obj[BATCH].handle; > + > + igt_assert_lt(cs - spin->batch, LOOP_START_OFFSET / sizeof(*cs)); > + spin->condition = spin->batch + LOOP_START_OFFSET / sizeof(*cs); > + cs = spin->condition; > + > + /* Allow ourselves to be preempted */ > + if (!(opts->flags & IGT_SPIN_NO_PREEMPTION)) > + *cs++ = MI_ARB_CHK; > + if (opts->flags & IGT_SPIN_INVALID_CS) { > + igt_assert(opts->ctx); > + if (!gem_engine_has_cmdparser(fd, &opts->ctx->cfg, opts->engine)) > + *cs++ = 0xdeadbeef; > + } > + > + /* Pad with a few nops so that we do not completely hog the system. > + * > + * Part of the attraction of using a recursive batch is that it is > + * hard on the system (executing the "function" call is apparently > + * quite expensive). However, the GPU may hog the entire system for > + * a few minutes, preventing even NMI. Quite why this is so is unclear, > + * but presumably it relates to the PM_INTRMSK workaround on gen6/gen7. > + * If we give the system a break by having the GPU execute a few nops > + * between function calls, that appears enough to keep SNB out of > + * trouble. See https://bugs.freedesktop.org/show_bug.cgi?id=102262 > + */ > + if (!(opts->flags & IGT_SPIN_FAST)) > + cs += 960; > + > + /* > + * When using a cmdparser, the batch is copied into a read only location > + * and validated. We are then unable to alter the executing batch, > + * breaking the older *spin->condition = MI_BB_END termination. > + * Instead we can use a conditional MI_BB_END here that looks at > + * the user's copy of the batch and terminates when they modified it, > + * no matter how they modify it (from either the GPU or CPU). > + * > + * On Sandybridge+ the comparison is a strict greater-than: > + * if the value at spin->condition is greater than BB_END, > + * we loop back to the beginning. > + * Beginning with Kabylake, we can select the comparison mode > + * and loop back to the beginning if spin->condition != BB_END > + * (using 5 << 12). > + * For simplicity, we try to stick to a one-size fits all. > + */ > + spin->condition = spin->batch + BATCH_SIZE / sizeof(*spin->batch) - 2; > + spin->condition[0] = 0xffffffff; > + spin->condition[1] = 0xffffffff; > + > + presumed_offset = obj[BATCH].offset; > + delta = (spin->condition - spin->batch) * sizeof(*cs); > + igt_assert(delta < 4096); > + > + *cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | 2; > + *cs++ = MI_BATCH_BUFFER_END; > + *cs++ = presumed_offset + delta; > + *cs++ = presumed_offset >> 32; > + > + /* recurse */ > + presumed_offset = obj[BATCH].offset; > + delta = LOOP_START_OFFSET; > + > + if (gen >= 8) { > + *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; > + *cs++ = presumed_offset + delta; > + *cs++ = (presumed_offset + delta) >> 32; > + } > + > + out_fence = memset(&spin->exec3_out_syncobj, 0, sizeof(*out_fence)); > + out_fence->flags = I915_TIMELINE_FENCE_SIGNAL; > + execbuf3->fence_count = 1; > + for (i = 0; i < nengine; i++) { > + spin->out_syncobj[i] = syncobj_create(fd, 0); > + out_fence->handle = spin->out_syncobj[i]; > + > + execbuf3->engine_idx = flags[i]; > + execbuf3->timeline_fences = to_user_pointer(out_fence); > + > + gem_execbuf3(fd, execbuf3); > + } > + > + igt_assert_lt(cs - spin->batch, BATCH_SIZE / sizeof(*cs)); > + > + spin->cmd_precondition = *spin->condition; > + spin->opts = *opts; > + > + return fence_fd; > +} > + > static igt_spin_t * > spin_create(int fd, const struct igt_spin_factory *opts) > { > @@ -433,8 +643,10 @@ spin_create(int fd, const struct igt_spin_factory *opts) > igt_assert(spin); > > spin->timerfd = -1; > - spin->out_fence = emit_recursive_batch(spin, fd, opts); > - > + if (opts->flags & IGT_SPIN_EXECBUF3) > + emit_recursive_batch_execbuf3(spin, fd, opts); > + else > + spin->out_fence = emit_recursive_batch(spin, fd, opts); > pthread_mutex_lock(&list_lock); > igt_list_add(&spin->link, &spin_list); > pthread_mutex_unlock(&list_lock); > @@ -464,6 +676,7 @@ igt_spin_t * > igt_spin_factory(int fd, const struct igt_spin_factory *opts) > { > igt_spin_t *spin; > + int i = 0; > > if ((opts->flags & IGT_SPIN_POLL_RUN) && opts->engine != ALL_ENGINES) { > unsigned int class; > @@ -481,11 +694,17 @@ igt_spin_factory(int fd, const struct igt_spin_factory *opts) > > spin = spin_create(fd, opts); > > - if (!(opts->flags & IGT_SPIN_INVALID_CS)) { > - /* > - * When injecting invalid CS into the batch, the spinner may > - * be killed immediately -- i.e. may already be completed! > - */ > + /* > + * When injecting invalid CS into the batch, the spinner may > + * be killed immediately -- i.e. may already be completed! > + */ > + if (opts->flags & IGT_SPIN_INVALID_CS) > + return spin; > + > + if (opts->flags & IGT_SPIN_EXECBUF3) { > + while (spin->out_syncobj[i]) > + igt_assert(syncobj_busy(fd, spin->out_syncobj[i++])); > + } else { > igt_assert(gem_bo_busy(fd, spin->handle)); > if (opts->flags & IGT_SPIN_FENCE_OUT) { > struct pollfd pfd = { spin->out_fence, POLLIN }; > @@ -595,6 +814,8 @@ void igt_spin_end(igt_spin_t *spin) > > static void __igt_spin_free(int fd, igt_spin_t *spin) > { > + int i = 0; > + > if (spin->timerfd >= 0) { > pthread_cancel(spin->timer_thread); > igt_assert(pthread_join(spin->timer_thread, NULL) == 0); > @@ -608,6 +829,10 @@ static void __igt_spin_free(int fd, igt_spin_t *spin) > if (spin->batch) > gem_munmap(spin->batch, BATCH_SIZE); > > + if ((spin->poll_handle || spin->opts.dependency) && > + (spin->opts.flags & IGT_SPIN_EXECBUF3)) > + i915_vm_unbind(fd, spin->opts.vm_id, spin->obj[SCRATCH].offset, 4096); > + > if (spin->poll_handle) { > gem_close(fd, spin->poll_handle); > if (spin->opts.ahnd) > @@ -615,6 +840,9 @@ static void __igt_spin_free(int fd, igt_spin_t *spin) > } > > if (spin->handle) { > + if (spin->opts.flags & IGT_SPIN_EXECBUF3) > + i915_vm_unbind(fd, spin->opts.vm_id, spin->obj[BATCH].offset, BATCH_SIZE); > + > gem_close(fd, spin->handle); > if (spin->opts.ahnd) > intel_allocator_free(spin->opts.ahnd, spin->handle); > @@ -623,6 +851,9 @@ static void __igt_spin_free(int fd, igt_spin_t *spin) > if (spin->out_fence >= 0) > close(spin->out_fence); > > + while (spin->out_syncobj[i]) > + syncobj_destroy(fd, spin->out_syncobj[i++]); > + > free(spin); > } > > diff --git a/lib/igt_dummyload.h b/lib/igt_dummyload.h > index b33507971..aacd61dc3 100644 > --- a/lib/igt_dummyload.h > +++ b/lib/igt_dummyload.h > @@ -31,6 +31,7 @@ > #include "igt_core.h" > #include "igt_list.h" > #include "i915_drm.h" > +#include "i915/i915_drm_local.h" > #include "intel_ctx.h" > > > @@ -48,6 +49,7 @@ > typedef struct igt_spin_factory { > uint32_t ctx_id; > const intel_ctx_t *ctx; > + uint32_t vm_id; > uint32_t dependency; > unsigned int engine; > unsigned int flags; > @@ -78,6 +80,11 @@ typedef struct igt_spin { > #define IGT_SPIN_BATCH 1 > struct drm_i915_gem_execbuffer2 execbuf; > > + struct drm_i915_gem_execbuffer3 execbuf3; > + uint32_t out_syncobj[GEM_MAX_ENGINES]; > + struct drm_i915_gem_timeline_fence exec3_out_syncobj; > + unsigned int nengines; > + > unsigned int flags; > #define SPIN_CLFLUSH (1 << 0) > > @@ -94,6 +101,7 @@ typedef struct igt_spin { > #define IGT_SPIN_INVALID_CS (1 << 6) > #define IGT_SPIN_USERPTR (1 << 7) > #define IGT_SPIN_SOFTDEP (1 << 8) > +#define IGT_SPIN_EXECBUF3 (1 << 9) > > igt_spin_t * > __igt_spin_factory(int fd, const igt_spin_factory_t *opts);